\documentclass[accepted]{uai2023}

\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{graphicx}
\usepackage{tocloft}
\usepackage{subfigure}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{wrapfig}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% \refstepcounter{lotdepth}


\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{condition}[theorem]{Condition}
\newtheorem{remark}[theorem]{Remark}

\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Modified Retrace for Off-Policy Temporal Difference Learning}

\author[1,2]{Xingguo Chen}
\author[1]{Xingzhou Ma}
\author[1]{Yang Li}
\author[2]{Guang Yang}
\author[1]{Shangdong Yang}
\author[2,3]{Yang Gao}
\affil[1]{%
    Jiangsu Key Laboratory of Big Data Security \& Intelligent Processing\\
    Nanjing University of Posts and Telecommunications\\
    Nanjing, Jiangsu, China
}
\affil[2]{%
    National Key Laboratory for Novel Software Technology\\
    Nanjing University\\
    Nanjing, Jiangsu, China
}
\affil[3]{%
Shenzhen Research Institute of Nanjing University\\
Shenzhen, China
}
 
\begin{document}
\maketitle

\begin{abstract}
Off-policy learning is a key to extend reinforcement learning
as it allows to learn  a target policy from a different behavior policy
that generates the data. However, it is well known as ``the deadly triad''
when combined with bootstrapping and function approximation.
Retrace is an efficient and  convergent off-policy algorithm with  
tabular value functions which employs  truncated importance sampling ratios.
Unfortunately, Retrace is known to be unstable with linear function
approximation.
In this paper, we propose modified Retrace  to correct the
 off-policy return, derive a new off-policy temporal difference
learning algorithm (TD-MRetrace) with linear function
approximation, and obtain a convergence guarantee under standard assumptions.
Experimental results on counterexamples and
control tasks validate
the effectiveness of the proposed algorithm compared with traditional
algorithms.
\end{abstract}


\section{Importance of the positive definite matrix}
Positive definite matrix plays an important role in convergence
analysis of reinforcement learning algorithms with linear function
approximation.
The convergence of  TD(0) is established by \citet{sutton1988learning},
where the key is  the positive definite matrix 
$\textbf{A}_{\text{on}}$
  based on the invariance of the on-policy state
  distribution. 
Off-policy learning seeks to learn a target policy while
exploring actions according to a behavior policy to avoid
getting stuck in local optima. 
However, due to the inconsistency between the behavior policy $\mu$
and the target policy $\pi$, off-policy learning may be instable
 when combined with function approximation and bootstrapping,
known as ``the deadly triad'' \citep{sutton2018book}.
The fundamental reason is that the positive definiteness of the matrix
$\textbf{A}_{\text{off}}$ is not guaranteed 
 \citep{sutton2016emphatic}.

\citet{baird1995residual} proposed residual algorithms by minimizing 
mean squared Bellman errors to solve the residual fixed point in 
closed-form.
The key matrix is positive definite, thus ensuring the stability of the
algorithms.
However, residual methods require double sampling in non-deterministic
environments to remove dependencies between successor states.
 More importantly, the residual fixed point is in most cases 
 worse than the TD fixed point  \citep{scherrer2010should,yang2021dhqn}.

\begin{table*}
  \caption{Comparisons of learning algorithms with linear
  function approximation.}
  \label{comparison}
  \centering
  \begin{tabular}{llll}
    \toprule
    Name & definition  & update rules  & positive definite 
    \\
    \midrule
    TD&&$\Delta\theta_t=\alpha_t\delta^{\mu}(\theta_t)\phi_t$&yes\\
    Off-policy
    TD&$\rho_t=\frac{\pi(a_t|s_t)}{\mu(a_t|s_t)}$&$\Delta\theta_t=\alpha_t\rho_t\delta^{\mu}(\theta_t)\phi_t$&no\\
    Retrace&$c_t=\min\left(1,\frac{\pi(a_t|s_t)}{\mu(a_t|s_t)}\right)$&$\Delta\theta_t=\alpha_tc_t\delta^{\pi}(\theta_t)\phi_t$&no\\
    MRetrace&$x_t=\min_{a}\Bigl\{\frac{\mu(a|s_t)}{\pi(a|s_t)}\Bigr\}$&
    $\Delta\theta_t=\alpha_t\rho_t\left(r_{t+1}+(x_{t+1}\gamma
    \mathbb{E}_{\pi}[\phi_{t+1}]-\phi_t)^{\top}\theta_t\right)\phi_t$&yes\\
     TD-MRetrace&&$\Delta\omega_t=\alpha_t\big[\delta^{\mu}(\omega_t)+
     \gamma{\theta_{t}}^{\top} (\mathbb{E}_\pi[\phi_{t+1}]-\phi_{t+1})\big]\phi_t$&yes\\
    \bottomrule
  \end{tabular}
  \label{comparison}
\end{table*}


Stable algorithms to solve the TD fixed point mainly include two 
approaches \citep{chen2016reinforcement,chen2023fixed}.
Gradient based methods guarantee the positive definiteness of the
correlation matrix by constructing different objective functions.
\citet{sutton2008convergent} proposed the first convergent off-policy 
temporal difference learning algorithm,  gradient TD (GTD),  
which minimizes the norm of the expected TD update (NEU)
\footnote{The NEU objective first appeared in \citep{yao2008preconditioned} and was defined by
\citet{sutton2009fast}.}
 and
involves a positive definite matrix $\textbf{A}_{\text{GTD}}=\left(\begin{matrix}
\sqrt{\eta}\textbf{I}& \textbf{A}_{\text{off}}\\
-\textbf{A}_{\text{off}}^{\top} & 0
\end{matrix}\right)$, where $\textbf{I}$ is the identity matrix and $\eta$ is 
the stepsize ratio  of the auxiliary  parameter to
the learning parameter.
Subsequently, \citet{sutton2009fast} proposed GTD2 algorithm with
 positive definite matrix
$\textbf{A}_{\text{GTD2}}=\left(\begin{matrix}
\sqrt{\eta}\textbf{C}& \textbf{A}_{\text{off}}\\
-\textbf{A}_{\text{off}}^{\top} & 0
\end{matrix}\right)$
and TD with gradient correction (TDC) algorithm with  positive definite matrix
$\textbf{A}_{\text{TDC}}=\textbf{A}_{\text{off}}^{\top}\textbf{C}^{-1}\textbf{A}_{\text{off}}$,
both of which minimize the mean square projected Bellman error (MSPBE),
where $\textbf{C}=\mathbb{E}[\phi\phi^{\top}]$ and $\phi$ is feature
of a state or state-action pair.
\citet{hackman2012faster} proposed Hybrid TD (HTD) algorithm
with a positive definite matrix 
$\textbf{A}_{\text{HTD}}=\textbf{A}_{\text{off}}^{\top}\textbf{A}_{\text{on}}^{-1}\textbf{A}_{\text{off}}$,
which replaces $\textbf{C}^{-1}$ in $\textbf{A}_{\text{TDC}}$ as
$\textbf{A}_{\text{on}}^{-1}$ to accelerate the learning rate.
\citet{liu2015finite,liu2016proximal,liu2018proximal} proposed
accelerated GTD-MP and GTD2-MP algorithm via rewriting  the objective functions,
NEU and MSPBE, in the form of a convex-concave saddle-point formulation.
\citet{zhang2021average} proposed Diff-GQ1 algorithm w.r.t saddle-point
formulation of GTD2 and Diff-GQ2 algorithm w.r.t  
two-stage gradient evaluation, both of which minimize MSPBE in the
average-reward setting.
Second-order information is used as a precondition \citep{yao2008preconditioned} 
to accelerate TD learning, e.g., Quasi Newton TD \cite{givchi2015quasi} and 
accelerated TD  \citep{pan2017accelerated}.
The main disadvantage of gradient based methods is slow convergence
due to one more parameter to be updated \citep{hallak2017consistent}.

The other approach,
importance sampling (IS) ratios,
 correct the returns via reweighting the state
distribution between on-policy and off-policy updates. 
It was first proposed by \citet{precup2001off}
where the positive definite matrix is $\textbf{A}_{\text{on}}$.
\citet{sutton2016emphatic} proposed  emphatic TD (ETD) algorithm
with followon trace to correct from beginning of the excursion
based on IS ratios,
where positive definite matrix is
$\textbf{A}_{\text{ETD}}=\Phi^{\top}\textbf{D}_f(\textbf{I}-\gamma 
\textbf{P}_{\pi})\Phi$, $\textbf{D}_f$ is a diagonal matrix with diagonal
element approximated to $f=(\textbf{I}-\gamma \textbf{P}_{\pi})^{-1}d_{\mu}$.
\citet{hallak2016generalized} introduced an additional parameter into ETD
to tradeoff bias for variance reduction.  
\citet{zhang2020provably} proposed convergent off-policy actor-critic
algorithm in which the followon trace's variance is reduced by emphasis
approximation.
\citet{zhang2022truncated} proposed truncated emphatic TD (TETD), 
where the positive definite matrix is
$\textbf{A}_{\text{TETD}}=\Phi^{\top}\textbf{D}_{f_k}(\textbf{I}-\gamma 
\textbf{P}_{\pi})\Phi$, $f_k$ is a truncated followon trace of length $k$.
The main disadvantage of ETD and TETD is that the followon trace may 
be of very high variance.


\citet{munos2016safe} proposed  Retrace algorithm with a safe and efficient
IS ratios truncated at 1,  
 which guarantees convergence with a contraction
 mapping in the case of look-up table.
 However, based on an action-value extension to Baird's counterexample, Retrace
 was pointed out that it is not guaranteed to be stable when combined with function
 approximation \citep{touati2018convergent}.
Then, a convergent gradient-based Retrace (GRetrace) was proposed 
based on a quadratic convex-concave saddle-point formulation, which minimizes
MSPBE \citep{touati2018convergent}.
However, this returns to the disadvantage of slow convergence of the gradient
TD learning families.


\textbf{Our contributions:} 
In this paper, we explore modified Retrace  to correct the
 off-policy return, and derive a new off-policy temporal difference
learning algorithm (TD-MRetrace).
Its key matrix is positive definite, thus
ensuring the learning stability.

The rest of this paper is organized as follows. First, 
 related notations and background are introduced. Second, we
revisit the fundamental reason why Retrace with linear function 
approximation is not stable, propose
Modified Retrace (MRetrace) to correct off-policy update, and derive an
off-policy learning algorithm, TD-MRetrace (see Table \ref{comparison}). 
After that, we show a convergence guarantee for TD-MRetrace algorithm under
standard conditions in the off-policy setting.
Finally, we experimentally verify the proposed algorithm on both prediction
tasks and control tasks.


\section{Notation and background}



Reinforcement learning agent interacts with its environment 
which we modeled as a discounted Markov Decision Process
$\langle S, A, R,T, \gamma\rangle$, where 
$S$ is a finite state space, $|S|=n$, $A$ is an action space,
$T:S\times A\times S\rightarrow [0,1]$ is a transition function,
$R:S\times A\times S\rightarrow \mathbb{R}$ is a reward function,
$\gamma\in[0,1)$ is a discount factor. 
Policy $\pi:S\times A\rightarrow [0,1]$ offers the probability $\pi(a|s)$
to choose action $a$ in state $s$.
State value function for policy $\pi$, denoted $V^{\pi}:S\rightarrow
\mathbb{R}$, represents the expected sum of discounted rewards in
the MDP under policy $\pi$:
$V^{\pi}(s)=\mathbb{E}_{\pi}\left[\sum_{t=0}^{\infty}\gamma^tr_t|s_0=s\right]$.
Action value function $Q^{\pi}:S\times A\rightarrow
\mathbb{R}$ is defined as 
$Q^{\pi}(s,a)=\mathbb{E}_{\pi}\left[\sum_{t=0}^{\infty}\gamma^tr_t|s_0=s,a_0=a\right]$.
$V^{\pi}$ is the fixed point of the Bellman operator over the value function
$\mathcal{T}^{\pi}V=r+\gamma \textbf{P}_{\pi}V$,
where $r$ is the expected immediate reward and $\textbf{P}_{\pi}$ 
denotes the $n\times n$ matrix of transition probabilities
\begin{equation}
[\textbf{P}_{\pi}]_{ij}\dot{=}\sum_{a\in A}\pi(a|i)T(i,a,j).
\end{equation}
Assume the state distribution $d_{\pi}$ under policy $\pi$ is steady and
exists. Then one special property is
the invariance  of distribution $d_{\pi}$,
\begin{equation}
d_{\pi} = \textbf{P}_{\pi}^{\top}d_{\pi}.
\label{property}
\end{equation}

When the state space is too large to preserve $V^{\pi}(s)$,
a linear function approximation is used to generalize
between different states $V^{\pi}(s)\approx V_{\theta}(s)= \theta^{\top}\phi(s)
=\sum_{i=1}^m\theta_i\phi_i(s)$, where $\theta$ is the weight vector,
 $\phi(s)$ is the feature vector of state $s$, and the feature size is far 
 less than the state space $m\ll n$.
 The action value function is generalized as $Q(s,a)\approx 
Q_{\theta}(s,a)=\theta^{\top}\phi(s,a)$, where $\phi(s,a)$
is the feature vector of the state-action pair.
  Notably, equation 
 $V_{\theta}=\mathcal{T}^{\pi}V_{\theta}$  no longer holds because 
 the number of parameters is far less than the number of equations.
 A common and efficient solution is the TD fixed point 
 $V_{\theta}=\Pi\mathcal{T}^{\pi}V_{\theta}$  
 with projection
 $\Pi=\Phi(\Phi^{\top}\textbf{D}_{\pi}\Phi)^{-1}\Phi^{\top}\textbf{D}_{\pi}$,
 where $\Phi$ is the $n\times m$ matrix with the $\phi(s)$ as its rows,
 $\textbf{D}_{\pi}$ is the $n\times n$ diagonal matrix with $d_{\pi}$ on its diagonal. 
It can be learned by the on-policy TD(0) algorithm:
\begin{equation}
\begin{split}
\theta_{t+1}
\dot{=}&\theta_t+\alpha_t\left(r_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_{t}\right)\phi_t\\
=&\theta_t+\alpha_t\left(r_{t+1}\phi_t-\phi_t(\phi_t-\gamma\phi_{t+1})^{\top}\theta_t\right),
\end{split}
\label{onpolicytd}
\end{equation}
where $\alpha_t>0$ is a step-size parameter, and we have used the shorthand
$\phi_t\dot{=}\phi(s_t)$. 
The convergence analysis of algorithms
with linear function approximation is mainly
based on the ODE (Ordinary Differential Equations) approach
\citep{borkar2000ode}, where the key relies on the matrix $\textbf{A}$
 being positive definite, i.e. $\forall x\neq 0$, $x^{\top}\textbf{A}x>0$. 
Let $\textbf{A}_{
\text{on}}$ denote the key matrix  of the expected update (\ref{onpolicytd}):
\begin{equation}
\begin{split}
\textbf{A}_{\text{on}}
&=\lim_{t\rightarrow\infty}\mathbb{E}_{\pi}\left[\phi_t(\phi_t-\gamma\phi_{t+1})^{\top}\right]\\
&=\Phi^{\top}\textbf{D}_{\pi}(\textbf{I}-\gamma
  \textbf{P}_{\pi})\Phi.
\end{split}
\end{equation}
With property (\ref{property}),  $\textbf{A}_{
\text{on}}$ is proved to be positive definite, thus the convergence of the
on-policy TD algorithm is established \citep{sutton1988learning}.

In this paper, we are concerned with off-policy learning, where
 the target policy $\pi$ is different from the behavior policy $\mu$ that
 generates  experiences $\langle s_t,
a_t,r_{t+1},s_{t+1},a_{t+1}\rangle$.
There are two ways to implement the off-policy learning.
One is to use the experiences of the behavior policy and 
simply multiplies the whole on-policy TD update (\ref{onpolicytd}) by the
importance sampling ratio $\rho_t=\frac{\pi(a_t|s_t)}{\mu(a_t|s_t)}$,
e.g., off-policy TD:
\begin{equation}
\begin{split}
\theta_{t+1}
\dot{=}&\theta_t+\rho_t\alpha_t(r_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_{t})\phi_t\\
=&\theta_t+\alpha_t\left(\rho_tr_{t+1}\phi_t-\rho_t\phi_t(\phi_t-\gamma\phi_{t+1})^{\top}\theta_t\right).
\end{split}
\label{offpolicytd}
\end{equation} 
Its key matrix is:
\begin{equation}
\begin{split}
\textbf{A}_{\text{off}}&
=\lim_{t\rightarrow\infty}\mathbb{E}_{\mu}\left[\rho_t\phi_t(\phi_t-\gamma\phi_{t+1})^{\top}\right]\\
&=\lim_{t\rightarrow\infty}\mathbb{E}_{\mu}\left[\frac{\pi(a|s)}{\mu(a|s)}\phi_t(\phi_t-\gamma\phi_{t+1})^{\top}\right]\\
&=\lim_{t\rightarrow\infty}\mathbb{E}_{\pi}\left[\phi_t(\phi_t-\gamma\phi_{t+1})^{\top}\right]\\
&=\Phi^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma
  \textbf{P}_{\pi})\Phi,
\end{split}
\label{aoff}
\end{equation}
The other is to directly use the target policy:
\begin{equation}
\begin{split}
\theta_{t+1}
\dot{=}&\theta_t+\alpha_t(r_{t+1}+\gamma\theta_t^{\top}\mathbb{E}_{\pi}[\phi_{t+1}]-\theta_t^{\top}\phi_{t})\phi_t\\
=&\theta_t+\alpha_t\left(r_{t+1}\phi_t-\phi_t(\phi_t-\gamma\mathbb{E}_{\pi}[\phi_{t+1}])^{\top}\theta_t\right).
\end{split}
\label{Q}
\end{equation} 
% Q-learning can be seen as a special case of (\ref{Q}), if we set
% $\theta_t^{\top}\mathbb{E}_{\pi}[\phi_{t+1}]\dot{=}\max_a\theta_t^{\top}\phi(s_{t+1},a)=\max_a
% Q_{\theta_t}(s_{t+1},a)$.
The key matrix of these two off-policy learning algorithms share the same form 
$\textbf{A}_{\text{off}}=\Phi^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma
  \textbf{P}_{\pi})\Phi$.



\subsection{2-state counterexample}

% \begin{figure}
% \begin{center}
% 	\includegraphics[scale=0.1]{2StateExample.pdf}
% 	\caption{The 2-state counterexample.}
% \end{center}
% \end{figure}






The $\textbf{$\theta$}\rightarrow 2\theta$ problem has only two states
\citep{tsitsiklis1997analysis,sutton2016emphatic}.
From each state, there are two actions, \emph{left} and \emph{right}, which
take the agent to the left or right state. All rewards are zeros.
\begin{wrapfigure}{r}{4cm}
\centering
\includegraphics[scale=0.1]{2StateExample.pdf}
% \caption{The 2-state counterexample.}
\end{wrapfigure}
The features $\Phi=(1,2)^{\top}$ are assigned to the left and the right state.
The behavior policy takes the equal probability to \emph{left} or \emph{right}
in both states, i.e., $\textrm{P}_{\mu}= \left[
\begin{array}{ll}
0.5 &0.5\\
0.5 &0.5
\end{array}
\right].$
The target policy only selects action right in both states, i.e.,
$\textrm{P}_{\pi}=
\left[
\begin{array}{ll}
0 &1\\
0 &1
\end{array}
\right].$
The state distribution of the behavior policy is $d_{\mu}=(0.5,0.5)^{\top}$.
The discount factor is $\gamma=0.9$.



For the counterexample, the key matrix of the off-policy TD is
$\textbf{A}_{\text{off}}=\Phi^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma
  \textbf{P}_{\pi})\Phi=-0.2$.
 This means that off-policy TD is not stable.



\subsection{Instability of Retrace}
Retrace algorithm belongs to the second implementation of 
off-policy learning.
It employs a truncated IS ratios $c_t=\min(1,\rho_t)$ 
and guarantees convergence with a look-up value function \citep{munos2016safe}.
We revisit Retrace(0) with linear function
approximation by \citet{touati2018convergent}, where the truncated IS ratios are
multiplied to the whole TD error:
\begin{equation}
\begin{split}
\theta_{t+1}
\dot{=}&\theta_t+c_t\alpha_t\left(r_{t+1}+\gamma\theta_t^{\top}\mathbb{E}_{\pi}[\phi_{t+1}]-\theta_t^{\top}\phi_{t}\right)\phi_t\\
=&\theta_t+\alpha_t\left(c_tr_{t+1}\phi_t-c_t\phi_t(\phi_t-\gamma\mathbb{E}_{\pi}[\phi_{t+1}])^{\top}\theta_t\right),\\
\end{split}
\label{touatiretrace0}
\end{equation}
where $\mathbb{E}_{\pi}[\phi_{t+1}]=\sum_a\pi(a|s_{t+1})\phi(s_{t+1})$.
The key matrix  of the
expected  Retrace's update (\ref{touatiretrace0}) is:
\begin{equation}
\footnotesize
\begin{split}
\textbf{A}_{\text{
Retrace(0)}}&=\lim_{t\rightarrow\infty}\mathbb{E}_{\mu}\left[c_t\phi_t(\phi_t-\gamma\mathbb{E}_{\pi}[\phi_{t+1}])^{\top}\right]\\
 &=\Phi^{\top}\textbf{D}_{\mu}\textbf{D}_{c}(\textbf{I}-\gamma
  \textbf{P}_{\pi})\Phi,
\end{split}
\end{equation}
where $\textbf{D}_{c}$ is the $n\times n$ diagonal matrix with $d_{c}$ on its diagonal, each component of $d_{c}$ is 
\begin{equation}
d_c(s)=\sum_a\min(\mu(a|s),\pi(a|s)).
\label{dc}
\end{equation}

In the counterexample, according to (\ref{dc}), 
$d_c=(0.5,0.5)^{\top}$.
Then, the key matrix of Retrace(0) algorithm for this example is: 
$\textbf{A}_{\text{Retrace(0)}}=\Phi^{\top}\textbf{D}_{\mu}\textbf{D}_{c}(\textbf{I}-\gamma
\textbf{P}_{\pi})\Phi=-0.1$.
Thus,  
Retrace(0) with linear function approximation is not stable.







\section{TD-MRetrace algorithm}
In this section we propose a mechanism to correct off-policy update and derive
new algorithms.
\subsection{Modified retrace}
Importance sampling ratios, $\rho_t=\frac{\pi(a_t|s_t)}{\mu(a_t|s_t)}$,
represent the ``off-policyness''   of the current state and action between the target
policy and the behavior policy.  The farther the target policy deviates,
the more unstable the learning algorithm will be. 
In this sense, the maximum of the ``off-policyness'', 
$\max_a\rho_t=\max_a\{\frac{\pi(a_t|s_t)}{\mu(a_t|s_t)}\}$, 
is the key to the instability of off-policy learning algorithms.

In order to reduce the impact of the deviation of the target policy,
 we introduce modified retrace (MRetrace) that takes the reciprocal of the 
above maximum degree as follows:
\begin{equation}
x(s_t)\dot{=}\frac{1}{\max_a\rho_t}=\min_a\Bigl\{\frac{1}{\rho_t}\Bigr\}=\min_a\Bigl\{\frac{\mu(a|s_t)}{\pi(a|s_t)}\Bigr\}.
\end{equation}
Obviously, $x(s_t)\leq 1$\footnote{Note that $\sum_a\mu(a|s_t)=1$,
$\sum_a\pi(a|s_t)=1$.}, and $x(s_t)=1$ only when 
$\forall a$, $\pi(a|s_t)=\mu(a|s_t)$.

\subsubsection{MRetrace learning for prediction}
We use the first way to learn state values for prediction.
The resulting temporal difference learning algorithm,
which we call MRetrace learning, is
\begin{equation}
\begin{split}
\theta_{t+1}
\dot{=}&\theta_t+\alpha_t\rho_t\big(
r_{t+1}+x_t\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_{t}\big)\phi_t
\\=&\theta_t+\alpha_t\big(\rho_tr_{t+1}\phi_t-\rho_t\phi_t(\phi_t-x_t\gamma\phi_{t+1})^{\top}\theta_t\big)
\\=&\theta_t+\alpha_t\big(\textbf{b}_t-\textbf{A}_t\theta_t\big),
\end{split}
\label{mretrace}
\end{equation}
where $x_t$ is in short of $x(s_t)$,
$\textbf{b}_t=\rho_tr_{t+1}\phi_t$, $\textbf{A}_t=\rho_t\phi_t\left(\phi_t-x_t\gamma\phi_{t+1}\right)^{\top}$.
Then,
\begin{equation}
\begin{split}
\textbf{b}=\lim_{t\rightarrow\infty}\mathbb{E}_{\mu}[\textbf{b}_{t}]
=\lim_{t\rightarrow\infty}\mathbb{E}_{\mu}\left[\rho_tr_{t+1}\phi_t\right]=\Phi^{\top}\textbf{D}_{\mu}r_{\pi},
\label{bvalue}
\end{split}
\end{equation}
where $r_{\pi}$ is expected reward vector under policy $\pi$ with each component
$r_{\pi}(s)=\sum_{a}\sum_{s'}\pi(a|s)R(s,a,s')$.
The key matrix of MRetrace is
\begin{equation}
\begin{split}
\textbf{A}=\lim_{t\rightarrow\infty}\mathbb{E}_{\mu}\left[\textbf{A}_{t}\right]
&=\lim_{t\rightarrow\infty}\mathbb{E}_{\mu}\left[\rho_t\phi_t(\phi_t-x_t\gamma
\phi_{t+1})^{\top}\right]\\
&=\lim_{t\rightarrow\infty}\mathbb{E}_{\mu}\left[\frac{\pi(a|s)}{\mu(a|s)}\phi_t(\phi_t-x_t\gamma
\phi_{t+1})^{\top}\right]\\
&=\lim_{t\rightarrow\infty}\mathbb{E}_{\pi}\left[\phi_t(\phi_t-x_t\gamma
\phi_{t+1})^{\top}\right]\\
&=\Phi^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma
  \textbf{D}_{x}\textbf{P}_{\pi})\Phi,
\label{aEvaVal}  
\end{split}
\end{equation}
where $\textbf{D}_{x}$ is the $n\times n$ diagonal matrix with 
$d_x$ on its diagonal, each component of $d_x$ is
$d_x(s)=\min_b\Bigl\{\frac{\mu(b|s)}{\pi(b|s)}\Bigr\}$.




\subsubsection{MRetrace learning for control}
We use the second way to learn action values for control.
The update rule is as follows:
\begin{equation}
\begin{split}
\theta_{t+1} 
&= \theta_t + \alpha_t\rho_t(r_{t+1} + x_{t+1}\gamma \theta_t^\top
\mathbb{E}_\pi[\phi_{t+1}]-\theta_t^\top\phi_t)\phi_t\\
&=\theta_t + \alpha_t\rho_t(r_{t+1}\phi_t -\phi_t(\phi_t- x_{t+1}\gamma 
\mathbb{E}_\pi[\phi_{t+1}])^{\top}\theta_t)\\
&=\theta_t + \alpha_t(\textbf{b}_t-\textbf{A}_t\theta_t),
\label{mretracecontrol}
\end{split}
\end{equation}
where $\textbf{b}_t=\rho_tr_{t+1}\phi_t$, and
$\textbf{A}_t=\rho_t\phi_t(\phi_t- x_{t+1}\gamma
\mathbb{E}_{\pi}[\phi_{t+1}])^\top$.
Then,
\begin{equation}
\label{bcontrol}
\textbf{b}=\lim_{t\rightarrow\infty}\mathbb{E}_{\mu}[\textbf{b}_{t}]
=\lim_{t\rightarrow\infty}\mathbb{E}_{\mu}[\rho_tr_{t+1}\phi_t]
=\Phi^{\top} \textbf{D}_{\mu} r_{\pi}.
\end{equation}
The key matrix is 
\begin{equation}
\label{acontrol}
\begin{split}
\textbf{A}
=\lim_{t\rightarrow\infty}\mathbb{E}_{\mu}[\textbf{A}_t]
&=\lim_{t\rightarrow\infty}\mathbb{E}_{\mu}[\rho_t\phi_t(\phi_t-
x_{t+1}\gamma \mathbb{E}_{\pi}[\phi_{t+1}])^\top]\\
&={\Phi}^\top \textbf{D}_\mu(\textbf{I}-\gamma
\textbf{D}_{x}\textbf{P}_{\pi})\Phi.
\end{split}
\end{equation}

It is worth noting that if we remove the important
 sampling ratios $\rho_{t}$ from (\ref{mretracecontrol}),
 the key matrix  $\textbf{A}$ remains the same
 since the terms, $r_{t+1}$, $\phi_t$ and $\mathbb{E}_{\pi}[\phi_{t+1}]$
 in the state action  values are independent of $\rho_{t}$.
 But we still keep $\rho_{t}$ for reasons explained below.
 When the successor state is composed
 of afterstate and dynamics, e.g., Tetris game,
 one usually learn the afterstate values.
 The distribution of these afterstates is generated by the
 behavior policy. Therefore, $\rho_{t}$ is needed to correct the
 target returns.

From (\ref{bvalue}), (\ref{aEvaVal}), (\ref{bcontrol}) and (\ref{acontrol}), we
can see that the expectation of updates for MRetrace learning algorithms share
the same form.
 The only difference is that the feature matrix is defined on state 
 for prediction and on state-action pair for control.
 
 For the 2-state counterexample, $d_x=(0.5,0.5)^{\top}$, 
the value of the new key
matrix is as follows:
\begin{equation}
\textbf{A}=
\Phi^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma
\textbf{D}_{x}\textbf{P}_{\pi})\Phi=1.15.
\end{equation}
This shows that our algorithm is convergent on this counterexample.
The specific theoretical proof and experimental verification are left to
following sections.


\subsection{About the solved TD fixed point}
MRetrace enhances stability by reducing the impact of off-policy.
It is important to show what solution it seeks.


When the parameter vector $\theta$ in (\ref{mretrace}) is no longer updated,
it means that the MRetrace algorithm converges.  
In this case, $\textbf{b}-\textbf{A}\theta=0$. That is
$\theta=\textbf{A}^{-1}\textbf{b}$ if $\textbf{A}$ is reversible.
It is the solution to  the following expectation equation:
\begin{equation}
\mathbb{E}_{\mu}\left[\rho(r+x\gamma
\theta^{\top}\mathbb{E}_{\pi}[\phi']-\theta^{\top}\phi)\phi\right]=0.
\label{tdfixedpoint}
\end{equation}

\begin{lemma}
The TD fixed point (\ref{tdfixedpoint}) follows from
$V_{\theta}=\Pi\mathcal{T}_{x}^{\pi} V_{\theta}$,
where the modified Bellman operator 
$\mathcal{T}_{x}^{\pi}$ is defined as
\begin{equation}
\mathcal{T}_{x}^{\pi} V\dot{=}r+\gamma \textbf{D}_x \textbf{P}_{\pi}V.
\end{equation} 
\end{lemma}
\begin{proof}
\begin{equation}
\begin{split}
0&=\mathbb{E}_{\mu}\left[\rho(r+x\gamma
\theta^{\top}\mathbb{E}_{\pi}[\phi']-\theta^{\top}\phi)\phi\right]\\
&=\sum_{s}d_s\mathbb{E}_{\pi}[(r+x\gamma V_{\theta}(s')-V_{\theta}(s))\phi(s)]\\
&=\Phi^{\top}\textbf{D}_{\mu}(\mathcal{T}_{x}^{\pi}V_{\theta}-V_{\theta}).
\end{split}
\end{equation}
We have
\begin{equation}
\begin{split}
\Phi^{\top}\textbf{D}_{\mu}\mathcal{T}_{x}^{\pi}V_{\theta}
&=\Phi^{\top}\textbf{D}_{\mu}V_{\theta}\\
&=\Phi^{\top}\textbf{D}_{\mu}\Phi\theta.
\end{split}
\end{equation}
Then,
$\theta=(\Phi^{\top}\textbf{D}_{\mu}\Phi)^{-1}\Phi^{\top}\textbf{D}_{\mu}\mathcal{T}_{x}^{\pi}V_{\theta}$.
That is
$V_{\theta}=\Phi\theta
=\Phi(\Phi^{\top}\textbf{D}_{\mu}\Phi)^{-1}\Phi^{\top}\textbf{D}_{\mu}\mathcal{T}_{x}^{\pi}V_{\theta}
=\Pi\mathcal{T}_{x}^{\pi} V_{\theta}.$
\end{proof}

According to \citet{scherrer2010should}, (\ref{tdfixedpoint})
is TD fixed point due to the projection direction ($\textbf{D}_{\mu}\Phi$)
in the projection operator $\Pi$. Note that
it is neither the TD fixed point of the behavior policy, 
nor the exact TD fixed point of the target policy in MDP $\langle S, A, R,T,
\gamma\rangle$.


Define a discount variable $\gamma^{\mu,\pi}$ on state $s$ as
$\gamma^{\mu,\pi}(s)=\gamma x(s).$
Then, the modified Bellman operator 
$\mathcal{T}_{x}^{\pi}$ in MDP $\langle S, A, R,T,
\gamma\rangle$ equals to the Bellman operator $\mathcal{T}^{\pi}$
in MDP $\langle S, A, R,T, \gamma^{\mu,\pi}\rangle$.

Thus, MRetrace (\ref{mretrace}) solves 
the TD fixed point of the target policy in MDP
$\langle S, A, R,T, \gamma^{\mu,\pi}\rangle$. 

\subsection{TD-MRetrace algorithm}
Remember that our objective is to solve the TD fixed point
of the target policy in MDP $\langle S, A, R,T,\gamma\rangle$.

Consider another weight vector $\omega$, 
the off-policy TD error $\delta^{\pi}(\omega_t)$ can be 
decomposed as follows:
\begin{equation}
\begin{split}
\delta^{\pi}(\omega_t)&= r_{t+1} + \gamma
\omega_t^{\top}\mathbb{E}_{\pi}[\phi_{t+1}]-\omega_t^{\top}\phi_t\\
&= r_{t+1} + \gamma \omega_t^{\top}
(\mathbb{E}_{\pi}[\phi_{t+1}]-\phi_{t+1}+\phi_{t+1})-\omega_t^{\top}\phi_t\\
&= r_{t+1} + \gamma \omega_t^{\top}(\phi_{t+1}-\phi_t)
+\gamma \omega_t^{\top}(\mathbb{E}_{\pi}[\phi_{t+1}]-\phi_{t+1})\\
&=\delta^{\mu}(\omega_t)+\delta^{\text{off}}(\omega_t),
\end{split}
\end{equation}
where the on-policy TD error 
$\delta^{\mu}(\omega_t)\dot{=}r_{t+1} + \gamma
\omega_t^{\top}(\phi_{t+1}-\phi_t)$, the off-policy correction $\delta^{\text{off}}(\omega_t)
\dot{=}\gamma \omega_t^{\top}
(\mathbb{E}_{\pi}[\phi_{t+1}]-\phi_{t+1})$.

It is a hybrid approach
that combines the on-policy update and the off-policy update together
\citep{hackman2012faster}.
When our target and behavior policy are the same,
$\mathbb{E}_{\pi}[\phi_{t+1}]-\phi_{t+1}=0$, the update
 becomes the expected Sarsa update.
 Therefore, the instability is due to the off-policy correction
 $\delta^{\text{off}}(\omega_t)$.
 
 
 Let the off-policy correction be approximated as
 $\delta^{\text{off}}(\omega_t)\approx \delta^{\text{off}}(\theta_t)$
 based on the proposed MRetrace.
 Then, the off-policy TD error can be approximated as follows:
 \begin{equation}
\begin{split}
\delta^{\pi}(\omega_t)&=\delta^{\mu}(\omega_t)+\delta^{\text{off}}(\omega_t)\\
&\approx \delta^{\mu}(\omega_t)+\delta^{\text{off}}(\theta_t)\\
&=r_{t+1} +\gamma \omega_t^{\top}(\phi_{t+1}-\phi_t) 
+ \gamma\theta_t^{\top} (\mathbb{E}_{\pi}[\phi_{t+1}]-\phi_{t+1})
\end{split}
\end{equation}


The resultant algorithm,
which we call TD-MRetrace, is
\begin{equation}
\begin{split}
\omega_{t+1}=
&\omega_t +
\alpha_t\delta^{\mu}(\omega_t)\phi_t+\alpha_t\delta^{\text{off}}(\theta_t)\phi_t\\
=&\omega_t + \alpha_t\big[r_{t+1}+\gamma
\omega_t^{\top} (\phi_{t+1}-\phi_t)\big]\phi_t \\
&+\alpha_t\gamma{\theta_{t}}^{\top}
(\mathbb{E}_\pi[\phi_{t+1}]-\phi_{t+1})\phi_t.
\end{split}
\label{tdMRetrace}
\end{equation}
where $\theta_t$ is generated by (\ref{mretrace}). 
Note that the update to $\omega_t$ is the sum of two terms,
and that the first term is exactly the same as the on-policy update.
The second term has nothing to do with $\omega$ and can be regarded
as a correction of the reward in the off-policy case.
Once $\theta$ converges, $\omega$ will converges such as on-policy TD learning.


\section{Convergence}

The purpose of this section is to establish that  the 
TD-MRetrace algorithm converges with probability one under
standard assumptions when $\{\phi_t,r_t,\mathbb{E}_{\pi}[\phi_{t+1}]\}$ is obtained by
the off-policy subsampling process
\citep{sutton2008convergent}.

Let $s$ be a state randomly drawn from $d_{\mu}$,  and let $s'$ be a state
obtained by following $\pi$ for one time step in  the MDP from $s$. Let the
behavior policy $\mu$  select all actions of the  target policy $\pi$ with
positive probability in every state, and the  target policy is deterministic.
Further, let $r(s,s')$ be the reward incurred.

\begin{assumption}
\label{assump1}
The Markov chain ($s_t$) is aperiodic and irreducible, 
so that $\lim_{t\rightarrow\infty}\mathbb{P}(s_t=s'|s_0=s)=d_{\mu}(s')$
exists and  is unique. 
\end{assumption}

This assumption implies that the state distribution
vector $d_{\mu}$ of the behavior policy $\mu$ is the fixed point of 
\begin{equation}
d_{\mu}=\textbf{P}_{\mu}^{\top}d_{\mu},
\end{equation} 
 where element of matrix $\textbf{P}_{\mu}$ is as follows:
 \begin{equation}
 [\textbf{P}_{\mu}]_{ss'}=\sum\mu(a|s)T(s,a,s').
 \end{equation}

\begin{assumption}
$\{\phi_t,r_t,\mathbb{E}_{\pi}[\phi_{t+1}]\}$ is such that 
$\mathbb{E}_{\mu}[||\phi_t||^2|s_{t_1}]$, $\mathbb{E}_{\mu}[r_t^2|s_{t_1}]$,
$\mathbb{E}_{\pi}[||\phi_{t+1}||^2|s_{t_1}]$ are uniformly bounded.
\label{assump2}
\end{assumption}

\begin{assumption}
The feature matrix $\Phi$ is column full rank.
\label{fullrank}
\end{assumption} 

\begin{assumption}
Step-size sequence $\alpha_t$ satisfies 
 $\alpha_t\in (0,1]$,  $
\sum_{t=0}^{\infty}\alpha_t=\infty,
$ and
$
\sum_{t=0}^{\infty}\alpha_t^2<\infty.
$
\label{assump3}
\end{assumption}


\begin{theorem}
\label{theorem1} (Convergence of MRetrace with an off-policy sub-sampled
process).
Assume Assumption \ref{assump1},
 \ref{assump2},
  \ref{fullrank}, and
\ref{assump3}.
Let the parameter $\theta_t$ be updated by  iteration
(\ref{mretrace}).
Let 
 $\textbf{A}
 =\mathbb{E}_{\mu}\left[\rho_t\phi_t(\phi_t-x_t\gamma\mathbb{E}_{\pi}[\phi_{t+1}])^{\top}\right]$,
 $\textbf{b}=\mathbb{E}_{\mu}[\rho_tr_t\phi_t]$.
Then the parameter vector $\theta_t$ converges with probability one 
to the TD fixed-point $\theta^{*}=\textbf{A}^{-1}\textbf{b}$
(\ref{tdfixedpoint}).
\end{theorem}

\begin{proof}


The proof follows from the procedures of
\citet{sutton2008convergent,sutton2009fast} for GTD and GTD2, which are based on
the ordinary-differential-equation (ODE) approach \citep{borkar2000ode}. 
 First, 
$\textbf{A}$ and $\textbf{b}$ are well-defined according to  Assumption
\ref{assump1} and \ref{assump2}.

Now we apply Theorem 2.2 of \citet{borkar2000ode}. 
We write 
$\theta_{t+1}=\theta_t+\alpha_t(-\textbf{A}\theta_t+\textbf{b}+(\textbf{A}-\textbf{A}_{t+1})\theta_t+(\textbf{b}_{t+1}-\textbf{b}))
=\theta_t+\alpha_t(h(\theta_t)+M_{t+1})$,
where $h(\theta)=\textbf{b}-\textbf{A}\theta$ and $M_{t+1}=(\textbf{A}-\textbf{A}_{t+1})\theta_t+\textbf{b}_{t+1}-\textbf{b}$.
Let $\mathcal{F}_t=\sigma(\theta_1,M_1,\ldots,\theta_{t-1},M_t).$
Theorem 2.2 requires the verification of the following conditions:
 (i) The function $h$ is Lipschitz and
$h_{\infty}(\theta)=\lim_{r\rightarrow\infty}h(r\theta)/r$
is well-defined for every $\theta\in\mathbb{R}^m$;
(ii-a) The sequence $(M_t,\mathcal{F}_t)$ is a martingale difference sequence,
and (ii-b) for some $C_0>0$, $\mathbb{E}[||M_{t+1}||^2|\mathcal{F}_t]\leq
C_0(1+||\theta_t||^2)$ holds for any initial parameter vector $\theta_1$;
(iii) The sequence $\alpha_t$ satisfies $0<\alpha_t\leq 1$, 
$\sum_{t=0}^{\infty}\alpha_t=\infty$, and
$\sum_{t=0}^{\infty}\alpha_t^2<\infty$;
(iv) The ODE $\dot{\theta}=h_{\infty}(\theta)$ has the origin as a globally
asymptotically stable equilibrium;
 and (v) The ODE $\dot{\theta}=h(\theta)$ has a unique globally asymptotically
stable equilibrium. 

Clearly, $h(\theta)$ is Lipschitz with coefficient $||\textbf{A}||$ and
$h_{\infty}(\theta)=-\textbf{A}\theta$. By construction, $(M_t,\mathcal{F}_t)$
satisfies $\mathbb{E}[M_{t+1}|\mathcal{F}_t]=0$ and $M_t\in \mathcal{F}_t$,
i.e., it is a martingale difference sequence.
Condition (ii-b) can be shown to
hold by a simple application of the triangle inequality and
the boundedness of the second moments of $\{\phi_t,r_t,\phi_t'\}_t$.
Condition (iii) is satisfied by our conditions on the step-size sequences
$\alpha_t$.

For the last two conditions, we begin by showing that the matrix
 $\textbf{A}
 =\mathbb{E}_{\mu}[\phi_t(\phi_t-x_t\gamma\mathbb{E}_{\pi}[\phi_{t+1}])^{\top}]
 =\Phi^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma
  \textbf{D}_{x}\textbf{P}_{\pi})\Phi$ is positive definite.

Note that $\textbf{A}$ consists of $\Phi^{\top}$ and $\Phi$ wrapped around 
an $n\times n$ matrix $\textbf{D}_{\mu}(\textbf{I}-\gamma
  \textbf{D}_{x}\textbf{P}_{\pi})$.
According to Assumption \ref{fullrank} that the feature matrix $\Phi$ is column full rank,
 then, $\textbf{A}$ is positive definite whenever the key matrix
 $\textbf{D}_{\mu}(\textbf{I}-\gamma
  \textbf{D}_{x}\textbf{P}_{\pi})$ is positive definite.

Based on two theorems showed by \citet{sutton1988learning,sutton2016emphatic},
positive definiteness of the key matrix is assured if all of its
columns and rows sum to positive numbers. 
One theorem is that any matrix $\textbf{M}$ is positive definite if and only if
the symmetric matrix $\textbf{S}=\textbf{M}+\textbf{M}^{\top}$ is positive definite.
Another theorem is that any symmetric real matrix $\textbf{S}$ is positive definite
if the absolute values of its diagonal entries are greater than the sum of
the absolute values of the corresponding off-diagonal entries.
For the key matrix, $\textbf{M}=\textbf{D}_{\mu}(\textbf{I}-\gamma
  \textbf{D}_{x}\textbf{P}_{\pi})$, 
the diagonal entries are positive and the off-diagonal entries are negative, so
all we have to show is that all components of both $(\textbf{M}\textbf{1})$
and $(\textbf{1}^{\top}\textbf{M})$ are positive, where $\textbf{1}$ is the
column vector with all components equal to 1. They can be verified as follows:
\begin{equation}
\begin{split}
\textbf{M}\textbf{1}=\textbf{D}_{\mu}(\textbf{I}-\gamma
  \textbf{D}_{x}\textbf{P}_{\pi})\textbf{1}
  &=\textbf{D}_{\mu}(\textbf{1}-\gamma
  \textbf{D}_{x}\textbf{P}_{\pi}\textbf{1})\\
  &=\textbf{D}_{\mu}(\textbf{1}-\gamma
  \textbf{D}_{x}\textbf{1})\\
  &=\textbf{D}_{\mu}(\textbf{1}-\gamma d_x)
\end{split}
\end{equation}
Each component of
$\textbf{M}\textbf{1}$ is 
$[\textbf{D}_{\mu}(\textbf{1}-\gamma d_x)](s)=
d_{\mu}(s)(1-\gamma\min_b\bigl\{\frac{\mu(b|s)}{\pi(b|s)}\bigr\})\geq
d_{\mu}(s)(1-\gamma)>0.$

\begin{equation}
\begin{split}
[\textbf{D}_{x}\textbf{P}_{\pi}]_{ij}&=\min_b\Bigl\{\frac{\mu(b|i)}{\pi(b|i)}\Bigr\}\sum_{a}\pi(a|i)T(i,a,j)\\
&=\sum_{a}\pi(a|i)\min_b\Bigl\{\frac{\mu(b|i)}{\pi(b|i)}\Bigr\}T(i,a,j)\\
&\leq\sum_{a}\pi(a|i)\frac{\mu(a|i)}{\pi(a|i)}T(i,a,j)\\
&= \sum_{a} \mu(a|i)T(i,a,j)\\
&=[\textbf{P}_{\mu}]_{ij}.
\end{split}
\end{equation}

\begin{equation}
\begin{split}
\textbf{1}^{\top}\textbf{M}&=\textbf{1}^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma
\textbf{D}_{x}\textbf{P}_{\pi}) \\
&=d_{\mu}^{\top}(\textbf{I}-\gamma
\textbf{D}_{x}\textbf{P}_{\pi})\\ &=d_{\mu}^{\top}-\gamma
d_{\mu}^{\top}\textbf{D}_{x}\textbf{P}_{\pi}\\
&\geq d_{\mu}^{\top}-\gamma d_{\mu}^{\top}\textbf{P}_{\mu}\\
&=d_{\mu}^{\top}-\gamma d_{\mu}^{\top}\\
&=(1-\gamma)d_{\mu}^{\top}
\end{split}
\end{equation}
Each component of the vector $\textbf{1}^{\top}\textbf{M}$
is $[(1-\gamma)d_{\mu}](s)=(1-\gamma)d_{\mu}(s)>0$.
The row sums and the column sums are all positive. 
Thus, (iv) is satisfied. 

Finally, for the ODE $\dot{\theta}=h(\theta)$, note that
$\theta^{*}=A^{-1}b$ is the unique asymptotically stable equilibrium with
$\overline{V}(\theta)=\frac{1}{2}||-A\theta+b||^2$ as 
its associated strict Liapunov function. 
The claim now follows.
\end{proof}

\begin{figure*}
\begin{center}
	\subfigure[Stochastic updates in 2-state.]{
	\includegraphics[scale=0.31]{2StateExampleSteps.pdf}
	}
	\subfigure[Deterministic updates in 2-state.]{
	\includegraphics[scale=0.31]{2StateExampleSweeps.pdf}
	}
	\subfigure[Stochastic updates in Baird's.]{
	\includegraphics[scale=0.31]{BairdSteps.pdf}
	}
	
    \subfigure[Deterministic updates in Baird's.]{
	\includegraphics[scale=0.31]{BairdSweeps.pdf}
	}
	\subfigure[Sensitive test in 2-state]{
	\includegraphics[scale=0.31]{2StateExampleSensitive.pdf}
	}
    \subfigure[Sensitive test Baird's]{
	\includegraphics[scale=0.31]{BairdSensitive.pdf}
	}
	\caption{Comparisons of various temporal difference updates in
	 counterexamples.}
	\label{learning}
	\end{center}
\end{figure*}

\begin{theorem} (Convergence of TD-MRetrace with an off-policy sub-sampled
process).
Assume Assumption \ref{assump1},
 \ref{assump2},
  \ref{fullrank}, and
\ref{assump3}.
Let the parameter $\omega_t$ be updated by iteration (\ref{tdMRetrace})
and $\theta_t$ be updated by  iteration
(\ref{mretrace}).
Then the parameter vector $\omega_t$ converges with probability one.
\end{theorem}
\begin{proof}
A sketch proof is given as follows.
Based on Theorem \ref{theorem1}, $\theta$ converges. Then,
$\delta^{\text{off}}(\theta_t)$ is stable.
Let a new reward
$r_{t+1}^{\text{new}}\dot{=}\delta^{\text{off}}(\theta_t)+r_{t+1}$,
this reward can be regarded as a correction to reward function in the
off-policy case.
Therefore, TD-MRetrace is actually an on-policy TD learning algorithm. It is
guaranteed to converge, just like TD.
\end{proof}







\section{Experimental studies}


In experiments, 
we care about two points about the proposed  TD-MRetrace algorithm: 
(1) Whether it converges experimentally, although it does converge in theory?
(2) What is the quality of the TD fixed point it solves?
We adopted two sets of experiments, i.e.,
counterexamples to test the stability and control tasks to test the utility. 




\subsection{About stability in counterexamples}
In  the 2-states counterexample and 
 Baird's counterexample,  we implemented two update
 styles including stochastic updates and deterministic updates, and finished 
 parameter sensitivity test for converged algorithms.
 Compared algorithms include Retrace, ETD, GTD, GTD2,
 TDC, and GRetrace. 
Each algorithm was run 100 times independently.




Algorithms' learning curves  including mean in line and
standard deviation in shaded regions and sensitive testing are shown in Figure
\ref{learning}, where the theta value is equal to the root of mean squared value error (RMSVE) since  there is only
one scalar parameter in the 2-state counterexample and the true value is zero.
 We can see that
(i) Retrace diverges in all cases. (ii) Deterministic ETD  converges to zero the
fastest. On the other hand, ETD converges 
 with a high variance at the beginning in the 2-state
counterexample, and diverges in Baird's  counterexample
which is consistent with results of computational experiments about ETD
\citep[see][Page 282]{sutton2018book}.
(iii) MRetrace converges to zero relatively fast in all cases.
(iv) MRetrace performs best in parameter sensitivity tests.


% 
% This result is not surprising.
% MRetrace updates only one  parameter vector, 
% and thus avoids the slow convergence
% problem compared to the gradient-based approaches. 






\subsection{Learning to control}



We divided into two groups of experiments to test the solution quality.
In the first set of experiments,
 we removed  function approximation from ``the deadly triad'',
 and used tabular value functions instead. 
 Under these settings, algorithms should converge. What we care
 about is that whether the proposed algorithm can obtain the optimal solution.
 Therefore, we adopted the classic maze task.
In the second set of experiments, 
we directly address ``the deadly triad'': 
linear function approximation, bootstrapping, and off-policy learning.
 Therefore, we adopted the classic Tetris task,
 which was used as a benchmark challenge for various optimization techniques
 including reinforcement learning.


% \begin{figure}[ht]
% \begin{center}
% \subfigure[Maze map]{
% \input{MazeMap.tex}
% {\label{mazemap}}
% }
% \subfigure[Learning curves]{
% \resizebox{7cm}{5cm}{%
% 	\includegraphics{mazeCurve.pdf}
% 	}
% 	\label{mazecurve}
% }
% 	\caption{Comparisons of learning algorithms in Maze.}
% 	\end{center}
% \end{figure}

\begin{figure}[htbp]
  \begin{center}
    \includegraphics[scale=0.47]{mazeCurve.pdf}
    \caption{Comparisons of learning algorithms in Maze.}
    \label{mazecurve}
    \end{center}
\end{figure}

 
\subsubsection{25$\times$25 Maze}

We use a 25$\times$25 version of Maze, 
% see Figure \ref{mazemap}.
as shown in the figure on the right at the beginning of this section.
Reward for each step is set to -1, except for the end state which is 0.
The action value for each state action pair is initialized to 0.
 \begin{wrapfigure}{r}{3cm}
  \centering
  \includegraphics[scale=0.7]{maze.pdf}
  % \caption{The map of Maze.}
  \label{mazemap}
\end{wrapfigure} 
 The behavior policy is an $\epsilon$-greedy policy.
 $\epsilon$ is initialized to 0.1, and decreases to 0 along with
 episodes.
Compared algorithms include Q-learning, Sarsa, Retrace and Double Q-learning. 
Each algorithm was run 1000 times independently.




Algorithms' learning curves including mean in line and standard deviation in shaded regions 
are shown in Figure \ref{mazecurve}.
We can see that
(i) As expected, each algorithm converges and converges to the optimal policy
since there are no ``deadly triad''.
(ii) Double-Q learning converges the slowest because it has twice 
as many  learning parameters.
(iii) Sarsa learning converges slower than Q-learning because Sarsa is not
an off-policy learning. 
 Its convergence to the optimal policy is due to the 
 decrease of $\epsilon$ in the behavior policy.
(iv) Q-learning, Retrace, MRetrace and TD-MRetrace perform well with no
significant differences.

\subsubsection{10$\times$10 Tetris}
Tetris game is used as a challenge for various optimization techniques
\citep{thiery2009improvements}, where value function based reinforcement learning algorithms have performed
extremely poor, i.e., removing only about 50 lines on average in the
20$\times$10 version of Tetris game where the reward is set to one point for
each  removed line \citep{gabillon2013approximate}.
It is much harder  to learn in the 10$\times$10 version of Tetris.
We learn the afterstate values via linear summation 
with weighted DT9 features \citep{scherrer2015approximate},
which are normalized in [0,1]. 

For the hyperparameter settings, the learning rate $\alpha$ is fixed
  at 0.001 with no decay. The initial $\epsilon$ is set to 0.01 and decays to
  0.0001 with a decay rate of 0.9992.
Compared algorithms include Q-learning, Retrace and Double Q-learning. 
Each algorithm was run 10 times independently.

\begin{figure}
\begin{center}
	\includegraphics[scale=0.63]{tetris.pdf}
	\caption{Comparisons of learning curves in
	the $10\times 10$ tetris tasks.}
	\label{tetris}
	\end{center}
\end{figure}


Algorithms' learning curves including mean in line and standard deviation in shaded regions 
are shown in Figure \ref{tetris}, where the averaged removed lines represent
the expected return per episode.
 We can see that 
 (i) On the 10$\times$10 version of Tetris,
  Q-learning and Double Q-learning perform
 poorly but that is consistent with the literature. 
 (ii) Although not reaching the state of the art, MRetrace and TD-MRetrace
 perform much better than the other three algorithms.
To the best of our knowledge, MRetrace and TD-MRetrace are the first two
discounted value function based reinforcement learning algorithms that perform
well on Tetris.

In summary, the experiments verified the convergence of the TD-MRetrace
algorithm. Moreover, in terms of quality testing, it finds a
relatively good policy, although it solves an approximation of the target
policy.


\section{Conclusions and future work}
In this paper, we propose a simple but efficient method by 
introducing modified retrace to correct the return of the target policy,
and guarantee the convergence of the proposed TD-MRetrace algorithm. 
The effectiveness of TD-MRetrace with linear 
value functions  are validated in both
evaluation tasks and control tasks.

Future works include: (i) extensions of  TD-MRetrace(0)
with the one-step update to TD-MRetrace($\lambda$) with multi-step updates.
(ii) extensions of the proposed TD-MRetrace algorithm with nonlinear value
functions.

\section*{Acknowledgements}
The authors would like to thank the anonymous reviewers
 for their valuable comments and suggestions.
This paper is partially supported by National Natural Science
 Foundation of China (No.62276142, 62206133, 62202240, 62192783), 
 Science and Technology Innovation 2030 New Generation Artificial
  Intelligence Major Project (No.2018AAA0100905), 
  Primary Research \& Developement Plan of Jiangsu Province
   (No.BE2021028), and Shenzhen Fundamental Research Program
    (No.2021Szvup056).
\bibliography{chen_144}
\end{document}
