% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Algorithm box
% \usepackage{graphicx} % Required for inserting images
\usepackage[linesnumbered,ruled,lined]{algorithm2e}
% \usepackage{float} % Fix the location of algorithm box

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{he_35}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{amsthm} % for theorem
\usepackage{amssymb} % for mathbb
\usepackage{subfigure} % for figures
\hypersetup{hidelinks}

\SetKw{KwInput}{Input}
\SetKw{KwInitialize}{Initialize}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% For commenting in the PDF
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\todotag}{\textcolor{violet}{[{\bf TODO}] }}
\newcommand{\mytodo}[1]{\textcolor{violet}{[{\bf TODO}: #1]}}
% \newcommand{\mytodo}[1]{\textcolor{violet}{}}
\newcommand{\jiamin}[1]{\textcolor{red}{[{\bf Jiamin}: #1]}}
\newcommand{\rupam}[1]{\textcolor{blue}{[{\bf Rupam}: #1]}}
\newcommand{\yi}[1]{\textcolor{orange}{[{\bf Yi}: #1]}}
\newcommand{\fengdi}[1]{\textcolor{green}{[{\bf Fengdi}: #1]}}
\newcommand{\uncertain}[1]{\textcolor{violet}{#1}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% MATH
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\vect}[1]{\boldsymbol{\mathbf{#1}}}
\newcommand{\mtrx}[1]{\boldsymbol{\mathbf{#1}}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cA}{\mathcal{A}}
\newcommand{\bR}{\mathbb{R}}
\newcommand{\bE}{\mathbb{E}}
\newcommand{\bP}{\mathbb{P}}
\newcommand{\bN}{\mathbb{N}}
\newcommand{\norm}[1]{\| #1 \|}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\title{Loosely Consistent Emphatic Temporal-Difference Learning}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\setcounter{Maxaffil}{2}
\author[1]{Jiamin~He}
\author[1]{Fengdi~Che}
\author[1]{Yi~Wan}
\author[1,2]{A.~Rupam~Mahmood}
% Add affiliations after the authors
\affil[1]{%
    Department of Computing Science,
    University of Alberta
}
\affil[2]{
    CIFAR AI Chair, Alberta Machine Intelligence Institute (Amii)
}
  
\begin{document}
\maketitle

\begin{abstract}
    
    There has been significant interest in searching for off-policy Temporal-Difference (TD) algorithms that find the same solution that would have been obtained in the on-policy regime. An important property of such algorithms is that their expected update has the same fixed point as that of On-policy TD($\lambda$), which we call \emph{loose consistency}. Notably, Full-IS-TD($\lambda$) is the only existing loosely consistent method under general linear function approximation but, unfortunately, has a high variance and is scarcely practical. This notorious high variance issue motivates the introduction of ETD($\lambda$), which tames down the variance but has a biased fixed point. Inspired by these two methods, we propose a new loosely consistent algorithm called \emph{Average Emphatic TD} (AETD($\lambda$)) with a transient bias, which strikes a balance between bias and variance. Further, we unify AETD($\lambda$) with existing methods and obtain a new family of loosely consistent algorithms called \emph{Loosely Consistent Emphatic TD} (LC-ETD($\lambda$, $\beta$, $\nu$)), which can control a smooth bias-variance trade-off by varying the speed at which the transient bias fades. Through experiments on illustrative examples, we show the effectiveness and practicality of LC-ETD($\lambda$, $\beta$, $\nu$).\footnotemark

\end{abstract}
\footnotetext{The Python implementations of the experiments are available at \url{https://github.com/hejm37/LC-ETD}.}

\section{Introduction}\label{sec:intro}

Off-policy learning is a critical area in reinforcement learning (RL). Particularly, off-policy policy evaluation (OPPE), also known as off-policy prediction, is an essential component in model learning, options learning \citep{sutton1999between}, and life-long learning \citep{sutton2022alberta, white2012scaling}. The goal of OPPE is to estimate the value function of a \textit{target policy} with data collected by a different \textit{behavior policy}. We refer to the data collected by the target policy as \textit{on}-policy data and the data collected by the behavior policy \textit{off}-policy data. In this paper, we consider the problem of OPPE with linear function approximation.

In online RL, where the algorithm makes incremental updates, TD learning is a ubiquitous family of algorithms, and On-policy TD($\lambda$) is an essential approach to on-policy prediction \citep{sutton1988learning}. In OPPE, there have been substantial efforts in obtaining the \textit{on-policy fixed point}, to which On-policy TD($\lambda$) converges with on-policy data \citep{precup2001off,hallak2017consistent,gelada2019off}. There is a good reason for targeting the on-policy fixed point: It produces a good approximation of the target policy's value function \citep{tsitsiklis1996analysis}.

When the ratio between the stationary distributions of the target and behavior policies is available, we can use it to reweight the TD update, allowing for the development of algorithms that converge to the on-policy fixed point \citep{hallak2017consistent}. However, such a ratio, known as the density ratio, is generally not accessible. One potential approach is to learn an approximation of the density ratio, which requires the ratio to be realizable by the features. When such an assumption holds, extensive studies have been conducted for both off-policy policy evaluation and optimization \citep{hallak2017consistent,liu2018breaking,nachum2019dualdice,zhang2020gendice,zhang2020gradientdice,lee2021optidice,zhan2022offline,chen2022offline,huang2023reinforcement}.

Nevertheless, the realizability assumption on the features is quite strong and may not be feasible in practice. In this paper, we consider algorithms with theoretical guarantees that do not require such an assumption and hold under general linear function approximation. Specifically, we search for off-policy TD algorithms whose expected update has the same fixed point as the on-policy fixed point, and we say such algorithms are \textit{loosely consistent}. An important implication of an algorithm's loose consistency is that if the algorithm converges, it is to the on-policy fixed point.

To our knowledge, Full Importance-Sampling TD \citep[Full-IS-TD($\lambda$),][]{precup2001off} is the only loosely consistent off-policy TD algorithm under general linear function approximation. To obtain the on-policy fixed point, Full-IS-TD($\lambda$) reweights the TD update with the full importance-sampling-ratio (IS-ratio) product, the multiplication of the IS ratios at every time step. However, Full-IS-TD($\lambda$) barely works in practice due to the high variance of the full IS-ratio product.

To tame down the variance of Full-IS-TD($\lambda$), Emphatic TD \citep[ETD($\lambda$),][]{sutton2016emphatic} reweights the TD update with the {emphatic weighting}. While the emphatic weighting mitigates the variance issue, it also induces persistent bias, deviating from the on-policy TD fixed point. Further, to obtain a smooth bias-variance trade-off, \citet{hallak2016generalized} proposed ETD($\lambda$, $\beta$), which unifies Off-policy TD($\lambda$) and ETD($\lambda$) with a tunable parameter $\beta$. Yet, ETD($\lambda$, $\beta$) loses the stability guarantee when $\beta$ is smaller than an instance-dependent condition number that is difficult to determine.

In this paper, we first propose Average Emphatic TD (AETD($\lambda$)), a novel loosely consistent algorithm inspired by Full-IS-TD($\lambda$) and ETD($\lambda$), which strikes a better balance between bias and variance. AETD($\lambda$) renovates the idea of ETD($\lambda$), introducing a transient bias to achieve a lower variance than Full-IS-TD($\lambda$) while retaining consistency as the bias fades away over time. Then, to make AETD($\lambda$) more practical, we introduce extra parameters to control a smooth bias-variance trade-off by unifying it with existing algorithms. The resulting new family of loosely consistent algorithms called \textit{Loosely Consistent Emphatic TD} (LC-ETD($\lambda$, $\beta$, $\nu$)) has a more general stability guarantee than ETD($\lambda$, $\beta$), the same fixed point as On-policy TD($\lambda$), and much better performance than Full-IS-TD($\lambda$). Finally, through experiments on didactic examples, we validate the stability and the benefit of loose consistency of LC-ETD($\lambda$, $\beta$, $\nu$). Experiment results on a more complex task with high variance also show LC-ETD($\lambda$, $\beta$, $\nu$)'s faster convergence to the lowest error. To our knowledge, LC-ETD($\lambda$, $\beta$, $\nu$) is the {first practical, loosely consistent} algorithm for off-policy TD learning under general linear function approximation.


\section{Background}\label{sec:background}

We consider an infinite horizon Markov Decision Process (MDP), which is defined as a tuple $\langle\cS, \cA, p, d_0, r, \gamma \rangle$ where $\cS$ is the finite state space, $\cA$ is the finite action space, $p: \cS \times \cA \rightarrow \Delta(\cS)$ is the transition function, $d_0 \in \Delta(\cS)$ is the initial state distribution, $r:\cS \times \cA \rightarrow \bR$ is the reward function, and $0\le\gamma < 1$ is the discount factor. Here, $\Delta(\mathcal{X})$ denotes the set of probability distributions over a finite set $\mathcal{X}$. The policy of the agent is defined as $\pi:\cS \rightarrow \Delta(\cA)$. The discounted value function is defined as
\begin{equation*}
  v_\pi(s) \doteq \bE_\pi\left[\sum_{t=0}^\infty \gamma^{t} r(S_t,A_t)|S_0=s\right].
\end{equation*}

Particularly, we consider the setting of \textit{online OPPE with linear function approximation}, where the agent needs to estimate the value function of a target policy $\pi$ while interacting with the environment with a behavior policy $\mu$. We assume the observation is parameterized by the feature function $\vect \phi: \cS \rightarrow \bR^d$ or equivalently the feature matrix $\mtrx \Phi \in \bR^{|\cS| \times d}$, where $d$ is the dimension of the feature. At each time step $t$, the agent selects action $A_t$ based on the current state $S_t$ following the behavior policy $\mu$ and observes the next state $S_{t+1}$ and reward $R_{t+1}=r(S_t,A_t)$. The \textit{importance-sampling ratio} at time step $t$ is defined as $\rho_t\doteq\frac{\pi(A_t|S_t)}{\mu(A_t|S_t)}$. With linear function approximation, the agent approximates the value function with $\hat v(s;\vect\theta)=\vect \phi(s)^\top \vect \theta$ or in matrix-vector form, $\vect {\hat v} \doteq \mtrx \Phi \vect \theta$, where $\vect \theta \in \bR^d$ is a parameter vector.

We make a few common assumptions to make the problem more tractable: Firstly, Assumption \ref{assumption:ergodicity} ensures the unique existence of the corresponding stationary distributions, $d_\mu\in\Delta(\cS)$ and $d_\pi\in\Delta(\cS)$. In addition, it holds that for any $s\in\cS$, $d_\mu(s) > 0$ and $d_\pi(s)>0$; secondly, Assumption \ref{assumption:coverage} makes sure that $\rho_t$ is well-defined at every time step; finally, Assumption \ref{assumption:features} ensures that the features are well-behaved, avoiding singularity in the analysis.

\begin{assumption}[Ergodicity]
    \label{assumption:ergodicity}
    The Markov chains induced by the behavior policy $\mu$ and the target policy $\pi$ are ergodic.
\end{assumption}

\begin{assumption}[Coverage]
    \label{assumption:coverage}
    For any $s\in\cS$ and $a\in\cA$, if $\pi(a|s)>0$, then $\mu(a|s)>0$.
\end{assumption}

\begin{assumption}[Independent Features]
    \label{assumption:features}
    The feature matrix $\mtrx \Phi$ has independent columns.
\end{assumption}

Let $\mtrx I\in\bR^{|\cS|\times|\cS|}$ denote the identity matrix, $\mtrx P_\pi\in\bR^{|\cS|\times|\cS|}$ denote the on-policy transition matrix with $[\mtrx P_\pi]_{ss'}\doteq \sum_{a\in\cA} \pi(a|s)p(s'|s,a)$, and $\vect r_\pi\in\bR^{|\cS|}$ denote the on-policy state reward function with $[\vect r_\pi]_s\doteq\sum_{a\in\cA} \pi(a|s)r(s,a)$. Similar to identifying $\hat v$ as $\vect {\hat v}$, we also identify $d_\mu$ as $\vect d_\mu$ and $d_\pi$ as $\vect d_\pi$. Moreover, we define $\mtrx{D_v}\doteq diag(\vect v)$ for some vector $\vect v$. Specifically, we use $\mtrx D_\pi$ for $\mtrx{D_{d}}_\pi$ and $\mtrx D_\mu$ for $\mtrx{D_{d}}_\mu$. We use $\norm{\cdot}_{\vect v}$ to denote the vector norm induced by $\mtrx{D_v}$ for some vector $\vect v$, i.e., $\norm{\vect x}_{\vect v}\doteq\sqrt{\vect x^\top \mtrx{D_v} \vect x}$. 

\paragraph{Stability} We will define the stability of a stochastic algorithm with an update of the following form:
\begin{align*}
    \vect \theta_{t+1} = \vect \theta_t + \alpha (\vect b_t - \mtrx A_t \vect \theta_t),
\end{align*}
where $\alpha>0$ is a scalar step-size parameter, $\{\vect \theta_t\}_{t=0}^\infty$ is the sequence of weight vectors generated by the algorithm, and $\{(\mtrx A_t, \vect b_t)\}_{t=0}^\infty$ is a sequence of random matrices and vectors that depend on the problem and the algorithm. Define $\mtrx A \doteq \lim_{t \to \infty} \bE_\mu [\mtrx A_t]$ and $\vect b \doteq \lim_{t \to \infty} \bE_\mu [\vect b_t]$. Using $\mtrx A$ and $\vect b$, we can form a deterministic algorithm:
\begin{align*}
    \vect{\bar \theta}_{t+1} = \vect{\bar \theta}_{t} + \alpha (\vect b - \mtrx A \vect{\bar \theta}_t),
\end{align*}
which we call the \textit{expected update} of the stochastic algorithm. We use the definition of the stability of a stochastic algorithm from \citet{sutton2016emphatic}: A stochastic algorithm and its expected update are \textit{stable} if the expected update converges to a unique fixed point under any initialization. It turns out that, the expected update is stable if and only if the eigenvalues of its $\mtrx A$ matrix all have positive real parts \citep{varga1999matrix}. As discussed in \citet{sutton2016emphatic}, the stability of a stochastic algorithm is essential to its convergence: If a stochastic algorithm is stable, then its parameter vector may converge with probability one with a proper step-size scheduling. Besides, if the stochastic algorithm converges, it is to the fixed point of its expected update, $\vect{\bar \theta}=\mtrx A^{-1} \vect b$. For example, under Assumptions \ref{assumption:ergodicity} and \ref{assumption:features}, On-policy TD($\lambda$) can be shown to be stable and converge to the \textit{on-policy fixed point}, $\vect{\bar \theta}_{\text{On}}=\mtrx A^{-1} \vect b$, where
\begin{align*}
    \mtrx A &= \mtrx \Phi^\top \mtrx D_{\pi} (\mtrx I - \lambda\gamma \mtrx P_\pi)^{-1} (\mtrx I - \gamma \mtrx P_\pi) \mtrx \Phi \quad \text{and } \\
    \vect b &= \mtrx \Phi^\top \mtrx D_{\pi} (\mtrx I - \lambda\gamma \mtrx P_\pi)^{-1} \vect r_\pi.
\end{align*}

\paragraph{Loose Consistency} We consider an off-policy TD algorithm to be \textit{loosely consistent} if its expected update converges to the on-policy fixed point under any initialization.\footnote{\citet{hallak2017consistent} referred to a similar property as ``consistency.'' However, to distinguish it from the usage of the word ``consistency'' in statistics, we redefine it as ``loose consistency.'' Following the statistical convention, we can define strong or weak consistencies based on whether the algorithm converges almost surely or in probability.} By definition, loose consistency implies stability. A sufficient condition of loose consistency is that the algorithm has the same expected update (or equivalently, $\mtrx A$ matrix and $\vect b$ vector) as On-policy TD($\lambda$). For simplicity, we will refer to loose consistency as \textit{consistency} and designate a loosely consistent algorithm as a \textit{consistent algorithm} throughout the remainder of the paper.

In our pursuit of consistent off-policy TD algorithms, we next review a line of work that has made progress toward this goal by reweighting the TD update.

\paragraph{Off-Policy TD($\lambda$)} Off-policy TD($\lambda$) \citep{precup2000eligibility} is the earliest effort in this line of work. In the one-step case, Off-policy TD($0$) makes the following update:
\begin{align}
  % \begin{split}
    \vect \theta_{t+1} &= \vect \theta_t + \alpha \rho_t \delta_t \vect \phi_t, \label{math:off_policy_update} \\
    \delta_t &= R_{t+1} + \gamma \vect \phi_{t+1}^\top \vect \theta_t - \vect \phi_{t}^\top \vect \theta_t, \label{math:td_error}
  % \end{split}
\end{align}
where $\vect \phi_t\doteq \vect \phi(S_t)$. Compared to On-policy TD($0$), Off-policy TD($0$) uses the IS ratio $\rho_t$ to correct the probability of selecting action $A_t$ at time step $t$, which allows Off-policy TD($0$) to converge to the true value function $v_\pi$ if the feature representation is tabular. This convergence guarantee is true for any $\lambda\in[0,1]$ in the tabular case. However, Off-policy TD($\lambda$) can be shown to diverge in various counterexamples with general linear features \citep{baird1995residual,sutton2018reinforcement}. This divergence issue is due to the distribution of Update (\ref{math:off_policy_update}) could deviate too much from the on-policy distribution, which is well explained by \citet{sutton2016emphatic}. What is worse, the \textit{off-policy fixed point}, the fixed point that Off-policy TD($\lambda$) converges to if it does converge, could have an unbounded error in the one-step case \citep{kolter2011fixed}.

\paragraph{Full-IS-TD($\lambda$)} To address the divergence of Off-policy TD($\lambda$), \citet{precup2001off} introduced the idea of correcting the distribution of Update (\ref{math:off_policy_update}) by reweighting it. The algorithm they proposed, Full-IS-TD($\lambda$), reweights the update with the \textit{full} IS-ratio product, the product of all the IS ratios up to the current time step. Before we bring in the update of Full-IS-TD($0$), we introduce a general description of one-step TD algorithms with an unspecified trace $F_t$:
\begin{align}
  \label{math:general_update}
  % \begin{split}
    \vect \theta_{t+1} &= \vect \theta_t + \alpha \rho_t F_t \delta_t \vect \phi_t,
  % \end{split}
\end{align}
where $\delta_t$ is the TD error defined in Eq. (\ref{math:td_error}). For Off-policy TD($0$), $F_t=1$. In the case of Full-IS-TD($0$), $F_t = \rho_{t-1} \rho_{t-2} \cdots \rho_{0}$ or $F_t=\rho_{t-1} F_{t-1}$ with $F_0=1$, which always corrects the distribution back to the on-policy distribution completely \citep{precup2001off}. In general, for Full-IS-TD($\lambda$), $\bE_\mu[F_t|S_t=s]$ is equal to $\frac{\bP_\pi(S_t=s)}{\bP_\mu(S_t=s)}$, which converges to the density ratio $\frac{d_\pi(s)}{d_\mu(s)}$. Consequently, Full-IS-TD($\lambda$) has the same expected update as On-policy TD($\lambda$), implying its consistency. In fact, it is the only consistent method.\footnote{From now on, our discussion will be based exclusively on Assumption \ref{assumption:features} for the features, without any additional assumptions.} However, Full-IS-TD($\lambda$) is scarcely practical due to variance issues, which motivates the next idea in this line.

\paragraph{ETD($\lambda$)} Instead of using the full IS-ratio product to weight the update, \citet{sutton2016emphatic} proposed to use the emphatic weighting:
\begin{align}
\label{math:followon_trace}
    F_t = \gamma \rho_{t-1} F_{t-1} + 1, \text{with } F_0 = 1,
\end{align}
which is termed the \textit{followon trace}, a geometrically weighted sum of IS-ratio products accumulated from different time steps. By introducing \textit{incomplete} IS-ratio products into the weighting, the proposed algorithm, ETD($\lambda$), reduces the variance and remains stable. However, these incomplete IS-ratio products also introduce persistent bias into $F_t$, causing ETD($\lambda$) to converge to a biased fixed point.


\section{Towards Practical, Consistent TD Learning}

\subsection*{Average Followon Trace}

As discussed in the last section, the only consistent method, Full-IS-TD($\lambda$), is not practical due to the high variance issue. On the other hand, ETD($\lambda$), an effective remedy to the variance issue, is biased and deviates from our objective of finding the on-policy fixed point. Then, \textit{can we find a good trade-off point between Full-IS-TD($\lambda$) and ETD($\lambda$)?} Specifically, can we find a method that is consistent and has a milder variance compared to Full-IS-TD($\lambda$)? The answer is, fortunately, yes. Inspired by the idea of using incomplete IS-ratio products to reduce the variance, we propose to use the below average emphatic weighting:
\begin{align}
    \label{math:average_followon_trace}
    F_t = \frac{t}{t+1} \rho_{t-1} F_{t-1} + \frac{1}{t+1}, \text{with } F_0 = 1,
\end{align}
which we term the \textit{average followon trace}. Expanding this trace reveals that it represents the mean of the IS-ratio products. By employing the mean instead of a geometrically weighted sum, we gradually reduce bias by diminishing the emphasis on the new IS-ratio product at each time step. Although the expectation of the average followon trace at time step $t$ typically differs from $\frac{\bP_\pi(S_t=s)}{\bP_\mu(S_t=s)}$, this discrepancy diminishes as $t$ increases, characterizing the average followon trace as displaying a fading or transient bias. Remarkably, the bias of $F_t$ completely vanishes in the limit, rendering the resulting algorithm defined by Update (\ref{math:general_update}) and Eq. (\ref{math:average_followon_trace}) consistent. This algorithm is referred to as \textit{one-step Average Emphatic TD} (AETD($0$)), and its consistency is presented in Theorem \ref{thrm:stability_aetd}. The detailed proof is deferred to the appendix.

\begin{theorem}[Consistency of AETD(0)]
\label{thrm:stability_aetd}
Let Assumptions \ref{assumption:ergodicity}-\ref{assumption:features} hold. If $\lim_{t\to\infty}\bE_\mu[F_t|S_t=s]$ exists for all $s\in\cS$, then AETD($0$) has the same expected update as On-policy TD($0$). As a result, AETD($0$) is stable and consistent.
\end{theorem}

Now, the idea of using a uniformly weighted sum of the IS-ratio products to reweight the TD update is not entirely new. \citet{hallak2016generalized} unified ETD($\lambda$) and Off-policy TD($\lambda$) by introducing a tunable decay parameter, $\beta\in[0,1]$, in the followon trace (Eq. (\ref{math:followon_trace})). The resulting algorithm, ETD($\lambda$, $\beta$), uses the following emphatic weighting:
\begin{align}
\label{math:followon_trace_beta}
    F_t = \beta \rho_{t-1} F_{t-1} + 1, \text{with } F_0 = 1.
\end{align}
When $\beta=0$, this trace degenerates to constant $1$, and ETD($\lambda$, $\beta$) becomes Off-policy TD($\lambda$); when $\beta=\gamma$, this trace recovers Eq. (\ref{math:followon_trace}), and ETD($\lambda$, $\beta$) becomes ETD($\lambda$); when $\beta=1$, this trace will equally weight each IS-ratio product with weight $1$. However, in their case, equally weighting the products is problematic because the expectation of $F_t$ diverges to infinity in the limit.


\subsection*{A Smooth Bias-Variance Trade-Off}\label{sec:general_class}

Despite ETD($\lambda$, $\beta$) not being a consistent algorithm for any value of $\beta$, it presents an interesting strategy to trade off bias and variance: With a small $\beta$, $F_t$ has a low variance but a large bias; with a large $\beta$, $F_t$ has a small bias but a high variance. Consequently, ETD($\lambda$, $\beta$) can trade off the bias of its fixed point and the variance it incurs by varying the value of $\beta$. Then, we wonder, \textit{can we unify AETD($0$) with other algorithms to attain a smooth bias-variance trade-off?} If possible, we also want to retain the consistency of AETD($0$). To achieve this goal, we consider the unification of AETD($0$) with both Off-policy TD($0$) and Full-IS-TD($0$), the one with the least variance but the greatest bias and the one with the least bias but the greatest variance.

We first unify AETD($0$) and Full-IS-TD($0$). To unify the traces that the two methods use, we introduce a tunable parameter, $\beta'\in[0,1]$, to the average followon trace: $F^{(1)}_t= (1-\beta'(t+1)^{-1}) \rho_{t-1} F^{(1)}_{t-1} + \beta'(t+1)^{-1}$ with $F^{(1)}_0=1$. Then, when $\beta'=0$, $F^{(1)}_t$ becomes $F^{(1)}_t=\rho_{t-1} F^{(1)}_{t-1}$, which corresponds to the trace of Full-IS-TD($0$); when $\beta'=1$, $F^{(1)}_t$ becomes the average followon trace.

Similarly, we can unify AETD($0$) and Off-policy TD($0$) with another tunable parameter, $\nu\in[0,1]$, and a new trace: $F^{(2)}_t= (1-(t+1)^{-\nu}) \rho_{t-1} F^{(2)}_{t-1} + (t+1)^{-\nu}$ with $F^{(2)}_0=1$. When $\nu=0$, $F^{(2)}_t$ becomes constant $1$, which corresponds to the trace of Off-policy TD($0$); when $\nu=1$, $F^{(2)}_t$ becomes the average followon trace.

We further unify $F^{(1)}_t$ and $F^{(2)}_t$, leading us to a third trace with two parameters, $\beta'$ and $\nu$: $F^{(3)}_t= (1-\beta'(t+1)^{-\nu}) \rho_{t-1} F^{(3)}_{t-1} + \beta'(t+1)^{-\nu}$ with $F^{(3)}_0=1$. Additionally, we found that when $\nu=0$, the trace becomes $F_t=(1-\beta')\rho_{t-1} F_{t-1} + \beta'$, which is also a geometrically weighted sum of IS-ratio products as in ETD($\lambda$, $\beta$). To obtain the same decay rate in the resulting trace and the followon trace (Eq. (\ref{math:followon_trace_beta})), we replace $\beta'$ with $1-\beta$ in $F^{(3)}_t$, and name the resulting trace \textit{general followon trace}:
\begin{align}
    \label{math:general_followon_trace}
    F_{t} &= (1-{g(t)}) \rho_{t-1} F_{t-1} + {g(t)}, \text{with } F_0=1,
\end{align}
where $g(t) \doteq (1-\beta)(t+1)^{-\nu}$ with $\beta \in [0,1]$ and $\nu\in [0,1]$. Note that when $\nu=0$, the resulting trace becomes $F_t=\beta\rho_{t-1} F_{t-1} + (1-\beta)$, which we call the scaled followon trace. The resulting one-step algorithm is subsequently called \text{Scaled ETD($0$, $\beta$)}. Although the scaled followon trace has the same decay rate as the original followon trace (Eq. (\ref{math:followon_trace_beta})), it is downscaled by $1-\beta$ (see Table \ref{tab:weightings}). This discrepancy, however, is not a qualitative difference because the constant factor $1-\beta$ can be absorbed in the step-size parameter.\footnote{We can see from Table \ref{tab:weightings} that the coefficient of the full IS-ratio product, $\beta^t$, is not downscaled to $\beta^t(1-\beta)$. However, this minor difference will not prevent Scaled ETD($0$, $\beta$) from sharing the same theory and empirical properties as ETD($0$, $\beta$).} Thus, Scaled ETD($0$, $\beta$) can be viewed as a slight variant of ETD($0$, $\beta$).

\begin{table*}[t]
    \centering
    \caption{The coefficients of different IS-ratio products in $F_t$.}
    \label{tab:weightings}
    \resizebox{\textwidth}{!}{  % Resize the box to textwidth
    \begin{tabular}{c|cccccc}
        \toprule
        IS-ratio Product & Off-policy TD($\lambda$) & Scaled ETD($\lambda$, $\beta$) & Full-IS-TD($\lambda$) & AETD($\lambda$) & LC-ETD($\lambda$, $\beta$, $\nu$) & ETD($\lambda$, $\beta$) \\
        \midrule
         1   &   1   &   $1-\beta$   &   0   & $1/(t+1)$&   $g(t)$  &   1 \\
         $\rho_{t-1}$  &   0   &   $\beta(1-\beta)$ &   0   & $1/(t+1)$&   $(1-g(t))g(t-1)$    &   $\beta$ \\
         $\rho_{t-1}\rho_{t-2}$  &   0   &   $\beta^2(1-\beta)$ &   0   & $1/(t+1)$&   $\Pi_{k={t-1}}^{t}(1-g(k))g(t-2)$    &   $\beta^2$ \\
         \vdots  &   \vdots  &   \vdots  &   \vdots  &   \vdots&   \vdots  &   \vdots \\
         $\Pi_{k=2}^{t}\rho_{k-1}$  &   0   &   $\beta^{t-1}(1-\beta)$ &   0   & $1/(t+1)$& $\Pi_{k=2}^{t}(1-g(k))g(1)$ &   $\beta^{t-1}$ \\
         $\Pi_{k=1}^{t}\rho_{k-1}$  &   0   &   $\beta^{t}$ &   1   & $1/(t+1)$& $\Pi_{k=1}^{t}(1-g(k))$ &   $\beta^{t}$ \\
        \bottomrule
    \end{tabular}
    }
\end{table*}

Having settled the relationship between Scaled ETD($0$, $\beta$) and ETD($0$, $\beta$), we are now ready to name the algorithm that unifies AETD($0$), Off-policy TD($0$), Full-IS-TD($0$), and Scaled ETD($0$, $\beta$). We call the resulting algorithm \textit{one-step General Emphatic TD} (GETD($0$, $\beta$, $\nu$)), which is defined by Update (\ref{math:general_update}) and the general followon trace (Eq. (\ref{math:general_followon_trace})).

So far, we have only introduced the one-step form of AETD and GETD. By applying the same idea of uniform averaging and the same strategy of unification to the multi-step bootstrapping case, we can obtain their multi-step version. Here, we present the unified algorithm with multi-step bootstrapping called \textit{General Emphatic TD} (GETD($\lambda$, $\beta$, $\nu$)), which makes the following update\footnote{For simplicity, we have not included general state-dependent interest, discounting, and bootstrapping functions as \citet{sutton2016emphatic}. However, GETD($\lambda$, $\beta$, $\nu$) can be extended to those cases.}:
\begin{align}
  \label{math:general_update_lambda}
  \begin{split}
    \vect \theta_{t+1} &= \vect \theta_t + \alpha \delta_t \vect z_t, \\
    \delta_t &= R_{t+1} + \gamma \vect \phi_{t+1}^\top \vect \theta_t - \vect \phi_{t}^\top \vect \theta_t, \\
    \vect z_t &= \rho_t (\gamma \lambda \vect z_{t-1} + M_t \vect \phi_t), \text{with } \vect z_{-1} = \vect 0, \\
    M_t &= \left(1 - \lambda h(t)\right) F_t + \lambda g(t), \\
    F_t &= \left(1 - g(t)\right) \rho_{t-1} F_{t-1} + g(t), \text{with } F_0 = 1,
  \end{split}
\end{align}
where $h(t)$ and $g(t)$ are defined as follows:
\begin{align}
    \label{math:general_g_and_f}
    \begin{split}        
        h(t) \doteq \left(\frac{1-\beta}{t+1}\right)^\nu \  \text{{and}} \  g(t) \doteq \frac{1-\beta}{(t+1)^{\nu}}
    \end{split}
\end{align}
with $\beta\in[0,1]$ and $\nu\in[0,1]$. Similar to the one-step case, GETD($\lambda$, $\beta$, $\nu$) subsumes AETD($\lambda$) , Off-policy TD($\lambda$), Full-IS-TD($\lambda$), and Scaled ETD($\lambda$, $\beta$). A list of the updates of all these algorithms is included in the appendix.


\subsection*{Loosely Consistent Emphatic TD}

In this section, we examine the product of the unification. The question here is, \textit{while the introduced decay parameters $\beta$ and $\nu$ offer us a smooth bias-variance trade-off, is the consistency of AETD($0$) retained?} Fortunately, the answer is, again, yes. We name this new class of consistent algorithms with tunable decay parameters as \textit{Loosely Consistent Emphatic TD} (LC-ETD($\lambda$, $\beta$, $\nu$)).\footnote{Recall that we refer to loose consistency as consistency.} Specifically, LC-ETD($\lambda$, $\beta$, $\nu$) is defined by Update (\ref{math:general_update_lambda}) with $\beta\in[0,1)$ and $\nu\in(0,1]$, or $\beta=1$ and $\nu\in[0,1]$.\footnote{We also include Full-IS-TD($\lambda$) in LC-ETD($\lambda$, $\beta$, $\nu$), since it is also consistent. LC-ETD($\lambda$, $\beta$, $\nu$) becomes Full-IS-TD($\lambda$) when $\beta=1$ and $\nu\in[0,1]$.} We provide its pseudocode in Algorithm \ref{alg:template} and present its consistency in Theorem \ref{thrm:stability_cetd}, of which the proof is deferred to the appendix.

\IncMargin{1.5em}
\begin{algorithm}[t]
  \KwInput{MDP $\langle\cS, \cA, p, d_0, r, \gamma \rangle$, feature function $\vect \phi$, behavior policy $\mu$, target policy $\pi$, step size $\alpha\in(0,1]$, bootstrapping parameter $\lambda\in[0,1]$, and decay parameters $\beta\in[0,1)$ and $\nu\in(0,1]$, or $\beta=1$ and $\nu\in[0,1]$}\\
  \KwInitialize{value-function weights $\boldsymbol{\theta}$ arbitrarily, followon trace $F = 1$, and eligibility trace $\boldsymbol{z} = \boldsymbol{0}$} \\
  Draw $S_0$ from $d_0$ \\
  \For{$t=0:\infty$}{
    Take action $A_t\sim \mu(\cdot|S_t)$ \\
    Observe $S_{t+1}\sim p(\cdot|S_t,A_t)$, $R_{t+1}=r(S_t,A_t)$ \\
    $M \gets (1-\lambda h(t)) F + \lambda g(t)$, where $g(t)=(1-\beta)(t+1)^{-\nu}$ and $h(t)=(1-\beta)^{\nu}(t+1)^{-\nu}$ \\
    $\boldsymbol{z} \gets \rho_t (\gamma \lambda \boldsymbol{z} + M \boldsymbol{\phi}(S_{t}))$, where $\rho_t=\frac{\pi(A_t|S_t)}{\mu(A_t|S_t)}$ \\
    $F \gets (1-g(t+1)) \rho_{t} F + g(t+1)$ \\
    $\boldsymbol{\theta} \gets \boldsymbol{\theta} + \alpha [R_{t+1} + \gamma \boldsymbol{\phi}(S_{t+1})^\top\boldsymbol{\theta} - \boldsymbol{\phi}(S_{t})^\top\boldsymbol{\theta}]\boldsymbol{z}$
  }
  \caption{LC-ETD($\lambda$, $\beta$, $\nu$) for online OPPE with linear function approximation}
  \label{alg:template}
\end{algorithm}

\begin{theorem}[Consistency of LC-ETD($\lambda$, $\beta$, $\nu$)]
    \label{thrm:stability_cetd}
    Let Assumptions \ref{assumption:ergodicity}-\ref{assumption:features} hold. For any $\beta\in[0,1)$ and $\nu\in(0,1]$, or $\beta=1$ and $\nu\in[0,1]$, if $\lim_{t\to\infty} \bE_\mu [F_t|S_t=s]$ and $\lim_{t\to\infty} \bE_\mu [\vect z_t|S_t=s]$ exist for all $s\in\cS$, then LC-ETD($\lambda$, $\beta$, $\nu$) has the same expected update as On-policy TD($\lambda$). As a result, LC-ETD($\lambda$, $\beta$, $\nu$) is stable and consistent.
\end{theorem}

\begin{remark}
\label{rmrk:stability}
LC-ETD($\lambda$, $\beta$, $\nu$) is stable for any values of $\beta\in[0,1)$ and $\nu\in(0,1]$, or $\beta=1$ and $\nu\in[0,1]$. This is significantly stronger than ETD($\lambda$, $\beta$) \citep{hallak2016generalized}. In their case, ETD($\lambda$, $\beta$) is stable only with $\beta > \beta_0$ where $\beta_0 \le \gamma$ is an instance-dependent condition number.
\end{remark}

\begin{remark}
\label{rmrk:fixed_point}
LC-ETD($\lambda$, $\beta$, $\nu$) is consistent for any values of $\beta\in[0,1)$ and $\nu\in(0,1]$, or $\beta=1$ and $\nu\in[0,1]$. This is, again, significantly stronger than ETD($\lambda$, $\beta$). For any $\beta\in[0,1)$, ETD($\lambda$, $\beta$) has persistent bias. In particular, the bias will increase as the value of $\beta$ decrease. At the extreme end when $\beta=0$, ETD($\lambda$, $\beta$) becomes Off-policy TD($\lambda$), which could have unbounded bias \citep{kolter2011fixed}. 
\end{remark}

Having settled the consistency of LC-ETD($\lambda$, $\beta$, $\nu$), we now discuss the bias-variance trade-off we obtained. Figure \ref{fig:square} plots the landscape of GETD($\lambda$, $\beta$, $\nu$), which illustrates the relationship between LC-ETD($\lambda$, $\beta$, $\nu$) and other algorithms. Starting from AETD($\lambda$), intuitively, as $\nu$ decreases, the algorithm gets closer to Off-policy TD($\lambda$) with the variance decreased, but the bias increases; meanwhile, as $\beta$ increases, the algorithm moves towards to Full-IS-TD($\lambda$) with the bias decreased, but the variance increased. More generally, it holds for LC-ETD($\lambda$, $\beta$, $\nu$) that increasing $\beta$ or $\nu$ will reduce the bias and increase the variance, and vice versa.

To better analyze the bias-variance trade-off that $\beta$ and $\nu$ control, we study three instances of LC-ETD($\lambda$, $\beta$, $\nu$), which cover a diagonal line and two edges of LC-ETD($\lambda$, $\beta$, $\nu$) (see Figure \ref{fig:square}). The first instance is LC-ETD1($\lambda$, $\beta$), which corresponds to a diagonal line of LC-ETD($\lambda$, $\beta$, $\nu$). In this diagonal line, the value of $\nu$ is always the same as the value of $\beta$. This line has the special property that it connects Off-policy TD($\lambda$) and Full-IS-TD($\lambda$). The update of LC-ETD1($\lambda$, $\beta$) is the same as Update (\ref{math:general_update_lambda}) but with $h(t)$ and $g(t)$ specified as the following:
\begin{align}
  \label{math:cetd1_g_and_f}
  \begin{split}
        h(t) \doteq \left(\frac{1-\beta}{t+1}\right)^{\beta} \  \text{and} \ 
        g(t) \doteq \frac{1-\beta}{(t+1)^{\beta}}.
  \end{split}
\end{align}

\begin{figure}
  \centering
  \includegraphics[width=0.45\textwidth]{figures/square_v7_ink_lc}
  \caption{The landscape of GETD($\lambda$, $\beta$, $\nu$). The square excluding the left edge and its bottom endpoint represents LC-ETD($\lambda$, $\beta$, $\nu$). The darkness of the color at each point inside the square represents the magnitude of $F_t$'s variance.}
  \label{fig:square}
\end{figure}

The second instance is LC-ETD2($\lambda$, $\nu$), the bottom edge of LC-ETD($\lambda$, $\beta$, $\nu$), which connects Off-policy TD($\lambda$) and AETD($\lambda$). Here, $\beta$ is always $0$. The update of LC-ETD2($\lambda$, $\nu$) is the identical as Update (\ref{math:general_update_lambda}) but with $h(t)$ and $g(t)$ set as the following:
\begin{align}
  \label{math:cetd2_g_and_f}
  \begin{split}
        h(t) \doteq {(t+1)^{-\nu}} \  \text{{and}} \  g(t) \doteq {(t+1)^{-\nu}}.
  \end{split}
\end{align}

The third instance is LC-ETD3($\lambda$, $\beta$), the right edge of LC-ETD($\lambda$, $\beta$, $\nu$), which links AETD($\lambda$) and Full-IS-TD($\lambda$). In this edge, $\nu$ is always $1$. The update of LC-ETD3($\lambda$, $\beta$) is the same as Update (\ref{math:general_update_lambda}) but with $h(t)$ and $g(t)$ specified as the following:
\begin{align}
  \label{math:cetd3_g_and_f}
  \begin{split}
        h(t) \doteq \frac{1-\beta}{t+1} \  \text{{and}} \  g(t) \doteq \frac{1-\beta}{t+1}.
  \end{split}
\end{align}


\section{Experiments}\label{sec:experiments}

\begin{figure*}
  \centering
  \subfigure[Best learning curves]{
    \includegraphics[width=0.292\textwidth]{figures/two_state_true/learning_curve_AllTwoState_true3_reLmbda0.0_final}
    \label{fig:exp1_sub_all}
  }
  \subfigure[Sensitivity to $\beta$ or $\nu$]{
    \includegraphics[width=0.292\textwidth]{figures/two_state_true/sensitivity_beta_TwoState_true3_reLmbda0.0_final}
    \label{fig:exp1_sub_sensitivity}
  }
  \subfigure[Best learning curves of ETD($\beta$)]{
    \includegraphics[width=0.292\textwidth]{figures/two_state_true/learning_curve_ETDLBTwoState_true3_reLmbda0.0_final}
    \label{fig:exp1_sub_etdlb}
  }
  \subfigure[Best learning curves of LC-ETD1($\beta$)]{
    \includegraphics[width=0.292\textwidth]{figures/two_state_true/learning_curve_CETDL1TwoState_true3_reLmbda0.0_final}
    \label{fig:exp1_sub_cetd1}
  }
  \subfigure[Best learning curves of LC-ETD2($\nu$)]{
    \includegraphics[width=0.292\textwidth]{figures/two_state_true/learning_curve_CETDL2TwoState_true3_reLmbda0.0_final}
    \label{fig:exp1_sub_cetd2}
  }
  \subfigure[Best learning curves of LC-ETD3($\beta$)]{
    \includegraphics[width=0.292\textwidth]{figures/two_state_true/learning_curve_CETDL3TwoState_true3_reLmbda0.0_final}
    \label{fig:exp1_sub_cetd3}
  }
  \caption{Performance of different algorithms on the Two-state task. The y-axis shows $\overline{\text{RMSVE}}$. The dash lines from top to bottom in Figure (a) show $\overline{\text{RMSVE}}(\bar \theta_{\text{ETD}}) \approx 1.251$ and $\overline{\text{RMSVE}}(\bar \theta_{\text{On}}) \approx 1.155$, respectively.}
  \label{fig:exp1_curve}
\end{figure*}

In this section, we present experiments that demonstrate the effectiveness of LC-ETD($\lambda$, $\beta$, $\nu$) in the one-step case. The results for the multi-step case exhibit a similar pattern and are provided in the appendix. Additionally, for stability analysis on Baird's \citeyearpar{baird1995residual} counterexample, please refer to the appendix. To maintain simplicity, we omit the $\lambda$ argument from all algorithms. For instance, LC-ETD($\beta$, $\nu$) refers to LC-ETD($0$, $\beta$, $\nu$). We evaluate the quality of the learned $\vect \theta$ using the root-mean-square-value error as our metric:
\begin{align*}
    \overline{\text{RMSVE}}(\vect \theta) = \norm{\vect{\hat{v}}_{\vect \theta} - \vect v_\pi }_{\vect{d}_\pi}.
\end{align*}
For all experiments, we use constant step sizes $\alpha=2^x$ for all algorithms where $x\in\{ -18, -17, \cdots, -1, 0 \}$. For \textit{tunable algorithms} with an adjustable decay parameter (ETD($\beta$), LC-ETD1($\beta$), LC-ETD2($\nu$), and LC-ETD3($\beta$)), the decay parameter ($\beta$ or $\nu$) is chosen from $\{ 0.0, 0.2, 0.4, 0.6, 0.8, 1.0 \}$. Note that ETD($\beta$), LC-ETD1($\beta$), and LC-ETD2($\nu$) with $\beta=0.0$ or $\nu=0.0$ are the same as Off-policy TD; LC-ETD1($\beta$) and LC-ETD3($\beta$) with $\beta=1.0$ are the same as Full-IS-TD; ETD($\beta$) with $\beta=1.0$ is an unsound method with a followon trace whose expectation will blow up to infinity in the limit. All results are reported with the best-performing step size, with which the final error is the smallest. We also provide the step-size sensitivity analysis in the appendix. The final error is calculated by averaging the errors in the last $1\%$ of the training steps. Compared to the area under the learning curve (AUC), the final error is favored because it is a better reflection of how the algorithm performs asymptotically.


\subsection*{Consistency of LC-ETD($\beta$, $\nu$)}

\paragraph{Two-State Task} To illustrate the benefit of LC-ETD($\beta$, $\nu$)'s consistency, we designed a didactic task with two states (Figure \ref{fig:two_state}). In this task, the target policy $\pi$ will go to the left state from any state with a probability of $0.6$, while the probability for the behavior policy $\mu$ is $0.4$. The discount factor $\gamma$ is $0.8$. The on-policy fixed point in this task induces an error of $\overline{\text{RMSVE}}(\bar \theta_{\text{On}}) \approx 1.155$, whereas the off-policy fixed point induces an error of $\overline{\text{RMSVE}}(\bar \theta_{\text{Off}}) \approx 1.523$. For ETD (ETD($\beta$) with $\beta=\gamma=0.8$), its fixed point has an error of $\overline{\text{RMSVE}}(\bar \theta_{\text{ETD}}) \approx 1.251$. Thus, consistent algorithms have a theoretical advantage in this task because their fixed point (the on-policy fixed point) has the lowest $\overline{\text{RMSVE}}$. We run each algorithm for $100{,}000$ steps and present the results in Figure \ref{fig:exp1_curve}, which are averaged over $100$ independent runs. The shaded region near each learning curve represents the standard error. Likewise, the standard error is shown as an error bar for each point in the sensitivity plot.

\begin{figure}
  \centering
  \includegraphics[width=0.35\textwidth]{figures/two_state_v3}
  \caption{The Two-state task. The values of the two states are approximated by $\theta$ and $2\theta$, respectively.}
  \label{fig:two_state}
\end{figure}

\begin{figure*}
  \centering
  \subfigure[Best learning curves]{
    \includegraphics[width=0.292\textwidth]{figures/four_room_true/learning_curve_AllFourRoom_true3_30s_150k_reLmbda0.0_final}
    \label{fig:exp2_sub_all}
  }
  \subfigure[Sensitivity to $\beta$ or $\nu$]{
    \includegraphics[width=0.292\textwidth]{figures/four_room_true/sensitivity_beta_FourRoom_true3_30s_150k_reLmbda0.0_final}
    \label{fig:exp2_sub_sensitivity}
  }
  \subfigure[Best learning curves of ETD($\beta$)]{
    \includegraphics[width=0.292\textwidth]{figures/four_room_true/learning_curve_ETDLBFourRoom_true3_30s_150k_reLmbda0.0_final}
    \label{fig:exp2_sub_etdlb}
  }
  \subfigure[Best learning curves of LC-ETD1($\beta$)]{
    \includegraphics[width=0.292\textwidth]{figures/four_room_true/learning_curve_CETDL1FourRoom_true3_30s_150k_reLmbda0.0_final}
    \label{fig:exp2_sub_cetd1}
  }
  \subfigure[Best learning curves of LC-ETD2($\nu$)]{
    \includegraphics[width=0.292\textwidth]{figures/four_room_true/learning_curve_CETDL2FourRoom_true3_30s_150k_reLmbda0.0_final}
    \label{fig:exp2_sub_cetd2}
  }
  \subfigure[Best learning curves of LC-ETD3($\beta$)]{
    \includegraphics[width=0.292\textwidth]{figures/four_room_true/learning_curve_CETDL3FourRoom_true3_30s_150k_reLmbda0.0_final}
    \label{fig:exp2_sub_cetd3}
  }
  \caption{Performance of different algorithms on the Rooms task. The y-axis shows $\overline{\text{RMSVE}}$.}
  \label{fig:exp2_curve}
\end{figure*}

From Figure \ref{fig:exp1_sub_all}, we can see that all LC-ETD instances achieve an error between $\overline{\text{RMSVE}}(\bar \theta_{\text{ETD}}) \approx 1.251$ and $\overline{\text{RMSVE}}(\bar \theta_{\text{On}}) \approx 1.155$ (the dash lines). They are the best-performing algorithms and significantly improve over the only existing consistent algorithm, Full-IS-TD, which cannot learn due to the high variance issue. On the other hand, ETD (ETD($\beta$) with $\beta=0.8$) is the second-tier algorithm in this task, achieving its theoretical optimal error of $1.251$. For Off-policy TD, it also converges to its fixed point, which induces a significantly larger error of $\overline{\text{RMSVE}}(\bar \theta_{\text{Off}}) \approx 1.523$.

Figures \ref{fig:exp1_sub_etdlb}-\ref{fig:exp1_sub_cetd3} plot the learning curves of tunable algorithms with fixed values of the decay parameter. From Figure \ref{fig:exp1_sub_etdlb}, it is evident that ETD($\beta$) converges to solutions with large biases for most values of $\beta$. When $\beta=1$, its error explodes after some steps, demonstrating the unsoundness of ETD($\beta$) with $\beta=1$ in the infinite horizon case. For LC-ETD instances (Figures \ref{fig:exp1_sub_cetd1}-\ref{fig:exp1_sub_cetd3}), with the decay parameter in interval $[0.2, 0.8]$, they all converge smoothly to errors at a similar level, which is lower than existing algorithms.

Figure \ref{fig:exp1_sub_sensitivity} summarizes these results. We can conclude that all LC-ETD instances consistently enjoy lower errors than existing algorithms, which implies LC-ETD($\beta$, $\nu$)'s consistency across all its decay parameter choices.


\subsection*{Practicality of LC-ETD($\beta$, $\nu$)}

\paragraph{Rooms Task} To further test the performance of LC-ETD($\beta$, $\nu$) in more complex tasks with higher variance, we modified the Rooms task proposed by \citet{ghiassian2021empirical2} to include continuing target policies. The discount factor $\gamma$ is kept at $0.9$. Other task specifications also largely follow from \citet{ghiassian2021empirical2}, and the modifications can be found in the appendix. Compared to the Two-state task, the Rooms task has more states and complex feature representation. Moreover, the differences between the target policies and the behavior policy are larger, inducing much larger variance. We run each algorithm for $150{,}000$ steps and $30$ runs. To better illustrate the advantage of LC-ETD($\beta$, $\nu$), we present the results using Interquartile Mean (IQM) in Figure \ref{fig:exp2_curve}, which are more robust and statistically efficient compared to the mean or median results \citep{agarwal2021deep}. The standard error is presented as a shaded region or an error bar, similar to the Two-state task. However, in this case, the standard error is based only on the middle $50\%$ of the samples since we used IQM instead of the mean. Additionally, we provide a comprehensive performance profile by presenting the mean results (averaged over all runs) in the appendix.

Figure \ref{fig:exp2_sub_all} shows that ETD, ETD($\beta$), and all LC-ETD instances achieve similar final errors. Among them, LC-ETD2($\nu$) and LC-ETD3($\beta$) learn the fastest. Same as in the Two-state task, Off-policy TD converges quickly to a solution with a large bias, while Full-IS-TD cannot learn.

Figures \ref{fig:exp2_sub_etdlb}-\ref{fig:exp2_sub_cetd3} plot the learning curves of tunable algorithms with fixed values of the decay parameter. From Figure \ref{fig:exp2_sub_etdlb}, we can see that as $\beta$ increases, the bias of the solution ETD($\beta$) found becomes smaller, and the learning also becomes slower. For LC-ETD1($\beta$) and LC-ETD2($\nu$) (Figures \ref{fig:exp2_sub_cetd1} and \ref{fig:exp2_sub_cetd2}), they learn faster with larger values of the decay parameter. On the other hand, LC-ETD3($\beta$) is not very sensitive to the value of $\beta$ (Figure \ref{fig:exp2_sub_cetd3}).

Figure \ref{fig:exp2_sub_sensitivity} summarizes the above results. We can see that even in the high variance setting, LC-ETD instances are still better: They converge faster to the lowest error and are less sensitive to the decay parameter compared to ETD($\beta$).


\subsection*{The Bias-Variance Trade-Off}

We now analyze the bias-variance trade-off that $\beta$ and $\nu$ control. We first analyze the bias and variance of the trace $F_t$ in Eq. (\ref{math:general_followon_trace}) for different algorithms. Ideally, the expectation of $F_t$ given $S_t=s$ should be $\frac{\bP_\pi(S_t=s)}{\bP_\mu(S_t=s)}$, which converges to the density ratio in the limit and corrects the distribution of the update back to the on-policy distribution. Full-IS-TD achieves a zero bias but has the highest variance. LC-ETD instances have a relatively lower variance and a non-zero bias that will converge to zero asymptotically. On the other hand, Scaled ETD($\beta$) exhibits an even lower variance but a persistent bias. We conducted experiments on the Two-state task, sampling $100{,}000$ trajectories of length $30$ to estimate the bias and variance of $F_t$, as shown in Figure \ref{fig:exp3_trade_off}. Further details and discussions can be found in the appendix.

Figure \ref{fig:exp3_trade_off} shows that increasing the decay parameter reduces the bias and increases the variance for all algorithms but with different speeds of change. LC-ETD1($\beta$) exhibits symmetric bias and variance curves, with the lowest variance and the highest bias at $\beta=0$ (Off-policy TD), and the lowest bias and the highest variance at $\beta=1$ (Full-IS-TD). Scaled ETD($\beta$) also connects Off-policy TD with Full-IS-TD, but it is not consistent and only becomes less biased as $\beta$ increases. It is also worth mentioning that its bias is persistent, while the bias of LC-ETD instances will fade away as more time steps are given. Additionally, LC-ETD2($\nu$) and LC-ETD3($\beta$) combined also form a polygonal line connecting Off-policy TD and Full-IS-TD. The bias and variance curves of the two algorithms combined form a similar shape to that of LC-ETD1($\beta$) but much wider. As a result, these two algorithms are less sensitive to the decay parameter but risk not achieving the best trade-off. Generally, LC-ETD2($\nu$) holds the best trade-off point in tasks with high variance.

\begin{figure}[t!]
  \centering
    \includegraphics[width=0.39\textwidth]{figures/bias_variance_ideal}
  \caption{Bias-variance trade-off of different algorithms. The y-axis shows the normalized bias and variance of $F_t$.}
  \label{fig:exp3_trade_off}
\end{figure}

We next look at how the updates at the two states are actually weighted in the experiment on the Two-state task. We calculate the ratio of $F_t$'s averages at the two states for every $1{,}000$ steps. Then we compute the absolute error of this ratio to the ratio of the density ratios at the two states as a measure of how effective $F_t$ is in reweighting the update. We refer to this error as the \textit{ratio error} in the remaining text. We use the same data that generates Figure \ref{fig:exp1_curve} and show the resulting ratio errors in logarithmic scale in Figure \ref{fig:exp5_trade_off2}, which are averaged over $100$ runs. The shaded region near each curve represents the standard error, which is unnoticeable.

From Figure \ref{fig:exp5_trade_off2}, we can see that the level of the ratio error has a positive correlation with $\overline{\text{RMSVE}}$ plotted in Figure \ref{fig:exp1_curve}. For LC-ETD instances with the decay parameter in the interval $[0.2, 0.8]$, their ratio errors are among the lowest. For Full-IS-TD (LC-ETD1($\beta$) and LC-ETD3($\beta$) with $\beta=1$), its ratio error is very unstable at the beginning and then quickly remains at the same level as Off-policy TD (LC-ETD1($\beta$) with $\beta=0$ and LC-ETD2($\nu$) with $\nu=0$). This is because its $F_t$, the full IS-ratio product, diminishes to near zero after some steps, resulting in the ratio of $F_t$ remaining at $1$ due to numerical issues. On the other hand, the ratio error of ETD($\beta$) becomes smaller and noisier as $\beta$ decreases. Noted that though the ratio error of ETD($\beta$) when $\beta=1$ is also among the lowest, its performance is extremely unstable, as shown in Figure \ref{fig:exp1_sub_etdlb}. This is because the magnitude of $F_t$ at both states is enormous despite the small ratio error.

In summary, the analysis illustrates how the decay parameters $\beta$ and $\nu$ affect the bias and variance of $F_t$, providing insights into the property of the corresponding algorithm.


\section{Conclusions and Discussion}\label{sec:conclusion}

\begin{figure}[t!]
  \centering
    \includegraphics[width=0.39\textwidth]{figures/bias_variance_time}
  \caption{Ratio errors under different states in the experiment on the Two-state task. The y-axis shows the ratio error.}
  \label{fig:exp5_trade_off2}
\end{figure}

In this paper, we first introduced Average Emphatic TD (AETD($\lambda$)), a new consistent off-policy algorithm. To attain a smooth bias-variance trade-off, we unified AETD($\lambda$) with some existing algorithms \citep{precup2000eligibility, precup2001off, sutton2016emphatic, hallak2016generalized}. The resulting unified algorithm contains a new family of consistent algorithms, Loosely Consistent Emphatic TD (LC-ETD($\lambda$, $\beta$, $\nu$)), which has several desired theoretical and empirical properties. Firstly, different from ETD($\lambda$, $\beta$), LC-ETD($\lambda$, $\beta$, $\nu$) is guaranteed to be stable regardless of the values of its parameters. Secondly, while ETD($\lambda$, $\beta$) has a biased fixed point, LC-ETD($\lambda$, $\beta$, $\nu$) has the same fixed point as On-policy TD($\lambda$). Thirdly, the bias-variance trade-off that its parameters control makes LC-ETD($\lambda$, $\beta$, $\nu$) practical, providing an effective remedy to Full-IS-TD($\lambda$), the only consistent method previously. To our knowledge, LC-ETD($\lambda$, $\beta$, $\nu$) is the \textit{first practical, consistent} algorithm for off-policy TD learning under general linear function approximation. By constraining LC-ETD($\lambda$, $\beta$, $\nu$)'s decay parameters, we obtained its three instances with the same number of parameters as ETD($\lambda$, $\beta$). Experiment results on a didactic example and a complex task with high variance showed a competitive performance of the instances, validating the effectiveness and practicality of LC-ETD($\lambda$, $\beta$, $\nu$).

Despite having the ability to control a smooth bias-variance trade-off, LC-ETD($\lambda$, $\beta$, $\nu$) still suffers from high variance issue to some degree (see the appendix). This issue is inherent to all importance-sampling-based methods including ETD($\lambda$, $\beta$) and Full-IS-TD($\lambda$). Potential treatments include periodically restarting or truncating the followon trace \citep{guan2021per, zhang2022truncated} and learning an expected followon trace \citep{zhang2020provably,jiang2022learning}. Investigating these and new approaches to further reduce the variance is one direction to be explored. Another unanswered question is the convergence of LC-ETD($\lambda$, $\beta$, $\nu$). Same as ETD($\lambda$) \citep{sutton2016emphatic}, we have provided the stability guarantee of LC-ETD($\lambda$, $\beta$, $\nu$), which is an important necessary condition of its convergence. Similar to proving the convergence of ETD($\lambda$) \citep{yu2016weak}, significant technical challenges may present in proving the convergence of LC-ETD($\lambda$, $\beta$, $\nu$). Thus, we leave it for future work.


% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    This research is supported by the Canada CIFAR AI Chairs program, the Reinforcement Learning and Artificial Intelligence (RLAI) laboratory, the Alberta Machine Intelligence Institute (Amii), and the Natural Sciences and Engineering Research Council (NSERC) of Canada. Jiamin He also gratefully acknowledges Richard S. Sutton and Huizhen Yu for valuable discussions and thanks the Digital Research Alliance of Canada for providing computational resources.

\end{acknowledgements}

% References
\bibliography{he_35}
\end{document}
 