%%%%%%%%%%%%%%%%%% packages for UAI submission %%%%%%%%%%%%%%%%%%
% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% xr pkg
\usepackage{xr-hyper}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{xiong_597-supp}

% \usepackage{xcite} 
% \externaldocument{xiong_597-supp-xr}

\usepackage{hyperref}       % hyperlinks

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors

%%%%%%%%%%%%%%%%%% additional packages %%%%%%%%%%%%%%%%%
\usepackage{amsmath} % assumes amsmath package installed
\usepackage{amssymb}  % assumes amsmath package installed
\usepackage{amsthm} % begin{proof}
% \usepackage{subcaption}
\usepackage{caption}
\usepackage{comment} 

\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}{Remark}
\newtheorem{fact}{Fact}
\newtheorem{definition}{Definition}
\newtheorem{proposition}{Proposition}
\newtheorem{assumption}{Assumption}
\usepackage{booktabs}
\usepackage{algorithm}
\usepackage{algorithmic}
% \urlstyle{same}
\usepackage{cleveref}
\usepackage{multirow}
\usepackage{hhline}

\usepackage{dirtytalk}
\newcommand{\mP}{\mathbb P}
\newcommand{\mE}{\mathbb E}
\newcommand{\mcb}{\mathcal B}
\newcommand{\mcs}{\mathcal S}
\newcommand{\mca}{\mathcal A}
\newcommand{\mf}{\mathcal F}
\newcommand{\mcv}{\mathcal V}
\newcommand{\mcphi}{{\rm\Phi}}
\newcommand{\mcxi}{{\rm\Xi}}
\newcommand{\mcpi}{{\rm \Pi}}
\newcommand{\lTV}[1]{\left\|#1\right\|_{TV}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\lone}[1]{\left|#1\right|}
\newcommand{\lF}[1]{\left\|#1\right\|_F}
\newcommand{\linf}[1]{\left\|#1\right\|_\infty}
\newcommand{\parentheses}[1]{\left(#1\right)}
\newcommand{\brackets}[1]{\left[#1\right]}
\newcommand{\cur}[1]{\left\{#1\right\}}

\allowdisplaybreaks[3]

\title{Deterministic Policy Gradient: Convergence Analysis}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<xiong.309@osu.edu>?Subject=Your UAI 2022 paper}{Huaqing Xiong}\thanks{Equal contribution}{}}
\author[1]{Tengyu Xu$^*$}
\author[2]{Lin Zhao}
\author[1]{Yingbin Liang}
\author[3]{{Wei Zhang}\thanks{Corresponding author}{}}
% Add affiliations after the authors
\affil[1]{%
    Department of Electrical and Computer Engineering\\
    The Ohio State University\\
    Columbus, Ohio, USA
}
\affil[2]{%
    Department of Electrical and Computer Engineering\\
    National University of Singapore\\
    Singapore, Republic of Singapore
}
\affil[3]{%
    Department of Mechanical and Energy Engineering\\
    Southern University of Science and Technology (SUSTech)\\
    Shenzhen, Guangdong, China
  }

\begin{document}
\maketitle

\begin{abstract}
  The deterministic policy gradient (DPG) method proposed in \cite{silver2014deterministic} has been demonstrated to exhibit superior performance particularly for applications with multi-dimensional and continuous action spaces. However, it remains unclear whether DPG converges, and if so, how fast it converges and whether it converges as efficiently as other PG methods. In this paper, we provide a theoretical analysis of DPG to answer those questions. We study the single timescale DPG (often the case in practice) in both on-policy and off-policy settings, and show that both algorithms attain an $\epsilon$-accurate stationary policy up to a system error with a sample complexity of $\mathcal{O}(\epsilon^{-2})$. Moreover, we establish the convergence rate for DPG under Gaussian noise exploration, which is widely adopted in practice to improve the performance of DPG. To our best knowledge, this is the first non-asymptotic convergence characterization for DPG methods.

\end{abstract}

\section{Introduction}

Reinforcement learning (RL) has achieved tremendous success so far in many applications such as playing video games~\citep{mnih2013playing}, bipedal walking~\citep{castillo2018reinforcement} and online advertising~\citep{pednault2002sequential}, to name a few. The central aim of RL is to learn a policy that maximizes an accumulative reward for a task via the interaction with the environment. To this end, one popular method is to directly parameterize the policy and then optimize over the parameter space via (stochastic) gradient descent, which is referred to as the policy gradient (PG) algorithm \citep{williams1992simple}. More variants of policy gradient have been developed to further improve the performance, including natural policy gradient (NPG) \citep{kakade2002natural}, trust region policy optimization (TRPO) \citep{schulman2015trust}, proximal policy optimization (PPO) \citep{schulman2017proximal}, actor-critic (AC) \citet{konda1999actor,konda2000actor}, Asynchronous Advantage Actor-Critic (A3C) \citep{mnih2016asynchronous}, Soft Actor-Critic (SAC) \citep{haarnoja2018soft}, etc.


%The vanilla PG is known to suffer from high variance and inefficient sampling. To improve its performance, more advanced policy-based RL algorithms have been proposed. Among them, one of the most effective methods is the actor-critic (AC) type of algorithms proposed in \citet{konda1999actor,konda2000actor}. AC uses critic to estimate the (action) value function, and uses actor to update the policies. Such an AC type architecture has shown great improvement in the variance control and convergence speed \citep{konda2000actor}, and thus has motivated more variants such as Asynchronous Advantage Actor-Critic (A3C) \citep{mnih2016asynchronous}, Soft Actor-Critic (SAC) \citep{haarnoja2018soft}, etc.

All the aforementioned PG algorithms adopt stochastic policies where the policy is modeled as a probability distribution over the action space. Rather, many RL applications have multi-dimensional {\bf continuous} action spaces, for which 
%the policy gradient for deterministic policies can usually be estimated more efficiently than stochastic policy gradient \citep{silver2014deterministic}. 
% Accordingly, deterministic actor-critic (DPG) methods
%Accordingly, 
{\bf deterministic} policy gradient (DPG) algorithms have been proposed and demonstrated to significantly outperform stochastic PG algorithms in \citet{silver2014deterministic}.
Motivated by this, \citet{lillicrap2015continuous} combined DPG with DQN and proposed Deep Deterministic Policy Gradient (DDPG), which extends DQN in discrete action space to a continuous setting. 
Later, DDPG has also gained great success in distributional \citep{maron2018distributed} and multi-agent \citep{lowe2017multi} scenarios. Although DPG and its variants exhibit superior performance in practice, the theoretical understanding of its convergence is rather limited. In fact, the only attempt was made in \citet{kumar2020zeroth}, which provided the convergence results for a modified zeroth-order DPG algorithm. However, what is used commonly in practice is the DPG algorithm originally proposed in \citet{silver2014deterministic}, for which the convergence guarantee remains open.

%by  existing theoretical result on deterministic policies is for a zeroth-order DPG algorithm \citep{kumar2020zeroth} which does not use the more practical AC architecture as DPG algorithms proposed in \citet{silver2014deterministic}. Therefore, our question naturally arises.
%\begin{list}{$\bullet$}{\topsep=0.ex \leftmargin=0.3in \rightmargin=0.in \itemsep =0.02in}
%\item Can we provide convergence guarantee for DPG algorithms?
%\end{list}

%To study DPG algorithms, we are interested in both the on-policy and off-policy versions which have been proposed and justified in \citet{silver2014deterministic}. In practice, there are some important implementation techniques to guarantee the success of DPG algorithms, such as mini-batch samples, noised exploration and alternative updates of the critic and actor with constant learning rates.
% (i.e., single timescale architecture).

%In this paper, we focus on addressing the above problem and study the finite-sample performance for DPG using the above practical techniques.

In fact, the convergence theory of DPG does not follow from that for stochastic PG algorithms due to a few unique features that DPG has. (a) The policy gradient in DPG takes a very different form from that in PG, and admits different compatibility for function approximation and consequently different actors to estimate. There is thus no guarantee that such a designed update rule must have guaranteed convergence as PG. (b) Through determinism in policy, practical implementation of DPG introduces {\bf stochastic noisy} sampling to improve exploration. There is no previous theory on such a mixed deterministic policy update with noisy sampling for exploration. (c) DPG takes alternative simultaneous updates between critic and actor with constant learning rates for both. Previous analysis for stochastic PG typically requires sufficiently fast update for critic so that its tracking error can be (asymptotically) decoupled from actor's convergence error. Such analysis is not applicable to DPG. 

{\em Thus, the goal of this paper is to develop new tools to address the aforementioned challenges and provide the first finite-sample convergence guarantee for DPG algorithms in \citet{silver2014deterministic}.}

\subsection{Main Contribution}

The main contribution of this work lies in establishing the first finite-sample analysis for both on-policy and off-policy DPG algorithms proposed in \citet{silver2014deterministic}.
%under a practical scheme adopting constant learning rates.

For the on-policy setting, we study DPG-TD, which uses the compatible approximation rule given by \citet{silver2014deterministic} to update actor, and adopts temporal difference (TD) learning with linear function approximation to update critic. We show that DPG-TD finds a stationary point (up to a system error)
%that widely exists in actor-critic type algorithms 
with a sample complexity of $\mathcal{O}(\epsilon^{-2})$.
In addition, we also show that noised DPG-TD (NoiDPG-TD) which uses a mixture of noisy exploration and the deterministic policies can also achieve a sample complexity of $\mathcal{O}(\epsilon^{-2})$.
For the off-policy setting, we study DPG-TDC, which uses TD with gradient correction (TDC) to update critic under off-policy data and show that DPG-TDC also achieves the convergence with a sample complexity of $\mathcal{O}(\epsilon^{-2})$.
%to address the divergence issue of off-policy TD(0) with linear function approximation, we focus on mini-batch DPG-TDC which uses TD with gradient correction (TDC) to update the critic instead. DPG-TDC is also proved to converge in a sample complexity of $\mathcal{O}(\epsilon^{-2})$.

Our results bring the following insights to the understanding of DPG. (a) Our sample complexity of DPG matches the best known actor-critic (AC) type PG in \citet{xu2020improving}. This implies that although the policy gradient of DPG is more challenging to estimate, the compatible function approximation for DPG in \citet{silver2014deterministic} is as efficient as that for PG, which yields the same complexity for DPG. (b) DPG achieves the same sample complexity for more challenging continuous and possibly unbounded action space whereas the known theorems for PG typically require the bounded action space except for Gaussian policy. (c) The noisy exploration does not cause higher sample complexity. (d) The simultaneous updates for actor and critic without sufficient estimation accuracy for critic still yield convergence without causing more sample complexity. 

Technically, our analysis develops the following novel techniques to handle the unique challenges arising due to deterministic policies. (a) We develop a new analysis to bound the estimation error of the Fisher information of deterministic policy arising via the compatibility property, and then further capture how such a metric affects the convergence via its minimum eigenvalue. 
%Our techniques can also effectively handle the unique compatibility estimator for DPG algorithms. 
(b) We develop a new tool to analyze the coupled actor and critic's stochastic approximation processes, due to their simultaneous updates both with constant stepsizes (which are commonly used in practice). Previous analysis of (stochastic) PG-type algorithms including AC algorithms mainly decouples the critic's error from actor's error either by sufficient updates of critic before each actor's update \citep{wang2019neural,kumar2019sample,qiu2019finite,xu2020improving}, or by updating critic much faster than actor via two timescale learning rates \citep{wu2020finite,xu2020non}. Our analysis allows the coupling between the two and develops the idea to cancel the critic's coupling error by the actor's overall positive progress to the stationary policy. 

\subsection{Related Work}

% Due to the extensive theoretical studies of PG and AC, we review only the most relevant work below.

\textbf{Convergence of DPG and its variants}: Since proposed and practically justified in \citet{silver2014deterministic}, DPG has inspired many variants and gained great success. However, there is almost no theoretical study of DPG with only one exception in \citet{kumar2020zeroth}, which provided the convergence guarantee for a zeroth-order DPG rather than the original form of DPG used widely in practice. Our work aims to provide the finite-sample convergence guarantee for DPG in its original practical form \citep{silver2014deterministic}. 
%However, \citet{kumar2020zeroth} studied a zeroth-order deterministic policy gradient algorithm instead of the more practical actor-critic type algorithms. This work aims to explore the open topic on the convergence of DPG algorithms adopting practical schemes.

\textbf{Convergence of stochastic PG}: The vanilla PG is a fundamental policy-based RL algorithm. Its asymptotic convergence has been established in \citet{williams1992simple,baxter2001infinite,sutton2000policy,kakade2002natural,pirotta2015policy,tadic2017asymptotic} via modeling PG as stochastic approximation (SA). 
PG has been shown to find the optimal policy under convex policy function approximation \citep{bhandari2019global} or in some specific applications such as LQR \citep{fazel2018global,malik2018derivative,tu2018gap}. 
Convergence of (N)PG under a more general function approximation has been also provided in \citet{shen2019hessian,papini2017adaptive,papini2018stochastic,papini2019smoothing,xu2019improved,xu2020sample,zhang2019global,agarwal2019theory,karimi2019non,wang2019neural,cen2020fast}. 
% Convergence results has been also established for the variants of PG, such as TRPO/PPO~\citep{shani2019adaptive,liu2019neural}. In this paper, we focus on analyzing another PG variant: actor-critic type algorithms.

\textbf{Convergence of stochastic actor-critic}: The AC algorithm was proposed in \citet{konda1999actor}, and since then has aroused wide interest in understanding its convergence. \citet{konda2000actor,konda_2002,peters2008natural,bhatnagar2008incremental,bhatnagar2009natural,bhatnagar2010actor,castro2010convergent,maei2018convergent} established the asymptotic convergence for (natural) AC. 
The non-asymptotic convergence for (N)AC has been also explored recently. Under a double-loop setting, where critic can run sufficiently many iterations before updating actor, the convergence rate for (N)AC has been characterized in \citet{yang2019global,wang2019neural,kumar2019sample,qiu2019finite,xu2020improving}. Under a two-timescale setting, where critic and actor update simultaneously but undergo different diminishing learning rates, the convergence rate has been provided in \citet{wu2020finite,xu2020non,hong2020two,zhang2020provably,shen2020asynchronous}.
% In practice, the single timescale framework is more popular for AC because of its ease for implementation. The only existing non-asymptotic convergence for single timescale AC was given in \citet{fu2020single}. 
Under the single timescale setting, where critic and actor are simultaneously updated with constant learning rates, the convergence rate was given in \citet{fu2020single}. Differently from the above stochastic PG and stochastic AC, our work studies the deterministic PG algorithms. The update structure of our work is the same as that in \citet{fu2020single}, but we adopt the more practical TD updates for critic rather than LSTD in \citet{fu2020single}, which causes substantial difference in our analysis besides the deterministic policy.

\section{Preliminary}

% In the section, we introduce necessary background.

\subsection{Problem Setup}

We consider the standard RL settings where an agent interacts with a stochastic environment. Such a system is usually modeled as a discrete-time discounted Markov Decision Process (MDP) which is represented by a tuple $(\mcs,\mca,P,r, \gamma)$. Here, $\mcs$ denotes the state space, $\mca$ denotes the action space, $P:\mcs\times \mca \times \mcs\mapsto [0,1]$ denotes the transition kernel for the state transitions, e.g., $P(s'|s, a)$ represents the probability that the system takes the next state $s'\in \mcs$ given the current state $s$ and action $a$; $r: \mcs\times \mca\mapsto[0,R_{\max}]$ is the reward function mapping the station-action pairs to a bounded subset of $\mathbb{R}$, and $\gamma\in (0,1)$ is the discount factor. 

Here, we consider a deterministic policy $\mu_{\theta}$ parameterized by $\theta\in\mathbb R^d$, namely, given the current state $s$, the policy follows a deterministic function mapping to generate an action $a=\mu_{\theta}(s)$. We also assume that the Markov chains generated by the policies are ergodic throughout this paper.

\subsection{On-Policy Deterministic Policy Gradient}

We first consider the on-policy case where the interaction with the environment (i.e., the sampling) can follow the instantaneous target policy $\mu_{\theta}$.

The goal is to maximize the expected cumulative reward in the infinite-horizon case given by
\begin{align}
    J(\mu_{\theta}) = \int_{\mcs}\nu_{\mu_{\theta}}(s) r(s,\mu_{\theta}(s)) ds = \mE_{s\sim\nu_{\mu_{\theta}}} \brackets{r(s,\mu_{\theta}(s))}, \label{eq:lossJ} 
\end{align}
where $\nu_{\mu}(s')=\int_{\mcs}\sum_{t=0}^\infty \gamma^{t} p_0(s)p(s\rightarrow s',t,\mu)ds$ is the (improper) discounted state visitation distribution and $p(s\rightarrow s',t,\mu)$ denotes the density at state $s'$ after $t$ steps from state $s$ under policy $\mu$.
In the remaining of this paper, we denote $J(\theta):= J(\mu_{\theta})$ and $\nu_{\theta}:=\nu_{\mu_{\theta}}$ for brevity.

One popular method to optimize the loss function defined in \eqref{eq:lossJ} is to use gradient-based algorithms such as stochastic gradient descent (SGD).
To this end, the gradient of the loss function has been given by the so-called deterministic policy gradient theorem~\citep{silver2014deterministic} as follows:
\begin{align}
    \nabla J(\theta) &= \int_{\mcs} \nu_{\theta}(s) \nabla_{\theta}\mu_{\theta}(s) \nabla_a Q^{\mu_{\theta}}(s,a)|_{a=\mu_{\theta}(s)}ds \nonumber\\
    &= \mE_{\nu_{\theta}} \brackets{ \nabla_{\theta}\mu_{\theta}(s) \nabla_a Q^{\mu_{\theta}}(s,a)|_{a=\mu_{\theta}(s)} }. \label{eq:dpgthm}
\end{align}

The policy gradient theorem for deterministic policies suggests a way to estimate the gradient via sampling, and then model-free policy gradient algorithms can be developed by following SGD updates 
%that applies such gradient estimation 
for optimizing over policies.
%we can use sampling to estimate the true gradient, and then directly optimize the policies via SGD. 
The difficulty of estimating the policy gradient $\nabla J(\theta)$ in \eqref{eq:dpgthm} lies in approximating $\nabla_a Q^{\mu_{\theta}}(s,a)$. 
% Typically, a good estimator $Q^w(s,a)$ (where $w$ is the parameter for estimating $Q$-function) of $Q^{\mu_{\theta}}(s,a)$ may not necessarily imply that $\nabla_a Q^w(s,a)$ also serves as a good approximation of $\nabla_a Q^{\mu_{\theta}}(s,a)$.
To address this difficulty, the compatible function approximation was established in \citet{silver2014deterministic} which guarantees that $\nabla_a Q^{\mu_{\theta}}(s,a)$ can be replaced by $\nabla_a Q^w(s,a)$ in the policy gradient. We state such a property below, which is critical for designing the deterministic policy gradient algorithms.
\begin{proposition}\label{prop:compatibility}
(Compatible function approximation \citep{silver2014deterministic}) A function estimator $Q^w(s,a)$ compatible with a deterministic policy $\mu_{\theta}$, i.e., $\nabla J(\theta)=\mE_{\nu_{\theta}} \brackets{ \nabla_{\theta}\mu_{\theta}(s) \nabla_a Q^{w}(s,a)|_{a=\mu_{\theta}(s)} }$, if it satisfies the following two conditions:
\begin{enumerate}
    \item $\nabla_a Q^{w}(s,a)|_{a=\mu_{\theta}(s)}=\nabla_{\theta}\mu_{\theta}(s)^T w$;
    \item $w=w^*_{\xi_{\theta}}$ minimizes the mean square error $\mE_{\nu_{\theta}}\brackets{\xi(s;\theta,w)^T\xi(s;\theta,w)}$, where  $\xi(s;\theta,w) \!=\! \nabla_a Q^{w}(s,a)|_{a=\mu_{\theta}(s)} \!-\! \nabla_a Q^{\mu_{\theta}}(s,a)|_{a=\mu_{\theta}(s)}$.
\end{enumerate}
\end{proposition}

Following the compatibility property, the deterministic policy gradient can be rewritten as 
\begin{equation}\label{eq:compDPG}
    \nabla J(\theta) = \mE_{\nu_{\theta}} \brackets{ \nabla_{\theta}\mu_{\theta}(s) \nabla_{\theta}\mu_{\theta}(s)^T w^*_{\xi_{\theta}} },
\end{equation}
where $w^*_{\xi_{\theta}}$ can be approximated easily by solving a regression problem.

\subsection{Off-Policy Deterministic Policy Gradient}

In practice, it is often convenient to estimate the policy gradient via sampling under a behavior policy $\beta$, which is different from the target policy $\mu_{\theta}$.
%is different from the behavior policy $\beta$ which is used to sample trajectories.
%
\citet{silver2014deterministic} has also provided the deterministic policy gradient theorem for such an off-policy case, which is given by
\begin{align}
    \nabla J_{\beta}(\theta) &= \int_{\mcs} \nu_{\beta}(s) \nabla_{\theta}\mu_{\theta}(s) \nabla_a Q^{\mu_{\theta}}(s,a)|_{a=\mu_{\theta}(s)}ds \nonumber\\
    &= \mE_{\nu_{\beta}} \brackets{ \nabla_{\theta}\mu_{\theta}(s) \nabla_a Q^{\mu_{\theta}}(s,a)|_{a=\mu_{\theta}(s)} }, \label{eq:dpgthmOffPolicy}
\end{align}
where $\nu_{\beta}$ is the state visitation measure of the policy $\beta$. Correspondingly, the compatible form is given by
\begin{equation}\label{eq:compDPGOffPolicy}
    \nabla J_{\beta}(\theta) = \mE_{\nu_{\beta}} \brackets{ \nabla_{\theta}\mu_{\theta}(s) \nabla_{\theta}\mu_{\theta}(s)^T w^*_{\beta,\xi_{\theta}} },
\end{equation}
where $w^*_{\beta,\xi_{\theta}}=\arg\min_w\mE_{\nu_{\beta}}\brackets{\xi(s;\theta,w)^T\xi(s;\theta,w)}$ and $\xi(s;\theta,w)$ holds the same form as in \Cref{prop:compatibility}.

\section{On-Policy DPG Algorithm}


In this section, we first describe the on-policy DPG algorithm proposed in \citet{silver2014deterministic} and then provide the finite-sample convergence result for this algorithm.


\subsection{Algorithm}


In \citet{silver2014deterministic}, a compatible DPG algorithm using TD critic update was proposed, which we call as DPG-TD and describe in \Cref{alg:onPolicyDPG}. This algorithm introduces a critic parameter $w$ to estimate the gradient of Q-function based on the compatibility property. At each iteration, $w$ is updated by TD with a linear function approximator $Q^w(s,a)=\phi(s,a)^Tw$ (line 9 of \Cref{alg:onPolicyDPG}).
% based on the samples $\{s_{t,j}\}_{j=0,\dots,M-1}$ generated by the stationary distribution $d_{\theta_t}$ corresponding to the policy $\mu_{\theta_t}$. 
The algorithm uses $\theta$ as an actor parameter to update the policy (line 13 in \Cref{alg:onPolicyDPG}) based on the compatibility property.
% an independent batch of states sampled by the state visitation measure. 

\begin{algorithm}[h]
 	\caption{DPG-TD} \label{alg:onPolicyDPG} 
 	\begin{algorithmic}[1]
 		\STATE 	{\bf Input:}   $\alpha_{w}, \alpha_{\theta}, w_0, \theta_0$, batch size $M$.
		\FOR{ $t=0, 1, \ldots, T $}
		\FOR{ $j=0, 1, \ldots, M-1 $}
		\STATE Sample $s_{t,j} \sim d_{\theta_t}$. Generate $a_{t,j} = \mu_{\theta_t}(s_{t,j})$.
		\STATE Sample $s_{t+1,j} \sim P(\cdot|s_{t,j}, a_{t,j}) \text{ and } r_{t,j}$. Generate $a_{t+1,j} = \mu_{\theta_t}(s_{t+1,j})$.
		\STATE Denote $x_{t,j} = (s_{t,j}, a_{t,j})$.
		\STATE $\delta_{t,j} = r_{t,j} + \gamma\phi(x_{t+1,j})^T w_t - \phi(x_{t,j})^T w_t$.
		\ENDFOR
		\STATE $w_{t+1} = w_t + \frac{\alpha_{w}}{M}\sum_{j=0}^{M-1}\delta_{t,j}\phi(x_{t,j})$.
		\FOR{ $j=0, 1, \ldots, M-1 $}
		\STATE Sample $s'_{t,j} \sim \nu_{\theta_t}$. 
		\ENDFOR
		\STATE $\theta_{t+1} = \theta_t + \frac{\alpha_{\theta}}{M}\sum_{j=0}^{M-1}\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^T w_t $.
		\ENDFOR
 	\end{algorithmic}
\end{algorithm}



\subsection{Technical Assumptions}\label{subsec:assumptions}

Before providing the result, we first introduce technical assumptions, all of which are standard or necessary mild regularity requirements.
\begin{assumption}\label{asp:policy}
For any $\theta_1,\theta_2,\theta \in\mathbb R^d$, there exist positive constants $L_{\mu}, L_{\psi}$ and $\lambda_{\Psi}$, such that
(1) $\norm{\mu_{\theta_1}(s)-\mu_{\theta_2}(s)}\leq L_{\mu}\norm{\theta_1-\theta_2}, \forall s\in\mcs$; 
(2) $\norm{\nabla_{\theta}\mu_{\theta_1}(s)-\nabla_{\theta}\mu_{\theta_2}(s)}\leq L_{\psi}\norm{\theta_1-\theta_2}, \forall s\in\mcs$; (3) the matrix $\Psi_{\theta}:=\mE_{\nu_{\theta}}\brackets{ \nabla_{\theta}\mu_{\theta}(s) \nabla_{\theta}\mu_{\theta}(s)^T}$ (which we call as {\bf Fisher information of deterministic policy}) is non-singular with the minimal eigenvalue uniformly lower-bounded as $\sigma_{\min}(\Psi_{\theta})\geq\lambda_{\Psi}$.
%with some positive constant $\lambda_{\Psi}$.
\end{assumption}

The first two statements can be easily satisfied for properly parameterized policy classes 
%consisting of smooth policies 
such as the linear approximator used in \citet{silver2014deterministic} and the smooth neural network class. The last one ensures that $w^*_{\xi_{\theta}}$ defined in \Cref{prop:compatibility} is solvable and unique. A similar assumption has been also used in \citet{liu2020improved}.

\begin{assumption}\label{asp:environment}
For any $a_1,a_2\in\mca$, there exist positive constants $L_P,L_r$, such that
(1) the transition kernel satisfies $|P(s'|s,a_1)-P(s'|s,a_2)|\leq L_P\norm{a_1-a_2}, \forall s,s'\in\mcs$;
(2) the reward function satisfies $|r(s,a_1)-r(s,a_2)|\leq L_r\norm{a_1-a_2}, \forall s,s'\in\mcs$.
\end{assumption}

In Assumption \ref{asp:environment}, the first statement is standard in the theoretical studies of RL \citep{bertsekas1975convergence,chow1991optimal,dufour2013finite,dufour2015approximation}, where the transition kernel is assumed to be Lipschitz continuous with respect to (w.r.t.) both state and action. \citet{shah2018q} relaxes the Lipschitz continuity to be only w.r.t.\ state when considering a continuous state space. In this paper, we need the Lipschitz continuity to hold only w.r.t.\ action, because DPG algorithms are commonly used for continuous action space. The second statement can be easily satisfied for a properly defined reward function.

\begin{assumption}\label{asp:Qsmooth}
For any $a_1,a_2\in\mca$, there exists a positive constant $L_Q$, such that $\norm{\nabla_a Q^{\mu_{\theta}}(s,a_1) \!-\! \nabla_a Q^{\mu_{\theta}}(s,a_2)}\leq L_Q\norm{a_1-a_2}, \forall \theta\in\mathbb R^d, s\in \mcs$.
\end{assumption}

Assumption \ref{asp:Qsmooth} indicates that the Q-function is smooth over action, which is a standard assumption in deterministic policy related studies \citep{kumar2020zeroth}, and is also known as a principle to mitigate overfitting in the value estimation for actor-critic algorithms \citep{fujimoto2018addressing}.

% \begin{assumption}\label{asp:ergodic}
% The Markov chains generated by any policy are ergodic.
% \end{assumption}

% This assumption is also standard and has been widely used in RL studies \citep{Bhandari2018finite,xu2019twotimescale,xu2020sample,xu2020improving,xiong2020amsgradRL}.

% \begin{assumption}\label{asp:PsiNonsingular}
% The matrix $\Psi_{\theta}\!=\!\mE_{\nu_{\theta}}\brackets{ \nabla_{\theta}\mu_{\theta}(s) \nabla_{\theta}\mu_{\theta}(s)^T}$ is non-singular. Its minimal eigenvalue is uniformly lower bounded, that is, $\sigma_{\min}(\Psi_{\theta})\geq\lambda_{\Psi}$ with some positive constant $\lambda_{\Psi}$.
% \end{assumption}

% Assumption \ref{asp:PsiNonsingular} ensures that $w^*_{\xi_{\theta}}$ defined in \Cref{prop:compatibility} is solvable and unique. 
% It can be regarded as an analogy to the non-singularity assumption for the Fisher matrix in the studies for natural PG/AC \citep{xu2020improving,xu2020non}.

\begin{assumption}\label{asp:phi}
The feature function $\phi:\mcs\times\mca\rightarrow\mathbb{R}^{ d}$ is uniformly bounded, i.e., $\norm{\phi(\cdot,\cdot)} \leq C_{\phi}$ for some positive constant $C_{\phi}$. In addition, we define $A = \mE_{d_{\theta}}\brackets{\phi(x)(\gamma\phi(x')-\phi(x))^T}$ and $ D=\mE_{d_{\theta}}\brackets{\phi(x)\phi(x)^T}$, and assume that $A$ and $D$ are non-singular. We further assume that the absolute value of the eigenvalues of $A$ are uniformly lower bounded, i.e., $|\sigma(A)|\geq\lambda_{A}$ for some positive constant $\lambda_{A}$.
\end{assumption}

Assumption \ref{asp:phi} is standard in the studies of TD learning with linear function approximation \citep{zou2019finite,wu2020finite,xu2020improving,xu2020non}. This assumption guarantees the solvability of TD learning with linear function approximation. To be more specific, it ensures that $\mE_{d_{\theta}}[\delta\phi] = 0$ has a unique root, namely $w^*_{\theta}$, which is also the global optimum of TD learning for a fixed policy $\mu_{\theta}$.  

% \begin{assumption}\label{asp:systemErrorKappa}
% Let $w^*_{\theta}$ be the global optimum of TD learning given a fixed policy $\mu_{\theta}$ (whose existence has been guaranteed under Assumption \ref{asp:phi}). Let $w_{\xi_{\theta}}^*$ be defined in \Cref{prop:compatibility}. We assume the difference between $w_{\theta}^*, w_{\xi_{\theta}}^*$ is uniformly bounded. That is, there exists a positive constant $\kappa$, such that $\norm{w_{\theta}^* - w_{\xi_{\theta}}^*}\leq \kappa, \forall \theta\in\mathbb R ^d$.
% \end{assumption}

% The term $\norm{w_{\theta}^* - w_{\xi_{\theta}}^*}$ is a system error and determined by two parts. The first one is how good  $Q^w(s,a)$ approximates $Q^{\mu_{\theta}}(s,a)$. The second is how the closeness of Q value estimate implies the closeness between their gradients (i.e., $\nabla_a Q^w(s,a)$ and $\nabla_a Q^{\mu_{\theta}}(s,a)$). 
% Such error terms are closely related to the expressive power of the function approximation class. 
% % The second term can be small if ??.
% Existence of a relatively small $\kappa$ in Assumption \ref{asp:systemErrorKappa} is a prerequisite condition that the compatible DPG algorithms in \citet{silver2014deterministic} can work. In fact, similar system error assumptions are widely existent in other studies of stochastic actor-critic algorithms \citep{bhatnagar2009natural,qiu2019finite,xu2020improving,xu2020non}.

\begin{remark}
We will abuse the notations a bit and assume that Assumptions \ref{asp:policy} and \ref{asp:phi} also hold for the off-policy case, with the expectations taken over the behavior stationary and (improper) visitation distributions $d_{\beta}$ and $\nu_{\beta}$. 
%For brevity, we will not state the assumptions again when only distributions on which the expectations are taken are needed to change (e.g. from $d_{\theta},\nu_{\theta}$ to $d_{\beta},\nu_{\beta}$, respectively).
\end{remark}

\subsection{Convergence Result}

In this subsection, we provide the finite-sample convergence analysis for DPG-TD in \Cref{alg:onPolicyDPG}. 

Note that the deterministic policy gradient $\nabla J(\theta)$ in \eqref{eq:dpgthm} has a different and more challenging form to analyze compared with stochastic PG, which requires the development of several new tools. First, we characterize the Lipschitz property for the deterministic policy gradient, which serves as a crucial step in the finite-sample analysis of DPG-TD. The previous study of DPG in \citet{kumar2020zeroth} takes such a property as an assumption. Here, we formally establish such a Lipschitz property with the proof provided in \Cref{app:proofDPGLipschitz} of the appendix, and characterize the dependence of the deterministic policy gradient on the basic parameters of the MDP.
\begin{lemma}\label{lem:dpglipschitz}
Suppose Assumptions \ref{asp:policy}-\ref{asp:Qsmooth} hold. Then the deterministic policy gradient $\nabla J(\theta)$ defined in \eqref{eq:dpgthm} is Lipschitz continuous with the parameter $L_J$, i.e., $\forall \theta_1, \theta_2 \in \mathbb{R}^d$,
\begin{equation}
    \norm{ \nabla J(\theta_1) - \nabla J(\theta_2) }\leq L_J \norm{\theta_1 - \theta_2},
\end{equation}
where $L_J\!=\!\parentheses{\frac{1}{2}L_PL_{\mu}^2 L_{\nu}C_{\nu} \!+\! \frac{L_{\psi}}{1\!-\!\gamma}}\parentheses{L_r \!+\! \frac{\gamma R_{\max}L_P}{1-\gamma}} \!+\! \frac{L_{\mu}}{1-\gamma}\parentheses{L_QL_{\mu} \!+\! \frac{\gamma}{2} L_P^2R_{\max}L_{\mu}C_{\nu} \!+\! \frac{\gamma L_PL_rL_{\mu}}{1-\gamma}}$.
\end{lemma}

%\Cref{lem:dpglipschitz} was given as an assumption in the only existing deterministic policy gradient study \citep{kumar2020zeroth}. In this paper we provide a detailed proof in \Cref{app:proofDPGLipschitz} which is also the first justification for the Lipschitz property of DPG.

% Next, we further introduce a constant 
% \begin{equation}\label{eq:systemErrorKappa}
%     \kappa:=\max_{\theta}\norm{w_{\theta}^* - w_{\xi_{\theta}}^*},
% \end{equation} 
% where $w^*_{\theta}$ is the global optimum of TD learning given a fixed policy $\mu_{\theta}$ and $w_{\xi_{\theta}}^*$ is defined in \Cref{prop:compatibility}. This constant is a uniform upper bound for the system error term $\norm{w_{\theta}^* - w_{\xi_{\theta}}^*}$ which is determined by two parts. The first part is how good  $Q^w(s,a)$ approximates $Q^{\mu_{\theta}}(s,a)$. The second is how the closeness of Q value estimate implies the closeness between their gradients (i.e., $\nabla_a Q^w(s,a)$ and $\nabla_a Q^{\mu_{\theta}}(s,a)$). 
% Such error terms are closely related to the expressive power of the function approximation class. 
% % The second term can be small if ??.
% Existence of a relatively small $\kappa$ in \eqref{eq:systemErrorKappa} is a prerequisite condition that the compatible DPG algorithms in \citet{silver2014deterministic} can work. In fact, similar system error assumptions are widely existent in other studies of stochastic actor-critic algorithms \citep{bhatnagar2009natural,qiu2019finite,xu2020improving,xu2020non}.

In the following, we provide the convergence guarantee for DPG-TD. Our main technical novelty lies in the development of a new framework to analyze the {\bf coupled} actor and critic's stochastic approximation processes, due to their simultaneous updates both with constant stepsizes. Our central idea is to cancel the critic's cumulative coupling error by the overall positive progress of actor's approach to the stationary policy. This is different from the previous analysis of (stochastic) PG-type algorithms which mainly decouples or asymptotically decouples the critic's error from actor's error. We provide a proof sketch in \Cref{sec:proofonpolicy} with the full proof given in \Cref{app:proofThmOnPolicy} of the appendix.
\begin{theorem}\label{thm:onPolicyDPG}
Suppose that Assumptions \ref{asp:policy}-\ref{asp:phi} hold. Let $\alpha_w \leq \frac{\lambda}{2C_{A}^2}; M\geq\frac{48\alpha_w  C_{A}^2}{\lambda}; \alpha_{\theta} \leq \min\cur{\frac{1}{4L_J}, \frac{\lambda\alpha_w}{24L_hL_w}}$. Then the output of DPG-TD in \Cref{alg:onPolicyDPG} satisfies 
\begin{align}
    \underset{t\in [T]}{\min}\mE\norm{\nabla J(\theta_{t})}^2 \leq \frac{c_1}{T} + \frac{c_2}{M} + c_3\kappa^2,\nonumber
\end{align} 
where $c_1 = \frac{8R_{\max}}{\alpha_{\theta}(1-\gamma)} + \frac{144L_{h}^2}{\lambda\alpha_w}\norm{w_{0}-w^*_{\theta_{0}}}^2,
c_2 = \brackets{48\alpha_w^2(C_A^2C_w^2 + C_b^2) + \frac{48L_w^2L_{\mu}^4C_{w_{\xi}}^2\alpha_{\theta}^2}{\lambda\alpha_w}}\cdot\frac{144L_{h}^2}{\lambda\alpha_w} + 36L_{\mu}^4C_{w_{\xi}}^2, 
c_3 = 18L_h^2 + \frac{24L_w^2L_h^2\alpha_{\theta}^2}{\lambda\alpha_w}$
with $C_A=2C_{\phi}^2, C_b=R_{\max}C_{\phi},  C_w=\frac{R_{\max}C_{\phi}}{\lambda_A}, C_{w_{\xi}}=\frac{L_{\mu}C_Q}{\lambda_{\Psi}(1-\gamma)}, L_w=\frac{L_{J}}{\lambda_{\Psi}}  + \frac{L_{\mu}C_Q}{\lambda_{\Psi}^2(1-\gamma)}\parentheses{L_{\mu}^2L_{\nu} + \frac{2L_{\mu}L_{\psi}}{1-\gamma}},L_h=L_{\mu}^2, C_Q=L_r + L_P\cdot\frac{\gamma R_{\max}}{1-\gamma}, L_{\nu}=\frac{1}{2}C_{\nu}L_PL_{\mu}$, and $L_J$ defined in \Cref{lem:dpglipschitz}, and we define
\begin{align}
    \kappa := \max_{\theta}\norm{w_{\theta}^* - w_{\xi_{\theta}}^*}. \label{eq:systemErrorKappa}
\end{align}
% where
% \begin{align}
%     c_1 &= \frac{8R_{\max}}{\alpha_{\theta}(1-\gamma)} + \frac{144L_{h}^2}{\lambda\alpha_w}\norm{w_{0}-w^*_{\theta_{0}}}^2\nonumber\\
%     c_2 &= \brackets{48\alpha_w^2(C_A^2C_w^2 + C_b^2) + \frac{48L_w^2L_{\mu}^4C_{w_{\xi}}^2\alpha_{\theta}^2}{\lambda\alpha_w}}\cdot\frac{144L_{h}^2}{\lambda\alpha_w}\nonumber\\
%     &\quad + 36L_{\mu}^4C_{w_{\xi}}^2 \nonumber\\
%     c_3 &= 18L_h^2 + \frac{24L_w^2L_h^2\alpha_{\theta}^2}{\lambda\alpha_w}\nonumber\\
%     \kappa &:= \max_{\theta}\norm{w_{\theta}^* - w_{\xi_{\theta}}^*}, \label{eq:systemErrorKappa}
% \end{align}
% with $C_A=2C_{\phi}^2, C_b=R_{\max}C_{\phi},  C_w=\frac{R_{\max}C_{\phi}}{\lambda_A}, C_{w_{\xi}}=\frac{L_{\mu}C_Q}{\lambda_{\Psi}(1-\gamma)}, L_w=\frac{L_{J}}{\lambda_{\Psi}}  + \frac{L_{\mu}C_Q}{\lambda_{\Psi}^2(1-\gamma)}\parentheses{L_{\mu}^2L_{\nu} + \frac{2L_{\mu}L_{\psi}}{1-\gamma}},L_h=L_{\mu}^2, C_Q=L_r + L_P\cdot\frac{\gamma R_{\max}}{1-\gamma}, L_{\nu}=\frac{1}{2}C_{\nu}L_PL_{\mu}$, and $L_J$ defined in \Cref{lem:dpglipschitz}.
\end{theorem}

\Cref{thm:onPolicyDPG} indicates that the convergence upper bound consists of three parts. The first term captures the convergence rate and vanishes sublinearly with the number of iterations. The second term captures the variance caused by the stochastic sampling and can be controlled by the batch size. The last term captures the system error $\norm{w_{\theta}^* - w_{\xi_{\theta}}^*}$ which is uniformly bounded by some constant $\kappa$. 
Such a system error includes two parts of the approximation errors. The first part is introduced by the difference between the optimal output Q-function of TD learning and the ground truth Q-function. The second part captures the approximation error due to the fact that none of the linear functions in this class satisfies the compatibility property in \Cref{prop:compatibility}.
In practice, the high capacity of  the neural network class can significantly help to reduce such an error and achieves better convergence accuracy.
%Existence of a relatively small $\kappa$ in \eqref{eq:systemErrorKappa} is a prerequisite condition that the compatible DPG algorithms in \citet{silver2014deterministic} can work. 
%In fact, similar system error terms are widely existent in other studies of stochastic actor-critic algorithms \citep{bhatnagar2009natural,qiu2019finite,xu2020improving,xu2020non}.

\Cref{thm:onPolicyDPG} also captures how the Fisher information of deterministic policy $\Psi_{\theta}:=\mE_{\nu_{\theta}}\brackets{ \nabla_{\theta}\mu_{\theta}(s) \nabla_{\theta}\mu_{\theta}(s)^T}$ affects the convergence rate via its minimum eigenvalue bound $\lambda_{\Psi}$. Such a metric arises due to the compatible function approximation and captures how well actor estimates the deterministic policy gradient. Clearly, larger $\lambda_{\Psi}$ indicates a better system condition (smaller $C_{w_{\xi}}$ and $L_w$) and hence a faster convergence.

Based on the convergence rate in \Cref{thm:onPolicyDPG}, we provide the sample complexity of the algorithm as follows.
\begin{corollary}\label{cor:onPolicyDPG}
Suppose that the same assumptions in \Cref{thm:onPolicyDPG} hold. Then the output of DPG-TD in \Cref{alg:onPolicyDPG} satisfies $\underset{t\in [T]}{\min}\mE\norm{\nabla J(\theta_{t})}^2 \leq \epsilon + c_3\kappa^2$,
% \begin{align}
%     \underset{t\in [T]}{\min}\mE\norm{\nabla J(\theta_{t})}^2 \leq \epsilon + c_3\kappa^2,\nonumber
% \end{align} 
by using the total number of samples $2MT = \mathcal{O}\parentheses{1/{\epsilon^2}}$.
%\begin{align*}
%    2MT = \mathcal{O}\parentheses{\frac{1}{\epsilon^2}}.
%\end{align*}
\end{corollary}

\Cref{cor:onPolicyDPG} shows that DPG-TD attains an $\epsilon$-accurate stationary point (up to the system error) with a sample complexity of $\mathcal{O}(\epsilon^{-2})$. 
To our best knowledge, this is the first finite-sample characterization for DPG. 

Despite the policy gradient of DPG is more challenging to estimate, the sample complexity of DPG in \Cref{cor:onPolicyDPG} matches the best known stochastic PG (with AC scheme) in \citet{xu2020improving}. Furthermore, such a result does not require critic's update in DPG to accurately track the deterministic policy gradient at each step and hence is practically desired, whereas the sample complexity guarantee in \citet{xu2020improving} for stochastic PG requires sufficient accuracy of tracking the policy gradient at each iteration.

%Compared with the existing results for stochastic AC with similar settings (infinite-horizon MDP + TD(0) type critic + nonconvex policy function approximation), this result outperforms those of \citet{wu2020finite,xu2020non,wang2019neural,kumar2019sample,qiu2019finite}
%% two timescale AC with a complexity of $\tilde{\mathcal{O}}(\epsilon^{-2.5})$ in \citet{wu2020finite,xu2020non} and most of double loop AC with a complexity of $\mathcal{O}(\epsilon^{-4})$ in \citet{wang2019neural,kumar2019sample} and $\tilde{\mathcal{O}}(\epsilon^{-3})$ in \citet{qiu2019finite}, 
%and matches the best known result in \citet{xu2020improving} which also studies a less practical AC algorithm which in fact decouples the analysis of the critic and actor. 
%% More comparison details on the algorithm architectures and results are seen in \Cref{table:comparison}.
%% those in \citet{wang2019neural,kumar2019sample} (both in a complexity of $\mathcal{O}(1/\epsilon^4)$), \citet{qiu2019finite} (in a complexity of $\tilde{\mathcal{O}}(1/\epsilon^3)$), and matches the best known result in \citet{xu2020improving}.

\textbf{DPG with noisy sampling for exploration.} In practice, the deterministic policy used in \Cref{alg:onPolicyDPG} usually suffers from the inefficient exploration. To overcome such an issue, \citet{silver2014deterministic} proposed to use a noisy sampling for DPG. To be specific, in lines 4-5 of \Cref{alg:onPolicyDPG}, a noisy policy, e.g.,  $\pi_{\theta_t}(s) = \mu_{\theta_t}(s) + \mathcal{N}(0,\sigma^2)$, is adopted to generate actions $a_{t,j},a_{t+1,j}$. Correspondingly, the states for critic's updates in lines 4-5 of \Cref{alg:onPolicyDPG} are generated by the stationary distribution $d_{\pi_{\theta_t}}$ associated with the noisy policy $\pi_{\theta_t}$. The rest of \Cref{alg:onPolicyDPG} is unchanged. We refer to such an algorithm as noisy DPG-TD (NoiDPG-TD). 

Following the same techniques for the proof of \Cref{thm:onPolicyDPG} and replace the stationary distribution $d_{\theta_t}$ by $d_{\pi_{\theta_t}}$, we readily obtain the convergence result for NoiDPG-TD as follows.
\begin{corollary}\label{cor:onPolicyNoisedDPG}
Suppose that the same assumptions in \Cref{thm:onPolicyDPG} hold. Then the output of NoiDPG-TD satisfies $\underset{t\in [T]}{\min}\mE\norm{\nabla J(\theta_{t})}^2 \leq \epsilon + \mathcal{O}\parentheses{\kappa^2}$,
% \begin{align}
%     \underset{t\in [T]}{\min}\mE\norm{\nabla J(\theta_{t})}^2 \leq \epsilon + \mathcal{O}\parentheses{\kappa^2},\nonumber
% \end{align} 
by using the total number of samples 
$\mathcal{O}\parentheses{\epsilon^{-2}}$.
\end{corollary}
In \Cref{cor:onPolicyNoisedDPG}, the system error $\kappa$ is determined by the noisy policy. \Cref{cor:onPolicyNoisedDPG} indicates that the noisy sampling for exploration does not cause higher sample complexity compared to DPG-TD.
% , but improves the stability of DPG in practice. 

\subsection{Proof Sketch of Theorem \ref{thm:onPolicyDPG}}\label{sec:proofonpolicy}

In the following, we outline the proof of \Cref{thm:onPolicyDPG} to highlight our new approach to analyzing the {\bf coupled} actor and critic's stochastic approximation processes, due to their simultaneous updates both with constant stepsizes. The central idea is to cancel the critic's cumulative tracking error by the actor's overall positive progress to the stationary policy, which is different from the existing analysis of (stochastic) PG-type algorithms that mainly decouples or asymptotically decouples the critic's error from actor's error. 
Further, we develop a new analysis to bound the estimation error of the Fisher information of deterministic policy arising via the compatibility theorem, and then further capture how such a metric affects the convergence via its minimum eigenvalue.

%We emphasize that our techniques effectively handle the unique compatibility estimator for DPG which is different from stochastic AC. In addition, we also deal with the coupling between the critic and actor parameters, which is usually avoided by most of the existing AC studies. 
% We consider the practical constant learning rates, which is different from the two timescale AC algorithms.

The main proof consists of three steps.
% First, we analyze the error dynamics of tracking a fixed critic target (i.e., fixed tracking error). Second, we analyze the error dynamics of tracking a dynamic critic target (i.e., dynamic tracking error) based on the first step. Last, we couple critic's dynamic tracking error with actor's update and bound the overall convergence gap. 
First, we characterize the error propagation of tracking a dynamic critic target (i.e., dynamic tracking error) based on its coupling with actor's update progress. Second, we bound the critic's cumulative tracking error in terms of actor's update progress via the compatibility properties of DPG.
%the actor penalty (expected policy gradient). 
Last, we establish the overall convergence by canceling out the cumulative tracking error via the actor's overall positive progress towards the stationary policy.

\textbf{Step I: Characterizing dynamics of critic's error via coupling with actor.}
% Relate $\norm{w_{t+1}-w^*_{\theta_t}}^2$ and $\norm{w_{t}-w^*_{\theta_t}}^2$.

% In the first step, we focus on the dynamics of the fixed tracking error. That is, we fix a policy $\mu_{\theta_t}$ and find out the dynamics of the error between the critic parameter and the fixed target $w^*_{\theta_t}$.
In the first step, we characterize the propagation of the dynamics of critic's dynamic tracking error based on its coupling with actor's updates. That is, we develop the relationship between $\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2$ and $\norm{w_{t}-w^*_{\theta_t}}^2$ by their coupling with actor's updates.

Recall that $w^*_{\theta_t}$ is the global optimum of TD given a fixed policy $\mu_{\theta_t}$, or is equivalently the unique root of $\bar g_{\theta_t}(w_t) := \mE_{d_{\theta_t}} \brackets{\frac{1}{M}\sum_{j=0}^{M-1}\delta_{t,j}\phi(x_{t,j})}=0$. 
% In this step, the key observation is the strong convexity like property for $\bar g_{\theta_t}(w)$, that is, for any policy, $\langle w_{t}-w^*_{\theta_t}, \bar g_{\theta_t}(w_t)\rangle\leq-\lambda\norm{w_{t}-w^*_{\theta_t}}^2$ with some constant $\lambda>0$. This property has been shown and widely used in the analysis of TD learning with linear function approximation \citep{tsitsiklis1997analysis,Bhandari2018finite,xiong2020amsgradRL}. 
We first give the following bound on the update rule of $w_t$ in \Cref{alg:onPolicyDPG} given by the TD learning property \citep{tsitsiklis1997analysis,Bhandari2018finite,xiong2020amsgradRL},
%We first use the strong convexity like property $\langle w_{t}-w^*_{\theta_t}, \bar g_{\theta_t}(w_t)\rangle\leq-\lambda\norm{w_{t}-w^*_{\theta_t}}^2$ with some constant $\lambda>0$ 
%\citep{tsitsiklis1997analysis,Bhandari2018finite,xiong2020amsgradRL}, and the update rule of $w_t$ in \Cref{alg:onPolicyDPG} to have
%\begin{align}
%    \mE\norm{w_{t+1}\!-\!w^*_{\theta_t}}^2 &\leq (1\!-\!2\alpha_{w}\lambda\!+\!2\alpha_{w}^2C_A^2)\mE\norm{w_{t}-w^*_{\theta_t}}^2 \nonumber\\
%    &\quad + 2\alpha_{w}^2\mE\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2, \nonumber
%\end{align}
%where $ g_{\theta_t}(w_t, \mcb_t) := \frac{1}{M}\sum_{j=0}^{M-1}\delta_{t,j}\phi(x_{t,j})$ is an unbiased estimate of $\bar g_{\theta_t}(w_t)$.
%To proceed, we use \Cref{lem:minibatchVariance} to bound the variance term $\mE\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2$, and obtain
\begin{align}
    &\mE\norm{w_{t+1}-w^*_{\theta_t}}^2 \nonumber\\
    % &\leq \parentheses{1-2\alpha_{w}\lambda+2\alpha_{w}^2C_A^2+\frac{24\alpha_w^2 C_A^2}{M}}\mE\norm{w_{t}\!-\!w^*_{\theta_t}}^2 \nonumber\\
    % &\quad + \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} \nonumber\\
    &\leq \parentheses{1-\frac{\alpha_w\lambda}{2}}\mE\norm{w_{t}-w^*_{\theta_t}}^2 + \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M}, \nonumber
\end{align}
where $\alpha_w \leq \frac{\lambda}{2C_A^2}, M\geq\frac{48\alpha_w  C_A^2}{\lambda}$.
% , we finish this step and have
% \begin{align}
%     &\mE\norm{w_{t+1}-w^*_{\theta_t}}^2 \nonumber\\
%     &\leq \parentheses{1-\frac{\alpha_w\lambda}{2}}\mE\norm{w_{t}-w^*_{\theta_t}}^2 + \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M}. \nonumber
% \end{align}


% \textbf{Step II}: Relate $\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2$ and $\norm{w_{t}-w^*_{\theta_t}}^2$.

% The second step aims to build the connection between the dynamic tracking errors along with the time step.

% Based on the result of the last step, it is natural to think of using Young's inequality to achieve our goal:
% \begin{align}
%     &\mE\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2 \nonumber\\
%     &\leq \parentheses{1\!+\!\frac{1}{c}}\mE\norm{w_{t+1}\!-\!w^*_{\theta_{t}}}^2 + \parentheses{1\!+\!c}\mE\norm{w^*_{\theta_{t}}\!-\!w^*_{\theta_{t+1}}}^2\nonumber.
% \end{align}

% The choice of the constant $c$ needs to guarantee $(1-\alpha_w\lambda/2)\cdot(1+1/c)\leq (1-\alpha_w\lambda/4)$. A qualified option is $c=1/2(2/\lambda\alpha_w-1)$. 

%Then, we apply Young's inequality and the Lipschitz continuity property of $w^*_{\theta_{t}}$ derived in \Cref{lem:wStar}, and obtain

In the previous analysis of (stochastic) AC algorithms, sufficient TD updates of critic result in a controlled small tracking error before updating the actor, which is hence decoupled from the actor's progress. In contrast, DPG-TD takes alternative updates between critic and actor, so that the critic's tracking error is inherent and non-vanishing. Thus, we take a new approach to characterize the moving dynamics of the tracking error and directly couple it with the actor's update as follows,
\begin{align}
    &\mE\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2
    \leq \frac{4L_w^2}{\lambda\alpha_w}\mE\norm{\theta_{t+1}-\theta_{t}}^2 \nonumber\\
    &+ \parentheses{1-\frac{\lambda\alpha_w}{4}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M}.\nonumber
    % &= \parentheses{1-\frac{\lambda\alpha_w}{4}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M}\nonumber\\
    % &\quad + \frac{4L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2\nonumber,
\end{align}
% where $h_{\theta_t}(w_t, \!\mcb_t) \!=\! \frac{1}{M}\sum_{j=0}^{M\!-\!1}\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^T w_t$.
Clearly, in the above bound, the two tracking errors at times $t+1$ and $t$ have different targets $w^*_{\theta_{t+1}}$ and $w^*_{\theta_{t}}$ due to actor's one update between critic's two consecutive updates. Hence, actor's update is necessarily coupled into the dynamics of the critic's tracking error.

%From the above dynamics, we see that when the critic and actor are updated alternatively as in \Cref{alg:onPolicyDPG}, the dynamic tracking error will be inherited to the next iteration. Such a non-vanishing error is coupled with the actor's update, which is different from the existing analysis that (asymptotically) decouples the critic's error from actor's error.

\textbf{Step II: Bounding cumulative tracking error via compatibility theorem for DPG.}

In this step, we bound the cumulative tracking error based on the dynamics of the tracking error from the last step. 
% penalized by the actor's approaching to stationary points. 

To this end, we first bound the difference between two consecutive actor parameters via DPG's properties. By the update rule of $\theta_t$ in \Cref{alg:onPolicyDPG}, we have $\theta_{t+1}\!-\!\theta_{t} \!=\! \frac{\alpha_{\theta}}{M}\sum_{j=0}^{M\!-\!1}\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^T w_t\!:=\!\alpha_{\theta}h_{\theta_t}(w_t, \!\mcb_t)$.
Since $h_{\theta_t}(w_t,\mcb_t)$ is not an unbiased estimator of the deterministic policy gradient $\nabla J(\theta_t)$, we characterize such a bias by exploiting the compatibility theorem as well as the property of Fisher information of deterministic policy defined in Assumption \ref{asp:policy} and obtain the following bound (see \Cref{lem:hVariance} for the proof)
\begin{align*}
    &\mE\norm{h_{\theta_t}(w_{t},\mcb_t) -\nabla J(\theta_t)}^2 \\
    &\quad\leq 3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}.
\end{align*}

% Next, by observing $\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2\leq 2\mE\norm{\nabla J(\theta_t)}^2+2\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)}^2$ and further bounding $\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)}^2$ in \Cref{lem:hVariance},

The above bound then connects the critic's error dynamics from Step I to the policy gradient and yields the following result: 
%Next, we proceed to bound critic's error dynamics from Step I via  and obtain
\begin{align}
    &\mE\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2 
    % &\leq \parentheses{1-\frac{\lambda\alpha_w}{4}+\frac{24L_h^2L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 \nonumber\\
    % &\quad\quad + \frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{\nabla J(\theta_t)}^2\nonumber\\
    % &\quad\quad + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\nonumber\\
    \leq \parentheses{1-\frac{\lambda\alpha_w}{8}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 \nonumber\\
    &\quad\quad + \frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{\nabla J(\theta_t)}^2\nonumber\\
    &\quad\quad + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\nonumber,
\end{align}
where it requires $\alpha_{\theta} \leq \frac{\lambda\alpha_w}{\sqrt{96}L_hL_w}$.

Thus, we obtain the cumulative dynamic tracking error as
\begin{align}
    &\sum_{t=0}^{T-1}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2\leq \frac{8\norm{w_{0}-w^*_{\theta_{0}}}^2}{\lambda\alpha_w} \nonumber\\
    &+\! \brackets{\frac{48\alpha_w^2(C_A^2C_w^2 \!+\! C_b^2)}{M} \!+\! \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 \!+\! \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}} \nonumber\\
    &\quad \cdot\frac{8T}{\lambda\alpha_w}+ \frac{64L_w^2\alpha_{\theta}^2}{\lambda^2\alpha_w^2}\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2. \nonumber
\end{align}
The above bound connects the cumulative dynamic tracking error to the convergence rate of actor's update via policy gradient, i.e., such an error depends on how fast actor's update approaches to the stationary point. 

%The above bound indicates that the cumulative dynamic tracking error is determined by how fast actor can approach to the stationary points. 

\textbf{Step III: Overall convergence by canceling tracking error via actor's positive progress.}

In this step, we establish the overall convergence to a stationary policy by novel cancellation of the above cumulative tracking error via actor's update progress.

%We first establish the relationship between the progress of value function and the tracking error based on the Lipschitz continuity in \Cref{lem:dpglipschitz}, and have
% use the Lipschitz continuity in \Cref{lem:dpglipschitz} and the same techniques to deal with $\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2$ in the last step to obtain
%\begin{align}
%    &\mE[J(\theta_{t+1})] - \mE[J(\theta_t)] \geq \frac{\alpha_{\theta}}{4}\mE\norm{\nabla J(\theta_t)}^2 \nonumber\\
%    &\quad - \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}},\nonumber
%\end{align}
%where $\alpha_{\theta} \leq \frac{1}{4L_J}$, 

We first bound the cumulative policy gradient by the cumulative tracking error via the relationship between the progress of loss function and the tracking error as follows:
\begin{align}
    &\frac{\alpha_{\theta}}{4}\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2
    \leq \frac{9\alpha_{\theta}L_{h}^2}{4}\sum_{t=0}^{T-1}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2\nonumber\\
    &\quad+ \frac{R_{\max}}{1-\gamma} + \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\cdot T. \nonumber
\end{align}

The previous analysis of (stochastic) AC typically exploits the fact that the above critic's tracking error can decay sufficiently fast by decoupling it from actor's update, which does not hold here. In contrast, we exploit the connection of the cumulative tracking errors and the cumulative policy gradient that we establish in Step II, and show that such a tracking error can ultimately be canceled by the actor's positive progress towards a stationary point. This also explains why the critic's inaccurate estimation does not affect the overall convergence guarantee. Such an idea is captured as follows:
% Our final goal is to bound $\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2$. Since the left hand side of \eqref{eq:scketch1} can be telescoped, it remains to handle $\sum_{t=0}^{T-1}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2$. To this end, we use the dynamic tracking error from Step II and have
% Next, by taking summation for both sides of the above inequality with telescoping, we have

%Last, the cumulative dynamic tracking error from Step II can be cancelled out by the overall positive progress of actor's approaching to the stationary policy, 
% which eventually leads to the final convergence.
% Then, we plug the cumulative dynamic tracking error from Step II into the above bound, and obtain
%which yields
\begin{align}
    &\parentheses{\frac{\alpha_{\theta}}{4}-\frac{144L_h^2L_w^2\alpha_{\theta}^3}{\lambda^2\alpha_w^2}}\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2 \nonumber\\
    &\leq \frac{R_{\max}}{1-\gamma} + \frac{18\alpha_{\theta}L_{h}^2}{\lambda\alpha_w}\norm{w_{0}-w^*_{\theta_{0}}}^2\nonumber\\
    &+\! \brackets{\frac{48\alpha_w^2(C_A^2C_w^2 \!+\! C_b^2)}{M} \!+\! \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 \!+\! \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}}\nonumber\\
    &\quad\cdot\frac{18\alpha_{\theta}L_{h}^2T}{\lambda\alpha_w} + \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\cdot T. \nonumber
\end{align}

Finally, by letting $\alpha_{\theta} \leq \frac{\lambda\alpha_w}{24L_hL_w}$ and rearranging the above terms, we complete the proof.

\section{Off-Policy DPG Algorithm}

In this section, we consider the off-policy setting, where the behavior policy is different from the target policy, and provide the convergence guarantee of such an off-policy DPG algorithm.

\subsection{Algorithm}

The design of the algorithm is based on the compatibility property in \eqref{eq:compDPGOffPolicy}. However, off-policy TD with linear function approximation is known to not necessarily converge \citep{baird1995residual}. 
To overcome such a divergence issue, \citet{silver2014deterministic} adopted TD with gradient correction (TDC) to update the critic parameter. Note that since critic in DPG estimates the Q-function rather than the value function, there is no need to use the importance sampling to adjust the sampling distribution \citep{silver2014deterministic}. We call the compatible DPG using TDC updates as DPG-TDC, with its details given in \Cref{alg:offPolicyDPG}.

\begin{algorithm}[h]
 	\caption{DPG-TDC} \label{alg:offPolicyDPG} 
 	\begin{algorithmic}[1]
 		\STATE 	{\bf Input:}   $\alpha_{w}, \eta, \alpha_{\theta}, w_0, u_0, \theta_0$, batch size $M$, behavior policy $\beta$.
		\FOR{ $t=0, 1, \ldots, T $}
		\FOR{ $j=0, 1, \ldots, M-1 $}
		\STATE Sample $x_{t,j} := (s_{t,j}, a_{t,j}) \sim d_{\beta}$.
		\STATE Sample $s_{t+1,j} \sim P(\cdot|s_{t,j}, a_{t,j}) \text{ and } r_{t,j}$. Generate $a_{t+1,j} = \mu_{\theta_t}(s_{t+1,j})$.
		\STATE Denote $\phi_{t,j} = \phi(s_{t,j}, a_{t,j})$.
		\STATE $\delta_{t,j} = r_{t,j} + \gamma\phi_{t+1,j}^T w_t - \phi_{t,j}^T w_t$.
		\ENDFOR
		\STATE $w_{t+1} = w_t + \frac{\alpha_{w}}{M}\sum_{j=0}^{M-1}\brackets{\delta_{t,j}\phi_{t,j}-\gamma\phi_{t+1,j}\phi_{t,j}^Tu_t}$.
		\STATE $u_{t+1} = u_t + \frac{\eta\alpha_{w}}{M}\sum_{j=0}^{M-1}\brackets{\delta_{t,j}\phi_{t,j}-\phi_{t,j}\phi_{t,j}^Tu_t}$.
		\FOR{ $j=0, 1, \ldots, M-1 $}
		\STATE Sample $s'_{t,j} \sim \nu_{\theta_t}$. 
		\ENDFOR
		\STATE $\theta_{t+1} = \theta_t + \frac{\alpha_{\theta}}{M}\sum_{j=0}^{M-1}\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^T w_t $.
		\ENDFOR
 	\end{algorithmic}
\end{algorithm}

As shown in \Cref{alg:offPolicyDPG}, while the state-action pair at time $t$ is sampled by the stationary distribution with the behavior policy $\beta$, the action corresponding to state $s_{t+1}$ is still generated by the target policy. In addition to the updates of $w_t$ as critic and $\theta_t$ as actor, a gradient correction parameter $u_t$ is also updated in line 10 of \Cref{alg:offPolicyDPG}.
% Differently from TD(0) updates in \Cref{alg:onPolicyDPG}, TDC is a provably convergent off-policy TD algorithm and aims to minimize an explicit projected mean square error loss function $\norm{Q^w - \Pi\mathcal{T}Q^w}^2_{d_{\beta}}$ where $\mathcal{T}(\cdot)$ is the Bellman operator.
% In lines 9-10 of \Cref{alg:offPolicyDPG}, $w_t$ still serves as the main critic parameter and $u_t$ is a correction parameter. When $w_t$ converges to its fixed point (e.g., $w^*_{\beta,\theta}$ for a fixed policy $\mu_{\theta}$), $u_t$ approaches to zero.
% Last, we still independently sample a batch of states by the state visitation measure when updating the actor parameter based on the compatible estimation rule introduced in \eqref{eq:compDPGOffPolicy}.



\subsection{Convergence Result}

In this subsection, we characterize the convergence rate and sample complexity for DPG-TDC in \Cref{alg:offPolicyDPG}. Similarly to the analysis of DPG-TD, we first show the Lipschitz continuity property for off-policy DPG-TDC.
\begin{lemma}\label{lem:dpglipschitzOffPolicy}
Suppose Assumptions \ref{asp:policy}-\ref{asp:Qsmooth} hold. Then the deterministic policy gradient $\nabla J_{\beta}(\theta)$ defined in \eqref{eq:dpgthmOffPolicy} is Lipschitz continuous with the parameter $L_{J_{\beta}}$, i.e., $\forall \theta_1, \theta_2 \in \mathbb{R}^d$,
\begin{equation}
    \norm{ \nabla J_{\beta}(\theta_1) - \nabla J_{\beta}(\theta_2) }\leq L_{J_{\beta}} \norm{\theta_1 - \theta_2},
\end{equation}
where $L_{J_{\beta}} \!= \! \frac{L_{\mu}}{1\!-\!\gamma}\!\parentheses{\!L_QL_{\mu} \!+\! \frac{1}{2}\gamma L_P^2R_{\max}L_{\mu}C_{\nu} \!+\! \frac{\gamma L_PL_rL_{\mu}}{1-\gamma}\!} \!+\! \frac{L_{\psi}}{1-\gamma}\parentheses{L_r + \frac{\gamma R_{\max}L_P}{1-\gamma}} $.
\end{lemma}

Compared with \Cref{lem:dpglipschitz} for the on-policy case, the Lipschitz parameter $L_{J_{\beta}}$ in \Cref{lem:dpglipschitzOffPolicy} has the same dependence on $1-\gamma$ as $L_{J}$, but does not have the state visitation error term because the behavior policy does not change as actor updates the policy.

To analyze the off-policy algorithm DPG-TDC, one of the main challenges lies in the complication arising due to the extra correction parameter $u_t$. To overcome this, we treat the update of critic as a lifted linear system with respect to a grouped state $z_t=[w_t^T u_t^T]^T\in\mathbb R^{2d}$ and then analyze the key properties of such a system matrix.
%, we can use similar techniques in the poof of \Cref{thm:onPolicyDPG} to obtain \Cref{thm:offPolicyDPG}. 
%Based on \Cref{lem:dpglipschitzOffPolicy}, 

We next provide the convergence guarantee for DPG-TDC in the following theorem. The full proof of \Cref{thm:offPolicyDPG} can be found in \Cref{app:proofThmOffPolicy} of the appendix.
\begin{theorem}\label{thm:offPolicyDPG}
Suppose that Assumptions \ref{asp:policy}-\ref{asp:phi} hold. Let $\alpha_w \leq \frac{\lambda'}{2C_{G}^2}; M\geq\frac{48\alpha_w  C_{G}^2}{\lambda'}; \alpha_{\theta} \leq \min\cur{\frac{1}{4L_{J_{\beta}}}, \frac{\lambda'\alpha_w}{24L_hL_{w'}}};\eta>\max\cur{0,\sigma_{\min}\parentheses{D^{-1}\cdot\frac{A+A^T}{2}}}$ where $A,D$ are defined in Assumption \ref{asp:phi}. Then the output of DPG-TDC in \Cref{alg:offPolicyDPG} satisfies 
\begin{align}
    \underset{t\in [T]}{\min}\mE\norm{\nabla J_{\beta}(\theta_{t})}^2 \leq \frac{c_4}{T} + \frac{c_5}{M} + c_6\kappa^2,\nonumber
\end{align} 
where $c_4 = \frac{8R_{\max}}{\alpha_{\theta}(1-\gamma)} + \frac{144L_{h}^2}{\lambda'\alpha_w}\norm{z_{0}-z^*_{\theta_{0}}}^2,
c_5 = \brackets{48\alpha_w^2(C_G^2C_w^2 + C_{\ell}^2) + \frac{48L_{w'}^2L_{\mu}^4C_{w_{\xi}}^2\alpha_{\theta}^2}{\lambda'\alpha_w}}\cdot\frac{144L_{h}^2}{\lambda'\alpha_w} + 36L_{\mu}^4C_{w_{\xi}}^2,
c_6 = 18L_h^2 + \frac{24L_{w'}^2L_h^2\alpha_{\theta}^2}{\lambda'\alpha_w}$
% \begin{align}
%     c_4 &= \frac{8R_{\max}}{\alpha_{\theta}(1-\gamma)} + \frac{144L_{h}^2}{\lambda'\alpha_w}\norm{z_{0}-z^*_{\theta_{0}}}^2,\nonumber\\
%     c_5 &= \brackets{48\alpha_w^2(C_G^2C_w^2 + C_{\ell}^2) + \frac{48L_{w'}^2L_{\mu}^4C_{w_{\xi}}^2\alpha_{\theta}^2}{\lambda'\alpha_w}}\cdot\frac{144L_{h}^2}{\lambda'\alpha_w} + 36L_{\mu}^4C_{w_{\xi}}^2, \nonumber\\
%     c_6 &= 18L_h^2 + \frac{24L_{w'}^2L_h^2\alpha_{\theta}^2}{\lambda'\alpha_w},\nonumber
% \end{align}
with $C_G^2=5(1+\eta^2)C_{\phi}^4, C_{\ell}^2=(1+\eta^2)R^2_{\max}C^2_{\phi}, L_{w'}=\frac{L_{J_{\beta}}}{\lambda_{\Psi}}  + \frac{2L_{\mu}^2L_{\psi}C_Q}{\lambda_{\Psi}^2(1-\gamma)^2}$, $\kappa$ given by \eqref{eq:systemErrorKappa}, $L_{J_{\beta}}$ defined in \Cref{lem:dpglipschitzOffPolicy}, and other constants remain the same as those in \Cref{thm:onPolicyDPG}.
% \begin{align}
%     c_4 &= \frac{8R_{\max}}{\alpha_{\theta}(1-\gamma)} + \frac{144L_{h}^2}{\lambda'\alpha_w}\norm{z_{0}-z^*_{\theta_{0}}}^2,\nonumber\\
%     c_5 &= \brackets{48\alpha_w^2(C_G^2C_w^2 + C_{\ell}^2) + \frac{48L_{w'}^2L_{\mu}^4C_{w_{\xi}}^2\alpha_{\theta}^2}{\lambda'\alpha_w}}\cdot\frac{144L_{h}^2}{\lambda'\alpha_w}\nonumber\\
%     &\quad + 36L_{\mu}^4C_{w_{\xi}}^2, \nonumber\\
%     c_6 &= 18L_h^2 + \frac{24L_{w'}^2L_h^2\alpha_{\theta}^2}{\lambda'\alpha_w},\nonumber
% \end{align}
% with $C_G^2=5(1+\eta^2)C_{\phi}^4, C_{\ell}^2=(1+\eta^2)R^2_{\max}C^2_{\phi}, L_{w'}=\frac{L_{J_{\beta}}}{\lambda_{\Psi}}  + \frac{2L_{\mu}^2L_{\psi}C_Q}{\lambda_{\Psi}^2(1-\gamma)^2}$, $\kappa$ given by \eqref{eq:systemErrorKappa}, $L_{J_{\beta}}$ defined in \Cref{lem:dpglipschitzOffPolicy}, and other constants remain the same as those in \Cref{thm:onPolicyDPG}.
\end{theorem}

\Cref{thm:offPolicyDPG} can readily imply the sample complexity for the convergence of DPG-TDC as given below.
\begin{corollary}\label{cor:offPolicyDPG}
Suppose the conditions in \Cref{thm:offPolicyDPG} still hold. Then the output of DPG-TDC in \Cref{alg:offPolicyDPG} satisfies 
\begin{align}
    \underset{t\in [T]}{\min}\mE\norm{\nabla J_{\beta}(\theta_{t})}^2 \leq \epsilon + c_6\kappa^2,\nonumber
\end{align} 
by using the total number of samples $2MT = \mathcal{O}\parentheses{1/{\epsilon^2}}$.
%\begin{align*}
%    2MT = \mathcal{O}\parentheses{\frac{1}{\epsilon^2}}.
%\end{align*}
\end{corollary}

In \Cref{cor:offPolicyDPG}, the system error $\kappa$ is determined by the off-policy distributions, and thus differs from that of the on-policy DPG-TD algorithm. Overall, \Cref{cor:offPolicyDPG} shows that off-policy DPG-TDC achieves the same sample complexity as on-policy DPG-TD in \Cref{cor:onPolicyDPG} (up to a different system error).
To our best knowledge, there has been no existing study on off-policy {\em stochastic} AC, where critic uses TDC with general nonconvex policy function approximation. Our techniques can be extended to fill such a gap.



% \subsection{Proof Sketch of \Cref{thm:offPolicyDPG}}

% The main proof idea is similar as that of \Cref{thm:onPolicyDPG}. In the following, we only point out the key differences.

% One of the main challenges lies in dealing with the two timescale scheme of the critic updates. To overcome the complication, we regard it as a linear system w.r.t a grouped state $z_t=[w_t^T u_t^T]^T\in\mathbb R^{2d}$. Then the dynamics of critic can be rewritten as 
% \begin{align}
%     z_{t+1} &= z_t + \alpha_w \begin{bmatrix}
% \hat A_t &\hat C_t\\ \eta\hat A_t &\eta \hat D_t
% \end{bmatrix} z_t + \alpha_w \begin{bmatrix}
% \hat b_t\\ \eta \hat b_t
% \end{bmatrix}\nonumber\\
% &:= z_t + \alpha_w \brackets{\hat G_t z_t + \hat \ell_t}\nonumber\\
% &:= z_t + \alpha_w g_{\theta_t}(z_t, \mcb_t), \nonumber
% \end{align}
% where $\hat A_t=\frac{1}{M}\sum_{j=0}^{M-1}\brackets{\phi_{t,j}\parentheses{\gamma\phi_{t+1,j}-\phi_{t,j}}^T}$, $\hat b_t=\frac{1}{M}\sum_{j=0}^{M-1}\brackets{r_{t,j}\phi_{t,j}}$, $\hat C=\frac{1}{M}\sum_{j=0}^{M-1}\brackets{-\gamma\phi_{t+1,j}\phi_{t,j}^T}$, and $\hat D=\frac{1}{M}\sum_{j=0}^{M-1}\brackets{-\phi_{t,j}\phi_{t,j}^T}$.


% \section{Discussion about Global Convergence of DPG}\label{sec:discussion}


% The global convergence has been established recently for stochastic NPG in \citet{agarwal2019theory} under a broad class of function approximation classes for the policy as well as for stochastic PG in \citet{liu2020improved}. It has also been shown that DPG can be viewed as a limiting method of stochastic PG \cite{silver2014deterministic}, for example, under Gaussian policy as the variance parameter $\sigma$ goes to zero. These two facts together appear to suggest that DPG should also converge to an optimal policy. 

% However, establishing the global convergence for DPG is not as simple as it appears. Consider the Gaussian policy with general nonlinear function approximation for the mean. It is easy to see that such a policy has an unbounded score function, and hence violates the major assumption for PG/NPG to have the global convergence \citet{agarwal2019theory,liu2020improved}. Hence, even stochastic PG/NPG under the Gaussian policy does not converge globally. Moreover, other policies of the same type such as bump function given in \cite{silver2014deterministic} has the same issue as the Gaussian policy.

% In order to still leverage the connection between DPG and stochastic PG/NPG,
% %and exploit the fact that PG/NPG has global convergence, 
% one may naturally modify the Gaussian policy into its truncated version denoted by $\{\pi_{\mu_{\theta},\sigma}\}$, which centers at the deterministic policy $\mu_{\theta}$ with the variance parameter $\sigma$, but has the {\bf bounded} score function, say, bounded by $G_{\sigma}$. Such a bound $G_{\sigma}$ necessarily involves $\sigma$, and converges to infinity quadratically fast as $\sigma$ converges to zero. 
% Further, the difference between the policy gradient of such a truncated Gaussian policy and that of the deterministic policy is bounded by $\epsilon_{\sigma}$, which converges to zero linearly as $\sigma$ goes to zero.
% Now the challenge is that the optimality gap of DPG is bounded by $G_\sigma \cdot \epsilon_{\sigma}$, which can be large for any chosen $\sigma$, and becomes unbounded as $\sigma$ goes to zero. This indicates that the optimality gap is determined by more refined non-asymptotic behavior of the stochastic policy that bridges DPG and stochastic PG/NPG, and may not vanish to yield global convergence.

% Thus, although we obtain substantial understanding about the connection between DPG and PG/NPG, there is still a gap to establish the global convergence guarantee for DPG, which in fact may or may not be the case by nature. We leave this open and challenging problem as the future work.


\section{Conclusion}

This paper provides the first finite-sample analysis for DPG algorithms in both on-policy (DPG-TD) and off-policy (DPG-TDC) settings. Up to the system error that necessarily exists for actor-critic algorithms, we show that both DPG-TD and DPG-TDC can find an $\epsilon$-accurate stationary policy with a sample complexity of $\mathcal{O}(\epsilon^{-2})$. Our results and the analysis techniques can lead to several promising extension directions.
For example, it would be important to explore whether DPG converges to a globally optimal policy as stochastic PG/NPG. Convergence of more popular algorithms such as DDPG is also interesting to study. 
% \cite{mitrophanov2005sensitivity}
% First, it is interesting to explore whether DPG converges to a globally optimal policy as stochastic PG/NPG. Second, as a popular deterministic policy based algorithm, DDPG combines DPG and DQN. Given the most recent progress in the studies of deep RL with neural network approximation \citep{cai2019neural,liu2019neural,wang2019neural,wai2020provably,fu2020single}, it is promising to analyze DDPG by properly adapting the compatibility condition. 
% Third, the analysis tools we develop in this paper can also be applied to a wide range of practical stochastic actor-critic algorithms where the critic and actor are updated simultaneously in an alternative manner both with constant learning rates.

\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    The work of H. Xiong, T. Xu and Y. Liang was supported in part by U.S. National Science Foundation under the grants CCF-1761506 and CCF-1900145. The work of L. Zhao was supported in part by the Singapore Ministry of Education Academic Research Fund Tier 1 under the grant R-263-000-E60-133. The work of W. Zhang is supported in part by the National Natural Science Foundation of China under Grants 62073159 and 62003155, the Shenzhen Science and Technology Program under Grant JCYJ20200109141601708, the Science, Technology and Innovation Commission of Shenzhen Municipality under grant ZDSYS20200811143601004.
\end{acknowledgements}

% \bibliography{xiong_597}

% \appendix
% \onecolumn
% \input{xiong_597-supp}
\input{bib_blb}
\end{document}
