% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022SingleCol} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

%% xr pkg
\usepackage{xr-hyper}
% \usepackage[draft]{hyperref}      % hyperlinks
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
\myexternaldocument{xiong_597}

% \usepackage{xr} 
% \externaldocument{xiong_597}
% \usepackage{xcite}
% \externalcitedocument{xiong_597}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors

%%%%%%%%%%%%%%%%%% additional packages %%%%%%%%%%%%%%%%%
\usepackage{amsmath} % assumes amsmath package installed
\usepackage{amssymb}  % assumes amsmath package installed
\usepackage{amsthm} % begin{proof}
% \usepackage{subcaption}
\usepackage{caption}
\usepackage{comment} 

\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}{Remark}
\newtheorem{fact}{Fact}
\newtheorem{definition}{Definition}
\newtheorem{proposition}{Proposition}
\newtheorem{assumption}{Assumption}
\usepackage{booktabs}
\usepackage{algorithm}
\usepackage{algorithmic}
% \urlstyle{same}
\usepackage{hyperref}
\usepackage{cleveref}
\usepackage{multirow}
\usepackage{hhline}

\usepackage{dirtytalk}
\newcommand{\mP}{\mathbb P}
\newcommand{\mE}{\mathbb E}
\newcommand{\mcb}{\mathcal B}
\newcommand{\mcs}{\mathcal S}
\newcommand{\mca}{\mathcal A}
\newcommand{\mf}{\mathcal F}
\newcommand{\mcv}{\mathcal V}
\newcommand{\mcphi}{{\rm\Phi}}
\newcommand{\mcxi}{{\rm\Xi}}
\newcommand{\mcpi}{{\rm \Pi}}
\newcommand{\lTV}[1]{\left\|#1\right\|_{TV}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\lone}[1]{\left|#1\right|}
\newcommand{\lF}[1]{\left\|#1\right\|_F}
\newcommand{\linf}[1]{\left\|#1\right\|_\infty}
\newcommand{\parentheses}[1]{\left(#1\right)}
\newcommand{\brackets}[1]{\left[#1\right]}
\newcommand{\cur}[1]{\left\{#1\right\}}

\allowdisplaybreaks[3]

\title{Deterministic Policy Gradient: Convergence Analysis (Supplementary material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<xiong.309@osu.edu>?Subject=Your UAI 2022 paper}{Huaqing Xiong}\thanks{equal contribution}{}}
\author[1]{Tengyu Xu$^*$}
\author[2]{Lin Zhao}
\author[1]{Yingbin Liang}
\author[3]{Wei Zhang}
% Add affiliations after the authors
\affil[1]{%
    Department of Electrical and Computer Engineering\\
    The Ohio State University\\
    Columbus, Ohio, USA
}
\affil[2]{%
    Department of Electrical and Computer Engineering\\
    National University of Singapore\\
    Singapore, Republic of Singapore
}
\affil[3]{%
    Department of Mechanical and Energy Engineering\\
    Southern University of Science and Technology (SUSTech)\\
    Shenzhen, Guangdong, China
  }

\begin{document}
\maketitle


\section{Proof of Lemma \ref{lem:dpglipschitz} } \label{app:proofDPGLipschitz}


\subsection{Supporting Lemmas}\label{app:lipschitzlemmas}


We first provide some useful lemmas. The first lemma provides the Lipschitz continuity property of the state visitation measure.

\begin{lemma}\label{lem:stateVisitionLipschitz}
Suppose Assumptions \ref{asp:policy} and \ref{asp:environment} hold. We define the total variation norm between two state visitation distributions respectively corresponding to two policies $\mu_{\theta_1}, \mu_{\theta_2}$ as $\lTV{\nu_{\theta_1}(\cdot)-\nu_{\theta_2}(\cdot)}=\int_{\mcs}\lone{\nu_{\theta_1}(ds)-\nu_{\theta_2}(ds)}$. Then there exists some constant $L_{\nu}>0$, such that
\begin{align*}
    \lTV{\nu_{\theta_1}(\cdot)-\nu_{\theta_2}(\cdot)} \leq L_{\nu}\norm{\theta_1-\theta_2}.
\end{align*}
\end{lemma}
\begin{proof}
Since we consider ergodic Markov chains, Theorem 3.1 of \cite{mitrophanov2005sensitivity} shows that there exists some constant $C_{\nu}>1$, such that
\begin{equation}\label{eq:nuProof}
    \lTV{\nu_{\theta_1}(\cdot)-\nu_{\theta_2}(\cdot)} \leq C_{\nu} \norm{P_{\theta_1}-P_{\theta_2}}_{\text{op}},
\end{equation}
where $P_{\theta}$ denotes the state transition kernel corresponding to a policy $\mu_{\theta}$, and the operator norm $\norm{\cdot}_{\text{op}}$ is given by $\norm{P}_{\text{op}}=\sup_{\lTV{q}=1}\lTV{qP}$. Then we have
\begin{align*}
    \norm{P_{\theta_1}-P_{\theta_2}}_{\text{op}} &= \underset{\lTV{q}=1}{\sup}\lTV{\int_{\mcs} (P_{\theta_1}-P_{\theta_2})(s,\cdot)q(ds)}\\
    &= \frac{1}{2} \underset{\lTV{q}=1}{\sup} \int_{s'}\lone{\int_s \parentheses{P_{\theta_1}(s,ds')-P_{\theta_2}(s,ds')}q(ds) }\\
    &\leq \frac{1}{2} \underset{\lTV{q}=1}{\sup} \int_{s'}\int_s \lone{P_{\theta_1}(s,ds')-P_{\theta_2}(s,ds')}q(ds) \\
    &= \frac{1}{2} \underset{\lTV{q}=1}{\sup} \int_{s'}\int_s \lone{P(ds'|s, \mu_{\theta_1}(s))-P(ds'|s, \mu_{\theta_1}(s))}q(ds)\\
    &\overset{\text{(i)}}{\leq} \frac{1}{2} \underset{\lTV{q}=1}{\sup} \int_s L_P\norm{\mu_{\theta_1}(s)-\mu_{\theta_2}(s)}q(ds)\\
    &\overset{\text{(ii)}}{\leq} \frac{1}{2}L_P L_{\mu}\norm{\theta_1-\theta_2},
\end{align*}
where (i) follows form Assumption \ref{asp:environment}, and (ii) follows from Assumption \ref{asp:policy}. Then, combining the above bound together with \eqref{eq:nuProof} completes the proof.
\end{proof}

Next, we show that the value function of a deterministic policy is Lipschitz continuous.

\begin{lemma}\label{lem:valueFuncLipschitz}
Suppose Assumptions \ref{asp:policy} and \ref{asp:environment} hold. The value function is Lipschitz continuous w.r.t. the policies. That is, for any $\theta_1,\theta_2\in\mathbb R^d, s\in\mcs$, we have
\begin{align*}
    \norm{V^{\mu_{\theta_1}}(s) - V^{\mu_{\theta_2}}(s)} \leq L_V\norm{\theta_1-\theta_2},
\end{align*}
where $L_V=R_{\max}L_{\nu} + \frac{L_rL_{\mu}}{1-\gamma}$.
\end{lemma}
\begin{proof}
By definition, we have $V^{\mu_{\theta}}(s_0)=\int_{\mcs} r(s,\mu_{\theta}(s)) \nu^{s_0}_{\mu_{\theta}}(ds)$, where $\nu^{s_0}_{\mu_{\theta}}(\cdot)$ is the discounted state visitation measure given the initial state, i.e., $\nu^{s_0}_{\mu_{\theta}}(s)=\int_{\mcs}\sum_{t=0}^\infty \gamma^{t} p(s_0\rightarrow s,t,\mu_{\theta})ds$.
We then derive
\begin{align*}
    &\lone{ V^{\mu_{\theta_1}}(s_0)-V^{\mu_{\theta_2}}(s_0) } \\
    &\quad =  \lone{\int_{\mcs} r(s,\mu_{\theta_1}(s)) \nu^{s_0}_{\mu_{\theta_1}}(ds) - \int_{\mcs} r(s,\mu_{\theta_2}(s)) \nu^{s_0}_{\mu_{\theta_2}}(ds) }\\
    &\quad\leq \lone{\int_{\mcs} r(s,\mu_{\theta_1}(s)) \nu^{s_0}_{\mu_{\theta_1}}(ds) - \int_{\mcs} r(s,\mu_{\theta_1}(s)) \nu^{s_0}_{\mu_{\theta_2}}(ds) } \\
    &\quad\quad + \lone{\int_{\mcs} r(s,\mu_{\theta_1}(s)) \nu^{s_0}_{\mu_{\theta_2}}(ds) - \int_{\mcs} r(s,\mu_{\theta_2}(s)) \nu^{s_0}_{\mu_{\theta_2}}(ds) }\\
    &\quad\leq \int_{\mcs} \lone{r(s,\mu_{\theta_1}(s))}\cdot \lone{\nu^{s_0}_{\mu_{\theta_1}}(ds) - \nu^{s_0}_{\mu_{\theta_2}}(ds) } + \int_{\mcs} \lone{r(s,\mu_{\theta_1}(s))  -  r(s,\mu_{\theta_2}(s))} \nu^{s_0}_{\mu_{\theta_2}}(ds) \\
    &\quad\overset{\text{(i)}}{\leq} R_{\max} \lTV{\nu^{s_0}_{\mu_{\theta_1}}(\cdot) - \nu^{s_0}_{\mu_{\theta_2}}(\cdot) } + L_r\int_{\mcs} \norm{\mu_{\theta_1}(s)-\mu_{\theta_2}(s)} \nu^{s_0}_{\mu_{\theta_2}}(ds) \\
    &\quad\overset{\text{(ii)}}{\leq} R_{\max}L_{\nu} \norm{\theta_1-\theta_2} + L_rL_{\mu}\norm{\theta_1-\theta_2}\int_{\mcs} \nu^{s_0}_{\mu_{\theta_2}}(ds) \\
    &\quad = \parentheses{R_{\max}L_{\nu} + \frac{L_rL_{\mu}}{1-\gamma}} \norm{\theta_1-\theta_2},
\end{align*}
where (i) follows from Assumption \ref{asp:environment}, and (ii) follows from \Cref{lem:stateVisitionLipschitz} and Assumption \ref{asp:policy}.

\end{proof}

The next lemma establishes the boundedness and Lipschitz continuity property for the gradient of Q-function.
\begin{lemma}\label{lem:Qgradient}
Suppose Assumptions \ref{asp:policy}-\ref{asp:Qsmooth} hold. The gradient of Q-function w.r.t. action is uniformly bounded. That is, for any $(s,a)\in\mcs\times\mca, \theta\in\mathbb R^d$,
\begin{align*}
    \norm{\nabla_a Q^{\mu_{\theta}}(s,a)}\leq C_Q,
\end{align*}
where $C_Q=L_r + L_P\cdot\frac{\gamma R_{\max}}{1-\gamma}$. Furthermore, $\nabla_a Q^{\mu_{\theta}}(s,a_{\theta})$ is Lipschitz continuous w.r.t. $\theta$, that is, for any $\theta_1,\theta_2\in\mathbb R^d$, we have
\begin{align*}
    \norm{\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) - \nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2})} \leq L'_Q\norm{\theta_1-\theta_2},
\end{align*}
where $L'_Q=L_QL_{\mu} + \gamma L_PL_V$.
\end{lemma}
\begin{proof}
For the boundedness property, we have
\begin{align*}
    \norm{\nabla_a Q^{\mu_{\theta}}(s,a)} &= \norm{\nabla_a \int_{\mcs}\parentheses{r(s,a) + \gamma P(s'|s,a)V^{\mu_{\theta}}(s')} ds'}\\
    &\leq \norm{\nabla_a r(s,a)} + \gamma\int_{\mcs} \norm{ \nabla_a P(s'|s,a)}\cdot\lone{V^{\mu_{\theta}}(s')} ds'\\
    &\leq L_r + L_P\cdot\frac{\gamma R_{\max}}{1-\gamma},
\end{align*}
where the last inequality follows from Assumptions \ref{asp:policy}, \ref{asp:environment} and the fact that $\lone{V^{\mu_{\theta}}(s')}\leq\frac{R_{\max}}{1-\gamma}$.

We next show the Lipschitz property as follows.
\begin{align*}
    &\norm{\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) - \nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2})} \\
    &\quad\leq \norm{\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) - \nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_2})} + \norm{\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_2}) - \nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2})}\\
    &\quad\overset{\text{(i)}}{\leq} L_Q\norm{a_{\theta_1}-a_{\theta_2}} + \norm{\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_2}) - \nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2})}\\
    &\quad= L_Q\norm{\mu_{\theta_1}(s)-\mu_{\theta_2}(s)} + \norm{\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_2}) - \nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2})}\\
    &\quad\overset{\text{(ii)}}{\leq} L_Q L_{\mu}\norm{\theta_1-\theta_2} + \norm{\int_{\mcs}\gamma \nabla_a P(s'|s,a)\parentheses{V^{\mu_{\theta_1}}(s') - V^{\mu_{\theta_2}}(s')}ds'}\\
    &\quad\leq L_Q L_{\mu}\norm{\theta_1-\theta_2} + \gamma \int_{\mcs}\norm{\nabla_a P(s'|s,a)}\cdot\lone{V^{\mu_{\theta_1}}(s') - V^{\mu_{\theta_2}}(s')}ds'\\
    &\quad\overset{\text{(iii)}}{\leq} (L_QL_{\mu} + \gamma L_PL_V)\norm{\theta_1-\theta_2},
\end{align*}
where (i) follows from Assumption \ref{asp:Qsmooth}, (ii) follows from Assumption \ref{asp:policy} and (iii) follows from Assumption \ref{asp:environment} and \Cref{lem:valueFuncLipschitz}.

\end{proof}

\subsection{Proof of Lemma \ref{lem:dpglipschitz} }

To simplify the notation, we define $\psi_{\theta}(s) := \nabla_{\theta}\mu_{\theta}(s)$, $a_{\theta}=\mu_{\theta}(s)$ and $\nabla_a Q^{\mu_{\theta}}(s,a_{\theta})=\nabla_a Q^{\mu_{\theta}}(s,a)|_{a=\mu_{\theta}(s)}$ in the following proof. 
%The main proof applies a few supporting lemmas that we present in \Cref{app:lipschitzlemmas}.

We start from the form of the off-policy deterministic policy gradient given in \eqref{eq:dpgthm}, and have
\begin{align}
    &\norm{ \nabla J(\theta_1) - \nabla J(\theta_2) }\nonumber\\
    &\quad= \norm{ \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) \nu_{\theta_1}(ds) - \int_{\mcs}\psi_{\theta_2}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\theta_2}(ds) }\nonumber\\
    &\quad= \left\lVert \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) \nu_{\theta_1}(ds) - \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) \nu_{\theta_2}(ds) \right.\nonumber\\
    &\quad\quad + \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) \nu_{\theta_2}(ds) - \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\theta_2}(ds) \nonumber\\
    &\quad\quad + \left. \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\theta_2}(ds) - \int_{\mcs}\psi_{\theta_2}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\theta_2}(ds) \right\rVert\nonumber\\
    &\quad\leq \norm{ \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) \nu_{\theta_1}(ds) - \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) \nu_{\theta_2}(ds) }\nonumber\\
    &\quad\quad + \norm{ \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) \nu_{\theta_2}(ds) - \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\theta_2}(ds) }\nonumber\\
    &\quad\quad + \norm{ \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\theta_2}(ds) - \int_{\mcs}\psi_{\theta_2}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\theta_2}(ds) }\nonumber\\
    &\quad\leq \int_{\mcs}\norm{ \psi_{\theta_1}(s) }\cdot\norm{ \nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1})} |\nu_{\theta_1}(ds) -  \nu_{\theta_2}(ds)|\nonumber\\
    &\quad\quad + \int_{\mcs}\norm{ \psi_{\theta_1}(s) }\cdot\norm{ \nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) - \nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2})} \nu_{\theta_2}(ds)\nonumber\\
    &\quad\quad + \int_{\mcs}\norm{ \psi_{\theta_1}(s) - \psi_{\theta_2}(s) }\cdot \norm{ \nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2})} \nu_{\theta_2}(ds)\nonumber\\
    &\quad\overset{\text{(i)}}{\leq} L_{\mu}C_Q \lTV{\nu_{\theta_1}(\cdot)-\nu_{\theta_2}(\cdot)} + L_{\mu}\int_{\mcs}\norm{\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1})-\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2})}\nu_{\theta_2}(ds)\nonumber\\
    &\quad\quad + C_Q \int_{\mcs}\norm{\psi_{\theta_1}(s) - \psi_{\theta_2}(s)}\nu_{\theta_2}(ds)\nonumber\\
    &\quad\overset{\text{(ii)}}{\leq} L_{\mu}C_Q \lTV{\nu_{\theta_1}(\cdot)-\nu_{\theta_2}(\cdot)} + L_{\mu}L'_Q\norm{\theta_1-\theta_2}\int_{\mcs}\nu_{\theta_2}(ds) + C_QL_{\psi}\norm{\theta_1-\theta_2}\int_{\mcs}\nu_{\theta_2}(ds)\nonumber\\
    &\quad\overset{\text{(iii)}}{=} L_{\mu}C_Q \lTV{\nu_{\theta_1}(\cdot)-\nu_{\theta_2}(\cdot)} + \frac{L_{\mu}L'_Q}{1-\gamma}\norm{\theta_1-\theta_2} + \frac{C_QL_{\psi}}{1-\gamma}\norm{\theta_1-\theta_2}\nonumber\\
    &\quad\overset{\text{(iv)}}{\leq} \parentheses{L_{\mu}C_Q L_{\nu} + \frac{L_{\mu}L'_Q}{1-\gamma} + \frac{C_QL_{\psi}}{1-\gamma}}\norm{\theta_1-\theta_2}\nonumber\\
    &\quad:= L_J\norm{\theta_1-\theta_2},\nonumber
\end{align}
where (i) follows because $\norm{\psi_{\theta}(s)}\leq L_{\mu}$ as indicated by Assumption \ref{asp:policy} and $\norm{\nabla_a Q^{\mu_{\theta}}(s,a)}\leq C_Q$ by \Cref{lem:Qgradient}, (ii) follows from Assumption \ref{asp:policy} and \Cref{lem:Qgradient}, (iii) follows because $\int_{\mcs}\nu_{\theta}(ds)=\frac{1}{1-\gamma}$, and (iv) follows from \Cref{lem:stateVisitionLipschitz}.




\section{Proof of Theorem \ref{thm:onPolicyDPG} and Theorem \ref{cor:onPolicyDPG} } \label{app:proofThmOnPolicy}

%We first provide a number of supporting lemmas and their proofs, and then provide the main proof of \Cref{thm:onPolicyDPG}.

% To illustrate our analysis idea in a more clear way, we first outline the proof of \Cref{thm:onPolicyDPG} to highlight our new approach to analyzing the {\bf coupled} actor and critic's stochastic approximation processes, due to their simultaneous updates both with constant stepsizes. Then we will provide the detailed proofs of \Cref{thm:onPolicyDPG} and \Cref{cor:onPolicyDPG}, respectively.

% \subsection{Proof Sketch}\label{sec:proofonpolicy}

% % In the following, we outline the proof of \Cref{thm:onPolicyDPG} to highlight our new approach to analyzing the {\bf coupled} actor and critic's stochastic approximation processes, due to their simultaneous updates both with constant stepsizes. 
% The central idea of this proof is to cancel the critic's cumulative tracking error by the actor's overall positive progress to the stationary policy, which is different from the existing analysis of (stochastic) PG-type algorithms that mainly decouples or asymptotically decouples the critic's error from actor's error. 
% Further, we develop a new analysis to bound the estimation error of the Fisher information of deterministic policy arising via the compatibility theorem, and then further capture how such a metric affects the convergence via its minimum eigenvalue.

% %We emphasize that our techniques effectively handle the unique compatibility estimator for DPG which is different from stochastic AC. In addition, we also deal with the coupling between the critic and actor parameters, which is usually avoided by most of the existing AC studies. 
% % We consider the practical constant learning rates, which is different from the two timescale AC algorithms.

% The main proof consists of three steps.
% % First, we analyze the error dynamics of tracking a fixed critic target (i.e., fixed tracking error). Second, we analyze the error dynamics of tracking a dynamic critic target (i.e., dynamic tracking error) based on the first step. Last, we couple critic's dynamic tracking error with actor's update and bound the overall convergence gap. 
% First, we characterize the error propagation of tracking a dynamic critic target (i.e., dynamic tracking error) based on its coupling with actor's update progress. Second, we bound the critic's cumulative tracking error in terms of actor's update progress via the compatibility properties of DPG.
% %the actor penalty (expected policy gradient). 
% Last, we establish the overall convergence by canceling out the cumulative tracking error via the actor's overall positive progress towards the stationary policy.

% \textbf{Step I: Characterizing dynamics of critic's error via coupling with actor.}
% % Relate $\norm{w_{t+1}-w^*_{\theta_t}}^2$ and $\norm{w_{t}-w^*_{\theta_t}}^2$.

% % In the first step, we focus on the dynamics of the fixed tracking error. That is, we fix a policy $\mu_{\theta_t}$ and find out the dynamics of the error between the critic parameter and the fixed target $w^*_{\theta_t}$.
% In the first step, we characterize the propagation of the dynamics of critic's dynamic tracking error based on its coupling with actor's updates. That is, we develop the relationship between $\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2$ and $\norm{w_{t}-w^*_{\theta_t}}^2$ by their coupling with actor's updates.

% Recall that $w^*_{\theta_t}$ is the global optimum of TD given a fixed policy $\mu_{\theta_t}$, or is equivalently the unique root of $\bar g_{\theta_t}(w_t) := \mE_{d_{\theta_t}} \brackets{\frac{1}{M}\sum_{j=0}^{M-1}\delta_{t,j}\phi(x_{t,j})}=0$. 
% % In this step, the key observation is the strong convexity like property for $\bar g_{\theta_t}(w)$, that is, for any policy, $\langle w_{t}-w^*_{\theta_t}, \bar g_{\theta_t}(w_t)\rangle\leq-\lambda\norm{w_{t}-w^*_{\theta_t}}^2$ with some constant $\lambda>0$. This property has been shown and widely used in the analysis of TD learning with linear function approximation \citep{tsitsiklis1997analysis,Bhandari2018finite,xiong2020amsgradRL}. 
% We first give the following bound on the update rule of $w_t$ in \Cref{alg:onPolicyDPG} given by the TD learning property \citep{tsitsiklis1997analysis,Bhandari2018finite,xiong2020amsgradRL},
% %We first use the strong convexity like property $\langle w_{t}-w^*_{\theta_t}, \bar g_{\theta_t}(w_t)\rangle\leq-\lambda\norm{w_{t}-w^*_{\theta_t}}^2$ with some constant $\lambda>0$ 
% %\citep{tsitsiklis1997analysis,Bhandari2018finite,xiong2020amsgradRL}, and the update rule of $w_t$ in \Cref{alg:onPolicyDPG} to have
% %\begin{align}
% %    \mE\norm{w_{t+1}\!-\!w^*_{\theta_t}}^2 &\leq (1\!-\!2\alpha_{w}\lambda\!+\!2\alpha_{w}^2C_A^2)\mE\norm{w_{t}-w^*_{\theta_t}}^2 \nonumber\\
% %    &\quad + 2\alpha_{w}^2\mE\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2, \nonumber
% %\end{align}
% %where $ g_{\theta_t}(w_t, \mcb_t) := \frac{1}{M}\sum_{j=0}^{M-1}\delta_{t,j}\phi(x_{t,j})$ is an unbiased estimate of $\bar g_{\theta_t}(w_t)$.
% %To proceed, we use \Cref{lem:minibatchVariance} to bound the variance term $\mE\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2$, and obtain
% \begin{align}
%     \mE\norm{w_{t+1}-w^*_{\theta_t}}^2 \leq \parentheses{1-\frac{\alpha_w\lambda}{2}}\mE\norm{w_{t}-w^*_{\theta_t}}^2 + \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M}, \nonumber
% \end{align}
% where $\alpha_w \leq \frac{\lambda}{2C_A^2}, M\geq\frac{48\alpha_w  C_A^2}{\lambda}$.
% % , we finish this step and have
% % \begin{align}
% %     &\mE\norm{w_{t+1}-w^*_{\theta_t}}^2 \nonumber\\
% %     &\leq \parentheses{1-\frac{\alpha_w\lambda}{2}}\mE\norm{w_{t}-w^*_{\theta_t}}^2 + \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M}. \nonumber
% % \end{align}


% % \textbf{Step II}: Relate $\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2$ and $\norm{w_{t}-w^*_{\theta_t}}^2$.

% % The second step aims to build the connection between the dynamic tracking errors along with the time step.

% % Based on the result of the last step, it is natural to think of using Young's inequality to achieve our goal:
% % \begin{align}
% %     &\mE\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2 \nonumber\\
% %     &\leq \parentheses{1\!+\!\frac{1}{c}}\mE\norm{w_{t+1}\!-\!w^*_{\theta_{t}}}^2 + \parentheses{1\!+\!c}\mE\norm{w^*_{\theta_{t}}\!-\!w^*_{\theta_{t+1}}}^2\nonumber.
% % \end{align}

% % The choice of the constant $c$ needs to guarantee $(1-\alpha_w\lambda/2)\cdot(1+1/c)\leq (1-\alpha_w\lambda/4)$. A qualified option is $c=1/2(2/\lambda\alpha_w-1)$. 

% %Then, we apply Young's inequality and the Lipschitz continuity property of $w^*_{\theta_{t}}$ derived in \Cref{lem:wStar}, and obtain

% In the previous analysis of (stochastic) AC algorithms, sufficient TD updates of critic result in a controlled small tracking error before updating the actor, which is hence decoupled from the actor's progress. In contrast, DPG-TD takes alternative updates between critic and actor, so that the critic's tracking error is inherent and non-vanishing. Thus, we take a new approach to characterize the moving dynamics of the tracking error and directly couple it with the actor's update as follows,
% \begin{align}
%     \mE\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2
%     \!\leq\! \frac{4L_w^2}{\lambda\alpha_w}\mE\norm{\theta_{t+1}-\theta_{t}}^2 \!+\! \parentheses{1-\frac{\lambda\alpha_w}{4}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 \!+\! \frac{48\alpha_w^2(C_A^2C_w^2 \!+\! C_b^2)}{M}.\nonumber
%     % &= \parentheses{1-\frac{\lambda\alpha_w}{4}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M}\nonumber\\
%     % &\quad + \frac{4L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2\nonumber,
% \end{align}
% % where $h_{\theta_t}(w_t, \!\mcb_t) \!=\! \frac{1}{M}\sum_{j=0}^{M\!-\!1}\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^T w_t$.
% Clearly, in the above bound, the two tracking errors at times $t+1$ and $t$ have different targets $w^*_{\theta_{t+1}}$ and $w^*_{\theta_{t}}$ due to actor's one update between critic's two consecutive updates. Hence, actor's update is necessarily coupled into the dynamics of the critic's tracking error.

% %From the above dynamics, we see that when the critic and actor are updated alternatively as in \Cref{alg:onPolicyDPG}, the dynamic tracking error will be inherited to the next iteration. Such a non-vanishing error is coupled with the actor's update, which is different from the existing analysis that (asymptotically) decouples the critic's error from actor's error.

% \textbf{Step II: Bounding cumulative tracking error via compatibility theorem for DPG.}

% In this step, we bound the cumulative tracking error based on the dynamics of the tracking error from the last step. 
% % penalized by the actor's approaching to stationary points. 

% To this end, we first bound the difference between two consecutive actor parameters via DPG's properties. By the update rule of $\theta_t$ in \Cref{alg:onPolicyDPG}, we have $\theta_{t+1}\!-\!\theta_{t} \!=\! \frac{\alpha_{\theta}}{M}\sum_{j=0}^{M\!-\!1}\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^T w_t\!:=\!\alpha_{\theta}h_{\theta_t}(w_t, \!\mcb_t)$.
% Since $h_{\theta_t}(w_t,\mcb_t)$ is not an unbiased estimator of the deterministic policy gradient $\nabla J(\theta_t)$, we characterize such a bias by exploiting the compatibility theorem as well as the property of Fisher information of deterministic policy defined in Assumption \ref{asp:policy} and obtain the following bound (see \Cref{lem:hVariance} for the proof)
% \begin{align*}
%     \mE\norm{h_{\theta_t}(w_{t},\mcb_t) -\nabla J(\theta_t)}^2 \leq 3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}.
% \end{align*}

% % Next, by observing $\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2\leq 2\mE\norm{\nabla J(\theta_t)}^2+2\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)}^2$ and further bounding $\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)}^2$ in \Cref{lem:hVariance},

% The above bound then connects the critic's error dynamics from Step I to the policy gradient and yields the following result: 
% %Next, we proceed to bound critic's error dynamics from Step I via  and obtain
% \begin{align}
%     \mE\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2 
%     % &\leq \parentheses{1-\frac{\lambda\alpha_w}{4}+\frac{24L_h^2L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 \nonumber\\
%     % &\quad\quad + \frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{\nabla J(\theta_t)}^2\nonumber\\
%     % &\quad\quad + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\nonumber\\
%     &\leq \parentheses{1-\frac{\lambda\alpha_w}{8}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 \!+\! \frac{48\alpha_w^2(C_A^2C_w^2 \!+\! C_b^2)}{M} \!+\! \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{\nabla J(\theta_t)}^2\nonumber\\
%     &\quad + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\nonumber,
% \end{align}
% where it requires $\alpha_{\theta} \leq \frac{\lambda\alpha_w}{\sqrt{96}L_hL_w}$.

% Thus, we obtain the cumulative dynamic tracking error as
% \begin{align}
%     \sum_{t=0}^{T-1}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2
%     &\leq \frac{8\norm{w_{0}-w^*_{\theta_{0}}}^2}{\lambda\alpha_w} + \frac{64L_w^2\alpha_{\theta}^2}{\lambda^2\alpha_w^2}\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2  \nonumber\\
%     &\quad + \brackets{\frac{48\alpha_w^2(C_A^2C_w^2 \!+\! C_b^2)}{M} \!+\! \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 \!+\! \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}} \cdot
%     \frac{8T}{\lambda\alpha_w}. \nonumber
% \end{align}
% The above bound connects the cumulative dynamic tracking error to the convergence rate of actor's update via policy gradient, i.e., such an error depends on how fast actor's update approaches to the stationary point. 

% %The above bound indicates that the cumulative dynamic tracking error is determined by how fast actor can approach to the stationary points. 

% \textbf{Step III: Overall convergence by canceling tracking error via actor's positive progress.}

% In this step, we establish the overall convergence to a stationary policy by novel cancellation of the above cumulative tracking error via actor's update progress.

% %We first establish the relationship between the progress of value function and the tracking error based on the Lipschitz continuity in \Cref{lem:dpglipschitz}, and have
% % use the Lipschitz continuity in \Cref{lem:dpglipschitz} and the same techniques to deal with $\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2$ in the last step to obtain
% %\begin{align}
% %    &\mE[J(\theta_{t+1})] - \mE[J(\theta_t)] \geq \frac{\alpha_{\theta}}{4}\mE\norm{\nabla J(\theta_t)}^2 \nonumber\\
% %    &\quad - \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}},\nonumber
% %\end{align}
% %where $\alpha_{\theta} \leq \frac{1}{4L_J}$, 

% We first bound the cumulative policy gradient by the cumulative tracking error via the relationship between the progress of loss function and the tracking error as follows:
% \begin{align}
%     \frac{\alpha_{\theta}}{4}\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2
%     \leq \frac{9\alpha_{\theta}L_{h}^2}{4}\sum_{t=0}^{T-1}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{R_{\max}}{1-\gamma} + \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\cdot T. \nonumber
% \end{align}

% The previous analysis of (stochastic) AC typically exploits the fact that the above critic's tracking error can decay sufficiently fast by decoupling it from actor's update, which does not hold here. In contrast, we exploit the connection of the cumulative tracking errors and the cumulative policy gradient that we establish in Step II, and show that such a tracking error can ultimately be canceled by the actor's positive progress towards a stationary point. This also explains why the critic's inaccurate estimation does not affect the overall convergence guarantee. Such an idea is captured as follows:
% % Our final goal is to bound $\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2$. Since the left hand side of \eqref{eq:scketch1} can be telescoped, it remains to handle $\sum_{t=0}^{T-1}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2$. To this end, we use the dynamic tracking error from Step II and have
% % Next, by taking summation for both sides of the above inequality with telescoping, we have

% %Last, the cumulative dynamic tracking error from Step II can be cancelled out by the overall positive progress of actor's approaching to the stationary policy, 
% % which eventually leads to the final convergence.
% % Then, we plug the cumulative dynamic tracking error from Step II into the above bound, and obtain
% %which yields
% \begin{align}
%     &\parentheses{\frac{\alpha_{\theta}}{4}-\frac{144L_h^2L_w^2\alpha_{\theta}^3}{\lambda^2\alpha_w^2}}\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2 \nonumber\\
%     &\quad\leq \frac{R_{\max}}{1-\gamma} + \frac{18\alpha_{\theta}L_{h}^2}{\lambda\alpha_w}\norm{w_{0}-w^*_{\theta_{0}}}^2 + \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\cdot T \nonumber\\
%     &\quad\quad +\! \brackets{\frac{48\alpha_w^2(C_A^2C_w^2 \!+\! C_b^2)}{M} \!+\! \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 \!+\! \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}}\cdot\frac{18\alpha_{\theta}L_{h}^2T}{\lambda\alpha_w}. \nonumber
% \end{align}

% Finally, by letting $\alpha_{\theta} \leq \frac{\lambda\alpha_w}{24L_hL_w}$ and rearranging the above terms, we complete the proof.


\subsection{Supporting Lemmas}\label{app:theorem1lemmas}

In the following, we provide a few supporting lemmas that are used in the main proof of \Cref{thm:onPolicyDPG}. The first lemma characterizes the properties of mini-batch sampling.
%indicates how the mini-batch sampling can help to bound variance terms.
\begin{lemma}\label{lem:minibatchVariance}
The following two properties hold.
\begin{enumerate}
    \item Let $\hat Y, \bar Y\in \mathbb R^{d_1\times d_2}$ be matrices satisfying $\lF{\hat Y}\leq C_{Y}, \lF{\bar Y}\leq C_{Y}$. If $\hat Y$ is an unbiased estimator of $\bar Y$ and $\{\hat Y_j\}_j$ are i.i.d.\ estimators, then we have
    \begin{align*}
        \mE\lF{\frac{1}{M}\sum_{j=0}^{M-1}\hat Y_j - \bar Y}^2 \leq \frac{4C_Y^2}{M}.
    \end{align*}
    \item Let $\hat y, \bar y\in \mathbb R^d$ be vectors satisfying $\norm{\hat y}\leq C_{y}, \norm{\bar y}\leq C_{y}$. If $\hat y$ is an unbiased estimator of $\bar y$ and $\{y_j\}_j$ are i.i.d.\ estimators, then we have
    \begin{align*}
        \mE\norm{\frac{1}{M}\sum_{j=0}^{M-1}\hat y_j - \bar y}^2 \leq \frac{4C_y^2}{M}.
    \end{align*}
\end{enumerate}
\end{lemma}
\begin{proof}
We first prove the first statement of the matrix case as follows. 
\begin{align*}
    \mE\lF{\frac{1}{M}\sum_{j=0}^{M-1}\hat Y_j - \bar Y}^2 
    &= \frac{1}{M^2} \sum_{c}^{d_2} \sum_{i=0}^{M-1} \sum_{j=0}^{M-1}\mE\langle \hat Y_{i}^c - \bar Y^c, Y_{j}^c - \bar Y^c \rangle \\
    &= \frac{1}{M^2}\sum_{j=0}^{M-1}\mE\lF{\hat Y_j - \bar Y}^2 + \frac{1}{M^2} \sum_{c}^{d_2} \sum_{i\neq j}\mE\langle \hat Y_{i}^c - \bar Y^c, Y_{j}^c - \bar Y^c \rangle\\
    &= \frac{1}{M^2}\sum_{j=0}^{M-1}\mE\lF{\hat Y_j - \bar Y}^2\\
    &\leq \frac{2}{M^2}\sum_{j=0}^{M-1}\parentheses{\mE\lF{\hat Y_j}^2 + \mE\lF{\bar Y}^2}\\
    &\leq \frac{4C_Y^2}{M},
\end{align*}
where $\hat Y_j^c$ is the $c$-th column of $\hat Y_j$.

We next prove the second statement of the vector case as follows.
\begin{align*}
    \mE\norm{\frac{1}{M}\sum_{j=0}^{M-1}\hat y_j - \bar y}^2 
    &= \frac{1}{M^2} \sum_{i=0}^{M-1} \sum_{j=0}^{M-1}\mE\langle \hat y_{i} - \bar y, y_{j} - \bar y \rangle \\
    &= \frac{1}{M^2}\sum_{j=0}^{M-1}\mE\norm{\hat y_j - \bar y}^2 + \frac{1}{M^2} \sum_{i\neq j}\mE\langle \hat y_{i} - \bar y, y_{j} - \bar y \rangle\\
    &= \frac{1}{M^2}\sum_{j=0}^{M-1}\mE\norm{\hat y_j - \bar y}^2\\
    &\leq \frac{2}{M^2}\sum_{j=0}^{M-1}\parentheses{\mE\norm{\hat y_j}^2 + \mE\norm{\bar y}^2}\\
    &\leq \frac{4C_y^2}{M}.
\end{align*}
\end{proof}

Next, we provide some important properties of $w^*_{\xi_{\theta}}$.

\begin{lemma}\label{lem:wStar}
Let $w^*_{\xi_{\theta}}$ be defined in \Cref{prop:compatibility}. Suppose Assumptions \ref{asp:policy}-\ref{asp:Qsmooth} hold. Then we have
\begin{align*}
    \norm{w^*_{\xi_{\theta}}} \leq C_{w_{\xi}},
\end{align*}
where $C_{w_{\xi}}=\frac{L_{\mu}C_Q}{\lambda_{\Psi}(1-\gamma)}$. Furthermore, for any $\theta_1,\theta_2$, we have
\begin{align*}
    \norm{w^*_{\xi_{\theta_1}} - w^*_{\xi_{\theta_2}}} \leq L_w\norm{\theta_1 - \theta_2},
\end{align*}
where $L_w=\frac{L_{J}}{\lambda_{\Psi}}  + \frac{L_{\mu}C_Q}{\lambda_{\Psi}^2(1-\gamma)}\parentheses{L_{\mu}^2L_{\nu} + \frac{2L_{\mu}L_{\psi}}{1-\gamma}}$.
\end{lemma}
\begin{proof}
We first show the boundedness of $\norm{\nabla J(\theta)}$.
\begin{align*}
    \norm{\nabla J(\theta)} & = \norm{\int_{\mcs}  \nabla_{\theta}\mu_{\theta}(s) \nabla_a Q^{\mu_{\theta}}(s,a)|_{a=\mu_{\theta}(s)}\nu_{\theta}(ds)}\\
    &\leq \int_{\mcs}  \norm{\nabla_{\theta}\mu_{\theta}(s)} \norm{\nabla_a Q^{\mu_{\theta}}(s,a)|_{a=\mu_{\theta}(s)}}\nu_{\theta}(ds)\\
    &\overset{\text{(i)}}{\leq} L_{\mu}C_Q \int_{\mcs}\nu_{\theta}(ds) = \frac{L_{\mu}C_Q}{(1-\gamma)},
\end{align*}
where (i) follows from Assumption \ref{asp:policy} and \Cref{lem:Qgradient}.

Recall we define $\Psi_{\theta}=\mE_{\nu_{\mu_{\theta}}}\brackets{ \nabla_{\theta}\mu_{\theta}(s) \nabla_{\theta}\mu_{\theta}(s)^T}$. Assumption \ref{asp:policy} implies that $\Psi_{\theta}$ is non-singular. Then by definition, we have
\begin{align*}
    \norm{w^*_{\xi_{\theta}}} 
    &=  \norm{\Psi_{\theta}^{-1}\nabla J(\theta)}
    \leq \frac{1}{\lambda_{\Psi}} \norm{\nabla J(\theta)}
    \leq \frac{L_{\mu}C_Q}{\lambda_{\Psi}(1-\gamma)}.
\end{align*}

Next, we show the Lipschitz continuity property.
\begin{align*}
    &\norm{w^*_{\xi_{\theta_1}} - w^*_{\xi_{\theta_2}}}\\
    &\quad= \norm{\Psi_{\theta_1}^{-1}\nabla J(\theta_1) - \Psi_{\theta_2}^{-1}\nabla J(\theta_2)}\\
    &\quad= \norm{\Psi_{\theta_1}^{-1}\nabla J(\theta_1) - \Psi_{\theta_1}^{-1}\nabla J(\theta_2) + \Psi_{\theta_1}^{-1}\nabla J(\theta_2) - \Psi_{\theta_2}^{-1}\nabla J(\theta_2)}\\
    &\quad\leq \norm{\Psi_{\theta_1}^{-1}(\nabla J(\theta_1) - \nabla J(\theta_2))} + \norm{\parentheses{\Psi_{\theta_1}^{-1} - \Psi_{\theta_2}^{-1}}\nabla J(\theta_2)}\\
    &\quad\overset{\text{(i)}}{\leq} \frac{L_{J}}{\lambda_{\Psi}} \norm{\theta_1 - \theta_2} + \norm{\parentheses{\Psi_{\theta_1}^{-1} - \Psi_{\theta_2}^{-1}}\nabla J(\theta_2)}\\
    &\quad= \frac{L_{J}}{\lambda_{\Psi}} \norm{\theta_1 - \theta_2} + \norm{\parentheses{\Psi_{\theta_1}^{-1}\Psi_{\theta_2}\Psi_{\theta_2}^{-1} - \Psi_{\theta_1}^{-1}\Psi_{\theta_1}\Psi_{\theta_2}^{-1}}\nabla J(\theta_2)}\\
    &\quad= \frac{L_{J}}{\lambda_{\Psi}} \norm{\theta_1 - \theta_2} + \norm{\Psi_{\theta_1}^{-1}\parentheses{\Psi_{\theta_2}-\Psi_{\theta_1}}\Psi_{\theta_2}^{-1}\nabla J(\theta_2)}\\
    &\quad\leq \frac{L_{J}}{\lambda_{\Psi}} \norm{\theta_1 - \theta_2} + \frac{1}{\lambda_{\Psi}^2}\norm{\Psi_{\theta_2}-\Psi_{\theta_1}}\norm{\nabla J(\theta_2)}\\
    &\quad\leq \frac{L_{J}}{\lambda_{\Psi}} \norm{\theta_1 - \theta_2} + \frac{L_{\mu}C_Q}{\lambda_{\Psi}^2(1-\gamma)}\norm{\Psi_{\theta_2}-\Psi_{\theta_1}},
\end{align*}
where (i) follows from \Cref{lem:dpglipschitz} and Assumption \ref{asp:policy}.

Observe that 
\begin{align*}
    &\norm{\Psi_{\theta_2}-\Psi_{\theta_1}} \\
    &\quad= \norm{ \int_{\mcs}  \nabla_{\theta}\mu_{\theta_2}(s)\nabla_{\theta}\mu_{\theta_2}(s)^T\nu_{\theta_2}(ds) - \int_{\mcs}  \nabla_{\theta}\mu_{\theta_1}(s)\nabla_{\theta}\mu_{\theta_1}(s)^T\nu_{\theta_1}(ds) }\\
    &\quad\leq \norm{ \int_{\mcs}  \nabla_{\theta}\mu_{\theta_2}(s)\nabla_{\theta}\mu_{\theta_2}(s)^T\nu_{\theta_2}(ds) - \int_{\mcs}  \nabla_{\theta}\mu_{\theta_2}(s)\nabla_{\theta}\mu_{\theta_2}(s)^T\nu_{\theta_1}(ds) }\\
    &\quad\quad + \norm{ \int_{\mcs}  \nabla_{\theta}\mu_{\theta_2}(s)\nabla_{\theta}\mu_{\theta_2}(s)^T\nu_{\theta_1}(ds) - \int_{\mcs}  \nabla_{\theta}\mu_{\theta_2}(s)\nabla_{\theta}\mu_{\theta_1}(s)^T\nu_{\theta_1}(ds) }\\
    &\quad\quad + \norm{ \int_{\mcs}  \nabla_{\theta}\mu_{\theta_2}(s)\nabla_{\theta}\mu_{\theta_1}(s)^T\nu_{\theta_1}(ds) - \int_{\mcs}  \nabla_{\theta}\mu_{\theta_1}(s)\nabla_{\theta}\mu_{\theta_1}(s)^T\nu_{\theta_1}(ds) }\\
    &\quad\overset{\text{(i)}}{\leq} L_{\mu}^2\lTV{\nu_{\theta_1}(\cdot)-\nu_{\theta_2}(\cdot)} + 2L_{\mu}\int_{\mcs}  \norm{\nabla_{\theta}\mu_{\theta_2}(s) - \nabla_{\theta}\mu_{\theta_1}(s)}\nu_{\theta_1}(ds) \\
    &\quad\overset{\text{(ii)}}{\leq} L_{\mu}^2\lTV{\nu_{\theta_1}(\cdot)-\nu_{\theta_2}(\cdot)} + \frac{2L_{\mu}L_{\psi}}{1-\gamma}\norm{\theta_1-\theta_2} \\
    &\quad\overset{\text{(iii)}}{\leq} \parentheses{L_{\mu}^2L_{\nu} + \frac{2L_{\mu}L_{\psi}}{1-\gamma}}\norm{\theta_1-\theta_2},
\end{align*}
where both (i) and (ii) follow from Assumption \ref{asp:policy}, and (iii) follows from \Cref{lem:stateVisitionLipschitz}.

Thus, we have
\begin{align*}
    &\norm{w^*_{\xi_{\theta_1}} - w^*_{\xi_{\theta_2}}}\\
    &\quad\leq \frac{L_{J}}{\lambda_{\Psi}} \norm{\theta_1 - \theta_2} + \frac{L_{\mu}C_Q}{\lambda_{\Psi}^2(1-\gamma)}\norm{\Psi_{\theta_2}-\Psi_{\theta_1}}\\
    &\quad\leq \brackets{\frac{L_{J}}{\lambda_{\Psi}}  + \frac{L_{\mu}C_Q}{\lambda_{\Psi}^2(1-\gamma)}\parentheses{L_{\mu}^2L_{\nu} + \frac{2L_{\mu}L_{\psi}}{1-\gamma}}}\norm{\theta_1 - \theta_2}.
\end{align*}

\end{proof}

The next lemma provides an important bound for the difference between the gradient estimators and the true gradient.
\begin{lemma}\label{lem:hVariance}
Suppose Assumptions \ref{asp:policy}-\ref{asp:Qsmooth}. Then we have 
\begin{equation*}
    \mE\norm{h_{\theta_t}(w_{t},\mcb_t) -\nabla J(\theta_t)}^2 \leq 3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M},
\end{equation*}
where $L_h=L_{\mu}^2$ and $\kappa$ is defined in \eqref{eq:systemErrorKappa}.
\end{lemma}
\begin{proof}
By definition, we have
\begin{align}
    &\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)}^2 \nonumber\\
    &\quad= \mE\norm{h_{\theta_t}(w_t,\mcb_t) - h_{\theta_t}(w^*_{\theta_t},\mcb_t) + h_{\theta_t}(w^*_{\theta_t},\mcb_t) -h_{\theta_t}(w^*_{\xi_{\theta_t}},\mcb_t) + h_{\theta_t}(w^*_{\xi_{\theta_t}},\mcb_t) -\nabla J(\theta_t)}^2 \nonumber\\
    &\quad\leq 3\mE\norm{h_{\theta_t}(w_t,\mcb_t) - h_{\theta_t}(w^*_{\theta_t},\mcb_t)}^2 + 3\mE\norm{h_{\theta_t}(w^*_{\theta_t},\mcb_t) -h_{\theta_t}(w^*_{\xi_{\theta_t}},\mcb_t)}^2 \nonumber\\
    &\quad\quad+ 3\mE\norm{h_{\theta_t}(w^*_{\xi_{\theta_t}},\mcb_t) -\nabla J(\theta_t)}^2\nonumber\\
    &\quad\overset{\text{(i)}}{\leq} 3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\mE\norm{w^*_{\theta_{t}}-w^*_{\xi_{\theta_t}}}^2 + 3\mE\norm{h_{\theta_t}(w^*_{\xi_{\theta_t}},\mcb_t) -\nabla J(\theta_t)}^2\nonumber \\
    &\quad\overset{\text{(ii)}}{\leq} 3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + 3\mE\norm{h_{\theta_t}(w^*_{\xi_{\theta_t}},\mcb_t) -\nabla J(\theta_t)}^2\nonumber \\
    &\quad\overset{\text{(iii)}}{\leq} 3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}\nonumber, 
\end{align}
where (i) follows because for any $w_1, w_2, \theta\in\mathbb R^d$, we have
\begin{align*}
    \norm{h_{\theta}(w_1,\mcb_t) - h_{\theta}(w_2,\mcb_t)} 
    & = \norm{ \frac{1}{M}\sum_{j=0}^{M-1}\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^T (w_1-w_2) }\\
    & \leq L_{\mu}^2\norm{w_1-w_2}
    := L_h\norm{w_1-w_2},
\end{align*}
(ii) follows from \eqref{eq:systemErrorKappa}, and (iii) holds due to the fact that
\begin{align}
    &\mE\norm{h_{\theta_t}(w^*_{\xi_{\theta_t}},\mcb_t) -\nabla J(\theta_t)}^2 \nonumber\\
    &\quad= \mE\norm{ \frac{1}{M}\sum_{j=0}^{M-1}\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^Tw^*_{\xi_{\theta_t}} -\nabla J(\theta_t) }^2 \nonumber\\
    &\quad= \frac{1}{M^2}\sum_{i=0}^{M-1}\sum_{j=0}^{M-1}\mE\langle \nabla_{\theta}\mu_{\theta_t}(s'_{t,i})\nabla_{\theta}\mu_{\theta_t}(s'_{t,i})^Tw^*_{\xi_{\theta_t}}-\nabla J(\theta_t), \nonumber\\
    &\quad\qquad \nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^Tw^*_{\xi_{\theta_t}}-\nabla J(\theta_t) \rangle \nonumber\\
    &\quad= \frac{1}{M^2}\sum_{j=0}^{M-1}\mE\norm{ \nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^Tw^*_{\xi_{\theta_t}} -\nabla J(\theta_t) }^2 \nonumber\\
    &\quad\overset{\text{(i)}}{\leq} \frac{1}{M^2}\sum_{j=0}^{M-1} 2L_{\mu}^4C_{w_{\xi}}^2 = \frac{2L_{\mu}^4C_{w_{\xi}}^2}{M}\nonumber,
\end{align}
where 
% (i) follows since the samples are generated in an i.i.d.\ manner, and 
(i) follows from Assumption \ref{asp:policy}, \Cref{lem:minibatchVariance} and \Cref{lem:wStar}.
\end{proof}

\subsection{Proof of Theorem \ref{thm:onPolicyDPG}}

We use the following notations for the clarity of the presentation:
\begin{align*}
    g_{\theta_t}(w_t, \mcb_t) &= \frac{1}{M}\sum_{j=0}^{M-1}\delta_{t,j}\phi(x_{t,j}) = \frac{1}{M}\sum_{j=0}^{M-1}\parentheses{A_{t,j}w_t + b_{t,j}} := \hat{A}_tw_t + \hat{b}_t;\\
    \bar g_{\theta_t}(w_t) &= \mE_{d_{\theta_t}} \brackets{\delta_t\phi(x_t)} = \bar A_tw_t + \bar b_t;\\
    \bar g_{\theta_t}(w^*_{\theta_t}) &= \bar A_tw^*_{\theta_t} + \bar b_t = 0;\\
    h_{\theta_t}(w_t, \mcb_t) &= \frac{1}{M}\sum_{j=0}^{M-1}\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^T w_t.
\end{align*}

In this proof, we develop a new approach to analyzing the coupled actor and critic’s stochastic approximation processes, due to their simultaneous updates both with constant stepsizes. The central idea is to cancel the critic’s cumulative tracking error by the actor’s overall positive progress to the stationary policy, which is different from the existing analysis of (stochastic) PG-type algorithms that mainly decouples or asymptotically decouples the critic’s error from actor’s error. Further, we develop a new analysis to bound the estimation error of the Fisher information of deterministic policy arising via the compatibility theorem, and then further capture how such a metric affects the convergence via its minimum eigenvalue.

The main proof consists of three steps. 
% The main proof applies a few supporting lemmas that we present in \Cref{app:theorem1lemmas}.
%First, we characterize the error propagation of tracking a dynamic critic target (i.e., dynamic tracking error) based on its coupling with actor's update progress. Second, we bound the critic's cumulative tracking error in terms of actor's update progress via the compatibility properties of DPG. Last, we establish the overall convergence by canceling out the cumulative tracking error via the actor's overall positive progress towards the stationary policy.

\textbf{Step I: Characterizing dynamics of critic's error via coupling with actor.} 

In the first step, we characterize the propagation of the dynamics of critic's dynamic tracking error based on its coupling with actor's updates. That is, we develop the relationship between $\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2$ and $\norm{w_{t}-w^*_{\theta_t}}^2$ by their coupling with actor's updates.

We first use the dynamics of the critic to obtain
\begin{align*}
    &\norm{w_{t+1}-w^*_{\theta_t}}^2 \\
    &\quad= \norm{w_{t}+\alpha_{w}g_{\theta_t}(w_t, \mcb_t)-w^*_{\theta_t}}^2\\
    &\quad= \norm{w_{t}-w^*_{\theta_t}}^2 + 2\alpha_{w}\langle w_{t}-w^*_{\theta_t}, g_{\theta_t}(w_t, \mcb_t)\rangle + \alpha_{w}^2\norm{g_{\theta_t}(w_t, \mcb_t)}^2\\
    &\quad= \norm{w_{t}-w^*_{\theta_t}}^2 + 2\alpha_{w}\langle w_{t}-w^*_{\theta_t}, \bar g_{\theta_t}(w_t)\rangle + 2\alpha_{w}\langle w_{t}-w^*_{\theta_t}, g_{\theta_t}(w_t, \mcb_t) - \bar g_{\theta_t}(w_t)\rangle \\
    &\quad\quad+ \alpha_{w}^2\norm{g_{\theta_t}(w_t, \mcb_t)}^2\\
    &\quad= \norm{w_{t}-w^*_{\theta_t}}^2 + 2\alpha_{w}( w_{t}-w^*_{\theta_t})^T\bar A_t ( w_{t}-w^*_{\theta_t}) + 2\alpha_{w}\langle w_{t}-w^*_{\theta_t}, g_{\theta_t}(w_t, \mcb_t) - \bar g_{\theta_t}(w_t)\rangle \\
    &\quad\quad+ \alpha_{w}^2\norm{g_{\theta_t}(w_t, \mcb_t)}^2\\
    &\quad\overset{\text{(i)}}{\leq} (1-2\alpha_{w}\lambda)\norm{w_{t}-w^*_{\theta_t}}^2 + 2\alpha_{w}\langle w_{t}-w^*_{\theta_t}, g_{\theta_t}(w_t, \mcb_t) - \bar g_{\theta_t}(w_t)\rangle + \alpha_{w}^2\norm{g_{\theta_t}(w_t, \mcb_t)}^2\\
    &\quad\leq (1-2\alpha_{w}\lambda)\norm{w_{t}-w^*_{\theta_t}}^2 + 2\alpha_{w}\langle w_{t}-w^*_{\theta_t}, g_{\theta_t}(w_t, \mcb_t) - \bar g_{\theta_t}(w_t)\rangle \\
    &\quad\quad+ 2\alpha_{w}^2\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2 + 2\alpha_{w}^2\norm{\bar g_{\theta_t}(w_t)}^2\\
    &\quad= (1-2\alpha_{w}\lambda)\norm{w_{t}-w^*_{\theta_t}}^2 + 2\alpha_{w}\langle w_{t}-w^*_{\theta_t}, g_{\theta_t}(w_t, \mcb_t) - \bar g_{\theta_t}(w_t)\rangle\\
    &\quad\quad + 2\alpha_{w}^2\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2 + 2\alpha_{w}^2\norm{\bar g_{\theta_t}(w_t)-\bar g_{\theta_t}(w^*_{\theta_t})}^2\\
    &\quad= (1-2\alpha_{w}\lambda)\norm{w_{t}-w^*_{\theta_t}}^2 + 2\alpha_{w}\langle w_{t}-w^*_{\theta_t}, g_{\theta_t}(w_t, \mcb_t) - \bar g_{\theta_t}(w_t)\rangle\\
    &\quad\quad + 2\alpha_{w}^2\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2 + 2\alpha_{w}^2\norm{\bar A_t(w_t - w^*_{\theta_t})}^2\\
    &\quad\leq (1-2\alpha_{w}\lambda)\norm{w_{t}-w^*_{\theta_t}}^2 + 2\alpha_{w}\langle w_{t}-w^*_{\theta_t}, g_{\theta_t}(w_t, \mcb_t) - \bar g_{\theta_t}(w_t)\rangle\\
    &\quad\quad + 2\alpha_{w}^2\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2 + 2\alpha_{w}^2\norm{\bar A_t}^2\norm{(w_t - w^*_{\theta_t})}^2\\
    &\quad\overset{\text{(ii)}}{\leq} (1-2\alpha_{w}\lambda+2\alpha_{w}^2C_A^2)\norm{w_{t}-w^*_{\theta_t}}^2 + 2\alpha_{w}\langle w_{t}-w^*_{\theta_t}, g_{\theta_t}(w_t, \mcb_t) - \bar g_{\theta_t}(w_t)\rangle\\
    &\quad\quad + 2\alpha_{w}^2\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2,
\end{align*}
where (i) follows from the property $( w_{t}-w^*_{\theta_t})^T\bar A_t ( w_{t}-w^*_{\theta_t})\leq-\lambda\norm{w_{t}-w^*_{\theta_t}}^2$ with some constant $\lambda>0$ for any policy, which has been proved 
%and widely used in the studies of TD learning with linear function approximation
in \cite{tsitsiklis1997analysis,Bhandari2018finite,tu2018gap,xiong2020amsgradRL}, and (ii) follows because $\norm{A}^2\leq 2(1+\gamma^2)C_{\phi}^4\leq 4C_{\phi}^4:=C_A^2$.

Taking the expectation on both sides yields
\begin{align}
    &\mE\norm{w_{t+1}-w^*_{\theta_t}}^2 \nonumber\\
    &\leq (1-2\alpha_{w}\lambda+2\alpha_{w}^2C_A^2)\mE\norm{w_{t}-w^*_{\theta_t}}^2 + 2\alpha_{w}\mE\langle w_{t}-w^*_{\theta_t}, g_{\theta_t}(w_t, \mcb_t) - \bar g_{\theta_t}(w_t)\rangle\nonumber\\
    &\quad + 2\alpha_{w}^2\mE\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2 \nonumber\\
    &= (1-2\alpha_{w}\lambda+2\alpha_{w}^2C_A^2)\mE\norm{w_{t}-w^*_{\theta_t}}^2 + 2\alpha_{w}^2\mE\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2. \label{eq:thm1Proof1}
\end{align}
% where (i) follows because we use i.i.d.\ samples.

Observe that
\begin{align*}
    &\mE\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2 \\
    &\quad = \mE\norm{ \hat{A}_tw_t + \hat{b}_t - \bar A_tw_t - \bar b_t }^2\\
    &\quad\overset{\text{(i)}}{\leq} 3\mE\norm{ (\hat{A}_t - \bar A_t)(w_t - w^*_{\theta_t}) }^2 + 3\mE\norm{ (\hat{A}_t - \bar A_t) w^*_{\theta_t} }^2 + 3\mE\norm{ \hat{b}_t - \bar b_t }^2 \\
    &\quad\leq 3\mE\lF{ \hat{A}_t - \bar A_t }^2 \norm{ w_t - w^*_{\theta_t} }^2 + 3\mE\lF{ \hat{A}_t - \bar A_t }^2\norm{  w^*_{\theta_t} }^2 + 3\mE\norm{ \hat{b}_t - \bar b_t }^2\\
    &\quad\overset{\text{(ii)}}{\leq} \frac{12C_A^2}{M}\mE\norm{w_{t}-w^*_{\theta_t}}^2 + \frac{12(C_A^2\mE\norm{  w^*_{\theta_t} }^2 + C_b^2)}{M}\\
    &\quad\overset{\text{(iii)}}{\leq} \frac{12C_A^2}{M}\mE\norm{w_{t}-w^*_{\theta_t}}^2 + \frac{12(C_A^2C_w^2 + C_b^2)}{M},
\end{align*}
where (i) follows because $(x+y+z)^2\leq 3x^2+3y^2+3z^2$, (ii) follows from \Cref{lem:minibatchVariance} and $C_b:=R_{\max}C_{\phi}\geq \norm{b}$, and (iii) follows because $\norm{  w^*_{\theta_t} }^2 = \norm{ \bar A_t^{-1}\bar b_t }^2 \leq C_b/\lambda_A = R_{\max}C_{\phi}/\lambda_A := C_{w}$ by Assumption \ref{asp:phi}.

Substituting the above bound into \eqref{eq:thm1Proof1}, we have
\begin{align}
    &\mE\norm{w_{t+1}-w^*_{\theta_t}}^2 \nonumber\\
    &\quad\leq (1-2\alpha_{w}\lambda+2\alpha_{w}^2C_A^2)\mE\norm{w_{t}-w^*_{\theta_t}}^2 + 2\alpha_{w}^2\mE\norm{g_{\theta_t}(w_t, \mcb_t)-\bar g_{\theta_t}(w_t)}^2\nonumber\\
    &\quad\leq \parentheses{1-2\alpha_{w}\lambda+2\alpha_{w}^2C_A^2+\frac{24\alpha_w^2 C_A^2}{M}}\mE\norm{w_{t}-w^*_{\theta_t}}^2 + \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M}\nonumber.
\end{align}

Since $\alpha_w \leq \frac{\lambda}{2C_A^2}; M\geq\frac{48\alpha_w  C_A^2}{\lambda},$
% \begin{equation}\label{eq:paramCondition1OnPolicy}
%     \alpha_w \leq \frac{\lambda}{2C_A^2}; M\geq\frac{48\alpha_w  C_A^2}{\lambda},
% \end{equation}
we further obtain
\begin{align}
    &\mE\norm{w_{t+1}-w^*_{\theta_t}}^2 \nonumber\\
    &\quad\leq \parentheses{1-2\alpha_{w}\lambda+2\alpha_{w}^2C_A^2+\frac{24\alpha_w^2C_A^2}{M}}\mE\norm{w_{t}-w^*_{\theta_t}}^2 + \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M}\nonumber\\
    &\quad\leq \parentheses{1-\frac{\alpha_w\lambda}{2}}\mE\norm{w_{t}-w^*_{\theta_t}}^2 + \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M}.\label{eq:thm1Proof2}
\end{align}

Next, we use Young's inequality, and obtain
\begin{align}
    &\mE\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2 \nonumber\\
    &\quad\leq \parentheses{1+\frac{1}{2(2/\lambda\alpha_w-1)}}\mE\norm{w_{t+1}-w^*_{\theta_{t}}}^2 + \parentheses{1+2(2/\lambda\alpha_w-1)}\mE\norm{w^*_{\theta_{t}}-w^*_{\theta_{t+1}}}^2\nonumber\\
    &\quad\overset{\text{(i)}}{\leq} \parentheses{1-\frac{\lambda\alpha_w}{4}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{4-\lambda\alpha_w}{4-2\lambda\alpha_w}\cdot \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{4}{\lambda\alpha_w}\mE\norm{w^*_{\theta_{t}}-w^*_{\theta_{t+1}}}^2\nonumber\\
    &\quad\overset{\text{(ii)}}{\leq} \parentheses{1-\frac{\lambda\alpha_w}{4}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{4-\lambda\alpha_w}{4-2\lambda\alpha_w}\cdot \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{4L_w^2}{\lambda\alpha_w}\mE\norm{\theta_{t+1}-\theta_{t}}^2, \label{eq:thm1proofDynTrackingError}
    % &\quad= \parentheses{1-\frac{\lambda\alpha_w}{4}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{4-\lambda\alpha_w}{4-2\lambda\alpha_w}\cdot \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{4L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2\nonumber\\
    % &\quad\leq \parentheses{1-\frac{\lambda\alpha_w}{4}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{\nabla J(\theta_t)}^2\nonumber\\
    % &\quad\quad + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)}^2\nonumber,
\end{align}
where (i) follows from the bound derived in \eqref{eq:thm1Proof2}, and (ii) follows from \Cref{lem:wStar}.

\textbf{Step II: Bounding cumulative tracking error via compatibility theorem for DPG.}

In this step, we bound the cumulative tracking error based on the dynamics of the tracking error from the last step. To this end, we need to first bound the difference between two consecutive actor parameters.

Observe that $\theta_{t+1}-\theta_{t} = \frac{1}{M}\sum_{j=0}^{M-1}\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^T w_t:=h_{\theta_t}(w_t, \mcb_t)$ and $\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2\leq 2\mE\norm{\nabla J(\theta_t)}^2+2\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)}^2$. We proceed to bound (\ref{eq:thm1proofDynTrackingError}) as follows
\begin{align}
    &\mE\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2 \nonumber\\
    &\quad\leq \parentheses{1-\frac{\lambda\alpha_w}{4}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{4-\lambda\alpha_w}{4-2\lambda\alpha_w}\cdot \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{4L_w^2}{\lambda\alpha_w}\mE\norm{\theta_{t+1}-\theta_{t}}^2\nonumber\\
    % &\quad= \parentheses{1-\frac{\lambda\alpha_w}{4}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{4-\lambda\alpha_w}{4-2\lambda\alpha_w}\cdot \frac{24\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{4L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2\nonumber\\
    &\quad\leq \parentheses{1-\frac{\lambda\alpha_w}{4}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{\nabla J(\theta_t)}^2\nonumber\\
    &\quad\quad + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)}^2\nonumber\\
% \end{align}
% Next, we deal with the term $\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)}^2$ in \Cref{lem:hVariance}, which yields
% \begin{align}
%     &\mE\norm{w_{t+1}-w^*_{\theta_{t+1}}}^2 \nonumber\\
%     &\quad\leq \parentheses{1-\frac{\lambda\alpha_w}{4}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{\nabla J(\theta_t)}^2\nonumber\\
%     &\quad\quad + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)}^2\nonumber\\
    &\quad\overset{\text{(i)}}{\leq} \parentheses{1-\frac{\lambda\alpha_w}{4}+\frac{24L_h^2L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{\nabla J(\theta_t)}^2\nonumber\\
    &\quad\quad + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\nonumber\\
    &\quad\overset{\text{(ii)}}{\leq} \parentheses{1-\frac{\lambda\alpha_w}{8}}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\mE\norm{\nabla J(\theta_t)}^2 + \frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} \nonumber\\
    &\quad\quad + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}},\label{eq:thm1Proof3}
\end{align}
where (i) follows from \Cref{lem:hVariance}, and (ii) follows because $\alpha_{\theta} \leq \frac{\lambda\alpha_w}{\sqrt{96}L_hL_w}.$
% \begin{equation}\label{eq:paramCondition2OnPolicy}
%     \alpha_{\theta} \leq \frac{\lambda\alpha_w}{\sqrt{96}L_hL_w}.
% \end{equation}

We further take the summation over all iterations on both sides of (\ref{eq:thm1Proof3}) and have
\begin{align}
    &\sum_{t=0}^{T-1}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 \nonumber\\
    &\quad\leq \sum_{t=0}^{T-1} \parentheses{1-\frac{\lambda\alpha_w}{8}}^t\norm{w_{0}-w^*_{\theta_{0}}}^2 + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\sum_{t=0}^{T-1}\sum_{i=0}^{t-1}\parentheses{1-\frac{\lambda\alpha_w}{8}}^{t-1-i}\mE\norm{\nabla J(\theta_t)}^2 \nonumber\\
    &\quad\quad + \brackets{\frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}}\sum_{t=0}^{T-1}\sum_{i=0}^{t-1}\parentheses{1-\frac{\lambda\alpha_w}{8}}^{t-1-i} \nonumber\\
    &\quad\leq \frac{8\norm{w_{0}-w^*_{\theta_{0}}}^2}{\lambda\alpha_w} + \brackets{\frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}}\cdot\frac{8T}{\lambda\alpha_w} \nonumber\\
    &\quad\quad + \frac{64L_w^2\alpha_{\theta}^2}{\lambda^2\alpha_w^2}\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2. \label{eq:thm1proofCumulativeError}
\end{align}

\textbf{Step III: Overall convergence by canceling tracking error via actor's positive progress.}

In this step, we establish the overall convergence to a stationary policy by novel cancellation of the above cumulative tracking error via actor's update progress. 

Based on \Cref{lem:dpglipschitz}, we have
\begin{align}
    &\mE[J(\theta_{t+1})] - \mE[J(\theta_t)] \nonumber\\
    &\quad\geq \mE\langle \nabla J(\theta_t), \theta_{t+1}-\theta_t \rangle - \frac{L_J}{2}\mE\norm{\theta_{t+1}-\theta_t}^2 \nonumber\\
    &\quad= \alpha_{\theta}\mE\langle \nabla J(\theta_t), h_{\theta_t}(w_t,\mcb_t) \rangle - \frac{L_J\alpha_{\theta}^2}{2}\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2 \nonumber\\
    &\quad= \alpha_{\theta}\mE\norm{\nabla J(\theta_t)}^2 + \alpha_{\theta}\mE\langle \nabla J(\theta_t), h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t) \rangle - \frac{L_J\alpha_{\theta}^2}{2}\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2 \nonumber\\
    &\quad\overset{\text{(i)}}{\geq} \frac{\alpha_{\theta}}{2}\mE\norm{\nabla J(\theta_t)}^2 - \frac{\alpha_{\theta}}{2}\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)}^2 \nonumber\\
    &\quad\quad - \frac{L_J\alpha_{\theta}^2}{2}\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)+\nabla J(\theta_t)}^2 \nonumber\\
    &\quad\geq \parentheses{\frac{\alpha_{\theta}}{2}-L_J\alpha_{\theta}^2}\mE\norm{\nabla J(\theta_t)}^2 - \parentheses{\frac{\alpha_{\theta}}{2}+L_J\alpha_{\theta}^2}\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J(\theta_t)}^2\nonumber\\
    &\quad\overset{\text{(ii)}}{\geq} \parentheses{\frac{\alpha_{\theta}}{2}-L_J\alpha_{\theta}^2}\mE\norm{\nabla J(\theta_t)}^2 - \parentheses{\frac{\alpha_{\theta}}{2}+L_J\alpha_{\theta}^2}\parentheses{3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\nonumber\\
    &\quad\overset{\text{(iii)}}{\geq} \frac{\alpha_{\theta}}{4}\mE\norm{\nabla J(\theta_t)}^2 - \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}},\label{eq:thm1Proof4}
\end{align}
where (i) follows because $x^Ty\geq -\frac{1}{2}x^2-\frac{1}{2}y^2$, (ii) follows from \Cref{lem:hVariance}, and (iii) follows from the condition $\alpha_{\theta} \leq \frac{1}{4L_J}.$
% \begin{equation}\label{eq:paramCondition3OnPolicy}
%     \alpha_{\theta} \leq \frac{1}{4L_J}.
% \end{equation}

We next take the summation over all iterations on both sides of the above bound and obtain
\begin{align}
    &\frac{\alpha_{\theta}}{4}\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2 \nonumber\\
    &\quad\leq \mE[J(\theta_{T+1})] - \mE[J(\theta_0)] + \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\cdot T + \frac{9\alpha_{\theta}L_{h}^2}{4}\sum_{t=0}^{T-1}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 \nonumber\\
    &\quad\leq \frac{R_{\max}}{1-\gamma} + \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\cdot T + \frac{9\alpha_{\theta}L_{h}^2}{4}\sum_{t=0}^{T-1}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2. \label{eq:thm1Proof5}
\end{align}

% We then use \eqref{eq:thm1Proof3} to bound the term $\sum_{t=0}^{T-1}\mE\norm{w_{t}-w^*_{\theta_{t}}}^2$, and obtain


Substituting the cumulative tracking error bound derived in (\ref{eq:thm1proofCumulativeError}) into \eqref{eq:thm1Proof5} yields
\begin{align}
    &\frac{\alpha_{\theta}}{8}\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2 \nonumber\\
    &\quad\overset{\text{(i)}}{\leq} \parentheses{\frac{\alpha_{\theta}}{4}-\frac{144L_h^2L_w^2\alpha_{\theta}^3}{\lambda^2\alpha_w^2}}\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2 \nonumber\\
    &\quad\leq \frac{R_{\max}}{1-\gamma} + \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\cdot T + \frac{18\alpha_{\theta}L_{h}^2}{\lambda\alpha_w}\norm{w_{0}-w^*_{\theta_{0}}}^2\nonumber\\
    &\quad\quad + \brackets{\frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}}\cdot\frac{18\alpha_{\theta}L_{h}^2T}{\lambda\alpha_w},\nonumber
\end{align}
where (i) follows from the condition $\alpha_{\theta} \leq \frac{\lambda\alpha_w}{24L_hL_w}.$
% \begin{equation}\label{eq:paramCondition4OnPolicy}
%     \alpha_{\theta} \leq \frac{\lambda\alpha_w}{24L_hL_w}.
% \end{equation}

Finally, 
% by taking the intersection of the conditions \eqref{eq:paramCondition1OnPolicy},\eqref{eq:paramCondition2OnPolicy},\eqref{eq:paramCondition3OnPolicy} and \eqref{eq:paramCondition4OnPolicy}, 
we have
\begin{align}
    \underset{t\in [T]}{\min}\mE\norm{\nabla J(\theta_{t})}^2 &\leq \frac{1}{T}\sum_{t=0}^{T-1}\mE\norm{\nabla J(\theta_t)}^2 \nonumber\\
    &\leq \parentheses{\frac{8R_{\max}}{\alpha_{\theta}(1-\gamma)} + \frac{144L_{h}^2}{\lambda\alpha_w}\norm{w_{0}-w^*_{\theta_{0}}}^2 }\cdot\frac{1}{T} + 6\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}} \nonumber\\
    &\quad\quad + \brackets{\frac{48\alpha_w^2(C_A^2C_w^2 + C_b^2)}{M} + \frac{8L_w^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}}\cdot\frac{144L_{h}^2}{\lambda\alpha_w} \nonumber\\
    &= \frac{c_1}{T} + \frac{c_2}{M} + c_3\kappa^2,\nonumber
\end{align} 
where 
\begin{align}
    c_1 &= \frac{8R_{\max}}{\alpha_{\theta}(1-\gamma)} + \frac{144L_{h}^2}{\lambda\alpha_w}\norm{w_{0}-w^*_{\theta_{0}}}^2,\label{eq:thmc1}\\
    c_2 &= 36L_{\mu}^4C_{w_{\xi}}^2+\brackets{48\alpha_w^2(C_A^2C_w^2 + C_b^2) + \frac{48L_w^2L_{\mu}^4C_{w_{\xi}}^2\alpha_{\theta}^2}{\lambda\alpha_w}}\cdot\frac{144L_{h}^2}{\lambda\alpha_w},\label{eq:thmc2}\\
    c_3 &= 18L_h^2 + \frac{24L_w^2L_h^2\alpha_{\theta}^2}{\lambda\alpha_w}.\label{eq:thmc3}
\end{align}


\subsection{Proof of Corollary \ref{cor:onPolicyDPG}}

Following from the upper bound in \Cref{thm:onPolicyDPG}, we let $\frac{c_1}{T}\leq\frac{\epsilon}{2}$ and $\frac{c_2}{M}\leq \frac{\epsilon}{2}$ to achieve the $\epsilon$-accuracy. Then we obtain $T\geq \frac{2c_1}{\epsilon}$ and $M\geq \frac{2c_2}{\epsilon}$. Further, since we generate $M$ samples in the update steps of both critic and actor in Algorithm \ref{alg:onPolicyDPG}, the total number of samples we use is thus $2MT = \frac{8c_1c_2}{\epsilon^2}$.


\section{Proof of Lemma \ref{lem:dpglipschitzOffPolicy}}


We use the notations $\psi_{\theta}(s) := \nabla_{\theta}\mu_{\theta}(s)$, $a_{\theta}=\mu_{\theta}(s)$ and $\nabla_a Q^{\mu_{\theta}}(s,a_{\theta})=\nabla_a Q^{\mu_{\theta}}(s,a)|_{a=\mu_{\theta}(s)}$ in the following proof.

We start from the form of the deterministic policy gradient given in \eqref{eq:dpgthmOffPolicy}, and have

\begin{align}
    &\norm{ \nabla J_{\beta}(\theta_1) - \nabla J_{\beta}(\theta_2) }\nonumber\\
    &\quad= \norm{ \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) \nu_{\beta}(ds) - \int_{\mcs}\psi_{\theta_2}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\beta}(ds) }\nonumber\\
    &\quad= \left\lVert  \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) \nu_{\beta}(ds) - \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\beta}(ds) \right.\nonumber\\
    &\quad\quad + \left. \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\beta}(ds) - \int_{\mcs}\psi_{\theta_2}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\beta}(ds) \right\rVert\nonumber\\
    &\quad\leq \norm{ \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) \nu_{\beta}(ds) - \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\beta}(ds) }\nonumber\\
    &\quad\quad + \norm{ \int_{\mcs}\psi_{\theta_1}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\beta}(ds) - \int_{\mcs}\psi_{\theta_2}(s)\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2}) \nu_{\beta}(ds) }\nonumber\\
    &\quad\leq \int_{\mcs}\norm{ \psi_{\theta_1}(s) }\cdot\norm{ \nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1}) - \nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2})} \nu_{\beta}(ds)\nonumber\\
    &\quad\quad + \int_{\mcs}\norm{ \psi_{\theta_1}(s) - \psi_{\theta_2}(s) }\cdot \norm{ \nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2})} \nu_{\beta}(ds)\nonumber\\
    &\quad\overset{\text{(i)}}{\leq}  L_{\mu}\int_{\mcs}\norm{\nabla_a Q^{\mu_{\theta_1}}(s,a_{\theta_1})-\nabla_a Q^{\mu_{\theta_2}}(s,a_{\theta_2})}\nu_{\beta}(ds) + C_Q \int_{\mcs}\norm{\psi_{\theta_1}(s) - \psi_{\theta_2}(s)}\nu_{\beta}(ds)\nonumber\\
    &\quad\overset{\text{(ii)}}{\leq}  L_{\mu}L'_Q\norm{\theta_1-\theta_2}\int_{\mcs}\nu_{\beta}(ds) + C_QL_{\psi}\norm{\theta_1-\theta_2}\int_{\mcs}\nu_{\beta}(ds)\nonumber\\
    &\quad\overset{\text{(iii)}}{=} \parentheses{ \frac{L_{\mu}L'_Q}{1-\gamma} + \frac{C_QL_{\psi}}{1-\gamma}}\norm{\theta_1-\theta_2}\nonumber\\
    &\quad:= L_{J_\beta}\norm{\theta_1-\theta_2},\nonumber
\end{align}
where (i) follows because $\norm{\psi_{\theta}(s)}\leq L_{\mu}$ as indicated by Assumption \ref{asp:policy} and $\norm{\nabla_a Q^{\mu_{\theta}}(s,a)}\leq C_Q$ by \Cref{lem:Qgradient}, (ii) follows from Assumption \ref{asp:policy} and \Cref{lem:Qgradient}, and (iii) follows because $\int_{\mcs}\nu_{\beta}(ds)=\frac{1}{1-\gamma}$.




\section{Proof of Theorem \ref{thm:offPolicyDPG} and Corollary \ref{cor:offPolicyDPG}} \label{app:proofThmOffPolicy}

%We first provide a useful lemma.

\subsection{Supporting Lemmas}\label{app:theorem2lemmas}


The following lemma provides the important properties of $w^*_{\beta,\xi_{\theta}}$.
\begin{lemma}\label{lem:wStarOffPolicy}
Let $w^*_{\beta,\xi_{\theta}}$ be defined in \eqref{eq:compDPGOffPolicy}. Suppose Assumptions \ref{asp:policy}-\ref{asp:Qsmooth} hold. Then we have
\begin{align*}
    \norm{w^*_{\beta,\xi_{\theta}}} \leq C_{w_{\xi}},
\end{align*}
where $C_{w_{\xi}}=\frac{L_{\mu}C_Q}{\lambda_{\Psi}(1-\gamma)}$. Furthermore, for any $\theta_1,\theta_2$, we have
\begin{align*}
    \norm{w^*_{\beta,\xi_{\theta_1}} - w^*_{\beta,\xi_{\theta_2}}} \leq L_{w'}\norm{\theta_1 - \theta_2},
\end{align*}
where $L_{w'}=\frac{L_{J_{\beta}}}{\lambda_{\Psi}}  + \frac{2L_{\mu}^2L_{\psi}C_Q}{\lambda_{\Psi}^2(1-\gamma)^2}$.
\end{lemma}
\begin{proof}
We first show the boundedness of $\norm{\nabla J_{\beta}(\theta)}$.
\begin{align*}
    \norm{\nabla J_{\beta}(\theta)} & = \norm{\int_{\mcs}  \nabla_{\theta}\mu_{\theta}(s) \nabla_a Q^{\mu_{\theta}}(s,a)|_{a=\mu_{\theta}(s)}\nu_{\beta}(ds)}\\
    &\leq \int_{\mcs}  \norm{\nabla_{\theta}\mu_{\theta}(s)} \norm{\nabla_a Q^{\mu_{\theta}}(s,a)|_{a=\mu_{\theta}(s)}}\nu_{\beta}(ds)\\
    &\overset{\text{(i)}}{\leq} L_{\mu}C_Q \int_{\mcs}\nu_{\beta}(ds) = \frac{L_{\mu}C_Q}{(1-\gamma)},
\end{align*}
where (i) follows from Assumption \ref{asp:policy} and \Cref{lem:Qgradient}.

We define $\Psi_{\beta,\theta}=\mE_{\nu_{\mu_{\beta}}}\brackets{ \nabla_{\theta}\mu_{\theta}(s) \nabla_{\theta}\mu_{\theta}(s)^T}$. Assumption \ref{asp:policy} implies that $\Psi_{\beta,\theta}$ is non-singular. Then by definition, we have
\begin{align*}
    \norm{w^*_{\beta,\xi_{\theta}}} 
    &=  \norm{\Psi_{\beta,\theta}^{-1}\nabla J_{\beta}(\theta)}
    \leq \frac{1}{\lambda_{\Psi}} \norm{\nabla J_{\beta}(\theta)}
    \leq \frac{L_{\mu}C_Q}{\lambda_{\Psi}(1-\gamma)}.
\end{align*}

Next, we show the Lipschitz continuity property.
\begin{align*}
    &\norm{w^*_{\xi_{\theta_1}} - w^*_{\xi_{\theta_2}}}\\
    &\quad= \norm{\Psi_{\beta,\theta_1}^{-1}\nabla J_{\beta}(\theta_1) - \Psi_{\beta,\theta_2}^{-1}\nabla J_{\beta}(\theta_2)}\\
    &\quad= \norm{\Psi_{\beta,\theta_1}^{-1}\nabla J_{\beta}(\theta_1) - \Psi_{\beta,\theta_1}^{-1}\nabla J_{\beta}(\theta_2) + \Psi_{\beta,\theta_1}^{-1}\nabla J_{\beta}(\theta_2) - \Psi_{\beta,\theta_2}^{-1}\nabla J_{\beta}(\theta_2)}\\
    &\quad\leq \norm{\Psi_{\beta,\theta_1}^{-1}(\nabla J_{\beta}(\theta_1) - \nabla J_{\beta}(\theta_2))} + \norm{\parentheses{\Psi_{\beta,\theta_1}^{-1} - \Psi_{\beta,\theta_2}^{-1}}\nabla J_{\beta}(\theta_2)}\\
    &\quad\overset{\text{(i)}}{\leq} \frac{L_{J}}{\lambda_{\Psi}} \norm{\theta_1 - \theta_2} + \norm{\parentheses{\Psi_{\beta,\theta_1}^{-1} - \Psi_{\beta,\theta_2}^{-1}}\nabla J_{\beta}(\theta_2)}\\
    &\quad= \frac{L_{J}}{\lambda_{\Psi}} \norm{\theta_1 - \theta_2} + \norm{\parentheses{\Psi_{\beta,\theta_1}^{-1}\Psi_{\beta,\theta_2}\Psi_{\beta,\theta_2}^{-1} - \Psi_{\beta,\theta_1}^{-1}\Psi_{\beta,\theta_1}\Psi_{\beta,\theta_2}^{-1}}\nabla J_{\beta}(\theta_2)}\\
    &\quad= \frac{L_{J}}{\lambda_{\Psi}} \norm{\theta_1 - \theta_2} + \norm{\Psi_{\beta,\theta_1}^{-1}\parentheses{\Psi_{\beta,\theta_2}-\Psi_{\beta,\theta_1}}\Psi_{\beta,\theta_2}^{-1}\nabla J_{\beta}(\theta_2)}\\
    &\quad\leq \frac{L_{J}}{\lambda_{\Psi}} \norm{\theta_1 - \theta_2} + \frac{1}{\lambda_{\Psi}^2}\norm{\Psi_{\beta,\theta_2}-\Psi_{\beta,\theta_1}}\norm{\nabla J_{\beta}(\theta_2)}\\
    &\quad\leq \frac{L_{J}}{\lambda_{\Psi}} \norm{\theta_1 - \theta_2} + \frac{L_{\mu}C_Q}{\lambda_{\Psi}^2(1-\gamma)}\norm{\Psi_{\beta,\theta_2}-\Psi_{\beta,\theta_1}},
\end{align*}
where (i) follows from \Cref{lem:dpglipschitz} and Assumption \ref{asp:policy}.

We further derive the following bound.
\begin{align*}
    &\norm{\Psi_{\beta,\theta_2}-\Psi_{\beta,\theta_1}} \\
    &\quad= \norm{ \int_{\mcs}  \nabla_{\theta}\mu_{\theta_2}(s)\nabla_{\theta}\mu_{\theta_2}(s)^T\nu_{\beta}(ds) - \int_{\mcs}  \nabla_{\theta}\mu_{\theta_1}(s)\nabla_{\theta}\mu_{\theta_1}(s)^T\nu_{\beta}(ds) }\\
    &\quad\leq \norm{ \int_{\mcs}  \nabla_{\theta}\mu_{\theta_2}(s)\nabla_{\theta}\mu_{\theta_2}(s)^T\nu_{\beta}(ds) - \int_{\mcs}  \nabla_{\theta}\mu_{\theta_2}(s)\nabla_{\theta}\mu_{\theta_1}(s)^T\nu_{\beta}(ds) }\\
    &\quad\quad + \norm{ \int_{\mcs}  \nabla_{\theta}\mu_{\theta_2}(s)\nabla_{\theta}\mu_{\theta_1}(s)^T\nu_{\beta}(ds) - \int_{\mcs}  \nabla_{\theta}\mu_{\theta_1}(s)\nabla_{\theta}\mu_{\theta_1}(s)^T\nu_{\beta}(ds) }\\
    &\quad\overset{\text{(i)}}{\leq} 2L_{\mu}\int_{\mcs}  \norm{\nabla_{\theta}\mu_{\theta_2}(s) - \nabla_{\theta}\mu_{\theta_1}(s)}\nu_{\beta}(ds) \\
    &\quad\overset{\text{(ii)}}{\leq} \frac{2L_{\mu}L_{\psi}}{1-\gamma}\norm{\theta_1-\theta_2},
\end{align*}
where both (i) and (ii) follow from Assumption \ref{asp:policy}.

Thus, we have
\begin{align*}
    &\norm{w^*_{\beta,\xi_{\theta_1}} - w^*_{\beta,\xi_{\theta_2}}}\\
    &\quad\leq \frac{L_{J_{\beta}}}{\lambda_{\Psi}} \norm{\theta_1 - \theta_2} + \frac{L_{\mu}C_Q}{\lambda_{\Psi}^2(1-\gamma)}\norm{\Psi_{\beta,\theta_2}-\Psi_{\beta,\theta_1}}\\
    &\quad\leq \parentheses{\frac{L_{J_{\beta}}}{\lambda_{\Psi}}  + \frac{2L_{\mu}^2L_{\psi}C_Q}{\lambda_{\Psi}^2(1-\gamma)^2} }\norm{\theta_1 - \theta_2}.
\end{align*}
\end{proof}


% Before providing the proof details, we remind the reader we will still use capital characters to represent matrices and use lower case characters for vectors. In addition, a character with a hat (e.g., $\hat{A}$) means it is a sampled parameter, while those with a bar (e.g., $\bar{A}$) are the corresponding expected version.

\subsection{Proof of Theorem \ref{thm:offPolicyDPG}}

The main difference here from the proof of \Cref{thm:onPolicyDPG} lies in the fact that
we apply TDC to update critic in \Cref{alg:offPolicyDPG} due to the off-policy sampling, which introduces an extra correction parameter $u_t$. Thus, we introduce a grouped vector $z_t=[w_t^T u_t^T]^T\in\mathbb R^{2d}$ and rewrite the dynamics of critic as a lifted linear system:
\begin{align}
    z_{t+1} &= z_t + \alpha_w \begin{bmatrix}
\hat A_t &\hat C_t\\ \eta\hat A_t &\eta \hat D_t
\end{bmatrix} z_t + \alpha_w \begin{bmatrix}
\hat b_t\\ \eta \hat b_t
\end{bmatrix}\nonumber\\
&:= z_t + \alpha_w \brackets{\hat G_t z_t + \hat \ell_t}\nonumber\\
&:= z_t + \alpha_w g_{\theta_t}(z_t, \mcb_t). \nonumber
\end{align}

TDC algorithm is designed to find the fixed point $w^*_{\beta,\theta}$ satisfying $\bar A w^*_{\beta,\theta} + \bar b = 0$, where $\bar A=\mE_{d_{\beta}}\brackets{\hat A}, \bar b=\mE_{d_{\beta}}\brackets{\hat b}$. Correspondingly, if we let $z^*_{\theta}=[{w^*_{\beta,\theta}}^T \mathbf{0}^T]^T$, then we have 
\begin{equation*}
    \bar g_{\theta}(z^*_{\theta}) = \bar Gz^*_{\theta} + \bar\ell = 0,
\end{equation*}
where $\bar G=\mE_{d_{\beta}}\brackets{\hat G}, \bar \ell=\mE_{d_{\beta}}\brackets{\hat \ell}$.
Based on the above lifted linear system, we proceed our proof as follows.

The main proof consists of three steps. 
%The main proof applies a few supporting lemmas that we present in \Cref{app:theorem2lemmas}.

\textbf{Step I: Characterizing dynamics of critic's error via coupling with actor.}

In the following, we first characterize the relationship between $\norm{z_{t+1}-z^*_{\theta_{t+1}}}^2$ and $\norm{z_{t}-z^*_{\theta_t}}^2$.

We first use the dynamics of the above linear system to obtain
\begin{align*}
    &\norm{z_{t+1}-z^*_{\theta_t}}^2 \\
    &\quad= \norm{z_{t}+\alpha_{w}g_{\theta_t}(z_t, \mcb_t)-z^*_{\theta_t}}^2\\
    &\quad= \norm{z_{t}-z^*_{\theta_t}}^2 + 2\alpha_{w}\langle z_{t}-z^*_{\theta_t}, g_{\theta_t}(z_t, \mcb_t)\rangle + \alpha_{w}^2\norm{g_{\theta_t}(z_t, \mcb_t)}^2\\
    &\quad= \norm{z_{t}-z^*_{\theta_t}}^2 + 2\alpha_{w}\langle z_{t}-z^*_{\theta_t}, \bar g_{\theta_t}(z_t)\rangle + 2\alpha_{w}\langle z_{t}-z^*_{\theta_t}, g_{\theta_t}(z_t, \mcb_t) - \bar g_{\theta_t}(z_t)\rangle\\
    &\quad\quad+ \alpha_{w}^2\norm{g_{\theta_t}(z_t, \mcb_t)}^2\\
    &\quad\overset{\text{(i)}}{\leq} (1-2\alpha_{w}\lambda')\norm{z_{t}-z^*_{\theta_t}}^2 + 2\alpha_{w}\langle z_{t}-z^*_{\theta_t}, g_{\theta_t}(z_t, \mcb_t) - \bar g_{\theta_t}(z_t)\rangle + \alpha_{w}^2\norm{g_{\theta_t}(z_t, \mcb_t)}^2\\
    &\quad\leq (1-2\alpha_{w}\lambda')\norm{z_{t}-z^*_{\theta_t}}^2 + 2\alpha_{w}\langle z_{t}-z^*_{\theta_t}, g_{\theta_t}(z_t, \mcb_t) - \bar g_{\theta_t}(z_t)\rangle \\
    &\quad\quad + 2\alpha_{w}^2\norm{g_{\theta_t}(z_t, \mcb_t)-\bar g_{\theta_t}(z_t)}^2 + 2\alpha_{w}^2\norm{\bar g_{\theta_t}(z_t)}^2\\
    &\quad= (1-2\alpha_{w}\lambda')\norm{z_{t}-z^*_{\theta_t}}^2 + 2\alpha_{w}\langle z_{t}-z^*_{\theta_t}, g_{\theta_t}(z_t, \mcb_t) - \bar g_{\theta_t}(z_t)\rangle\\
    &\quad\quad + 2\alpha_{w}^2\norm{g_{\theta_t}(z_t, \mcb_t)-\bar g_{\theta_t}(z_t)}^2 + 2\alpha_{w}^2\norm{\bar g_{\theta_t}(z_t)-\bar g_{\theta_t}(z^*_{\theta_t})}^2\\
    &\quad= (1-2\alpha_{w}\lambda')\norm{z_{t}-z^*_{\theta_t}}^2 + 2\alpha_{w}\langle z_{t}-z^*_{\theta_t}, g_{\theta_t}(z_t, \mcb_t) - \bar g_{\theta_t}(z_t)\rangle\\
    &\quad\quad + 2\alpha_{w}^2\norm{g_{\theta_t}(z_t, \mcb_t)-\bar g_{\theta_t}(z_t)}^2 + 2\alpha_{w}^2\norm{\bar G_t(z_t - z^*_{\theta_t})}^2\\
    &\quad\leq (1-2\alpha_{w}\lambda')\norm{z_{t}-z^*_{\theta_t}}^2 + 2\alpha_{w}\langle z_{t}-z^*_{\theta_t}, g_{\theta_t}(z_t, \mcb_t) - \bar g_{\theta_t}(z_t)\rangle\\
    &\quad\quad + 2\alpha_{w}^2\norm{g_{\theta_t}(z_t, \mcb_t)-\bar g_{\theta_t}(z_t)}^2 + 2\alpha_{w}^2\lF{\bar G_t}^2\norm{(z_t - z^*_{\theta_t})}^2\\
    &\quad\overset{\text{(ii)}}{\leq} (1-2\alpha_{w}\lambda'+2\alpha_{w}^2C_G^2)\norm{z_{t}-z^*_{\theta_t}}^2 + 2\alpha_{w}\langle z_{t}-z^*_{\theta_t}, g_{\theta_t}(z_t, \mcb_t) - \bar g_{\theta_t}(z_t)\rangle\\
    &\quad\quad + 2\alpha_{w}^2\norm{g_{\theta_t}(z_t, \mcb_t)-\bar g_{\theta_t}(z_t)}^2,
\end{align*}
where (i) follows from the property $\langle z_{t}-z^*_{\theta_t}, \bar g_{\theta_t}(z_t)\rangle\leq-\lambda'\norm{z_{t}-z^*_{\theta_t}}^2$ with some constant $\lambda'>0$ for any policy which has been proved in Theorem 3 of \cite{maei2011gradient} as long as $\eta>\max\cur{0,\sigma_{\min}\parentheses{D^{-1}\cdot\frac{A+A^T}{2}}}$, and (ii) follows from $\lF{\bar G}^2=(1+\eta^2)\lF{\bar A}^2+\lF{\bar C}^2 + \eta^2\lF{\bar D}^2\leq (1+\eta^2)C_A^2+ C_C^2 + \eta^2C_D^2\leq 5(1+\eta^2)C_{\phi}^4:=C_G^2$.

Taking the expectation on both sides of the above bound yields
\begin{align}
    &\mE\norm{z_{t+1}-z^*_{\theta_t}}^2 \nonumber\\
    &\leq (1-2\alpha_{w}\lambda'+2\alpha_{w}^2C_G^2)\mE\norm{z_{t}-z^*_{\theta_t}}^2 + 2\alpha_{w}\mE\langle z_{t}-z^*_{\theta_t}, g_{\theta_t}(z_t, \mcb_t) - \bar g_{\theta_t}(z_t)\rangle\nonumber\\
    &\quad + 2\alpha_{w}^2\mE\norm{g_{\theta_t}(z_t, \mcb_t)-\bar g_{\theta_t}(z_t)}^2 \nonumber\\
    &= (1-2\alpha_{w}\lambda'+2\alpha_{w}^2C_G^2)\mE\norm{z_{t}-z^*_{\theta_t}}^2 + 2\alpha_{w}^2\mE\norm{g_{\theta_t}(z_t, \mcb_t)-\bar g_{\theta_t}(z_t)}^2\nonumber\\
    &= (1-2\alpha_{w}\lambda'+2\alpha_{w}^2C_G^2)\mE\norm{z_{t}-z^*_{\theta_t}}^2 + 2\alpha_{w}^2\mE\norm{\hat G_t z_t + \hat \ell_t - \bar G_t z_t - \bar \ell_t}^2\nonumber\\
    &\leq (1-2\alpha_{w}\lambda'+2\alpha_{w}^2C_G^2)\mE\norm{z_{t}-z^*_{\theta_t}}^2\nonumber\\
    &\quad + 6\alpha_{w}^2\parentheses{\mE\norm{ (\hat{G}_t - \bar G_t)(z_t - z^*_{\theta_t}) }^2 + \mE\norm{ (\hat{G}_t - \bar G_t) z^*_{\theta_t} }^2 + \mE\norm{ \hat{\ell}_t - \bar \ell_t }^2}\nonumber\\
    &\leq (1-2\alpha_{w}\lambda'+2\alpha_{w}^2C_G^2)\mE\norm{z_{t}-z^*_{\theta_t}}^2\nonumber\\
    &\quad + 6\alpha_{w}^2\parentheses{\mE\lF{ \hat{G}_t - \bar G_t}^2\norm{z_t - z^*_{\theta_t} }^2 + \mE\lF{ \hat{G}_t - \bar G_t}^2\norm{ z^*_{\theta_t} }^2 \!+\! \mE\norm{ \hat{\ell}_t - \bar \ell_t }^2}\nonumber\\
    &\overset{\text{(i)}}{\leq} (1-2\alpha_{w}\lambda'+2\alpha_{w}^2C_G^2)\mE\norm{z_{t}-z^*_{\theta_t}}^2 + 6\alpha_{w}^2\parentheses{\frac{4C_G^2}{M}\mE\norm{z_{t}-z^*_{\theta_t}}^2 + \frac{4(C_G^2\mE\norm{  z^*_{\theta_t} }^2 + C_{\ell}^2)}{M}}\nonumber\\
    &\overset{\text{(ii)}}{=} \parentheses{1-2\alpha_{w}\lambda'+2\alpha_{w}^2C_G^2+\frac{24\alpha_w^2C_G^2}{M}}\mE\norm{z_{t}-z^*_{\theta_t}}^2 + \frac{24\alpha_w^2\parentheses{C_G^2\mE\norm{  w^*_{\beta,\theta_t} }^2 + C_{\ell}^2}}{M}\nonumber\\
    &\overset{\text{(iii)}}{\leq} \parentheses{1-2\alpha_{w}\lambda'+2\alpha_{w}^2C_G^2+\frac{24\alpha_w^2C_G^2}{M}}\mE\norm{z_{t}-z^*_{\theta_t}}^2 + \frac{24\alpha_w^2(C_G^2C_w^2 + C_{\ell}^2)}{M}\nonumber\\
    &\overset{\text{(iv)}}{\leq}\parentheses{1-\frac{\alpha_{w}\lambda'}{2}}\mE\norm{z_{t}-z^*_{\theta_t}}^2 + \frac{24\alpha_w^2(C_G^2C_w^2 + C_{\ell}^2)}{M}, \label{eq:thm2Proof1}
\end{align}
where (i) follows from \Cref{lem:minibatchVariance}, (ii) follows from $\norm{  z^*_{\theta_t} }^2=\norm{  w^*_{\beta,\theta_t} }^2$, (iii) follows because $\norm{  w^*_{\beta,\theta_t} }^2 = \norm{ \bar A_t^{-1}\bar b_t }^2 \leq C_b/\lambda_A = R_{\max}C_{\phi}/\lambda_A := C_{w}$ by Assumption \ref{asp:phi}, and (iv) follows from the conditions $\alpha_w \leq \frac{\lambda'}{2C_G^2}$ and $ M\geq\frac{48\alpha_w  C_G^2}{\lambda'}.$
% \begin{equation}\label{eq:paramCondition1OffPolicy}
%     \alpha_w \leq \frac{\lambda'}{2C_G^2}; \qquad M\geq\frac{48\alpha_w  C_G^2}{\lambda'}.
% \end{equation}

% \textbf{Step II}: Relate $\norm{z_{t+1}-z^*_{\theta_{t+1}}}^2$ and $\norm{z_{t}-z^*_{\theta_t}}^2$. 


We further derive that
\begin{align}
    &\mE\norm{z_{t+1}-z^*_{\theta_{t+1}}}^2 \nonumber\\
    &\quad\overset{\text{(i)}}{\leq} \parentheses{1+\frac{1}{2(2/\lambda'\alpha_w-1)}}\mE\norm{z_{t+1}-z^*_{\theta_{t}}}^2 + \parentheses{1+2(2/\lambda'\alpha_w-1)}\mE\norm{z^*_{\theta_{t}}-z^*_{\theta_{t+1}}}^2\nonumber\\
    &\quad\overset{\text{(ii)}}{\leq} \parentheses{1-\frac{\lambda'\alpha_w}{4}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 +  \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M} + \frac{4}{\lambda'\alpha_w}\mE\norm{z^*_{\theta_{t}}-z^*_{\theta_{t+1}}}^2\nonumber\\
    &\quad\overset{\text{(iii)}}{=} \parentheses{1-\frac{\lambda'\alpha_w}{4}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 +  \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M} + \frac{4}{\lambda'\alpha_w}\mE\norm{w^*_{\beta,\theta_{t}}-w^*_{\beta,\theta_{t+1}}}^2\nonumber\\
    &\quad\overset{\text{(iv)}}{\leq} \parentheses{1-\frac{\lambda'\alpha_w}{4}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 +  \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M} + \frac{4L_{w'}^2}{\lambda'\alpha_w}\mE\norm{\theta_{t+1}-\theta_{t}}^2\label{eqw:thm3proofDynTrackingError},
    % &\quad= \parentheses{1-\frac{\lambda'\alpha_w}{4}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 +  \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M} + \frac{4L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2\nonumber\\
    % &\quad\leq \parentheses{1-\frac{\lambda'\alpha_w}{4}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M} + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\mE\norm{\nabla J_{\beta}(\theta_t)}^2\nonumber\\
    % &\quad\quad + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J_{\beta}(\theta_t)}^2\nonumber\\
    % &\quad\overset{\text{(v)}}{\leq} \parentheses{1-\frac{\lambda'\alpha_w}{4}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M} + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\mE\norm{\nabla J_{\beta}(\theta_t)}^2\nonumber\\
    % &\quad\quad + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\parentheses{3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}},
\end{align}
where (i) follows from Young's inequality, (ii) follows from the bound derived in \eqref{eq:thm2Proof1}, (iii) follows because $\norm{z^*_{\theta_{t}}-z^*_{\theta_{t+1}}}^2=\norm{w^*_{\beta,\theta_{t}}-w^*_{\beta,\theta_{t+1}}}^2$, and (iv) follows from \Cref{lem:wStarOffPolicy}. 

\textbf{Step II: Bounding cumulative tracking error via compatibility theorem for DPG.}

Recall that we define $h_{\theta_t}(w_t, \mcb_t) := \frac{1}{M}\sum_{j=0}^{M-1}\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})\nabla_{\theta}\mu_{\theta_t}(s'_{t,j})^T w_t$. We continue with \eqref{eqw:thm3proofDynTrackingError} and have
\begin{align}
    &\mE\norm{z_{t+1}-z^*_{\theta_{t+1}}}^2 \nonumber\\
    &\quad \leq \parentheses{1-\frac{\lambda'\alpha_w}{4}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 +  \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M} + \frac{4L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\mE\norm{h_{\theta_t}(w_t,\mcb_t)}^2\nonumber\\
    &\quad\leq \parentheses{1-\frac{\lambda'\alpha_w}{4}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M} + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\mE\norm{\nabla J_{\beta}(\theta_t)}^2\nonumber\\
    &\quad\quad + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\mE\norm{h_{\theta_t}(w_t,\mcb_t)-\nabla J_{\beta}(\theta_t)}^2\nonumber\\
    &\quad\overset{\text{(i)}}{\leq} \parentheses{1-\frac{\lambda'\alpha_w}{4}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M} + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\mE\norm{\nabla J_{\beta}(\theta_t)}^2\nonumber\\
    &\quad\quad + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\parentheses{3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\nonumber\\
% \end{align}
% Observing that $\norm{w_{t}-w^*_{\theta_{t}}}^2\leq\norm{z_{t}-z^*_{\theta_{t}}}^2$, we further have
% \begin{align}
%     &\mE\norm{z_{t+1}-z^*_{\theta_{t+1}}}^2 \nonumber\\
%     &\quad\leq \parentheses{1-\frac{\lambda'\alpha_w}{4}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M} + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\mE\norm{\nabla J_{\beta}(\theta_t)}^2\nonumber\\
%     &\quad\quad + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\parentheses{3L_{h}^2\mE\norm{w_{t}-w^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\nonumber\\
    &\quad\overset{\text{(ii)}}{\leq} \parentheses{1-\frac{\lambda'\alpha_w}{4}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M} + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\mE\norm{\nabla J_{\beta}(\theta_t)}^2\nonumber\\
    &\quad\quad + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\parentheses{3L_{h}^2\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\nonumber\\
    &\quad= \parentheses{1-\frac{\lambda'\alpha_w}{4}+\frac{24L_h^2L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M}\nonumber\\
    &\quad\quad + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\mE\norm{\nabla J_{\beta}(\theta_t)}^2 + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\parentheses{ 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\nonumber\\
    &\quad\overset{\text{(iii)}}{\leq} \parentheses{1-\frac{\lambda'\alpha_w}{8}}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 + \frac{48\alpha_w^2( C_G^2C_w^2 +  C_{\ell}^2)}{M} + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\mE\norm{\nabla J_{\beta}(\theta_t)}^2\nonumber\\
    &\quad\quad + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\parentheses{ 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\label{eq:thm2Proof2},
\end{align}
where (i) follows from \Cref{lem:hVariance}, (ii) follows since $\norm{w_{t}-w^*_{\theta_{t}}}^2\leq\norm{z_{t}-z^*_{\theta_{t}}}^2$, and (iii) follows because $\alpha_{\theta} \leq \frac{\lambda'\alpha_w}{\sqrt{96}L_hL_{w'}}.$
% \begin{equation}\label{eq:paramCondition2OffPolicy}
%     \alpha_{\theta} \leq \frac{\lambda'\alpha_w}{\sqrt{96}L_hL_{w'}}.
% \end{equation}

Then, taking the summation over all iterations on both sides of \eqref{eq:thm2Proof2} yields
\begin{align}
    &\sum_{t=0}^{T-1}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2 \nonumber\\
    &\quad\leq \sum_{t=0}^{T-1} \parentheses{1-\frac{\lambda'\alpha_w}{8}}^t\norm{z_{0}-z^*_{\theta_{0}}}^2 + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\sum_{t=0}^{T-1}\sum_{i=0}^{t-1}\parentheses{1-\frac{\lambda'\alpha_w}{8}}^{t-1-i}\mE\norm{\nabla J_{\beta}(\theta_t)}^2 \nonumber\\
    &\quad\quad + \brackets{\frac{48\alpha_w^2(C_G^2C_w^2 + C_{\ell}^2)}{M} + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}}\sum_{t=0}^{T-1}\sum_{i=0}^{t-1}\parentheses{1-\frac{\lambda'\alpha_w}{8}}^{t-1-i} \nonumber\\
    &\quad\leq \frac{8\norm{z_{0}-z^*_{\theta_{0}}}^2}{\lambda'\alpha_w} + \brackets{\frac{48\alpha_w^2(C_G^2C_w^2 + C_{\ell}^2)}{M} + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}}\cdot\frac{8T}{\lambda'\alpha_w} \nonumber\\
    &\quad\quad + \frac{64L_{w'}^2\alpha_{\theta}^2}{\lambda'^2\alpha_w^2}\sum_{t=0}^{T-1}\mE\norm{\nabla J_{\beta}(\theta_t)}^2.\label{eq:thm3proofCumulativeTrackingError}
\end{align}


\textbf{Step III: Overall convergence by canceling tracking error via actor's positive progress.}

Similarly to the on-policy case, we use the Lipschitz continuity property to obtain (see \eqref{eq:thm1Proof4})
\begin{align}
    &\mE[J_{\beta}(\theta_{t+1})] - \mE[J_{\beta}(\theta_t)]\nonumber\\
    &\quad\geq \frac{\alpha_{\theta}}{4}\mE\norm{\nabla J_{\beta}(\theta_t)}^2 - \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\mE\norm{w_{t}-w^*_{\beta,\theta_{t}}}^2 + 3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}},\nonumber
\end{align}
where we use the condition $\alpha_{\theta} \leq \frac{1}{4L_{J_{\beta}}}.$
% \begin{equation}\label{eq:paramCondition3OffPolicy}
%     \alpha_{\theta} \leq \frac{1}{4L_{J_{\beta}}}.
% \end{equation}

Further, we take the summation over all iterations on both sides of the above bound and have
\begin{align}
    &\frac{\alpha_{\theta}}{4}\sum_{t=0}^{T-1}\mE\norm{\nabla J_{\beta}(\theta_t)}^2\nonumber\\ 
    &\leq \mE[J_{\beta}(\theta_{T+1})] \!-\! \mE[J_{\beta}(\theta_0)] \!+\! \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\kappa^2 \!+\! \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\cdot T \!+\! \frac{9\alpha_{\theta}L_{h}^2}{4}\sum_{t=0}^{T-1}\mE\norm{w_{t}\!-\!w^*_{\beta,\theta_{t}}}^2 \nonumber\\
    &\leq \frac{R_{\max}}{1-\gamma} + \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\cdot T + \frac{9\alpha_{\theta}L_{h}^2}{4}\sum_{t=0}^{T-1}\mE\norm{w_{t}-w^*_{\beta,\theta_{t}}}^2\nonumber\\
    &\leq \frac{R_{\max}}{1-\gamma} + \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\cdot T + \frac{9\alpha_{\theta}L_{h}^2}{4}\sum_{t=0}^{T-1}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2.\nonumber
\end{align}

% We then use \eqref{eq:thm2Proof2} to bound the term $\sum_{t=0}^{T-1}\mE\norm{z_{t}-z^*_{\theta_{t}}}^2$, and obtain


Then, we substitute the cumulative error from \eqref{eq:thm3proofCumulativeTrackingError} into the above bound and have
\begin{align}
    &\frac{\alpha_{\theta}}{8}\sum_{t=0}^{T-1}\mE\norm{\nabla J_{\beta}(\theta_t)}^2 \nonumber\\
    &\quad\overset{\text{(i)}}{\leq} \parentheses{\frac{\alpha_{\theta}}{4}-\frac{144L_h^2L_{w'}^2\alpha_{\theta}^3}{\lambda'^2\alpha_w^2}}\sum_{t=0}^{T-1}\mE\norm{\nabla J_{\beta}(\theta_t)}^2 \nonumber\\
    &\quad\leq \frac{R_{\max}}{1-\gamma} + \frac{3\alpha_{\theta}}{4}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}\cdot T + \frac{18\alpha_{\theta}L_{h}^2}{\lambda'\alpha_w}\norm{z_{0}-z^*_{\theta_{0}}}^2\nonumber\\
    &\quad\quad + \brackets{\frac{48\alpha_w^2(C_G^2C_w^2 + C_{\ell}^2)}{M} + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda'\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}}\cdot\frac{18\alpha_{\theta}L_{h}^2T}{\lambda'\alpha_w},\nonumber
\end{align}
where (i) follows from the condition $\alpha_{\theta} \leq \frac{\lambda'\alpha_w}{24L_hL_{w'}}.$
% \begin{equation}\label{eq:paramCondition4OffPolicy}
%     \alpha_{\theta} \leq \frac{\lambda'\alpha_w}{24L_hL_{w'}}.
% \end{equation}


Finally, 
% we take the intersection of \eqref{eq:paramCondition1OffPolicy},\eqref{eq:paramCondition2OffPolicy},\eqref{eq:paramCondition3OffPolicy} and \eqref{eq:paramCondition4OffPolicy} to 
we obtain
\begin{align}
    \underset{t\in [T]}{\min}\mE\norm{\nabla J_{\beta}(\theta_{t})}^2 &\leq \frac{1}{T}\sum_{t=0}^{T-1}\mE\norm{\nabla J_{\beta}(\theta_t)}^2 \nonumber\\
    &\leq \parentheses{\frac{8R_{\max}}{\alpha_{\theta}(1-\gamma)} + \frac{144L_{h}^2}{\lambda\alpha_w}\norm{z_{0}-z^*_{\theta_{0}}}^2 }\cdot\frac{1}{T} + 6\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}} \nonumber\\
    &\quad\quad + \brackets{\frac{48\alpha_w^2(C_G^2C_w^2 + C_{\ell}^2)}{M} + \frac{8L_{w'}^2\alpha_{\theta}^2}{\lambda\alpha_w}\parentheses{3L_{h}^2\kappa^2 + \frac{6L_{\mu}^4C_{w_{\xi}}^2}{M}}}\cdot\frac{144L_{h}^2}{\lambda\alpha_w} \nonumber\\
    &= \frac{c_4}{T} + \frac{c_5}{M} + c_6\kappa^2,\nonumber
\end{align} 
where 
\begin{align}
    c_4 &= \frac{8R_{\max}}{\alpha_{\theta}(1-\gamma)} + \frac{144L_{h}^2}{\lambda'\alpha_w}\norm{z_{0}-z^*_{\theta_{0}}}^2,\label{eq:thm2c1}\\
    c_5 &= 36L_{\mu}^4C_{w_{\xi}}^2+\brackets{48\alpha_w^2(C_G^2C_w^2 + C_{\ell}^2) + \frac{48L_{w'}^2L_{\mu}^4C_{w_{\xi}}^2\alpha_{\theta}^2}{\lambda'\alpha_w}}\cdot\frac{144L_{h}^2}{\lambda'\alpha_w},\label{eq:thm2c2}\\
    c_6 &= 18L_h^2 + \frac{24L_{w'}^2L_h^2\alpha_{\theta}^2}{\lambda'\alpha_w}.\label{eq:thm2c3}
\end{align}


\subsection{Proof of Corollary \ref{cor:offPolicyDPG}}

Following from the upper bound in \Cref{thm:offPolicyDPG}, we let $\frac{c_4}{T}\leq\frac{\epsilon}{2}$ and $\frac{c_5}{M}\leq \frac{\epsilon}{2}$ to achieve the target $\epsilon$-accuracy. Then we obtain $T\geq \frac{2c_4}{\epsilon}$ and $M\geq \frac{2c_5}{\epsilon}$. Further, since we generate $M$ samples in the update steps of both critic and actor in Algorithm \ref{alg:offPolicyDPG}, the total number of samples we use is thus $2MT = \frac{8c_4c_5}{\epsilon^2}$.

% \bibliography{xiong_597}

\end{document}