\documentclass[accepted]{uai2025} % for initial submission

\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions

\usepackage{hyperref}       
\usepackage{booktabs}       
\usepackage{amsfonts}       
\usepackage{nicefrac}       
\usepackage{xcolor}
\usepackage{amssymb}



\usepackage{algorithm}
\usepackage{algpseudocode}

%\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{braket}
%\usepackage[utf8]{inputenc}
\usepackage{colortbl}
\usepackage{enumitem}
\usepackage{lipsum}
\usepackage{wrapfig}
\usepackage{subfigure}
\usepackage{pifont}% http://ctan.org/pkg/pifont
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\newcommand{\ignore}[1]{}
\hypersetup{
    colorlinks,
    linkcolor={blue!50!black},
    citecolor={blue!50!black},
    urlcolor={blue!80!black}
}

\makeatletter
\def\blfootnote{\xdef\@thefnmark{}\@footnotetext}
\makeatother

%\algnewcommand{\LineComment}[1]{\State \(\triangleright\) #1}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\R}{\mathbb{R}}
\DeclareMathOperator*{\Rn}{\mathbb{R}^n}
\DeclareMathOperator*{\Rm}{\mathbb{R}^m}
\DeclareMathOperator*{\Rd}{\mathbb{R}^d}
\newcommand{\vz}{\mathbf{0}}
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator*{\cO}{\mathcal{O}}
\DeclareMathOperator*{\la}{\langle}
\DeclareMathOperator*{\ra}{\rangle}
\DeclareMathOperator{\cG}{\mathcal{G}}
\DeclareMathOperator*{\cB}{\mathcal{B}}
\DeclareMathOperator*{\cS}{\mathcal{S}}
\DeclareMathOperator*{\cA}{\mathcal{A}}
\DeclareMathOperator*{\cP}{\mathcal{P}}
\DeclareMathOperator*{\cR}{\mathcal{R}}
\DeclareMathOperator{\cV}{\mathcal{V}}
\DeclareMathOperator{\cW}{\mathcal{W}}
\DeclareMathOperator{\g}{\gamma}

\DeclareMathOperator{\bias}{\beta_g}

%\newtheorem{assumption}{Assumption}
\newtheorem{notation}{Notation}
\newtheorem{observation}{Observation}
% \newtheorem{definition}{Definition}
% \newtheorem{theorem}{Theorem}
% \newtheorem{corollary}{Corollary}
% \newtheorem{example}{Example}
% \newtheorem{lemma}{Lemma}
% \newtheorem{remark}{Remark}

\newcommand{\norm}[1]{\left \lVert #1 \right\rVert }
\newcommand{\sqnorm}[1]{\left \lVert #1 \right\rVert^2 }
\newcommand{\rb}[1]{\left ( #1 \right ) }
\newcommand{\cb}[1]{\left \{ #1 \right \} }



%\title[Global Convergence of Neural Actor-Critic]{Order-Optimal Global Convergence for Average Reward Actor-Critic with General Policy and Neural Critic  Parametrization}
\usepackage{times}
% Use \Name{Author Name} to specify the name.
% If the surname contains spaces, enclose the surname
% in braces, e.g. \Name{John {Smith Jones}} similarly
% if the name has a "von" part, e.g \Name{Jane {de Winter}}.
% If the first letter in the forenames is a diacritic
% enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

% Two authors with the same address
% \coltauthor{\Name{Author Name1} \Email{abc@sample.com}\and
%  \Name{Author Name2} \Email{xyz@sample.com}\\
%  \addr Address}

% Three or more authors with the same address:
\author[1,2]{Swetha Ganesh}
\author[3]{Jiayu Chen}
\author[4]{Washim Uddin Mondal}
\author[1]{Vaneet Aggarwal}
% Add affiliations after the authors
\affil[1]{%
Purdue University
}
\affil[2]{%
Indian Institute of Science
}
\affil[3]{%
    Carnegie Mellon University
}
\affil[4]{%
    Indian Institute of Technology, Kanpur
  }

% Authors with different addresses:
\usepackage{amsthm}
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\theoremstyle{remark}
\newtheorem{remark}{Remark}

\allowdisplaybreaks


\title{Order-Optimal Global Convergence for Actor-Critic with General Policy and Neural Critic  Parametrization}

\begin{document}

\maketitle


\begin{abstract}
This paper addresses the challenge of achieving optimal sample complexity in reinforcement learning for Markov Decision Processes (MDPs) with general policy parameterization and multi-layer neural network critics. Existing approaches either fail to achieve the optimal rate or require impractical assumptions, such as access to knowledge of mixing times or the linearity of the critic. We introduce the Natural Actor-Critic with Data Drop (NAC-DD) algorithm, which integrates Natural Policy Gradient methods with a Data Drop technique to mitigate statistical dependencies inherent in Markovian sampling. NAC-DD achieves an optimal sample complexity of $\tilde{\mathcal{O}}(1/\epsilon^2)$, marking a significant improvement over the previous state-of-the-art guarantee of $\tilde{O}(1/\epsilon^3)$. The algorithm employs a multi-layer neural network critic with differentiable activation functions, aligning with real-world applications where tabular policies and linear critics are insufficient. Our work represents the first to achieve order-optimal sample complexity for actor-critic methods with neural function approximation, continuous state and action spaces, and Markovian sampling. Empirical evaluations on benchmark tasks confirm the theoretical findings, demonstrating the practical efficacy of the proposed method.
\end{abstract}

\section{Introduction}



Reinforcement learning (RL) has emerged as a powerful framework with broad applications across various domains such as robotics \citep{Gonzalez2023}, transportation \citep{al2019deeppool}, communication networks \citep{agarwal2022concave}, and healthcare \citep{tamboli2024reinforced}, where autonomous systems learn optimal decision-making strategies through interaction with their environment. However, unlike many machine learning scenarios, the temporal dependence inherent in RL violates the assumption of independent and identically distributed (i.i.d.) samples, complicating theoretical analysis and convergence guarantees. Among RL approaches, actor-critic methods have garnered attention for their scalability and adaptability, yet they generally fall short in achieving optimal convergence rates. This paper aims to address this gap by analyzing sample complexity for discounted reward Markov Decision Processes (MDPs) with general parametrized policies and neural critic. The current state of the art in this area, \citep{gaur2024closing}, reaches a sample complexity of $\tilde{O}(1/\epsilon^3)$ under Markovian sampling. This brings forth a central question:
\newline

\fbox{\begin{minipage}{23em}
{{\em Can we achieve an $\epsilon$-globally optimal solution with a sample complexity of $\tilde{O}(1/\epsilon^2)$ in the Markovian sampling setting, using general parameterized policies and a multi-layer neural network parameterized critic?
}}\end{minipage}}
\newline

In this paper, we answer this question in the affirmative by proposing an algorithm called Natural Actor-Critic with Data Drop (NAC-DD). We observe that the general policy and neural critic parametrization we consider are widely used in practice. In contrast, while tabular policies and linear critics have been extensively studied, they find limited practical application. Our work focuses on neural critics with differentiable activation functions (such as Sigmoid, ELU, and GeLU), which smoothly approximate ReLU and are commonly employed in real-world settings.

\subsection{Related works}






\begin{table*}[ht]
\centering
\caption{ This table summarizes the features of different actor-critic convergence results. Our result is the first to provide order-optimal sample complexity results of AC for an MDP setting with general/multi-layer neural network parametrization for the actor-critic,  continuous state and action space, and Markovian sampling. }
\label{tbl_related1}%
{\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
References &  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}Global \\ Optimality \end{tabular}} &  \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}} Continuous State \\ Action Space \end{tabular}} &   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}} Multi Layer  \\ NN AC \end{tabular}} &   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}} Markovian \\ Sampling \end{tabular}} &   \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}} Sample \\ Complexity \end{tabular}}   \\ \hline
     \citep{xu2020non}     &                                    \textcolor{green}{\cmark}         &     \textcolor{green}{\cmark}               &                     \textcolor{red}{\xmark}                                                                  &           \textcolor{green}{\cmark}    &       $\tilde{\mathcal{O}}(\epsilon^{-4})$                                                       \\ 
    \citep{khodadadian2021finite}                &                \textcolor{green}{\cmark}         &     \textcolor{red}{\xmark}               &                     \textcolor{red}{\xmark}                                                                  &            \textcolor{green}{\cmark}    &       $\tilde{\mathcal{O}}(\epsilon^{-3})$                                                       \\     
 \citep{xu2020improving}                 &                \textcolor{green}{\cmark}         &      \textcolor{green}{\cmark}                &                     \textcolor{red}{\xmark}                                                                  &           \textcolor{green}{\cmark}    &       $\tilde{\mathcal{O}}(\epsilon^{-3})$                                                       \\  
 \citep{xu2021doubly}                        &                \textcolor{green}{\cmark}         &     \textcolor{green}{\cmark}               &                     \textcolor{red}{\xmark}                                                                               &           \textcolor{green}{\cmark}    &       $\tilde{\mathcal{O}}(\epsilon^{-4})$                                                       \\  
 \citep{wang2019neural}            &                \textcolor{green}{\cmark}         &     \textcolor{red}{\xmark}               &                     \textcolor{red}{\xmark}                                                                                &           \textcolor{red}{\xmark}    &       $\tilde{\mathcal{O}}(\epsilon^{-4})$                                                       \\
\citep{cayci2022finite}                &                \textcolor{green}{\cmark}         &     \textcolor{red}{\xmark}               &                     \textcolor{red}{\xmark}                                                                                &           \textcolor{red}{\xmark}    &       $\tilde{\mathcal{O}}(\epsilon^{-4})$                                                       \\ 
\citep{fu2020single}                  &                \textcolor{green}{\cmark}         &     \textcolor{red}{\xmark}               &                    \textcolor{green}{\cmark}                                                                               &           \textcolor{red}{\xmark}    &       $\tilde{\mathcal{O}}(\epsilon^{-6})$                                                                           \\
\citep{tian2023convergence}           &                \textcolor{red}{\xmark}         &     \textcolor{red}{\xmark}               &                    \textcolor{green}{\cmark}                                                                               &           \textcolor{green}{\cmark}    &       $\tilde{\mathcal{O}}(\epsilon^{-2})$                                         \\
 \citep{gaur2024closing}                    &               \textcolor{green}{\cmark}        &     \textcolor{green}{\cmark}               &                    \textcolor{green}{\cmark}                                                                               &           \textcolor{green}{\cmark}    &       $\tilde{\mathcal{O}}(\epsilon^{-3})$
\\
\textbf{ This work}                      &               \textcolor{green}{\cmark}        &     \textcolor{green}{\cmark}               &                    \textcolor{green}{\cmark}                                                                               &           \textcolor{green}{\cmark}    &       $\tilde{\mathcal{O}}(\epsilon^{-2})$
\\
\hline
\end{tabular}
}
\end{table*}

{\bf Policy Gradient Approaches:} Recent studies have established an optimal sample complexity of $\tilde{O}(1/\epsilon^2)$ for policy gradient approaches with general parameterizations, as seen in \citep{fatkhullin2023stochastic,mondal2024improved}, though these methods rely on independent sampling for gradient estimation. Thus, their approaches for policy gradient estimation are not directly extendable to actor-critic framework with Markovian sampling. 

{\bf Actor-Critic Approaches:} For actor-critic algorithms, however, no existing work has yet achieved this optimal sample complexity when using multi-layer neural network parameterizations for both actor and critic. Table \ref{tbl_related1} provides a summary of key actor-critic approaches, categorizing algorithms by their achievement of global optimality, compatibility with continuous state and action spaces, use of general/multi-layer neural network parameterizations, and reliance on Markovian sampling. The current state of the art in this area, \citep{gaur2024closing}, reaches a sample complexity of $\tilde{O}(1/\epsilon^3)$ under Markovian sampling. 

\if 0
\textbf{Average MDPs with Actor-Critic}  
Recent work on the average reward actor-critic framework with a linear critic has established an $\mathcal{O}(\epsilon^{-2})$ sample complexity result \citep{ganesh2024orderoptimal}. This approach employs the MLMC-NAC algorithm, where Multilevel Monte Carlo (MLMC) is used to reduce bias in the updates. In our work, we instead utilize Data Drop (DD) as a bias reduction technique. While MLMC is advantageous in that it achieves the same bias reduction as DD without requiring prior knowledge of mixing times, it introduces significant challenges in the neural critic setting. Specifically, the analysis of bias in Natural Policy Gradient (NPG) becomes more complex, as the higher-order moments of MLMC can be unbounded, even when gradient estimates remain absolutely bounded.  
\fi 

\textbf{Neural Policy Evaluation: }  
A recent paper has established an optimal-order result for Q-learning with a fixed sample-generating policy and neural function approximation \citep{ke2024an}. This work also provides valuable intermediate results on Q-value approximation, which we use in our analysis. However, in order to achieve order-optimal global convergence, we also require a bound on the bias of the $Q$-function estimates, which requires a substantially different analysis.




\if 0
In this setup, the key challenge is that we have a Markovian trajectory, which restricts obtaining independent samples for gradient estimation. One key approach that had been used in the past works for model-free algorithms \citep{wei2020model,bai2023regret} is based on the observation that if the trajectory is divided into sub-trajectories that are order of mixing time apart, the sub-trajectories become near independent. However, such an approach requires the knowledge of mixing time. We note that model-based tabular Markov Decision Process (MDP) approaches \citep{jaksch2010near,agrawal2017optimistic,agarwal2023reinforcement} do not have that issue, since transition probabilities can be learned from Markovian trajectories, enabling Martingale-based analyses (e.g., employing Azuma-Hoeffding's inequality), while even the algorithms for linear MDP \citep{wei2020model}  still use either the span of the optimal value function or the mixing time. The values of these are not known in practice for a general MDP. Recently, the authors of \citep{patel2024global} showed that the algorithms that wait for orders of mixing time to have independent sub-trajectories require extremely large time-horizons to even have the sub-linear regret guarantees (even for mixing and hitting times of $10$, the time-horizon needed for the guarantees is $6.6\times 10^9$). In order to alleviate the issue, \citep{patel2024global} proposed an algorithm, called Multi-level Actor Critic (MAC),  where the estimates are obtained directly from the Markovian trajectory rather than obtaining independent samples from sub-trajectories, using the Multi-level Monte-Carlo (MLMC) gradient estimator. However, this approach is shown to achieve a global convergence rate of $\Tilde{\cO}(1/T^{1/4})$, which is not order optimal. This raises the following question, that is addressed in this paper. 

\fbox{\begin{minipage}{40em}
{{\em Is it possible to achieve a global convergence rate of $\Tilde{\mathcal{O}}\left(\frac{1}{\sqrt{T}}\right)$ for a model-free algorithm without the algorithm requiring knowledge of mixing time, in the average-reward setup with general policy parametrization?}}\end{minipage}}
\fi 

\subsection{Main Contributions and Challenges}
In this paper, we propose an algorithm that integrates the Natural Actor-Critic method with a Data Drop (NAC-DD) technique that involves selecting only one out of every $t_{\mathrm{mix}}$ samples for updates, thereby reducing correlation among samples. We show that this approach achieves optimal sample complexity of $\Tilde{\cO}(\epsilon^{-2})$ (Theorem \ref{thm:main-conv-rate}). 
  
To motivate why we use DD, we first take a look at a generic recursion: $$x_{t+1}=x_t-\alpha (g(x_t)+M_{t}),$$ where $g(x_t)$ is linear in $x_t$ and $\{M_t\}_{t\geq 0}$ is an ergodic Markov chain. Denote the solution of this update as $x^*$. It is known that with linear function approximation, $Q$-learning (with a fixed policy) is of this form. It is known that the \textit{bias} of such an update, $\|\E[x_t]-x^*\|$, can be constant \citep{nagaraj2020neurips}. However, by modifying this update by applying data drop, it is shown in the same paper that an optimal sample complexity can be achieved. 
  
  
  However, we note that when the neural approximations are used instead, the update becomes non-linear and the bias becomes much more difficult to analyze. In the linear case, bias can be characterized through a recursive formulation, facilitating a precise analysis (e.g., (29) in \citep{mondal2024improved}). In contrast, for non-linear critics, such a direct characterization is not feasible. To address this, we adopt a linearized update approach, commonly employed in neural critic analysis. This approximation is justified when the neural network width is sufficiently large, aligning with the popular Neural Tangent Kernel (NTK) theory \citep{NTK}. Nevertheless, ensuring that the error introduced by this linearization does not accumulate requires a refined analytical approach.

A further challenge arises from the use of a projection operator in the critic update. This projection operator is essential when employing NTK-based analysis. Although the use of projections does not typically complicate the proof, we note that our analysis also requires the bias of the critic to decrease sufficiently fast to achieve an improved sample complexity. Standard arguments based on the non-expansiveness property of projections are insufficient to guarantee an order-optimal bias. To overcome this limitation, we provide a careful analysis, which provides sharp bounds for the convergence rate and bias of the critic (Lemmas \ref{lem:critic-second_order} and \ref{lem:critic_bias}).

%{\bf maybe mention where each of the above challenge is resolved. Also, give an itemized list of summary of contributions. }

Finally, we provide empirical evaluations to validate our theoretical findings and demonstrate the practical efficacy of the proposed NAC-DD algorithm (Section \ref{evaluation}).

\section{Setup}

This paper addresses an infinite-horizon, discounted reward reinforcement learning problem formulated as a Markov Decision Process (MDP), represented by the tuple $\mathcal{M} = (\mathcal{S}, \mathcal{A}, r, P, \rho, \gamma)$. In this framework, $\mathcal{S}$ indicates a general state space, $\mathcal{A}$ is the action space, and $r: \mathcal{S} \times \mathcal{A} \rightarrow [0, 1]$ the reward function. When an agent takes action $a$ in state $s$, it transitions to a subsequent state $s'$ with a probability $P(s'|s, a)$. The initial state distribution is specified by $\rho$, and $\gamma$ represents the discount factor. A (stationary) policy $\pi: \mathcal{S} \rightarrow \Delta(\mathcal{A})$ defines the distribution over actions given the current state. This induces a transition function $P^{\pi}: \mathcal{S} \rightarrow \Delta(\mathcal{S})$, given by $P^{\pi}(s, s') = \sum_{a \in \mathcal{A}} P(s'|s,a) \pi(a|s)$ for all states $s, s' \in \mathcal{S}$. Under any policy $\pi$, the resulting state sequence forms a Markov chain. We also consider a parameterized family of policies $\Pi$, consisting of all policies $\pi_{\theta}$ with parameters $\theta \in \Theta$, where $\Theta \subset \mathbb{R}^d$.


The objective of the agent is to find a parameter $\theta$ that maximizes the long-term reward function, defined as \( J(\theta) \coloneqq \mathbb{E}_{s_0 \sim \rho} \left[ \sum_{t=0}^{\infty} \gamma^{t} r(s_t, a_t) | \pi_{\theta} \right] \) where the expectation is over the distribution of $\pi_{\theta}$-induced trajectories emanating from the initial distribution, $\rho$. For notational simplicity, we ignore the dependence on $\rho$. This work employs an actor-critic method to optimize \( J(\cdot) \). Before delving into the optimization process, we first introduce several key concepts.

The action-value ($Q^{\pi_\theta}$) function corresponding to $\pi_\theta$ is defined $\forall (s, a)\in\mathcal{S}\times\mathcal{A}$ as 
\begin{align}
    \label{q_function}
    Q^{\pi_{\theta}}(s,a) = \mathbb{E}\Bigg[\sum_{t=0}^{\infty}  \gamma^{t} r(s_t,a_t) \bigg| s_0=s, a_0=a, \pi_{\theta}\Bigg]
\end{align}
We can then further define the state value function as
\begin{align}
    \label{v_function}
    V^{\pi_{\theta}}(s) =\mathbb E_{a \sim \pi_{\theta}(\cdot|s)}[Q^{\pi_{\theta}}(s,a)], ~\forall s\in\mathcal{S}.
\end{align}
For any $Q:\cS\times\cA\to\R$, we define the Bellman operator $\mathcal{T}^{\pi_\theta}$ for all $(s,a)$ as
\begin{align*}
\mathcal{T}^{\pi_\theta} Q(s,a)\coloneqq r(s,a)+\gamma \E_{s'\sim P(\cdot|s,a),a'\sim \pi_{\theta}(\cdot|s')}[Q(s',a')].
\end{align*}
It is known that $\mathcal{T}^{\pi_\theta} $ is a $\gamma$-contraction under the infinity norm and $Q^{\pi_\theta}$ is the unique fixed point.

We assume the following throughout the paper.

\begin{assumption}
    \label{assump:ergodic_mdp}
    The Markov chain $\{s_t\}_{t\geq0}$, induced by an arbitrary policy $\pi\in\Pi$ is ergodic.
\end{assumption}
It is well-established that if $\mathcal{M}$ is ergodic, then $\forall\theta\in\Theta$, there exists a unique stationary $\rho$-independent distribution, denoted as $d^{\pi_{\theta}}\in \Delta(\mathcal{S})$, which obeys
$(P^{\pi_{\theta}})^{\top}d^{\pi_{\theta}}=d^{\pi_{\theta}}$. With this notation in place, we define the mixing time of an MDP.


The mixing time of an MDP $\mathcal{M}$ with respect to a policy parameter $\theta$ is defined as
\begin{align*}
    t_{\mathrm{mix}}^{\theta}\coloneqq \min\left\lbrace t\geq 1\bigg| \|(P^{\pi_{\theta}})^t(s, \cdot) - d^{\pi_\theta}\|_{\mathrm{TV}}\leq \dfrac{1}{4}, \forall s\in\mathcal{S}\right\rbrace
\end{align*} 
where $\|\cdot\|_{\mathrm{TV}}$ denotes the total variation distance. 

%We define $t_{\mathrm{mix}}\coloneqq \sup_{\theta\in\Theta} t^{\theta}_{\mathrm{mix}} $ as the the overall mixing time. This paper assumes  $t_{\mathrm{mix}}$ to be finite.
Let $
\tilde{d}^{\pi_{\theta}}_{\gamma,\rho}(s)
:=
(1-\gamma)\sum_{t=0}^{\infty}\gamma^{t}
\Pr\bigl(s_t = s | s_0\sim\rho,\;\pi_{\theta}\bigr)
$ be the discounted state‐visitation frequency under policy $\pi_{\theta}$ with initial distribution $\rho$ and discount factor $\gamma$. Define the modified transition kernel $
\widetilde{P}(s'| s,a)
:=
\gamma\,P(s'| s,a)\;+\;(1-\gamma)\,\rho(s'),
$ which corresponds to sampling $s'\sim P(\cdot| s,a)$ with probability $\gamma$ and $s'\sim\rho$ otherwise.


It is known that if the MDP is ergodic, then $\tilde{d}^{\pi_{\theta}}_{\gamma,\rho}$ is the stationary distribution of the Markov chain with the $\pi_\theta$-induced transition kernel $\widetilde{P}^{\pi_{\theta}}(\cdot|s)\coloneqq \gamma P^{\pi_{\theta}}(\cdot|s)+(1-\gamma)\rho(\cdot)$ \citep{konda-thesis}. We define
\begin{align*}
    \tilde{t}_{\mathrm{mix}}^{\theta}\coloneqq \min\left\lbrace t\geq 1\bigg| \|(\widetilde{P}^{\pi_{\theta}})^t(s, \cdot) - \tilde{d}^{\pi_\theta}\|_{\mathrm{TV}}\leq \dfrac{1}{4}, \forall s\in\mathcal{S}\right\rbrace
\end{align*} 

When the state space is finite, $
\tilde t_{\mathrm{mix}}^{\theta} = \mathcal O\bigl((1-\gamma)^{-1}\bigr).
$To see this, observe that $\widetilde{P}^{\pi_{\theta}}$ is a convex combination of $P^{\pi_{\theta}}$ and a rank 1 matrix. The bound follows using Corollary 1 of \citep{nussbaum2003notessecondeigenvaluegoogle} to bound the spectral gap, which provides a bound for the mixing time \citep{levin2017markov}. For convenience, we introduce
$$
t_{\mathrm{mix}} \;=\; \sup_{\theta\in\Theta}\max\bigl\{t_{\mathrm{mix}}^{\theta},\,\tilde t_{\mathrm{mix}}^{\theta}\bigr\},
$$
which serves as a uniform upper bound on both quantities.


\section{Natural Actor–Critic with Data Drop (NAC--DD)}
\label{sec:nacdd}
Policy Gradient (PG)-type algorithms typically maximize the long-term reward function $J(\cdot)$ by updating $\theta$ along the gradient of $J(\cdot)$, which can be expressed in the following form using the well-known policy gradient theorem \citep{sutton1999policy}.  
\begin{align}
    \nabla_{\theta} J(\theta)=\frac{1}{1-\gamma}\mathbb{E}_{s\sim \tilde{d}^{\pi_{\theta}}_{\gamma,\rho},a\sim\pi_{\theta}(\cdot|s)}\!\bigg[ \!Q^{\pi_{\theta}}(s,a)\nabla_{\theta}\log\pi_{\theta}(a|s)\!\bigg]
\end{align}
Natural Policy Gradient (NPG) methods, however, update $\theta$ along the NPG $\omega^*_\theta$ instead, where
    \begin{align} \label{npg}
        \omega^*_{\theta} = F(\theta)^{\dagger} \nabla_{\theta} J(\theta),
    \end{align}
    $\dagger$ denotes the Moore-Penrose pseudo-inverse and $F(\theta)$ is the Fisher information matrix as defined as: 
    \begin{align}
        &F(\theta)\nonumber
       = \E_{s\sim \tilde{d}^{\pi_{\theta}}_{\gamma,\rho}}\E_{a\sim\pi_{\theta}(\cdot\vert s)} \left[
              \nabla_{\theta}\log\pi_{\theta}(a|s)(\nabla_{\theta}\log\pi_{\theta}(a|s))^\top \right]
    \end{align}
The precoder $F(\theta)$ takes the change of the parameterized policy with respect to $\theta$ into account, thereby preventing overshooting or slow updates of $\theta$. Note that $\omega_\theta^*$ can be written as the minimizer of the function $L_{\pi_\theta}(\cdot, \theta)$ where
\begin{align}
\label{eq_def_L_nu}
   & L_{\pi_\theta}(\omega, \theta)= \dfrac{1}{2}\E_{s \sim \tilde{d}^{\pi_{\theta}}_{\gamma,\rho}(\cdot),a\sim \pi(\cdot|s)}\nonumber\\& \left[\big((1-\gamma)Q^{\pi_{\theta}}(s,a)-\omega^\top\nabla_{\theta}\log\pi_{\theta}(a| s)\big)^2\right]
\end{align}
for all $ \omega\in\mathbb{R}^{\mathrm{d}}$. This is essentially a convex optimization that can be iteratively solved utilizing a gradient-based method. One can show that
\begin{align}
\label{eq_washim_def_L_grad}
    \nabla_{\omega}  L_{\pi_\theta}(\omega, \theta) = F(\theta)\omega - \nabla_{\theta} J(\theta)
\end{align}
Note that $\nabla_{\omega}L_{\pi_\theta}(\omega, \theta)$ is not exactly computable since the transition function $P$ and hence the stationary distribution, $\tilde{d}^{\pi_\theta}_{\gamma, \rho}$, and the state-action value function, $Q^{\pi_\theta}(\cdot, \cdot)$ are typically unknown in most practical cases. 

% -------------------------------------------------------------------
% Critic approximation and gradient
% -------------------------------------------------------------------
To estimate the policy gradient, we introduce a parameterized critic 
$Q(\phi(s,a);\zeta)$ in place of the true action‐value function 
$Q^{\pi_\theta}(s,a)$.  Here $\phi\colon\mathcal{S}\times\mathcal{A}\to\mathbb{R}^n$ 
is a fixed feature map and $\zeta\in\mathbb{R}^m$ the critic parameters.  

In this paper, we consider a neural temporal difference 
learning method where the action-value function $Q^{\pi_{\theta_k}}(\cdot,\cdot)$ is parameterized by some multi-layer neural network. Let us define a feedforward neural network by the following recursion:
\begin{equation}
\label{eq:nn-layer}
x^{(l)}=\frac{1}{\sqrt{m}} \sigma \left(W_{l}x^{(l-1)}\right), \quad l\in \{1,2,\cdots,L\},
\end{equation}
where $W_1\in\mathbb{R}^{m\times n}$, $W_{l}\in\mathbb{R}^{m\times m}$ for $2\leq l\leq L$ are the weight matrices of the network, $\sigma(\cdot)$ is  
an activation function, and $x^{(0)}=\phi(s,a)\in\mathbb{R}^n$. Using $x^{(L)}$ computed above, the approximate action-value function $Q(\phi(s,a);\zeta)$ can be computed as
\begin{equation}
\label{eq:nn}
Q(\phi(s,a);\zeta)=\frac{1}{\sqrt{m}} b^\top x^{(L)} , 
\end{equation}
where the parameter $\zeta=\left(\mbox{Vec}(W_1);\cdots; \mbox{Vec}(W_L)\right)$ denotes the collection of all weight matrices, and $b$ is given by a random initialization. The parameter $b$ will not be optimized during training. Note that the RHS of the above equation depends on $\zeta$ via $x^{(L)}$. $\mbox{Vec}(\cdot)$ stands for the vectorization operator that reshapes a matrix to a column vector by stacking its columns one by one and the ``;'' separator in $\zeta$ stands for the vertical stacking of the elements. That is, we reshape $\zeta$ to a long column vector for the notational convenience.

\begin{assumption}
\label{assump:critic-activation}
The activation function $\sigma(\cdot)$ is $L_1$-Lipschitz and $L_2$-smooth, i.e. , for $\forall y_1, y_2\in \mathbb{R}:$
$$
\left|\sigma(y_1)-\sigma(y_2)\right| \leq L_1 |y_1-y_2|
$$
and 
$$
\left|\sigma'(y_1)-\sigma'(y_2)\right| \leq L_2 |y_1-y_2|.
$$
\end{assumption}
Assumption \ref{assump:critic-activation} indicates that our results below are not based on the popular ReLU activation function. However, we primarily focus on some twice-differentiable activation functions (such as Sigmoid, ELU, GeLU, etc.), which are smooth approximations of the ReLU function and are frequently utilized in practical problems \citep{devlin2018bert, godfrey2019evaluation}. Such a setup aligns with \citep{liu2020linearity}, and provides a $\mathcal{O}(m^{-\frac{1}{2}})$-smooth property for the neural Q-function.

Let $\zeta_0=\left(\mbox{Vec}(W_1^0);\cdots; \mbox{Vec}(W_L^0)\right)$ be the initial solution. For each $l$, we initialize the weights of $W_l^0$ 
element-wise from a normal distribution $\mathcal{N}(0,1)$ and each element of $b$ is drawn uniformly from $\{-1,+1\}$. For regularity purpose, we would like to restrict the iterations to a bounded set around $\zeta_0$, which is defined as 
\begin{align}
S_R := 
\Big\{ \zeta = \big( \text{Vec}(W_1); \dots; &\text{Vec}(W_L) \big) 
| \|\zeta - \zeta_0\|_2 \leq R, \notag \\ 
&1 \leq l \leq L \Big\}
\end{align}

and denote the projection onto $S_R$ as $\Pi_{R}$.



We now define the local linearization function class of the multi-layer Q network \eqref{eq:nn} at the random initialization $\zeta_0$:
\begin{equation}
    \label{defn:local-linearization}
    \mathcal{F}_{R, m}:=\left\{\widehat{Q}(\cdot\,;\zeta)=Q(\cdot\,;\zeta_0)+\left<\nabla_{\zeta} Q(\cdot\,;\zeta_0), \zeta-\zeta_0\right> \right\}
\end{equation} 
for any $\zeta \in S_R$.


For each policy parameter $\theta$, define the mean‐squared Bellman error under the on‐policy state–action distribution by
\begin{align*}
&E(\theta,\zeta)\\
&\coloneqq \frac12\sum_{s,a}d^{\pi_\theta}(s)\,\pi_\theta(a| s)\,\bigl[\,Q^{\pi_\theta}(s,a)-\widehat Q\bigl(\phi(s,a);\zeta\bigr)\bigr]^2
\end{align*}
The critic parameter $\zeta_*^\theta$ is then chosen (not necessarily uniquely) to minimize this error:

$$
\zeta_*^\theta\;\in\;\arg\min_{\zeta\in\mathbb R^m}E(\theta,\zeta)\,.  
$$

A direct computation of $\zeta^\theta_*$ is infeasible and instead, we perform stochastic gradient descent.  Noting 
\begin{align}
  \label{eq:critic-gradient}
  &\nabla_\zeta
  \tfrac12\bigl[Q^{\pi_\theta}(s,a)-\widehat{Q}(\phi(s,a);\zeta)\bigr]^2
  \nonumber \\
  &= \bigl[\widehat{Q}(\phi(s,a);\zeta)-Q^{\pi_\theta}(s,a)\bigr]\,
    \nabla_\zeta \widehat{Q}(\phi(s,a);\zeta),
\end{align}
we obtain the batch gradient
\begin{align}
 & \nabla_\zeta E(\theta,\zeta)= \sum_{s,a}
    d^{\pi_\theta}(s)\pi_\theta(a| s)\cdot\nonumber \\
    &\quad \bigl[Q^{\pi_\theta}(s,a)-\widehat{Q}(\phi(s,a);\zeta)\bigr]\,
    \nabla_\zeta \widehat{Q}(\phi(s,a);\zeta).
\end{align}
In practice, samples obtained from a contiguous trajectory induced by $\pi_\theta$ are used to form unbiased estimates of this gradient. The details of these estimates are given below.  

% -------------------------------------------------------------------


  

\subsection{Algorithmic description}
We divide each outer epoch into two phases of equal length \(N\).  Within each 
phase we process data in contiguous blocks of size 
\(\displaystyle M=\kappa t_{\mathrm{mix}}\lfloor\log_2 T\rfloor\), where 
\(T=KN\) is the time horizon and \(\kappa\ge1\) is a user‐chosen integer.  
Each phase thus comprises 
\(\displaystyle H=\bigl\lceil N/M\bigr\rceil\) blocks.

\noindent\textbf{Notation.}
Let \(s_{t}\) and \(a_{t}\) denote the state and action at time \(t\).  We write
\[
  x_h^k = \phi\bigl(s^k_{hM}, a^k_{hM}\bigr),
  \quad
  x'_h{}^k = \phi\bigl(s^k_{hM+1}, a^k_{hM+1}\bigr).
\]
The temporal‐difference error on block \(h\) of epoch \(k\) is
\begin{equation}
  \Delta_h^k
  = Q(x_h^k;\zeta_h^k)
    -\bigl[r^k_{hM} + \gamma\,Q(x'_h{}^k;\zeta_h^k)\bigr],
\end{equation}
and its gradient contribution
\[
  g_h^k(x_h^k;\zeta_h^k)
  = \Delta_h^k\,\nabla_\zeta Q(x_h^k;\zeta_h^k).
\]
\(g_h^k\) serves as an estimate of \(\nabla_\zeta E(\theta_k,\zeta_h^k)\).

For the NPG update, let
\(\bar s_h^k=s_{hM}^k\) and \(\bar a_h^k=a_{hM}^k\).  Define
\begin{align}
\label{eq:nabla_L_sample}
  \widehat\nabla_\omega L_h^k
  &= \nabla_\theta\log\pi_{\theta_k}(\bar a_h^k|\bar s_h^k)\,
    \bigl[\nabla_\theta\log\pi_{\theta_k}(\bar a_h^k|\bar s_h^k)\bigr]^\top
    \,\omega_{H+h}^k
  \notag\\
  &\quad -\,Q\bigl(\phi(\bar s_h^k,\bar a_h^k);\zeta_N^k\bigr)\,
    \nabla_\theta\log\pi_{\theta_k}(\bar a_h^k|\bar s_h^k).
\end{align}
It can be seen that \(\widehat\nabla_\omega L_h^k\) is an estimate of \(\nabla_\omega L_h^k\). 

\begin{algorithm}[ht]
  \caption{Natural Actor–Critic with Data Drop (NAC–DD)}
  \label{alg:nacdd}
  \begin{algorithmic}[1]
\State \textbf{Input:} Initial parameters \(\theta_0\), \(\{\omega_H^k\}\), \(\{\zeta_0^k\}\); policy step size \(\alpha\); NPG step size \(\eta\); critic step size \(\beta\); initial state \(s_0\sim\rho\); time horizon $T$; outer loops \(K\); inner loop length \(H\); discount factor \(\gamma\); drop number \(M=\kappa t_{\mathrm{mix}}\lfloor\log_2 T\rfloor\).
\State \textbf{Critic Initialization} $\zeta_0$: Sample each entry of \(W_l^0\sim\mathcal{N}(0,1)\) for \(l=1,\dots,L\), and each entry of \(b\sim\mathrm{Unif}\{-1,+1\}\).
    \For{\(k=0,\dots,K-1\)}
      \State Set \(s^k_0\) to the final state of epoch \(k-1\).
      \For{\(h=0,\dots,H-1\)}       \Comment{Critic phase}
        \For{\(m=0,\dots,M-1\)}
          \State Sample \(a_{hM+m}^k\sim\pi_{\theta_k}(\cdot| s_{hM+m}^k)\)
          \State Sample \(s_{hM+m+1}^k\sim P(\cdot| s_{hM+m}^k,a_{hM+m}^k)\)
        \EndFor
        \State Compute \(\Delta_h^k\) and update
          \(\zeta_{h+1}^k = \Pi_R\bigl(\zeta_h^k - \beta\,g_h^k\bigr)\)
      \EndFor
      \For{\(h=0,\dots,H-1\)}      \Comment{NPG phase}
        \For{\(m=0,\dots,M-1\)}
          \State Sample \(a_{hM+m}^k\sim\pi_{\theta_k}(\cdot| s_{hM+m}^k)\)
          \State Sample \(s_{hM+m+1}^k\sim \widetilde P(\cdot| s_{hM+m}^k,a_{hM+m}^k)\)
        \EndFor
        \State Compute \(\widehat\nabla_\omega L_h^k\)
        \State Update 
          \(\omega_{H+h+1}^k \leftarrow \omega_{H+h}^k - \eta\,\widehat\nabla_\omega L_h^k\)
      \EndFor
      \State Set \(\omega_k\leftarrow \omega_{2H}^k\).
      \State Update \(\theta_{k+1}\leftarrow \theta_k + \alpha\,\omega_k\). \Comment{Policy update}
    \EndFor
  \end{algorithmic}
\end{algorithm}

\noindent\textbf{Block updates.}
Putting it all together, for each block \(h\) in epoch \(k\) we perform the neural TD updates
\begin{align}
\label{eq:td-sample-update}
  \zeta_{h+1}^k = \Pi_R\bigl(\zeta_h^k - \beta\,g_h^k\bigr),
\end{align}
in the first phase followed by the NPG updates
\begin{align}
\label{eq:npg-sample-update}
  \omega_{H+h+1}^k = \omega_{H+h}^k - \eta\,\widehat\nabla_\omega L_h^k,
\end{align}
and then finalize with the policy update 
\(\theta_{k+1} = \theta_k + \alpha\,\omega_{2N}^k\).

\section{Sample Complexity of the proposed algorithm}

We first state some assumptions that we will be using before proceeding to the main result.

\begin{assumption}
\label{assump:critic-error}
    The critic approximation error defined as
%
\begin{equation}
    \epsilon_{\mathrm{app}} \coloneqq \sup_{\theta \in \Theta}  \E[(Q^{\pi_{\theta}}(s,a)-\Pi_{\mathcal{F}_{R,m}}Q^{\pi_{\theta}}(s,a))^2],
\end{equation}
 where the expectation is over $s \sim d^{\pi_{\theta}}, a \sim \pi_{\theta}(\cdot|s)$, is assumed to be finite.
\end{assumption}

\begin{assumption} \label{assum:critic_positive_definite}
    There exist $\lambda_0> 0$ such that $\forall \theta$
    \begin{align}
       \E[\nabla_{\zeta} &Q(\phi(s,a);\zeta_0)\nabla_{\zeta} Q(\phi(s,a);\zeta_0)^\top]\succcurlyeq \lambda_0 I.\nonumber 
    \end{align}
    where the expectation is over $s \sim d^{\pi_{\theta}}_{\gamma,\rho}, a \sim \pi_{\theta}(\cdot|s)$. We will denote $\E[\nabla_{\zeta} Q(\phi(s,a);\zeta_0)\nabla_{\zeta} Q(\phi(s,a);\zeta_0)^\top]$ as $\Sigma_{\pi_{\theta}}$ henceforth.
\end{assumption}
\if 0
Temporary:
\begin{align}
       \textstyle{\E[\nabla_{\zeta} Q(\phi(s,a);\zeta_0)(\nabla_{\zeta} Q(\phi(s,a);\zeta_0)-\gamma \nabla_{\zeta} Q(\phi(s',a');\zeta_0))^\top]} \succcurlyeq \lambda_0 (1-\gamma) I 
    \end{align}
Average case:
 \begin{align}
       \textstyle{\E[\nabla_{\zeta} V(\phi(s);\zeta_0)\nabla_{\zeta} V(\phi(s);\zeta_0)^\top]} \succcurlyeq \lambda_0 I 
    \end{align}

\begin{align}
       \textstyle{\E[\nabla_{\zeta} V(\phi(s);\zeta_0)(\nabla_{\zeta} V(\phi(s);\zeta_0)-\nabla_{\zeta} V(\phi(s');\zeta_0))^\top]} \succcurlyeq \lambda I 
    \end{align}

$\psi(s)\coloneqq \nabla_{\zeta} V(\phi(s);\zeta_0)$ (feature vector for state $s$)
\fi 

\begin{assumption}
\label{assump:function_approx_error}
For any $\theta$, the \textit{transferred compatible function approximation error}, $L_{\pi^*}(\omega_{\theta}^*; \theta)$, satisfies the following inequality. 
\begin{align}
\label{equ: minimal compatible function approximation error}
L_{\pi^*}(\omega_{\theta}^*; \theta) 
&\coloneqq \E_{s\sim d^{\pi^*}_{\gamma,\rho}, a \sim \pi^*(\cdot|s)}
\Big[
(1-\gamma)A^{\pi_{\theta}}(s,a) \\
&- (\omega_{\theta}^*)^\top 
\nabla_{\theta}\log\pi_{\theta}(a| s)
\Big]^2 \notag \leq \epsilon_{\mathrm{bias}},
\end{align}
where $\pi^*$ is an optimal policy for the discounted MDP $\mathcal{M}$ and $\omega_{\theta}^*$ is the exact NPG direction at $\theta$.
\end{assumption}



\begin{assumption}
    \label{assump:score_func_bounds}
    For all $\theta, \theta_1,\theta_2 \in\Theta$ and $(s,a)\in\mathcal{S}\times\mathcal{A}$, the following statements hold:
    \begin{enumerate}[label=(\alph*)]
        \item $\Vert\nabla_{\theta}\log\pi_\theta(a\vert s)\Vert\leq G_1$
        \item $\Vert \nabla_{\theta}\log\pi_{\theta}(a\vert s)-\nabla_\theta\log\pi_{\theta_2}(a\vert s)\Vert\leq G_2\Vert \theta_1-\theta_2\Vert.$
    \end{enumerate}
\end{assumption}


\begin{assumption}[Fisher non-degenerate policy]
    \label{assump:FND_policy}
    There exists a constant $\mu>0$ such that $F(\theta)-\mu I_{d}$ is positive semidefinite where $I_{d}$ denotes an identity matrix.
\end{assumption}

{\bf Comments on Assumptions \ref{assump:critic-error}-\ref{assum:critic_positive_definite}:} Assumption \ref{assump:critic-error} ensures that a class of neural networks can approximate the function obtained by applying the Bellman operator to another neural network within the same class. Similar assumptions have been considered in \citep{fu2020single,wang2019neural,ke2024an,gaur2024closing}. In works such as \citep{cayci2022finite}, even stronger assumptions are made, where the function class used for critic parameterization is assumed to approximate any smooth function.  

Assumption \ref{assum:critic_positive_definite} has been employed in prior works \citep{zou2019finite,pmlr-v119-xu20c} and is closely related to the state regularity assumption, which similarly ensures a strong convexity-type property in the critic update \citep{tian2023convergence,gaur2024closing}. It can also be viewed as a generalization of the positive definite feature covariance matrix assumption in the analysis of linear Q-learning \citep{xu2019sample,ganesh2024orderoptimal}. %Moreover, the matrix \(\Sigma_{\pi}\) is linked to the Gram matrix of the neural network at \(\theta_0\), which is known to be strictly positive definite for sufficiently large network width (see \citep{ke2024an} for details).

{\bf Comments on Assumptions \ref{assump:function_approx_error}-\ref{assump:FND_policy}:}  We would like to highlight that all these assumptions  are commonly found in PG literature \citep{liu2020improved,agarwal2021theory, papini2018stochastic, xu2019sample,fatkhullin2023stochastic}. We elaborate more on these assumptions below. 

    The term $\epsilon_{\mathrm{bias}}$ captures the expressivity of the parameterized policy class. If the policy class is complete such as in the case of softmax parametrization, we have $\epsilon_{\mathrm{bias}}=0$ \citep{agarwal2021theory}. However, for restricted parametrization which may not contain all stochastic policies, we have $\epsilon_{\mathrm{bias}}>0$. It is known that $\epsilon_{\mathrm{bias}}$ is insignificant for rich neural parametrization \citep{wang2019neural}. Assumption \ref{assump:score_func_bounds} requires that the score function is bounded and Lipschitz continuous. This assumption is widely used in the analysis of PG based methods \citep{liu2020improved,agarwal2021theory, papini2018stochastic, xu2019sample,fatkhullin2023stochastic}. Assumption \ref{assump:FND_policy} requires that the eigenvalues of the Fisher information matrix can be bounded from below and is commonly used in obtaining global complexity bounds for PG based methods \citep{liu2020improved,zhang2021on,Bai_Bedi_Agarwal_Koppel_Aggarwal_2022,fatkhullin2023stochastic}. Assumptions \ref{assump:score_func_bounds}-\ref{assump:FND_policy} were shown to hold for various examples recently including Gaussian policies with linearly parameterized means and certain neural parametrizations \citep{liu2020improved, fatkhullin2023stochastic}.


\begin{theorem}
\label{thm:main-conv-rate}
    Consider Algorithm \ref{alg:nacdd} with $K=\frac{1}{\epsilon}$, $H=\frac{1}{2 t_{\mathrm{mix}}\lfloor\log_2(1/\epsilon)\rfloor\epsilon}$ and $M=2 t_{\mathrm{mix}}\lfloor\log_2(1/\epsilon)\rfloor$ . If Assumptions \ref{assump:ergodic_mdp}-\ref{assump:FND_policy} hold then there exists a choice of parameters such that the following holds for sufficiently small $\epsilon$:
    \begin{align*}
&\textstyle{J^* - \frac{1}{K}\sum_{k=0}^{K-1}\E[J(\theta_k)|\zeta_0]}\\
&\textstyle{\le \cO \left(\frac{\sqrt{\epsilon_{\rm bias}}}{1-\gamma}
  + \frac{\sqrt{\epsilon_{\rm app}}}{1-\gamma}
  + \frac{ t_{\mathrm{mix}}\,\log^3 \left(\frac{1}{\epsilon \delta}\right)}{(1-\gamma)^3}\cdot \epsilon
      +\frac{1}{m^{1/4}(1-\gamma)^{1/2}}
    \right)}.
\end{align*}
    with probability $1-2\delta-2L\exp(-Cm)$, for some constant $C>0$. Here, $m$ and $L$ denote the width and depth of the critic neural network, respectively.
\end{theorem}



\section{Proof Outline}
We structure our analysis into three parts: policy update, NPG estimation, and critic analysis.

\subsection{Policy update analysis}
We begin with a useful lemma from \citep{mondal2024improved}.

\begin{lemma}
    \label{lemma:local_global}
    Consider any policy update rule of the form
    \begin{align}
        \theta_{k+1} = \theta_k + \alpha \omega_k.
    \end{align}
    If Assumptions \ref{assump:function_approx_error} and \ref{assump:score_func_bounds} hold, then the following inequality is satisfied:
    
    \begin{equation} \label{eq:general_bound}
    \begin{aligned}
        &J^{*}-\frac{1}{K}\sum_{k=0}^{K-1}\E[J(\theta_k)] 
        \leq \frac{\sqrt{\epsilon_{\mathrm{bias}}} }{1-\gamma} 
        \\
        &\quad  +\frac{G_1}{K} \sum_{k=0}^{K-1} \E\big\| \E[\omega_k|\theta_k] - \omega^*_k \big\| +\frac{\alpha G_2}{2K} \sum_{k=0}^{K-1} \E\|\omega_k\|^2 \\
        &\quad +\frac{1}{\alpha K} 
        \E_{s\sim d^{\pi^*}}\big[ \mathrm{KL}(\pi^*(\cdot| s) \| \pi_{\theta_0}(\cdot| s)) \big],
    \end{aligned}
    \end{equation}
    where $\mathrm{KL}(\cdot \|\cdot)$ is the Kullback-Leibler divergence, $\omega^*_k$ is the NPG direction $F(\theta_k)^{-1}\nabla J(\theta_k)$, $\pi^*$ is the optimal policy, and $J^*$ is the optimal value of the function $J(\cdot)$.
\end{lemma}

The last term above is of order $\mathcal{O}(1/K)$ since  
\[
\E_{s\sim d^{\pi^*}}\big[\mathrm{KL}(\pi^*(\cdot\vert s)\Vert\pi_{\theta_0}(\cdot\vert s))\big]
\]  
is constant. The term $\E\|\omega_k\|^2$ is further decomposed as:
\begin{equation}
\label{eq:eq_21}
    \begin{aligned}
        \frac{1}{K} \sum_{k=0}^{K-1} \E\|\omega_k\|^2 
        &\leq \frac{2}{K} \sum_{k=0}^{K-1} \E\|\omega_k - \omega_k^*\|^2  \\
        &\quad + \frac{2}{K} \sum_{k=0}^{K-1} \E\|\omega_k^*\|^2 \\
        &\overset{(a)}{\leq} \frac{2}{K} \sum_{k=0}^{K-1} \E\|\omega_k - \omega_k^*\|^2  \\
        &\quad + \frac{2\mu^{-2}}{K} \sum_{k=0}^{K-1} \E\|\nabla_{\theta} J(\theta_k)\|^2,
    \end{aligned}
\end{equation}
where $(a)$ follows from Assumption \ref{assump:FND_policy} and the definition  
$\omega_k^* = F(\theta_k)^{-1}\nabla_\theta J(\theta_k)$.  

Thus, we can obtain a global convergence bound by bounding the terms  
$\E\|\omega_k-\omega^*_k\|^2$, $\E\|\E[\omega_k|\theta_k] -\omega^*_k\|$, and  
$\E\|\nabla_{\theta} J(\theta_k) \|^2$. The first two terms represent the second-order error and bias of the NPG estimator $\omega_k$, and the third term indicates the local convergence rate.  
Since $\E\|\nabla_{\theta} J(\theta_k) \|^2$ can be expressed in terms of  
$\E\|\omega_k - \omega^*_k\|^2$, we now briefly describe how to bound these terms.

\begin{figure*}[t]
\centering
\subfigure[Hopper-v3]{
\label{fig:1(a)} 
\includegraphics[width=2.1in, height=1.3in]{figs/hp-ab.pdf}}
\subfigure[HalfCheetah-v3]{
\label{fig:1(b)} 
\includegraphics[width=2.1in, height=1.3in]{figs/hc-ab.pdf}}
\subfigure[Walker2d-v3]{
\label{fig:1(c)} 
\includegraphics[width=2.1in, height=1.3in]{figs/wk-ab.pdf}}
\caption{Performance of NAC-DD on MuJoCo locomotion tasks with varying drop numbers ($M$). The results demonstrate that NAC-DD consistently achieves better performance when the drop number exceeds 1.}
\label{fig:1} 
\end{figure*}

\subsection{NPG analysis}
In this section, we derive bounds on the second‐order error and bias of the NPG estimator $\omega_k$. For any policy $\pi_{\theta_k}$, the critic subroutine’s fixed point need not be unique. Let
$$
Z_k = \bigl\{\zeta : \widehat Q(\cdot;\zeta)\text{ is a fixed point of }\Pi_{\mathcal F,m}\mathcal T^{\pi_{\theta_k}}\bigr\},
$$
and let $\zeta_*^k$ be the projection of the initial critic parameter $\zeta_0$ onto $Z_k$. We will show that the algorithm’s iterates closely track $\zeta_*^k$. Finally, denote by $\mathbb{E}_k[\cdot]$ the expectation conditioned on $\theta_k$.
 
\begin{lemma}[Second-order error of NPG estimator]
\label{lem:npg-second_order}
Consider the NPG-finding recursion \eqref{eq:npg-sample-update} with  
$\eta = \frac{2\log H}{\mu H}$. If all assumptions in Theorem \ref{thm:main-conv-rate} hold, then for sufficiently large $H$,  
\begin{align*}
    &\E_k[\|\omega_k-\omega_k^*\|^2|\zeta_0]\leq \cO\Bigg(\frac{G_1^2(C_1')^2\log(H/\delta)}{H\mu^{2}(1-\gamma)^4}+\mu^{-2}m^{-1/2}\\
    &+\mu^{-2}G_1^2\E_k[\|\zeta_H^k-\zeta_*^k\|^2|\zeta_0]+\frac{G_1^2 \epsilon_{\mathrm{app}}}{\mu^2(1-\gamma)^2}\Bigg)
\end{align*}
\end{lemma}

\begin{lemma}[Bias of NPG estimator]
\label{lem:npg-bias}

Consider the NPG-finding recursion \eqref{eq:npg-sample-update} with  
$\eta = \frac{2\log H}{\mu H}$. If all assumptions in Theorem \ref{thm:main-conv-rate} hold, then for sufficiently large $H$, we have the following bound with probability $1-2\delta-2L\exp(-Cm)$, for some constant $C>0$
\begin{align*}
   \| \E_k[\omega_k|\zeta_0]-&\omega_k^*\|^2\leq \cO\Bigg(\frac{G_1^2(C_1')^2G_1^2\log(H/\delta)}{T^\kappa}\\
   &+\|\E_k[\zeta_{H}^k|\zeta_0]-\zeta_*^k\|^2
   +\frac{G_1^2\epsilon_{\mathrm{app}}}{\mu^2(1-\gamma)^2}\Bigg)
\end{align*}
\end{lemma}
% \begin{align}
%     \E_k\|\omega_k-\omega_k^*\|^2\leq \cO\left(\frac{G_1^4}{H\mu^{4}(1-\gamma)^4}+\frac{G_1^2(C_1')^2\log(H/\delta)}{H\mu^{2}(1-\gamma)^4}+\mu^{-2}G_1^2\E\|\zeta_H^k-\zeta_*^k\|^2+\mu^{-2}m^{-1/2}+\frac{G_1^2 \epsilon_{\mathrm{app}}}{\mu^2(1-\gamma)^2}\right)
% \end{align}
% and
% \begin{align}
%    \| \E_k[\omega_k]-\omega_k^*\|^2\leq \cO\left(\frac{G_1^2(C_1')^2G_1^2\log(H/\delta)}{T^\kappa}+\|\E[\zeta_{H}^k]-\zeta_*^k\|^2+\frac{G_1^2\epsilon_{\mathrm{app}}}{\mu^2(1-\gamma)^2}\right)
% \end{align}
The proof of this result can be found in Appendix \ref{sec:lem23-proof}. Since the NPG estimator $\omega_k$ uses the critic values, the above bounds depend on the second-order error and bias of the critic estimator. The bounds for these quantities are provided in the next section.
\subsection{Critic analysis}

In this section, we focus on providing bounds for the second-order error and bias of the critic estimator $\zeta^k_H$. A second-order error bound of $\cO(\frac{1}{\epsilon})$ for $Q$-learning with neural approximation was recently studied in \citep{ke2024an}, without requiring strict positive definiteness as in Assumption \ref{assump:critic-error}. Instead, we present an alternative analysis of this result that enables us to also derive a bound on the critic’s bias. The proof of this result can be found in Appendix \ref{sec:lem4-proof}.


\begin{lemma}[Second-order error of the Critic]
\label{lem:critic-second_order}

Consider Algorithm \ref{alg:nacdd} and let  
$\beta = \frac{2\log H}{\lambda H}$.  
If all assumptions of Theorem \ref{thm:main-conv-rate} hold, then for sufficiently large $H$,  
\begin{align*}
    &\E[\|\zeta_{H}^k - \zeta_*^k\|^2|\zeta_0] \leq \cO \bigg( \frac{\E\|\zeta_{0} -\zeta_*^k\|^2}{H^2} + \\
    &\frac{\log^2 (H/\delta)}{\lambda_0^2 (1-\gamma)^2H}+ \frac{\log (H/\delta)}{\lambda_0 (1-\gamma)m^{1/2}} +\frac{1}{(1-\gamma)^4\lambda_0^4 T^{\kappa}}  \bigg)
\end{align*}  
with probability $1-2\delta-2L\exp(-Cm)$, for some constant $C>0$.
\end{lemma}





% \begin{figure}[t]
% \centering

% \begin{minipage}{0.44\textwidth}
%     \centering
%     \includegraphics[width=\linewidth, height=1.3in]{figs/hp-ab.pdf}
%     \caption*{(a) Hopper-v3}
%     \label{fig:1a}
% \end{minipage}
% \hfill
% \begin{minipage}{0.44\textwidth}
%     \centering
%     \includegraphics[width=\linewidth, height=1.3in]{figs/hc-ab.pdf}
%     \caption*{(b) HalfCheetah-v3}
%     \label{fig:1b}
% \end{minipage}
% \hfill
% \begin{minipage}{0.44\textwidth}
%     \centering
%     \includegraphics[width=\linewidth, height=1.3in]{figs/wk-ab.pdf}
%     \caption*{(c) Walker2d-v3}
%     \label{fig:1c}
% \end{minipage}

% \caption{Performance of NAC-DD on MuJoCo locomotion tasks with varying drop numbers ($M$). The results demonstrate that NAC-DD consistently achieves better performance when the drop number exceeds 1.}
% \label{fig:1}

% \end{figure}

\begin{figure*}[t]
    \centering
    \includegraphics[width=6.3in, height=1.45in]{figs/return.pdf}
    \caption{Comparison of NAC-DD and standard policy gradient algorithms on various MuJoCo locomotion tasks. Here, NAC-DD-5 represents NAC-DD with a drop number of 5. Our algorithm achieves the best performance on two out of three tasks and ranks second on the remaining task.}
    \label{fig:2}
\end{figure*}
\begin{table*}[ht]
  \centering
  \caption{Training time (in hours) for each algorithm on MuJoCo benchmarks, measured on a single NVIDIA GeForce RTX 2080 Ti GPU}
  \label{tab:results}
  \begin{tabular}{lccccccc}
    \toprule
    Algorithm   & NAC-DD-1         & NAC-DD-3         & NAC-DD-5         & PG               & NPG              & TRPO             & PPO              \\
    \midrule
    Hopper      & $5.44 \pm 0.39$  & $12.0 \pm 0.37$  & $17.1 \pm 0.51$  & $2.95 \pm 0.02$  & $3.30 \pm 0.15$  & $3.34 \pm 0.03$  & $4.22 \pm 0.09$  \\
    HalfCheetah & $10.5 \pm 0.36$  & $19.1 \pm 0.33$  & $27.7 \pm 0.34$  & $6.68 \pm 0.02$  & $7.00 \pm 0.04$  & $7.49 \pm 0.06$  & $9.82 \pm 0.04$  \\
    Walker2d    & $12.4 \pm 0.31$  & $23.7 \pm 0.30$  & $35.1 \pm 0.41$  & $6.94 \pm 0.08$  & $7.04 \pm 0.13$  & $8.57 \pm 0.14$  & $9.74 \pm 0.35$  \\
    \bottomrule
  \end{tabular}
    \label{table:comp-time}
\end{table*}
Analyzing the bias forms a key challenge due to the non-linearity of the critic update due to the neural network and due to the presence of the projection operator. The proof details of the following result can be found in Appendix \ref{sec:lem5-proof}.
\begin{lemma}[Bias of the Critic estimator]
\label{lem:critic_bias}

Consider Algorithm \ref{alg:nacdd} and let $\beta = \frac{2\log H}{\lambda H}$. If all assumptions of Theorem \ref{thm:main-conv-rate} hold, then the following is true for sufficiently large $H$.
    \begin{align*}
    &\|\E[\zeta_{h+1}^k|\zeta_0]-\zeta_*^k \|^2 \leq \cO \Bigg(\frac{\|\E[\zeta_0] - \zeta_*^k\|^2}{\lambda_0^2(1-\gamma)^2H^2}  \nonumber\\
    &+  \frac{\log^4 (H/\delta)}{\lambda_0^6 (1-\gamma)^6H^2}+ \frac{\sqrt{\log(H/\delta)}}{ \lambda_0 (1-\gamma) m^{1/2}} +\frac{1}{(1-\gamma)^{10}\lambda_0^{10} T^{2\kappa}}\Bigg)
\end{align*}
with probability $1-2\delta-2L\exp(-Cm)$, for some constant $C>0$.
\end{lemma}

It can be seen that substituting Lemmas \ref{lem:critic-second_order} and \ref{lem:critic_bias} in Lemmas \ref{lem:npg-second_order} and \ref{lem:npg-bias} with the bound on the policy update in Lemma 1 yields Theorem \ref{thm:main-conv-rate}. Based on Lemmas \ref{lem:npg-second_order}, \ref{lem:npg-bias}, \ref{lem:critic-second_order} and \ref{lem:critic_bias}, we observe that it is sufficient to set $\kappa=2$ to obtain the desired result. 

\input{evaluation}

\section{Conclusions}

In this work, we address the challenge of achieving optimal sample complexity in reinforcement learning for Markov Decision Processes (MDPs) with general policy parameterization and multi-layer neural network critics. Existing methods either fall short of achieving the optimal rate or rely on linear critic approximations. To overcome these limitations, we introduce Natural Actor-Critic with Data Drop (NAC-DD) algorithm, which integrates Natural Policy Gradient methods with a Data Drop technique to mitigate statistical dependencies inherent in Markovian sampling. By achieving an optimal sample complexity of $\tilde{O}(1/\epsilon^2)$, our approach significantly improves upon the previous state-of-the-art guarantee of $\tilde{O}(1/\epsilon^3)$, marking a pivotal advancement in the field.

\section*{Acknowledgement}
This work was supported in part by the Anusandhan National Research Foundation (ANRF), India, through the Overseas Visiting Doctoral Fellowship and the U.S. National Science Foundation under grant CCF-2149588.


\bibliography{references}
\newpage
\onecolumn

\appendix

% \crefalias{section}{appendix} % uncomment if you are using cleveref

\section{Neural Critic Analysis}

Let $\hat{g}(x_h^k;\zeta) \coloneqq (\widehat{Q}(x_h^k;\zeta)-(r_h^k+\gamma \widehat{Q}(x'_h{}^k;\zeta)))\nabla_{\zeta}Q(x_h^k;\zeta_0)$ be the linearization of $g(x_h^k;\zeta)$ at $\zeta_0$. For brevity, we denote $\phi(s_h^k,a_h^k)$ by $x_h^k$. It can be seen that
\begin{align}
    \hat{g}(x_h^k;\zeta) = A(x_h^k) \zeta - b(x_h^k),
\end{align}
where 
\begin{align}
    A(x_h^k)= \nabla_{\zeta}Q(x_h^k;\zeta_0)(\nabla_{\zeta}Q(x_h^k;\zeta_0)-\gamma \nabla_{\zeta}Q(x'_h{}^k;\zeta_0)) ^{\top}
\end{align} 
and 
\begin{align}
    b(x_h^k) = (r_h^k + \gamma Q(x'_h{}^k;\zeta_0) -Q(x_h^k;\zeta_0))\nabla_{\zeta}Q(x_h^k;\zeta_0).
\end{align}

Define $A_k \coloneqq \E_{s \sim d^{\pi_{\theta_k}}, a \sim \pi_{\theta_k}(\cdot|s)}[A(x_h^k)|\zeta_0]$ and $b_k \coloneqq \E_{s \sim d^{\pi_{\theta_k}}, a \sim \pi_{\theta_k}(\cdot|s)}[b(x_h^k)|\zeta_0]$. For notational convenience, we henceforth drop the conditional expectation \(\E[\cdot|\zeta_0]\) and instead write denote \(\E[\cdot]\) instead. %\(\E[\cdot]\), \(\E_{k,h}[\cdot]\) and \(\E_k[\cdot]\) in place of \(\E[\cdot|\zeta_0]\), \(\E_{k,h}[\cdot| \zeta_0]\) and \(\E_k[\cdot| \zeta_0]\), respectively.


We now state a useful lemma summarizing various properties of the $Q$-value estimator below:
 \begin{lemma}
\label{lem:neural_td_supp}
Fix an outer iteration index 
$k$. Let \(\zeta_h^k \in S_R\) for all \(h \in \{0,1, 2, \dots, H\}\), where the radius \(R\) satisfies \(R = \mathcal{O}(1)\). Then, for all \(h \in \{1, 2, \dots, H\}\), there exist positive constants $C$, $C_1'$ and $\{C_i\}_{i=1,2,\cdots,5}$ such that the following statements hold with a probability of at least $1-\delta-2L\exp{(-C m)}$. 
\begin{enumerate}[label=(\alph*)]
    \item $\left\|\nabla_{\zeta}Q(x_h^k;\zeta_h^k)\right\| \leq C_1 $, $|Q(x_h^k;\zeta_h^k)| \leq C_1'\sqrt{\log (H/\delta)} $
    \item  $\|g(x_h^k;\zeta_h^k)-\hat{g}(x_h^k;\zeta_h^k)\|\leq C_2 m^{-\frac{1}{2}}\sqrt{ \log (H/ \delta)}$
    \item $ \left|\left<g\left(x_h^k;\zeta_h^k \right)-\hat{g}\left(x_h^k;\zeta_h^k \right), \zeta_h^k -\zeta_*\right>\right| \leq  C_3 m^{-\frac{1}{2}}\sqrt{ \log (H/ \delta)}$
    \item |$\widehat{Q}(x_h^k;\zeta_h^k)-Q(x_h^k;\zeta_h^k)| \leq  C_4 m^{-\frac{1}{2}}\sqrt{ \log (H/ \delta)}$
    \item $\|\nabla_{\zeta}Q(x_h^k;\zeta_0)-\nabla_{\zeta}Q(x_h^k;\zeta_h^k)\| \leq  C_5 m^{-\frac{1}{2}}\sqrt{ \log (H/ \delta)}$
\end{enumerate}
\end{lemma}
Statements $(a)$-$(e)$ follow from results in \citet{ke2024an}: Statement $(a)$ from Lemmas D.2 and D.3, Statements $(b)$ and $(c)$ from Lemma D.5, and Statements $(d)$ and $(e)$ from Lemma D.4. Building on the above result, we obtain the following bounds.

\begin{lemma}
\label{lem:critic_bounds_supp}
   There exist positive constants \(c_1, c_2 > 0\) such that the following bounds for each $h,k$ hold under the assumptions stated in Theorem \ref{thm:main-conv-rate} with a probability of at least $1-\delta-2L\exp{(-C m)}$. 
    \begin{enumerate}
        \item $\|A(x_h^k)\| \leq c_1$
        \item $\|b(x_h^k)\| \leq c_2\sqrt{ \log (H/ \delta)}$
        \item $\|\E[A(x_h^k)]-A_k\| \leq c_1 T^{-\kappa}$
        \item $\|\E[b(x_h^k)]-b_k\| \leq c_2 \sqrt{ \log (H/ \delta)} T^{-\kappa}$
    \end{enumerate}
\end{lemma}

\begin{proof}
    Note that from Lemma \ref{lem:neural_td_supp}$(a)$
    \begin{align}
        \|A(x_h^k)\| \leq \|\nabla_{\zeta}Q(x_h^k;\zeta_0)-\gamma \nabla_{\zeta}Q(x'_h{}^k;\zeta_0)\|\|\nabla_{\zeta}Q(x_h^k;\zeta_0)\| \leq (1+\gamma)  C_1^2.
    \end{align}
    Statement 1 follows by setting $c_1 = (1+\gamma)  C_1^2  $. Again, from Lemma \ref{lem:neural_td_supp}$(a)$
    \begin{align*}
        \|b(x_h^k)\| \leq |r_h^k+Q(x_h^k;\zeta_0)-\gamma Q(x'_h{}^k;\zeta_0)|\|\nabla_{\zeta}Q(x_h^k;\zeta_0)\| \leq (1+C_1'+\gamma C_1') C_1 \sqrt{ \log (H/ \delta)}.
    \end{align*}
    and by setting $c_2= (1+C_1'+\gamma C_1') C_1$,  Statement 2 follows.
    For  Statement 3, observe that
    \begin{align}
        &\E[A(x_h^k)]-A_k= \sum_{x_h^k} A(x_h^k)((P^{\pi_{\theta_k}})^M(s_{(h-1)M}^k,s_{hM}^k)-d^{\pi_{\theta_k}}(s_{hM}^k)) \pi(a_{hM}^k|s_{hM}^k). 
    \end{align}
    Since $M= \kappa t_{\mathrm{mix}}\lceil \log_2 T\rceil$
    \begin{align}
        &\|\E[A(x_h^k)]-A_k\| \leq c_1 \sum_{x_h^k}  |(P^{\pi_{\theta_k}})^M(s_{(h-1)M}^k,s_{hM})-d^{\pi_{\theta_k}}(s_{hM})| \pi(a_{hM}^k|s_{hM})  \leq \frac{c_1}{T^{\kappa}}.
    \end{align}
     Statement 4 follows along similar lines.
\end{proof}

\begin{lemma}
\label{lem:psd-A}
    Fix $k$ and let assumptions in Theorem \ref{thm:main-conv-rate} hold. Then the following holds $\forall \zeta \in \ker (A_k)^\perp$
    \begin{align}
       \zeta^{\top} A_k \zeta \geq (1-\gamma) \lambda_0 \|\zeta\|^2
    \end{align}
\end{lemma}
\begin{proof}
    \begin{align}
    \begin{split}
        \zeta^{\top} A_k \zeta &= \zeta^{\top} \E[\nabla_{\zeta}Q(x_h^k;\zeta_0) \nabla_{\zeta}Q(x_h^k;\zeta_0)^{\top} -\gamma \nabla_{\zeta}Q(x'_h{}^k;\zeta_0) \nabla_{\zeta}Q(x_h^k;\zeta_0)^{\top}] \zeta  \\
        &= \E[(\nabla_{\zeta}Q(x_h^k;\zeta_0)^{\top}\zeta)^2] -\gamma \E[(\nabla_{\zeta}Q(x'_h{}^k;\zeta_0)^\top \zeta) (\nabla_{\zeta}Q(x_h^k;\zeta_0)^{\top}\zeta)] \\ 
        &\overset{(a)}{\geq}  \E[(\nabla_{\zeta}Q(x_h^k;\zeta_0)^{\top}\zeta)^2] -\gamma (\E[(\nabla_{\zeta}Q(x'_h{}^k;\zeta_0)^\top \zeta)^2]\E [(\nabla_{\zeta}Q(x_h^k;\zeta_0)^{\top}\zeta)^2])^{1/2} \\
        &\overset{(b)}{=}   (1-\gamma)\E[(\nabla_{\zeta}Q(x_h^k;\zeta_0)^{\top}\zeta)^2] \\
        &\overset{(c)}{\geq} (1-\gamma)\lambda_0\|\zeta\|^2
    \end{split}
    \end{align}
where $(a)$ follows from Cauchy-Schwartz inequality, $(b)$ follows since $x_h^k$ and $x'_h{}^k$ have the same marginal distribution and $(c)$ follows from Assumption \ref{assump:critic-error}.
\end{proof}



\subsection{Proof of Lemma 4}
\label{sec:lem4-proof}

We begin by introducing some notation. Let $\Lambda_A$, $\Lambda_b$, $\delta_A$, and $\delta_b$ be positive constants such that $|\mathbb{E}[A(x_h^k)] - A_k| \leq \delta_A$, $|\mathbb{E}[b(x_h^k)] - b_k| \leq \delta_b$, $|A(x_h^k)| \leq \Lambda_A$, and $|b(x_h^k)| \leq \Lambda_b$. The values of these quantities are provided in Lemma~\ref{lem:critic_bounds_supp}. 



We introduce an auxiliary sequence $\{ \tilde{\zeta}_h^k \}_{h \geq 0}$ that replaces the neural update $g$ with its linear approximation $\hat{g}$. Specifically, define

$$
\tilde{\zeta}_0^k = \zeta_0^k\equiv \zeta_0, \quad \tilde{\zeta}_{h+1}^k = \Pi_R\left( \tilde{\zeta}_h^k - \beta \hat{g}(x_h^k; \tilde{\zeta}_h^k) \right),
$$

where $\Pi_R$ denotes the projection onto the ball of radius $R$ centered at $\zeta_0$, and $\beta > 0$ is a step-size parameter.

 Let $\Pi_\perp$ denote the orthogonal projection onto $\ker(A_k)^\perp$. We now bound the expected discrepancy between the auxiliary and original iterates:


\begin{align}
\E\left\| \tilde{\zeta}_{h+1}^k - \zeta_{h+1}^k \right\|^2
&= \E\left\| \Pi_R\left( \zeta_h^k - \beta g(x_h^k; \zeta_h^k) \right) - \Pi_R\left( \tilde{\zeta}_h^k - \beta \hat{g}(x_h^k; \tilde{\zeta}_h^k) \right) \right\|^2 \\
&\leq \E\left\| \zeta_h^k - \beta g(x_h^k; \zeta_h^k) - \left( \tilde{\zeta}_h^k - \beta \hat{g}(x_h^k; \tilde{\zeta}_h^k) \right) \right\|^2 \\
&\leq \E\left\| \zeta_h^k - \beta \hat{g}(x_h^k; \zeta_h^k) - \left( \tilde{\zeta}_h^k - \beta \hat{g}(x_h^k; \tilde{\zeta}_h^k) \right) \right\|^2 + \beta C_2 m^{-1/2} \sqrt{\log(H/ \delta)} \\
&= \E\left\| \left( \zeta_h^k - \tilde{\zeta}_h^k \right) - \beta A(x_h^k) \left( \zeta_h^k - \tilde{\zeta}_h^k \right) \right\|^2 + \beta C_2 m^{-1/2} \sqrt{\log(H/ \delta)} \\
&= \E\left\| \zeta_h^k - \tilde{\zeta}_h^k \right\|^2 - 2\beta \E\left\langle \zeta_h^k - \tilde{\zeta}_h^k, A(x_h^k)(\zeta_h^k - \tilde{\zeta}_h^k) \right\rangle \\
&\quad + \beta^2 \E\left\| A(x_h^k)(\zeta_h^k - \tilde{\zeta}_h^k) \right\|^2 + \beta C_2 m^{-1/2} \sqrt{\log(H/ \delta)} \\
&\leq \E\left\| \zeta_h^k - \tilde{\zeta}_h^k \right\|^2 - 2\beta \E\left\langle \zeta_h^k - \tilde{\zeta}_h^k, A_k(\zeta_h^k - \tilde{\zeta}_h^k) \right\rangle \\
&\quad + 2\beta \delta_A \E\left\| \zeta_h^k - \tilde{\zeta}_h^k \right\|^2 + \beta^2 \E\left\| A(x_h^k)(\zeta_h^k - \tilde{\zeta}_h^k) \right\|^2 + \beta C_2 m^{-1/2} \sqrt{\log(H/ \delta)} \\
&\leq \E\left\| \zeta_h^k - \tilde{\zeta}_h^k \right\|^2 - 2\beta \E\left\langle \Pi_\perp(\zeta_h^k - \tilde{\zeta}_h^k), A_k \Pi_\perp(\zeta_h^k - \tilde{\zeta}_h^k) \right\rangle \\
&\quad + 2\beta R^2 \delta_A + \beta^2 \Lambda_A^2 \E\left\| \Pi_\perp(\zeta_h^k - \tilde{\zeta}_h^k)\right\|^2 + \beta C_2 m^{-1/2} \sqrt{\log(H/ \delta)} \\
&\leq \E\left\| \zeta_h^k - \tilde{\zeta}_h^k \right\|^2 + (\beta^2 \Lambda_A^2 - 2\beta \mu_A) \E\left\| \Pi_\perp(\zeta_h^k - \tilde{\zeta}_h^k) \right\|^2 \\
&\quad + 2\beta R^2 \delta_A + \beta C_2 m^{-1/2} \sqrt{\log(H/ \delta)} \\
&\leq \E\left\| \zeta_h^k - \tilde{\zeta}_h^k \right\|^2 + 2\beta R^2 \delta_A + \beta C_2 m^{-1/2} \sqrt{\log(H/ \delta)} \\
&\leq \E\left\| \zeta_0^k - \tilde{\zeta}_0^k \right\|^2 + 2\beta (h+1) R^2 \delta_A + \beta (h+1) C_2 m^{-1/2} \sqrt{\log(H/ \delta)} \\
&= 2(h+1)\beta R^2 \delta_A + \beta (h+1) C_2 m^{-1/2} \sqrt{\log(H/ \delta)}\\
&\leq 2(h+1)\beta R^2 c_1 T^{-\kappa} + \beta (h+1) C_2 m^{-1/2} \sqrt{\log(H/ \delta)},
\end{align}


where we used the non-expansiveness of $\Pi_R$ and the approximation error bound between $g$ and $\hat{g}$. Substituting $\beta=\frac{2\log H}{\lambda_0 (1-\gamma) H}$, we obtain the following result for all $h\in \{1,2,\cdots,H\}$:
$$
\E\left\| \tilde{\zeta}_{h}^k - \zeta_{h}^k \right\|^2 
\leq  \cO\left(  \frac{R^2 c_1\log H}{\lambda_0 (1-\gamma)T^{\kappa}} + C_2 m^{-1/2} \sqrt{\log(H/ \delta)}+ \right).
$$






\begin{lemma}
\label{lem:critic_fixed_point}
Let $Z_k := \{ z \in \mathbb{R}^d : z = A_k^\dagger b_k + v, \ v \in \ker(A_k) \}$ denote the set of minimum-norm least-squares solutions to $A_k z \approx b_k$, and let $\zeta_*^k$ be the projection of a fixed point $\zeta_0 \in \mathbb{R}^d$ onto $Z_k$. Then, under the update rule

$$
\tilde{\zeta}_h^k = \Pi_R\left( \tilde{\zeta}_{h-1}^k - \beta \hat{g}(x_h^k; \tilde{\zeta}_h^k) \right),
$$

where $\hat{g}(x_h^k; \tilde{\zeta}_h^k) \in \ker(A_k)^\perp$, it holds that

$$
\tilde{\zeta}_h^k - \zeta_*^k \in \ker(A_k)^\perp, \quad \text{for all } h \geq 0.
$$

\end{lemma}

\begin{proof}

The set $Z_k = A_k^\dagger b_k + \ker(A_k)$ is an affine subspace, and $\zeta_*^k$ is the projection of $\zeta_0$ onto $Z_k$. By the projection theorem for affine spaces, we have:

$$
\zeta_0 - \zeta_*^k \in \ker(A_k)^\perp.
$$

We proceed by induction on $h$.

Base case ($h = 0$): By initialization, $\tilde{\zeta}_0^k = \zeta_0$, hence

$$
\tilde{\zeta}_0^k - \zeta_*^k = \zeta_0 - \zeta_*^k \in \ker(A_k)^\perp.
$$

Inductive step: Assume that $\tilde{\zeta}_{h-1}^k - \zeta_*^k \in \ker(A_k)^\perp$. Define the intermediate iterate:

$$
\hat{\zeta}_h^k := \tilde{\zeta}_{h-1}^k - \beta \hat{g}(x_h^k; \tilde{\zeta}_h^k).
$$

Since $\hat{g}(x_h^k; \tilde{\zeta}_h^k) \in \ker(A_k)^\perp$ and $\tilde{\zeta}_{h-1}^k - \zeta_*^k \in \ker(A_k)^\perp$ by the inductive hypothesis, we conclude:

$$
\hat{\zeta}_h^k - \zeta_*^k = (\tilde{\zeta}_{h-1}^k - \zeta_*^k) - \beta \hat{g}(x_h^k; \tilde{\zeta}_h^k) \in \ker(A_k)^\perp.
$$

Now consider the projection operator $\Pi_R$, defined as:

$$
\Pi_R(\zeta) =
\begin{cases}
\zeta, & \text{if } \|\zeta - \zeta_0\| \leq R, \\
\zeta_0 + R \cdot \dfrac{\zeta - \zeta_0}{\|\zeta - \zeta_0\|}, & \text{otherwise}.
\end{cases}
$$

This operation returns a point on the line segment between $\zeta_0$ and $\zeta$, and since both $\zeta_0 - \zeta_*^k$ and $\hat{\zeta}_h^k - \zeta_*^k$ lie in $\ker(A_k)^\perp$, which is a linear subspace (and hence convex), we have:

$$
\Pi_R(\hat{\zeta}_h^k) - \zeta_*^k \in \ker(A_k)^\perp.
$$

Therefore, by definition of the update:

$$
\tilde{\zeta}_h^k = \Pi_R(\hat{\zeta}_h^k),
$$

we conclude that:

$$
\tilde{\zeta}_h^k - \zeta_*^k \in \ker(A_k)^\perp.
$$

This completes the proof.

\end{proof}

To derive the second-order error bound, we first note the following relations.
\begin{align*}
    \|&\tilde{\zeta}_{h+1}^k-\tilde{\zeta}_*^k\|^2\\
    &= \|\Pi_{R}(\tilde{\zeta}_{h}^k - \beta g(x_h^k;\tilde{\zeta}_h^k) ) - \Pi_R(\tilde{\zeta}_*^k)\|^2\\
    &\leq \|\tilde{\zeta}_{h}^k - \beta g(x_h^k;\tilde{\zeta}_h^k)  - \tilde{\zeta}_*^k\|^2\\
    &= \|\tilde{\zeta}_{h}^k -\tilde{\zeta}_*^k\|^2- 2\beta \langle \tilde{\zeta}_{h}^k - \tilde{\zeta}_*^k, g(x_h^k;\tilde{\zeta}_h^k)  \rangle +\beta^2\| g(x_h^k;\tilde{\zeta}_h^k) \|^2\\
    &= \|\tilde{\zeta}_{h}^k -\tilde{\zeta}_*^k\|^2- 2\beta \langle \tilde{\zeta}_{h}^k - \tilde{\zeta}_*^k, \hat{g}(x_h^k;\tilde{\zeta}_h^k)  \rangle- 2\beta\langle \tilde{\zeta}_{h}^k - \tilde{\zeta}_*^k, g(x_h^k;\tilde{\zeta}_h^k) - \hat{g}(x_h^k;\tilde{\zeta}_h^k)\rangle +\beta^2\| g(x_h^k;\tilde{\zeta}_h^k) \|^2\\
    &\leq \|\tilde{\zeta}_{h}^k -\tilde{\zeta}_*^k\|^2- 2\beta \langle \tilde{\zeta}_{h}^k - \tilde{\zeta}_*^k, \hat{g}(x_h^k;\tilde{\zeta}_h^k)  \rangle- 2\beta\langle \tilde{\zeta}_{h}^k - \tilde{\zeta}_*^k, g(x_h^k;\tilde{\zeta}_h^k) - \hat{g}(x_h^k;\tilde{\zeta}_h^k)\rangle +\beta^2 C_1^2 \log(H/\delta)\\
    &\overset{(a)}{\leq} \|\tilde{\zeta}_{h}^k -\tilde{\zeta}_*^k\|^2 - 2\beta \langle \tilde{\zeta}_{h} ^k-\tilde{\zeta}_*^k, A_k(\tilde{\zeta}_h^k - \tilde{\zeta}_*^k) \rangle - 2\beta \langle \tilde{\zeta}_{h} ^k-\tilde{\zeta}_*^k, \hat{g}(x_h^k;\tilde{\zeta}_h^k)  - A_k(\tilde{\zeta}_h^k - \tilde{\zeta}_*^k) \rangle \\
    &\quad + C_3 m^{-1/2} \log (H/\delta) + \beta^2 C_1^2 \log(H/\delta)\\
    &\overset{(b)}{\leq} \|\tilde{\zeta}_{h}^k -\tilde{\zeta}_*^k\|^2 - 2\beta\lambda_0 (1-\gamma) \| \tilde{\zeta}_{h}^k -\tilde{\zeta}_*^k\|^2 - 2\beta \langle \tilde{\zeta}_{h} ^k-\tilde{\zeta}_*^k, \hat{g}(x_h^k;\tilde{\zeta}_h^k)  - A_k(\tilde{\zeta}_h^k - \tilde{\zeta}_*^k) \rangle\\
    &\quad + 2\beta C_3 m^{-1/2} \log (H/\delta) + \beta^2 C_1^2 \log(H/\delta)
\end{align*}
where $(a)$ follows from Lemma \ref{lem:neural_td_supp}(a), $(b)$ follows from the fact that $A_k \succcurlyeq \lambda_0 (1-\gamma)I$ and Lemma \ref{lem:neural_td_supp} (a). Taking conditional expectation $\E_h$ on both sides, we obtain
\begin{align}
\label{eq:mid-exp-td}
    \E_h\left[\left\|\tilde{\zeta}_{h+1}^k-\tilde{\zeta}_*^k\right\|^2\right] &\leq (1- 2\beta\lambda_0 (1-\gamma) )\|\tilde{\zeta}_{h}^k -\tilde{\zeta}_*^k\|^2 - 2\beta \langle \tilde{\zeta}_{h}^k - \tilde{\zeta}_*^k, \E_h \left[\hat{g}(x_h^k;\tilde{\zeta}_h^k) - A_k(\tilde{\zeta}_h^k - \tilde{\zeta}_*^k)\right] \rangle \nonumber\\
    &\quad + 2\beta C_3 m^{-1/2} \log (H/\delta) + \beta^2 C_1^2 \log(H/\delta)
\end{align}

The second term in \eqref{eq:mid-exp-td} can be bounded as
\begin{align}
    -&\langle \tilde{\zeta}_{h}^k - \tilde{\zeta}_*^k, \E_h \left[\hat{g}(x_h^k;\tilde{\zeta}_h^k) - A_k(\tilde{\zeta}_h^k - \tilde{\zeta}_*^k)\right] \rangle \nonumber\\
    &\leq \frac{\lambda_0 (1-\gamma)}{4} \| \tilde{\zeta}_{h}^k - \tilde{\zeta}_*^k\|^2 + \frac{1}{\lambda_0 (1-\gamma)}\left\|\E_h [\hat{g}(x_h^k;\tilde{\zeta}_h^k) - A_k(\tilde{\zeta}_h^k - \tilde{\zeta}_*^k)]\right\|^2 \nonumber\\
    &\leq \frac{\lambda_0 (1-\gamma)}{4} \| \tilde{\zeta}_{h}^k - \tilde{\zeta}_*^k\|^2 + \dfrac{1}{\lambda_0 (1-\gamma)}\left\|\left\{\E_h[A(x_h^k)]-A_k\right\}\tilde{\zeta}_h^k + \bigg\{b_k-\E_h\left[b(z^k_h)\right]\bigg\}\right\|^2\nonumber\\
    &\leq \frac{\lambda_0 (1-\gamma)}{4} \| \tilde{\zeta}_{h}^k - \tilde{\zeta}_*^k\|^2 + \frac{2\delta_A^2\|\tilde{\zeta}_h^k\|^2+2\delta_b^2}{\lambda_0 (1-\gamma)}\nonumber\\
    &\leq \frac{\lambda_0 (1-\gamma)}{4} \| \tilde{\zeta}_{h}^k - \tilde{\zeta}_*^k\|^2 + \frac{4\delta_A^2\|\tilde{\zeta}_h-\tilde{\zeta}_*^k\|^2+4\delta_A^2\lambda_0^{-2} (1-\gamma)^{-2}\Lambda_b^2 + 2\delta_b^2}{\lambda_0 (1-\gamma)}
\end{align}
where the last inequality follows from $\norm{\tilde{\zeta}_*^k}^2=\norm{A^{-1}b}^2\leq \lambda_0^{-2} (1-\gamma)^{-2}\Lambda_b^2$. Substituting the above bounds in \eqref{eq:mid-exp-td},
\begin{align*}
    &\E_h\left[\|\tilde{\zeta}_{h+1}^k-\tilde{\zeta}_*^k\|^2\right] \nonumber\\ 
    &\leq \left(1- \frac{3\beta\lambda_0 (1-\gamma)}{2} +\dfrac{8\beta\delta_A^2}{\lambda_0 (1-\gamma)}\right)\|\tilde{\zeta}_{h}^k -\tilde{\zeta}_*^k\|^2  
    +\dfrac{4\beta}{\lambda_0 (1-\gamma)}\left[2\delta_A^2 \lambda_0^{-2} (1-\gamma)^{-2}\Lambda_b^2+\delta_b^2\right]\\
    &\quad + 2\beta C_3 m^{-1/2} \log (H/\delta) + \beta^2 C_1^2 \log(H/\delta)
\end{align*}
For $\delta_A \leq \lambda_0 (1-\gamma)/4$, we can modify the above inequality to the following. 
\begin{align*}
    &\E_h[\|\tilde{\zeta}_{h+1}^k-\tilde{\zeta}_*^k\|^2]  \\
    &\leq \left(1-{\beta \lambda_0 (1-\gamma)}\right)\|\tilde{\zeta}_{h}^k -\tilde{\zeta}_*^k\|^2 +\dfrac{4\beta}{\lambda_0 (1-\gamma)}\left[2\delta_A^2 \lambda_0^{-2} (1-\gamma)^{-2}\Lambda_b^2+\delta_b^2\right]+ 2 \beta C_3 m^{-1/2} \log (H/\delta) \\
    &+ \beta^2 C_1^2 \log(H/\delta)
\end{align*}
Taking expectation on both sides and unrolling the recursion yields
\begin{align*}
    &\E[\|\tilde{\zeta}_{H}^k - \tilde{\zeta}_*^k\|^2] \\
    &\leq \left(1-{\beta \lambda_0 (1-\gamma)}\right)^H\E\|\tilde{\zeta}_{0} -\tilde{\zeta}_*^k\|^2 \\
    &+ \sum_{h=0}^{H-1} \left(1-{\beta \lambda_0 (1-\gamma)}\right)^{h}\bigg\{\dfrac{4\beta}{\lambda_0 (1-\gamma)}\left[2\delta_A^2 \lambda_0^{-2} (1-\gamma)^{-2}\Lambda_b^2+\delta_b^2\right]+2 \beta C_3 m^{-1/2} \log (H/\delta) \\
    &+ \beta^2 C_1^2 \log(H/\delta)\bigg\}\\
    &\leq \exp\left(-{H\beta \lambda_0 (1-\gamma)}\right)\E\|\tilde{\zeta}_{0} -\tilde{\zeta}_*^k\|^2 \\
    &+ \frac{1}{\beta \lambda_0 (1-\gamma)}\left\{\dfrac{4\beta}{\lambda_0 (1-\gamma)}\left[2\delta_A^2 \lambda_0^{-2} (1-\gamma)^{-2}\Lambda_b^2+\delta_b^2\right]+2 \beta C_3 m^{-1/2} \log (H/\delta) + \beta^2 C_1^2 \log(H/\delta)\right\}\\
    &= \exp\left(-{H\beta \lambda_0 (1-\gamma)}\right)\E\|\tilde{\zeta}_{0} -\tilde{\zeta}_*^k\|^2 \\
    &+ \bigg\{4\lambda_0^{-2} (1-\gamma)^{-2}\left[2\delta_A^2 \lambda_0^{-2} (1-\gamma)^{-2}\Lambda_b^2+\delta_b^2\right]+ 2 \lambda_0 (1-\gamma)^{-1} C_3 m^{-1/2} \log (H/\delta) \\
    + &\beta \lambda_0 (1-\gamma)^{-1} C_1^2 \log(H/\delta)\bigg\}
\end{align*}
Substituting $\beta=\frac{2\log H}{\lambda_0 (1-\gamma) H}$ and using Lemma \ref{lem:critic_bounds_supp} yields
\begin{align*}
    &\E[\|\zeta_{H}^k - \zeta_*^k\|^2] \leq \cO \bigg( \frac{\E\|\tilde{\zeta}_{0} -\tilde{\zeta}_*^k\|^2}{H^2} +  \frac{\log^2 (H/\delta)}{\lambda_0^2 (1-\gamma)^2H}+ \frac{\log (H/\delta)}{\lambda_0 (1-\gamma)m^{1/2}} +\frac{1}{(1-\gamma)^4\lambda_0^4 T^{\kappa}}  \bigg)
\end{align*}


\subsection{Proof of Lemma 5}
\label{sec:lem5-proof}
Consider the critic update
\begin{align}
    \zeta_{h+1}^k = \Pi_{R}(\zeta_{h}^k - \beta g(x_h^k;\zeta_h^k))
\end{align}
This can be rewritten as 
\begin{align}
\label{eq:bias-critic-recursion}
    \zeta_{h+1}^k = \zeta_{h}^k - \beta g(x_h^k;\zeta_h^k) + \epsilon_h^k
\end{align}
where $\epsilon_h^k = \Pi_R(\zeta_{h}^k - \beta g(\zeta_h^k)) - (\zeta_{h}^k - \beta g(z^k_h;\zeta_h^k))$. Note that if $\zeta_{h}^k - \beta g(x_h^k;\zeta_h^k) \in S_R$ then $\epsilon_h^k = 0$ and if $\zeta_{h}^k - \beta g(x_h^k;\zeta_h^k) \notin S_R$, we have the following 
\begin{align}
\|\epsilon_h^k\| = \|\Pi_R(\zeta_{h}^k - \beta g(x_h^k;\zeta_h^k)) - (\zeta_{h}^k - \beta g(x_h^k;\zeta_h^k))\| \overset{(a)}{\leq} \|\zeta_h^k - (\zeta_{h}^k - \beta g(x_h^k;\zeta_h^k))\| \leq \beta \| g(x_h^k;\zeta_h^k)\|,
\end{align}

where $(a)$ follows since $\Pi_R(\zeta_{h}^k - \beta g(x_h^k;\zeta_h^k))$ is the closest point to $\zeta_{h}^k - \beta g(x_h^k;\zeta_h^k)$ in the set $S_R$ and $\zeta_h^k \in S_R$. This yields,
\begin{align}
    \|\epsilon_h^k\| \leq \beta \|g(x_h^k;\zeta_h^k)\| \mathbf{1}_{\{\zeta_{h}^k - \beta g(x_h^k;\zeta_h^k) \notin S_R\}},
\end{align}
where $\mathbf{1}_A$ denotes the indicator function for event $A$. Taking expectation on both sides gives us
\begin{align}
    \E\|\epsilon_h^k\| \leq \beta C_1 \sqrt{\log (H/\delta)} \Pr (\zeta_{h}^k - \beta g(x_h^k;\zeta_h^k) \notin S_R) \overset{(a)}{\leq} \beta C_1 \sqrt{\log (H/\delta)} \Pr \left(\zeta_{h}^k \notin S_{R-\beta C_1 \sqrt{\log (H/\delta)}}\right),
\end{align}

where $(a)$ follows from the fact that the event $\{\zeta_{h}^k - \beta g(x_h^k;\zeta_h^k) \notin S_R\}$ is contained in the event $\{\zeta_{h}^k \notin S_{R-\beta C_1 \sqrt{\log (H/\delta)}}\}$. To see this, observe that if $\zeta_{h}^k - \beta g(x_h^k;\zeta_h^k)  \notin S_R$, then 
\begin{align}
    \|\zeta_{h}^k -\zeta_0\| \geq \|\zeta_h^k -\zeta_0 - \beta g(x_h^k;\zeta_h^k)\|-\|\beta g(x_h^k;\zeta_h^k)\| \geq R -\beta C_1 \sqrt{\log (H/\delta)}
\end{align}
We now bound $\Pr \left(\zeta_{h}^k \in S_{R-\beta C_1 \sqrt{\log (H/\delta)}}\right)$ using Markov's inequality combined with the bound on $\E\|\zeta_h^k-\zeta_*^k\|^2$ obtained earlier.
\begin{align}
    \Pr\left(\|\zeta_h^k-\zeta_0\|\geq R-\beta C_1 \sqrt{\log (H/\delta)}\right)  &\leq \Pr\left(\|\zeta_h^k -\zeta_*^k\|+\|\zeta_0-\zeta_*^k\|\geq R-\beta C_1 \sqrt{\log (H/\delta)}\right) \nonumber\\
    &\leq \Pr(\|\zeta_h^k-\zeta_*^k\|\geq \bar{R})
\end{align}
where $\bar{R}\coloneqq (R/2)-\beta C_1 \sqrt{\log (H/\delta)}$ and $R$ is chosen such that $\zeta_*^k \in S_{R/2}$. Since $\Pr(\|\zeta_h^k-\zeta_*^k\|\geq \bar{R}) \leq \frac{1}{\bar{R}^2}\cdot\E\|\zeta_h^k-\zeta_*^k\|^2$, it follows that
\begin{align}
    \|\E[\epsilon_h^k]\| \leq \E\|\epsilon_h^k\| \leq \cO \bigg( \frac{\beta \E\|\zeta_{0} -\zeta_*^k\|^2}{H^2} +  \frac{\beta \log^2 (H/\delta)}{\lambda_0^2 (1-\gamma)^2H}+ \frac{\beta \log (H/\delta)}{\lambda_0 (1-\gamma)m^{1/2}} +\frac{\beta }{(1-\gamma)^4\lambda_0^4 T^{\kappa}} \bigg)
\end{align}
The term $\epsilon_h^k$ arising from the projection operator can now be viewed as a small error term. Taking the expectation given the policy parameter $\theta$ and the square norm in \eqref{eq:bias-critic-recursion}, we obtain
\begin{align}
    &\|\E[\zeta_{h+1}^k] - \zeta_*^k\|^2 \nonumber\\
    &=  \|\E[\zeta_{h}^k]-\zeta_*^k - \beta \E[g(x_h^k;\zeta_h^k)]\|^2 + \|\E[\epsilon_h^k]\|^2 + 2\langle \E[\zeta_{h}^k]-\zeta_*^k - \beta \E[g(x_h^k;\zeta_h^k)],\E[\epsilon_h^k] \rangle
\end{align}

Note that
\begin{align}
    2\langle \E[\zeta_{h}^k]-&\zeta_*^k - \beta \E[g(\zeta_h^k)],\E[\epsilon_h^k] \rangle \nonumber \\
    &\leq  \frac{\beta \lambda_0 (1-\gamma)}{2} \|\E[\zeta_{h}^k]-\zeta_*^k - \beta \E[g(x_h^k;\zeta_h^k)]\|^2+\frac{2}{\beta \lambda_0 (1-\gamma)}\|\E[\epsilon_h^k]\|^2
\end{align}

Thus, combining the above inequalities
\begin{align}
\label{eq:nu_h_recursion_mid}
    &\|\E[\zeta_{h+1}^k] - \zeta_*^k\|^2 \nonumber \\
    \leq & \left(1+\frac{\beta \lambda_0 (1-\gamma) }{2}\right)\|\E[\zeta_{h}^k]-\zeta_*^k - \beta \E[g(x_h^k;\zeta_h^k)]\|^2 + \left(1+\frac{2}{\beta \lambda_0 (1-\gamma)}\right)\|\E[\epsilon_h^k]\|^2 
\end{align}

We now focus on bounding $\|\E[\zeta_{h}^k]-\zeta_*^k - \beta \E[g(x_h^k;\zeta_h^k)]\|^2$. Observe the following
\begin{align}
\label{eq_appndx_lemma_exp_x_h_recursion}
\begin{split}
    &\|\E[\zeta_{h}^k]-\zeta_*^k - \beta \E[g(x_h^k;\zeta_h^k)]\|^2 \\
    &= \|\E[\zeta_{h}^k] -\zeta_*^k\|^2- 2\beta \langle \E[\zeta_{h}^k] - \zeta_*^k, \E[g(x_h^k;\zeta_h^k)] \rangle +\beta^2\| \E[g(x_h^k;\zeta_h^k)] \|^2\\
    &\leq \|\E[\zeta_{h}^k] -\zeta_*^k\|^2- 2\beta \langle \E[\zeta_{h}^k] - \zeta_*^k, \E[A(x_h^k)] \rangle - 2\beta \langle \E[\zeta_{h}^k] - \zeta_*^k, \E[g(x_h^k;\zeta_h^k)]-\E[\hat{g}(x_h^k;\zeta_h^k)] \rangle \\
    &\quad+ \beta^2 \|\E[g(x_h^k;\zeta_h^k)]-\E[\hat{g}(x_h^k;\zeta_h^k)]\|^2+\beta^2\|\E[\hat{g}(x_h^k;\zeta_h^k)]\|^2 \\
    &\leq \|\E[\zeta_{h}^k] -\zeta_*^k\|^2- 2\beta \langle \E[\zeta_{h}^k] - \zeta_*^k, \E[\hat{g}(x_h^k;\zeta_h^k)] \rangle - 2\beta \langle \E[\zeta_{h}^k] - \zeta_*^k, \E[g(x_h^k;\zeta_h^k)]-\E[\hat{g}(x_h^k;\zeta_h^k)] \rangle \\
    &\quad+ 2\beta^2 \|\E[g(x_h^k;\zeta_h^k)]-\E[\hat{g}(x_h^k;\zeta_h^k)]\|^2+2\beta^2\|\E[\hat{g}(x_h^k;\zeta_h^k)]\|^2\\
    &\overset{(a)}{\leq} \|\E[\zeta_{h}^k] -\zeta_*^k\|^2- 2\beta \langle \E[\zeta_{h}^k] - \zeta_*^k, \E[\hat{g}(x_h^k;\zeta_h^k)] \rangle + 2\beta C_3  \sqrt{\log(H/\delta)} m^{-1/2} \\
    &\quad+ 2\beta^2 C_2^2 \log(H/\delta) m^{-1}+2\beta^2\|\E[\hat{g}(x_h^k;\zeta_h^k)]\|^2\\
    &\leq \|\E[\zeta_{h}^k] -\zeta_*^k\|^2- 2\beta \langle \E[\zeta_{h}^k] - \zeta_*^k, A_k(\E[\zeta_h^k] - \zeta_*^k) \rangle - 2\beta \langle \E[\zeta_{h}^k] - \zeta_*^k, \E[\hat{g}(x_h^k;\zeta_h^k)]-A_k(\E[\zeta_h^k] - \zeta_*^k) \rangle \\
    &\quad+ 2\beta C_3  \sqrt{\log(H/\delta)} m^{-1/2}  + 2\beta^2 C_2^2 \log(H/\delta) m^{-1}+2\beta^2\|A_k(\E[\zeta_h^k] - \zeta_*^k)\|^2\\
    &\quad +2\beta^2\|\E[\hat{g}(x_h^k;\zeta_h^k)]- A_k(\E[\zeta_h^k] - \zeta_*^k)\|^2\\
    &\overset{(b)}{\leq} (1-2\beta\lambda_0 (1-\gamma)+2\Lambda_A^2\beta^2)\|\E[\zeta_{h}^k] -\zeta_*^k\|^2- 2\beta \langle \E[\zeta_{h}^k] - \zeta_*^k, \E[\hat{g}(x_h^k;\zeta_h^k)]-A_k(\E[\zeta_h^k] - \zeta_*^k) \rangle  \\
    &\quad+  2\beta C_3  \sqrt{\log(H/\delta)} m^{-1/2}+ 2\beta^2 C_2^2 \log(H/\delta) m^{-1}+2\beta^2\|\E[\hat{g}(x_h^k;\zeta_h^k)]- A_k(\E[\zeta_h^k] - \zeta_*^k)\|^2\\
\end{split}
\end{align}
where $(a)$ follows from Lemma \ref{lem:neural_td_supp}, while $(b)$ follows from the fact that $\|A_k\|\leq \Lambda_A$ and $A_k \succcurlyeq \lambda_0 (1-\gamma) I$. The last term in the last line of \eqref{eq_appndx_lemma_exp_x_h_recursion} can be bounded as follows.
\begin{align*}
    \| &\E[\hat{g}(x_h^k;\zeta_h^k)] - (A_k\E[\zeta_h^k] - b_k)\|^2 \\
    &= \left\| \E\left[(\E[A(x_h^k)] - A_k)(\zeta_h^k - \zeta_*^k)\right] + (\E[A(x_h^k)] - A_k)\zeta_*^k + (b_k-\E[b(x_h^k)])\right\|^2\\
    &\leq 3\E\left[\|\E[A(x_h^k)] - A_k\|^2 \|\zeta_h^k - \zeta_*^k\|^2\right] + 3\E\left[\|\E[A(x_h^k)] - A_k)\|^2 \right]\|\zeta_*^k\|^2 + 3\left\|b_k-\E[b(x_h^k)]\right\|^2\\
    &\leq 3\delta_{A}^2\E\left[\|\zeta_h^k- \zeta_*^k\|^2\right] + 3\lambda_0^{-2} (1-\gamma)^{-2}\Lambda_b^2\delta_{A}^2 + 3\bar{\delta}_b^2
\end{align*}
The second term in the last line of \eqref{eq_appndx_lemma_exp_x_h_recursion} can be bounded as follows.
\begin{align*}
    -\langle& \E[\zeta_h^k] - \zeta_*^k, \E_h \left[\E[\hat{g}(x_h^k;\zeta_h^k)] - A_k(\E[\zeta_h^k] - \zeta_*^k)\right] \rangle  \nonumber\\
    &\quad\leq \frac{\lambda_0 (1-\gamma)}{4} \| \E[\zeta_h^k] - \zeta_*^k\|^2 + \frac{1}{\lambda_0 (1-\gamma)}\left\|\E[\hat{g}(x_h^k;\zeta_h^k)] - A_k(\E[\zeta_h^k] - \zeta_*^k)\right\|^2 \nonumber\\
    &\quad\leq \frac{\lambda_0 (1-\gamma)}{4} \|\E[\zeta_h^k] - \zeta_*^k\|^2 + \dfrac{3}{\lambda_0 (1-\gamma)}\left[\delta_{A}^2\E\|\zeta_h^k- \zeta_*^k\|^2 + \lambda_0^{-2} (1-\gamma)^{-2}\Lambda_b^2\delta_{A}^2 + \bar{\delta}_b^2\right]\nonumber
\end{align*}

Substituting the above bounds in \eqref{eq_appndx_lemma_exp_x_h_recursion}, we obtain the following bound
\begin{align*}
    &\|\E[\zeta_h^k]-\zeta_*^k - \beta \E[g(x_h^k;\zeta_h^k)]\|^2 \\
    &\leq \left(1-\dfrac{3\beta\lambda_0 (1-\gamma)}{2}+2\Lambda_A^2\beta^2\right)\|\E[\zeta_h^k] - \zeta_*^k\|^2 + 2\beta C_3  \sqrt{\log(H/\delta)} m^{-1/2}\\
    &+ 2\beta^2 C_2^2 \log(H/\delta) m^{-1}+6\beta\left(\beta+\dfrac{1}{\lambda_0 (1-\gamma)}\right)\left[\delta_{A}^2\E\|\zeta_h^k- \zeta_*^k\|^2 + \lambda_0^{-2} (1-\gamma)^{-2}\Lambda_b^2\delta_{A}^2 + \bar{\delta}_b^2\right]
\end{align*}

Combining \eqref{eq:nu_h_recursion_mid} with the above bound yields the following result.
\begin{align*}
    &\|\E[\zeta_{h+1}^k]-\zeta_*^k \|^2 \\
    &\leq \left(1-\dfrac{3\beta\lambda_0 (1-\gamma)}{2}+2\Lambda_A^2\beta^2\right)\left(1+\dfrac{\beta\lambda_0 (1-\gamma)}{2}\right)\|\E[\zeta_h^k] - \zeta_*^k\|^2 \\
    &+ \left(2\beta+\beta^2\lambda_0 (1-\gamma)\right)C_3  \sqrt{\log(H/\delta)} m^{-1/2}\\
    &\quad+6\left(1+\dfrac{\beta\lambda_0 (1-\gamma)}{2}\right)\left(\beta^2+\dfrac{\beta}{\lambda_0 (1-\gamma)}\right)\bigg[\delta_{A}^2\E\|\zeta_h^k- \zeta_*^k\|^2 \\
    &+ \lambda_0^{-2} (1-\gamma)^{-2}\left(1+\dfrac{\beta\lambda_0 (1-\gamma)}{2}\right)\Lambda_b^2\delta_{A}^2 + \bar{\delta}_b^2\bigg]\\
    &\quad+\left(1+\frac{2}{\beta \lambda_0 (1-\gamma)}\right)\|\E[\epsilon_h^k]\|^2+ 2\left(1+\dfrac{\beta\lambda_0 (1-\gamma)}{2}\right)\beta^2 C_2^2 \log(H/\delta) m^{-1}\\
    &\leq \left(1-\beta\lambda_0 (1-\gamma)+\Lambda_A^2 \lambda_0 (1-\gamma) \beta^3\right)\|\E[\zeta_h^k] - \zeta_*^k\|^2 + 2\left(1+\dfrac{\beta\lambda_0 (1-\gamma)}{2}\right)\beta C_3  \sqrt{\log(H/\delta)} m^{-1/2}\\
    &\quad+6\left(1+\dfrac{\beta\lambda_0 (1-\gamma)}{2}\right)\beta\left(\beta+\dfrac{1}{\lambda_0 (1-\gamma)}\right)\left[\delta_{A}^2 R^2 + \lambda_0^{-2} (1-\gamma)^{-2}\left(1+\dfrac{\beta\lambda_0 (1-\gamma)}{2}\right)\Lambda_b^2\delta_{A}^2 + \bar{\delta}_b^2\right]\\
    &\quad+\left(1+\frac{2}{\beta \lambda_0 (1-\gamma)}\right)\|\E[\epsilon_h^k]\|^2+ 2\left(1+\dfrac{\beta\lambda_0 (1-\gamma)}{2}\right)\beta^2 C_2^2 \log(H/\delta) m^{-1}\\
    &\coloneqq \left(1-\beta\lambda_0 (1-\gamma)+\Lambda_A^2 \lambda_0 (1-\gamma) \beta^3\right)\|\E[\zeta_h^k] - \zeta_*^k\|^2 + \Delta_h^k
\end{align*}

If $\beta<1/(2\Lambda_A)$, the above bound implies the following.
\begin{align*}
    \|\E[\zeta_{h+1}^k]-\zeta_*^k \|^2
    \leq \left(1-\frac{\beta\lambda_0 (1-\gamma)}{2}\right)\|\E[\zeta_h^k] - \zeta_*^k\|^2 + \Delta_h^k
\end{align*}

Unrolling the recursion, we obtain the following result.
\begin{align*}
    &\|\E[\zeta_{h+1}^k]-\zeta_*^k \|^2\\
    &\leq \left(1-\frac{\beta\lambda_0 (1-\gamma)}{2}\right)^H\|\E[\zeta_0] - \zeta_*^k\|^2 + \sum_{i=0}^{H}\left(1-\frac{\beta\lambda_0 (1-\gamma)}{2}\right)^{H-h}\Delta_h^k \\ 
    &\leq \left(1-\frac{\beta\lambda_0 (1-\gamma)}{2}\right)^H\|\E[\zeta_0] - \zeta_*^k\|^2 + \frac{2}{\beta \lambda_0 (1-\gamma)}\Delta_h^k \\ 
    &\leq \exp \left(\frac{\beta\lambda_0 (1-\gamma) H}{2}\right)\|\E[\zeta_0] - \zeta_*^k\|^2 + \frac{2}{\beta \lambda_0 (1-\gamma)}\Delta_h^k \\
    &\leq \exp \left(\frac{\beta\lambda_0 (1-\gamma) H}{2}\right)\|\E[\zeta_0] - \zeta_*^k\|^2 + \cO \left(\frac{2}{\beta \lambda_0 (1-\gamma)}\Delta_h^k \right)
\end{align*}
Substituting $\beta=\frac{2\log H}{\lambda_0 (1-\gamma)H}$, it follows that
\begin{align}
    \|\E[\zeta_{h+1}^k]-\zeta_*^k \|^2 \leq \cO \left(\frac{\|\E[\zeta_0] - \zeta_*^k\|^2}{\lambda_0^2(1-\gamma)^2H^2} +  \frac{\log^4 (H/\delta)}{\lambda_0^6 (1-\gamma)^6H^2}+ \frac{\sqrt{\log(H/\delta)}}{ \lambda_0 (1-\gamma) m^{1/2}} +\frac{1}{(1-\gamma)^{10}\lambda_0^{10} T^{2\kappa}}\right)
\end{align}

\section{Proof of Lemmas 2 and 3}
\label{sec:lem23-proof}

Recall that the block‐indexed NAC‐DD algorithm (Algorithm~\ref{alg:nacdd})
uses the NPG estimator evaluated once every \(M\) transitions. The NPG updates can be written as follows
\begin{align}
\omega_{H+h+1}^k = \omega_{H+h}^k - \eta\bigl(X(x_h^k)\,\omega_{H+h}^k - y(x_h^k)\bigr),
\label{eq:xt-block}
\end{align}
where
\begin{align}
X(x_h^k)
&\coloneqq
\nabla_{\theta}\log\pi_{\theta_k}(\bar{a}_h^k|\bar{s}_h^k)\nabla_{\theta}\log\pi_{\theta_k}(\bar{a}_h^k|\bar{s}_h^k)\bigr)^{\!\top},
&
y(x_h^k)
&\coloneqq
Q\bigl(\phi(\bar{a}_h^k|\bar{s}_h^k);\zeta_H^k\bigr)\,\nabla_{\theta}\log\pi_{\theta_k}(\bar{a}_h^k|\bar{s}_h^k),
\end{align}
with \(x_h^k = \phi(\bar{s}_h^k,\bar{a}_h^k)\).  Throughout, the
conditional expectation \(\mathbb{E}_{k,h}[\cdot]\) is over all the randomness from the \(h^{th}\) block in epoch \(k\) given the entire history prior to this block. Whereas, \(\mathbb{E}_{k}[\cdot]\) denotes the expectation given \(\theta_k\). For notational convenience, we henceforth denote \(\E[\cdot]\), \(\E_{k,h}[\cdot]\) and \(\E_k[\cdot]\) in place of \(\E[\cdot|\zeta_0]\), \(\E_{k,h}[\cdot| \zeta_0]\) and \(\E_k[\cdot| \zeta_0]\), respectively. Recall that $X(x_h^k)$ serves as an estimate of $F(\theta_k)$ and $X(x_h^k)$ serves as an estimate of $\nabla_\theta J(\theta_k)$.

The Fisher information matrix satisfies
$\mu I\preccurlyeq F(\theta_k)$ and $\|F(\theta_k)\|\leq G_1^2$ from Assumptions \ref{assump:FND_policy} and \ref{assump:score_func_bounds}, respectively. 
Furthermore, the norm of the policy gradient norm is bounded by $\|\nabla_\theta J(\theta_k)\|\le G_1(1-\gamma)^2$ \citep{liu2020improved}.

In this Section, we establish that, for each block $h$, there exists positive constants $\sigma_X^2$, $\delta_X^2$, $\sigma_y^2$, $\delta_y^2$, $\bar\delta_y^2$, $\Lambda_X$, $\Lambda_y$ such that the following bounds hold:

$$
\E_{k,h}\bigl\|X(x_h^k)-F(\theta_k)\bigr\|^2 \le \sigma_X^2,
\quad
\bigl\|\E_{k,h}[X(x_h^k)]-F(\theta_k)\bigr\|^2 \le \delta_X^2,\quad \|X(x_h^k)\|\leq \Lambda_X, \quad \|y(x_h^k)\|\leq \Lambda_y
$$

$$
\E_{k,h}\bigl\|y(x_h^k)-\nabla_\theta J(\theta_k)\bigr\|^2 \le \sigma_y^2,
\quad
\bigl\|\E_{k,h}[y(x_h^k)]-\nabla_\theta J(\theta_k)\bigr\|^2 \le \delta_y^2,
\quad
\bigl\|\E_k[y(x_h^k)]-\nabla_\theta J(\theta_k)\bigr\|^2 \le \bar\delta_y^2.
$$



Using these bounds, Theorem 2 of \cite{ganesh2024orderoptimal} can then be applied with

$$
P = F(\theta_k), 
\quad
q = \nabla_\theta J(\theta_k), 
\quad
\hat P_h = X(x_h^k), 
\quad
\hat q_h = y(x_h^k),
$$

and step size $\eta = \tfrac{2\log H}{\mu H}$,
yielding the desired mean‐square and bias guarantees for the iterates $\omega_{H}^k$.


From Assumption \ref{assump:FND_policy}, eigenvalues of $F(\theta_k)$ are bounded below by $\mu$. Whereas

\begin{lemma}
\label{lem:fisher_bounds_supp}
Under the assumptions of Theorem~\ref{thm:main-conv-rate}, for every epoch \(k\)
and block \(h\):
\begin{enumerate}
  \item \(\|X(x_h^k)\|\le G_1^2\).
  \item \(\|\mathbb{E}_{k,h}[X(x_h^k)] - F(\theta_k)\|^2\le G_1^4\,T^{-2\kappa}\).
\end{enumerate}
\end{lemma}
\begin{proof}
Part (1) follows directly from Assumption~\ref{assump:score_func_bounds}.
Part (2) follows by bounding the bias bound as in
Lemma~\ref{lem:critic_bounds_supp}.
\end{proof}

Since $\|X(x_h^k)\|,\|F(\theta_k)\|\le G_1^2$, it follows that $\E_{k,h} \|X(x_h^k)-F(\theta_k)\|^2\le 2G_1^4$. Separately, using Lemma \ref{lem:neural_td_supp}, we obtain \(\|y(x_h^k)\|\leq C_1'G_1\sqrt{\log(H/\delta)}\). Next, we bound the bias and second-order error of \(y(x_h^k)\), which also carries the
critic approximation error.

\begin{lemma}
\label{lemma:washim_2}
Fix epoch \(k\).  Under the assumptions of Theorem~\ref{thm:main-conv-rate}, for
each block \(h\):
\begin{align*}
\text{(a)}\quad
&\bigl\|\mathbb{E}_{k,h}[\,y(x_h^k)\,]-\nabla_{\theta}J(\theta_k)\bigr\|^2
\;\le\;
\tilde{\cO}\Bigl(\tfrac{\sigma_y^2}{T^{\kappa}}+\delta_y^2\Bigr),
\\[4pt]
\text{(b)}\quad
&\mathbb{E}_{k,h}\bigl[\|y(x_h^k)-\nabla_{\theta}J(\theta_k)\|^2\bigr]
\;\le\;
\tilde{\cO}\Bigl(\sigma_y^2+G_1^2(C_1')^2\sqrt{\log(H/\delta)}\Bigr),
\end{align*}
where
\[
\sigma_y^2 = \tilde{\cO}\Bigl(\tfrac{G_1^2}{(1-\gamma)^4}\Bigr),\quad
\delta_y^2 = \tilde{\cO}\Bigl(G_1^2\|\zeta_H^k-\zeta^k_*\|^2 + m^{-1/2}
                + \tfrac{G_1^2\epsilon_{\mathrm{app}}}{(1-\gamma)^2}\Bigr).
\]
Moreover,
\begin{align*}
\text{(c)}\quad
\bigl\|\mathbb{E}_k[\,y(x_h^k)\,]-\nabla_{\theta}J(\theta_k)\bigr\|^2
\;\le\;
\tilde{\cO}\Bigl(\tfrac{\sigma_y^2}{T^{\kappa}}
      + \bar\delta_y^2\Bigr),
\end{align*}
with
\(\bar\delta_y^2=\tilde{\cO}\Bigl(G_1^2\|\mathbb{E}_k[\zeta^k_H]-\zeta^k_*\|^2
              +m^{-1/2} + G_1^2\epsilon_{\mathrm{app}}/(1-\gamma)^2\Bigr)\).
\end{lemma}
\begin{proof}
We expand
\begin{align*}
&\E_{k,h}[y(x_h^k)] - \nabla_{\theta}J(\theta_k)\\
&=
\E_{k,h}[Q(x_h^k;\zeta^k_H)\,\nabla_{\theta}\log\pi_{\theta_k}(\bar{a}_h^k| \bar{s}_h^k)]
- \nabla_{\theta}J(\theta_k)\\
&=\E_{k,h}[(Q(x_h^k;\zeta^k_H)-Q(\bar{s}_h^k,\bar{a}_h^k))\nabla_{\theta}\log\pi_{\theta_k}(\bar{a}_h^k| \bar{s}_h^k)]+\E_{k,h}[Q(\bar{s}_h^k,\bar{a}_h^k)\nabla_{\theta}\log\pi_{\theta_k}(\bar{a}_h^k| \bar{s}_h^k)]
- \nabla_{\theta}J(\theta_k),
\end{align*}
and decompose
\[
Q(x_h^k;\zeta^k_H)-Q(\bar{s}_h^k,\bar{a}_h^k)\
= T_0 + T_1 + T_2 + T_3,
\]
where
\[
\begin{aligned}
T_0 &= Q(x_h^k;\zeta^k_*)-Q(\bar{s}_h^k,\bar{a}_h^k)\, &
T_1 &= Q(x_h^k;\zeta^k_H)-\hat Q(x_h^k;\zeta^k_H),\\
T_2 &= \hat Q(x_h^k;\zeta^k_*)-Q(x_h^k;\zeta^k_*), &
T_3 &= \hat Q(x_h^k;\zeta^k_H)-\hat Q(x_h^k;\zeta^k_*).
\end{aligned}
\]

We have \(\|T_1\|,\|T_2\|=\cO(m^{-1/2})\), which follows from the bounds on the linearization error, while
\(\|T_3\|=\cO(C_1\|\zeta_H^k-\zeta^k_*\|)\). 

To bound \(\|T_0\|\), first note that
\begin{align}
    (Q(x_h^k;\zeta^k_*)-Q(\bar{s}_h^k,\bar{a}_h^k))^2 \leq 2(Q(x_h^k;\zeta^k_*)-\widehat{Q}(x_h^k;\zeta^k_*))^2+2(\widehat{Q}(x_h^k;\zeta^k_*)-Q(\bar{s}_h^k,\bar{a}_h^k))^2
\end{align}
We have \((Q(x_h^k;\zeta^k_*)-\widehat{Q}(x_h^k;\zeta^k_*))^2 \leq \cO(m^{-1})\). Furthermore, from Appendix A.3 in \cite{ke2024an}, we have
% \begin{align}
% \begin{split}
%     \|\widehat{Q}(x_h^k;\zeta^k_*)-Q(\bar{s}_h^k,\bar{a}_h^k)\|_{\infty} &\leq \|\widehat{Q}(x_h^k;\zeta^k_*)-\Pi_{\mathcal{F}_{R,m}}Q(\bar{s}_h^k,\bar{a}_h^k)+\Pi_{\mathcal{F}_{R,m}}Q(\bar{s}_h^k,\bar{a}_h^k)-Q(\bar{s}_h^k,\bar{a}_h^k)\|_{\infty} \\
%     &\overset{(a)}{=} \|\Pi_{\mathcal{F}_{R,m}}\cT^{\pi_{\theta_k}}\widehat{Q}(x_h^k;\zeta^k_*)-\Pi_{\mathcal{F}_{R,m}}\cT^{\pi_{\theta_k}}Q(\bar{s}_h^k,\bar{a}_h^k)+\Pi_{\mathcal{F}_{R,m}}Q(\bar{s}_h^k,\bar{a}_h^k)-Q(\bar{s}_h^k,\bar{a}_h^k)\|_{\infty}\\
%     &\leq \|\Pi_{\mathcal{F}_{R,m}}\cT^{\pi_{\theta_k}}\widehat{Q}(x_h^k;\zeta^k_*)-\Pi_{\mathcal{F}_{R,m}}\cT^{\pi_{\theta_k}}Q(\bar{s}_h^k,\bar{a}_h^k)\|_{\infty}+\|\Pi_{\mathcal{F}_{R,m}}Q(\bar{s}_h^k,\bar{a}_h^k)-Q(\bar{s}_h^k,\bar{a}_h^k)\|_{\infty}\\
%      &\overset{(b)}{\leq} \gamma\|\widehat{Q}(x_h^k;\zeta^k_*)-Q(\bar{s}_h^k,\bar{a}_h^k)\|_{\infty}+\|\Pi_{\mathcal{F}_{R,m}}Q(\bar{s}_h^k,\bar{a}_h^k)-Q(\bar{s}_h^k,\bar{a}_h^k)\|_{\infty},
% \end{split}
% \end{align}
% where \((a)\) holds since \(\widehat{Q}(x_h^k;\zeta_*^k)\) is the fixed point of the projected Bellman operator \(\Pi_{\mathcal{F}_{R,m}}\cT^{\pi_{\theta_k}}\) and \(Q(\bar{s}_h^k,\bar{a}_h^k)\) is the fixed point of the Bellman operator \(\cT^{\pi_{\theta_k}}\). Separately, \((b)\) holds since \(\Pi_{\mathcal{F}_{R,m}}\cT^{\pi_{\theta_k}}\) is \(\gamma\)-contractive on \(\|\cdot\|_{\infty}\). Re-arranging, we obtain
\begin{align}
    \E\|\widehat{Q}(x_h^k;\zeta^k_*)-Q(\bar{s}_h^k,\bar{a}_h^k)\|^2&\leq \frac{1}{(1-\gamma)^2}\E\|\Pi_{\mathcal{F}_{R,m}}Q(\bar{s}_h^k,\bar{a}_h^k)-Q(\bar{s}_h^k,\bar{a}_h^k)\|^2 \leq \frac{\epsilon_{\mathrm{app}}}{(1-\gamma)^2}
\end{align}

Using arguments as in Lemma \ref{lem:critic_bounds_supp}, we obtain 
\(\|\mathbb{E}_{k,h}[Q(\bar{s}_h^k,\bar{a}_h^k)\nabla\log\pi_{\theta_k}(\bar{a}_h^k|\bar{s}_h^k)]-\nabla_\theta J(\theta_k)\|\le G_1((1-\gamma)T^{\kappa})^{-1}\), which  
yields part (a).  Part (b) follows easily using the bounds on \(\|\nabla_\theta J(\theta_k)\|\) and $\|y(x_h^k)\|$. For part (c), note that 
\begin{align*}
&\E_{k}[y(x_h^k)] - \nabla_{\theta}J(\theta_k)\\
&=\E_{k}[(T_0+T_1+T_2+T_3)\nabla_{\theta}\log\pi_{\theta_k}(\bar{a}_h^k| \bar{s}_h^k)]+\E_{k}[Q(\bar{s}_h^k,\bar{a}_h^k)\nabla_{\theta}\log\pi_{\theta_k}(\bar{a}_h^k| \bar{s}_h^k)]
- \nabla_{\theta}J(\theta_k)
\end{align*}
and \(\|\E_{k}[T_3\nabla_{\theta}\log\pi_{\theta_k}(\bar{a}_h^k| \bar{s}_h^k)]\|=\|\E_{k,h}[\E_k[\hat Q(x_h^k;\zeta^k_H)-\hat Q(x_h^k;\zeta^k_*)]\nabla_{\theta}\log\pi_{\theta_k}(\bar{a}_h^k| \bar{s}_h^k)]\|\leq C_1G_1\|\E_k[\zeta_H^k]-\zeta_*^k\|\). The bounds for the remaining terms follow from the bounds in part (a). 
\end{proof}

We now can invoke Theorem 2 in \cite{ganesh2024orderoptimal} to obtain
\begin{align}
    \E_k\|\omega_k-\omega_k^*\|^2\leq \cO\left(\frac{G_1^4}{H\mu^{4}(1-\gamma)^4}+\frac{G_1^2(C_1')^2\log(H/\delta)}{H\mu^{2}(1-\gamma)^4}+\mu^{-2}G_1^2\E\|\zeta_H^k-\zeta_*^k\|^2+\mu^{-2}m^{-1/2}+\frac{G_1^2 \epsilon_{\mathrm{app}}}{\mu^2(1-\gamma)^2}\right)
\end{align}
and
\begin{align}
   \| \E_k[\omega_k]-\omega_k^*\|^2\leq \cO\left(\frac{G_1^2(C_1')^2G_1^2\log(H/\delta)}{T^\kappa}+\|\E[\zeta_{H}^k]-\zeta_*^k\|^2+\frac{G_1^2\epsilon_{\mathrm{app}}}{\mu^2(1-\gamma)^2}\right)
\end{align}



\section{Proof of Theorem \ref{thm:main-conv-rate}}
\label{sec:final-bound}
Recall that the global convergence of any update of form $\theta_{k+1}=\theta_{k}+\alpha \omega_k$ can be bounded as
\begin{equation}
\label{eq:useful-lem}
			\begin{split}
			J^{*}-\frac{1}{K}\sum_{k=0}^{K-1}&\E[J(\theta_k)]\leq \frac{\sqrt{\epsilon_{\mathrm{bias}}}}{1-\gamma} +\frac{G_1}{K}\sum_{k=0}^{K-1}\E\Vert(\E\left[\omega_k|\theta_k\right]-\omega^*_k)\Vert+\dfrac{\alpha G_2}{K}\sum_{k=0}^{K-1}\E\Vert \omega_k -\omega_k^*\Vert^2\\
            & + \dfrac{\alpha \mu^{-2}}{ K}\sum_{k=0}^{K-1}\E\Vert \nabla_{\theta} J(\theta_k) \Vert^2
            +\frac{1}{\alpha K}\E_{s\sim d^{\pi^*}}[\mathrm{KL}(\pi^*(\cdot\vert s)\Vert\pi_{\theta_0}(\cdot\vert s))].		\end{split}
		\end{equation}
 


We note that our algorithm updates \(\theta\) at each iteration \(k\) using \(\omega_H\) and \(\zeta_H\) obtained after \(H\) iterations of the NPG and critic estimation inner loops. Therefore, we use \(\omega_H\) and \(\zeta_H\) instead of \(\omega_k\) and \(\zeta^k_H\). We begin by deriving a bound for \(\textstyle{\frac{1}{K}\sum_{k=0}^{K-1}\Vert\nabla_\theta J(\theta_k)\Vert^2}\). It is known that \(J\) is \(L_J\)-smooth with $L_J\coloneqq\frac{G_2}{(1-\gamma)^2}+\frac{2G_1^2}{(1-\gamma)^3}$ [Lemma B.1, \cite{liu2020improved}]. With this, we obtain:
\begin{align}
\label{eq:eq_26}
\begin{split}
    &J(\theta_{k+1})\\
    &\geq J(\theta_k)+\left<\nabla_\theta J(\theta_k),\theta_{k+1}-\theta_k\right>-\frac{L_J}{2}\Vert\theta_{k+1}-\theta_k\Vert^2\\
    &=J(\theta_k)+\alpha\left<\nabla_\theta J(\theta_k),\omega_k\right>-\frac{\alpha^2 L_J}{2}\Vert \omega_k\Vert^2\\
    &=J(\theta_k)+\alpha\left<\nabla_\theta J(\theta_k),\omega_k^*\right>+\alpha\left<\nabla_\theta J(\theta_k),\omega_k-\omega_k^*\right>-\frac{\alpha^2 L_J}{2}\Vert \omega_k-\omega_k^*+\omega_k^*\Vert^2\\
    &\overset{(a)}{\geq} J(\theta_k) +\alpha\left<\nabla_\theta J(\theta_k),F(\theta_k)^{-1}\nabla_\theta J(\theta_k)\right> +\alpha\left<\nabla_\theta J(\theta_k),\omega_k-\omega_k^*\right> \\
    &\quad-\alpha^2 L_J \Vert \omega_k-\omega_k^*\Vert^2 - \alpha^2L_J\Vert \omega_k^*\Vert^2\\
    &\overset{(b)}{\geq} J(\theta_k) +\dfrac{\alpha}{G_1^2}\Vert\nabla_\theta J(\theta_k)\Vert^2 +\alpha\left<\nabla_\theta J(\theta_k),\omega_k-\omega_k^*\right> -\alpha^2 L_J \Vert \omega_k-\omega_k^*\Vert^2 - \alpha^2L_J\Vert \omega_k^*\Vert^2\\
    &= J(\theta_k) +\dfrac{\alpha}{2G_1^2}\Vert\nabla_\theta J(\theta_k)\Vert^2 + \dfrac{\alpha}{2G_1^2}\left[\Vert\nabla_\theta J(\theta_k)\Vert^2 +2G_1^2 \left<\nabla_\theta J(\theta_k),\omega_k-\omega_k^*\right>+G_1^4\Vert \omega_k-\omega_k^*\Vert^2\right] \\
    &\quad-\left(\dfrac{\alpha G_1^2}{2}+\alpha^2 L_J\right) \Vert \omega_k-\omega_k^*\Vert^2 - \alpha^2L_J\Vert \omega_k^*\Vert^2 \\
    &=J(\theta_k) +\dfrac{\alpha}{2G_1^2}\Vert\nabla_\theta J(\theta_k)\Vert^2 + \dfrac{\alpha}{2G_1^2}\Vert\nabla_\theta J(\theta_k)+G_1^2(\omega_k-\omega_k^*)\Vert^2-\left(\dfrac{\alpha G_1^2}{2}+\alpha^2 L_J\right) \Vert \omega_k-\omega_k^*\Vert^2 \\
    &\quad- \alpha^2L_J\Vert \omega_k^*\Vert^2 \\
    &\geq J(\theta_k) +\dfrac{\alpha}{2G_1^2}\Vert\nabla_\theta J(\theta_k)\Vert^2 -\left(\dfrac{\alpha G_1^2}{2}+\alpha^2 L_J\right) \Vert \omega_k-\omega_k^*\Vert^2 - \alpha^2L_J\Vert F(\theta_k)^{-1}\nabla_\theta J(\theta_k)\Vert^2 \\
    &\overset{(c)}{\geq} J(\theta_k) +\left(\dfrac{\alpha}{2G_1^2}-\dfrac{\alpha^2 L_J}{\mu^2}\right)\Vert\nabla_\theta J(\theta_k)\Vert^2 -\left(\dfrac{\alpha G_1^2}{2}+\alpha^2 L_J\right) \Vert \omega_k-\omega_k^*\Vert^2  
\end{split}
\end{align}
 where $(a)$ utilizes the Cauchy-Schwarz inequality and the definition that $\omega_k^*=F(\theta_k)^{-1}\nabla_\theta J(\theta_k)$. Inequalities $(b)$, and $(c)$ follow from Assumption \ref{assump:score_func_bounds}(a) and \ref{assump:FND_policy} respectively. We take the above inequality, sum over $k=0,\cdots, K-1$, rearrange the terms and substitute $\alpha = \frac{\mu^2}{4G_1^2L_J}$, to obtain:
 \begin{align}
 \label{eq:local-temp}
 \begin{split}
     \dfrac{\mu^2}{16G_1^4 L_J}\left(\dfrac{1}{K}\sum_{k=0}^{K-1}\Vert\nabla_\theta J(\theta_k)\Vert^2\right)&\leq \dfrac{J(\theta_K)-J(\theta_0)}{K} + \left(\dfrac{\mu^2}{8L_J}+\dfrac{\mu^4}{16G_1^4 L_J}\right)\left(\dfrac{1}{K}\sum_{k=0}^{K-1}\Vert \omega_k-\omega_k^*\Vert^2\right)\\
     &\overset{(a)}{\leq} \dfrac{2}{(1-\gamma)K}+\left(\dfrac{\mu^2}{8L_J}+\dfrac{\mu^4}{16G_1^4 L_J}\right)\left(\dfrac{1}{K}\sum_{k=0}^{K-1}\Vert \omega_k-\omega_k^*\Vert^2\right)
 \end{split}
 \end{align}
where $(a)$ uses the fact that $J(\cdot)$ is absolutely bounded above by $(1-\gamma)^{-1}$. Using \eqref{eq:local-temp}, we obtain
\begin{align}
\label{eq:bound-2}
 \begin{split}
     \dfrac{\mu^{-2}}{K}\left(\sum_{k=0}^{K-1}\Vert\nabla_\theta J(\theta_k)\Vert^2\right)&\leq  \dfrac{32L_JG_1^4}{\mu^{4} K}+\left(\dfrac{2G_1^4}{\mu^2}+1\right)\left(\dfrac{1}{K}\sum_{k=0}^{K-1}\Vert \omega_k-\omega_k^*\Vert^2\right)
 \end{split}
 \end{align}

Substituting Lemma \ref{lem:critic-second_order} in Lemma \ref{lem:npg-second_order}, we obtain 

\begin{align}
\label{eq:bound-3}
\begin{split}
\E_k\bigl\|\omega_k-\omega_k^*\bigr\|^2 
&\le \cO\Biggl(\,
\frac{G_1^4}{H\,\mu^{4}(1-\gamma)^4}
\;+\;
\frac{G_1^2(C_1')^2\,\log(H/\delta)}{H\,\mu^{2}(1-\gamma)^4}
\;+\;
\frac{1}{\mu^2\,m^{1/2}}
\;+\;
\frac{G_1^2\,\epsilon_{\mathrm{app}}}{\mu^2\,(1-\gamma)^2}
\\
&\qquad\quad
+\;\frac{G_1^2}{\mu^2}\,\frac{\E\|\tilde\zeta_0-\tilde\zeta_*^k\|^2}{H^2}
\;+\;
\frac{G_1^2}{\mu^2}\,\frac{\log^2(H/\delta)}{\lambda_0^2(1-\gamma)^2\,H}
\;+\;
\frac{G_1^2}{\mu^2}\,\frac{\log(H/\delta)}{\lambda_0(1-\gamma)\,m^{1/2}}
\\
&\qquad\quad
+\;
\frac{G_1^2}{\mu^2}\,\frac{1}{\lambda_0^4(1-\gamma)^4\,T^{\kappa}}
\Biggr)\,,
\end{split}
\end{align}

\begin{align}
\label{eq:bound-4}
\begin{split}
\bigl\|\E_k[\omega_k]-\omega_k^*\bigr\|^2
&\le \cO\Biggl(\,
\frac{G_1^4\,(C_1')^2\,\log(H/\delta)}{T^{\kappa}}
\;+\;
\frac{\|\E[\zeta_0]-\zeta_*^k\|^2}{\lambda_0^2(1-\gamma)^2\,H^2}
\;+\;
\frac{\log^4(H/\delta)}{\lambda_0^6(1-\gamma)^6\,H^2}
\\
&\qquad\quad
+\;
\frac{\sqrt{\log(H/\delta)}}{\lambda_0(1-\gamma)\,m^{1/2}}
\;+\;
\frac{1}{\lambda_0^{10}(1-\gamma)^{10}\,T^{2\kappa}}
\;+\;
\frac{G_1^2\,\epsilon_{\mathrm{app}}}{\mu^2\,(1-\gamma)^2}
\Biggr)\,. 
\end{split}
\end{align}



 Now combining \eqref{eq:bound-2}, \eqref{eq:bound-3} and \eqref{eq:bound-4} with \eqref{eq:useful-lem}, and substituting $K=\sqrt{T}$, $M=2t_{\mathrm{mix}}\lfloor\log T\rfloor$ and $H=(\sqrt{T})/M$, we obtain the following bound
\begin{align*}
J^* - \frac{1}{K}\sum_{k=0}^{K-1}\E[J(\theta_k)]
&\le \frac{\sqrt{\epsilon_{\rm bias}}}{1-\gamma}
  + \frac{G_1^2 \sqrt{\epsilon_{\rm app}}}{\mu(1-\gamma)}
  + \cO \left(\frac{t_{\mathrm{mix}}\,\log^3 (T/\delta)}{(1-\gamma)^3\sqrt T}
      +\frac{1}{m^{1/4}(1-\gamma)^{1/2}}
      +\frac{\E_{s\sim d^{\pi^*}}\bigl[\mathrm{KL}(\pi^*\|\pi_{\theta_0})\bigr]}{\sqrt T}
    \right).
\end{align*}



\end{document}
