\begin{thebibliography}{65}
\providecommand{\natexlab}[1]{#1}
\providecommand{\url}[1]{\texttt{#1}}
\expandafter\ifx\csname urlstyle\endcsname\relax
  \providecommand{\doi}[1]{doi: #1}\else
  \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi

\bibitem[Agarwal et~al.(2019)Agarwal, Kakade, Lee, and
  Mahajan]{agarwal2019theory}
Alekh Agarwal, Sham~M. Kakade, Jason~D. Lee, and Gaurav Mahajan.
\newblock On the theory of policy gradient methods: Optimality, approximation,
  and distribution shift.
\newblock \emph{arXiv preprint arXiv:1908.00261}, 2019.

\bibitem[Baird(1995)]{baird1995residual}
Leemon Baird.
\newblock Residual algorithms: Reinforcement learning with function
  approximation.
\newblock In \emph{Machine Learning Proceedings 1995}, pages 30--37. Elsevier,
  1995.

\bibitem[Barth-Maron et~al.(2018)Barth-Maron, Hoffman, Budden, Dabney, Horgan,
  TB, Muldal, Heess, and Lillicrap]{maron2018distributed}
Gabriel Barth-Maron, Matthew~W. Hoffman, David Budden, Will Dabney, Dan Horgan,
  Dhruva TB, Alistair Muldal, Nicolas Heess, and Timothy Lillicrap.
\newblock Distributed distributional deterministic policy gradients.
\newblock In \emph{International Conference on Learning Representations
  (ICLR)}, 2018.

\bibitem[Baxter and Bartlett(2001)]{baxter2001infinite}
Jonathan Baxter and Peter~L Bartlett.
\newblock Infinite-horizon policy-gradient estimation.
\newblock \emph{Journal of Artificial Intelligence Research}, 15:\penalty0
  319--350, 2001.

\bibitem[Bertsekas(1975)]{bertsekas1975convergence}
Dimitri Bertsekas.
\newblock Convergence of discretization procedures in dynamic programming.
\newblock \emph{IEEE Transactions on Automatic Control}, 20\penalty0
  (3):\penalty0 415--419, 1975.

\bibitem[Bhandari and Russo(2019)]{bhandari2019global}
Jalaj Bhandari and Daniel Russo.
\newblock Global optimality guarantees for policy gradient methods.
\newblock \emph{arXiv preprint arXiv:1906.01786}, 2019.

\bibitem[Bhandari et~al.(2018)Bhandari, Russo, and Singal]{Bhandari2018finite}
Jalaj Bhandari, Daniel Russo, and Raghav Singal.
\newblock A finite time analysis of temporal difference learning with linear
  function approximation.
\newblock In \emph{Conference on Learning Theory (COLT)}, 2018.

\bibitem[Bhatnagar et~al.(2008)Bhatnagar, Ghavamzadeh, Lee, and
  Sutton]{bhatnagar2008incremental}
S.~Bhatnagar, M.~Ghavamzadeh, M.~Lee, and R.~S. Sutton.
\newblock Incremental natural actor-critic algorithms.
\newblock In \emph{Advances in Neural Information Processing Systems
  (NeurIPS)}, pages 105--112, 2008.

\bibitem[Bhatnagar(2010)]{bhatnagar2010actor}
Shalabh Bhatnagar.
\newblock An actor-critic algorithm with function approximation for discounted
  cost constrained markov decision processes.
\newblock \emph{Systems \& Control Letters}, 59\penalty0 (12):\penalty0
  760--766, 2010.

\bibitem[Bhatnagar et~al.(2009)Bhatnagar, Sutton, Ghavamzadeh, and
  Lee]{bhatnagar2009natural}
Shalabh Bhatnagar, Richard~S Sutton, Mohammad Ghavamzadeh, and Mark Lee.
\newblock Natural actor-critic algorithms.
\newblock \emph{Automatica}, 45\penalty0 (11):\penalty0 2471--2482, 2009.

\bibitem[Castillo et~al.(2019)Castillo, Weng, Hereid, Wang, and
  Zhang]{castillo2018reinforcement}
Guillermo~A Castillo, Bowen Weng, Ayonga Hereid, Zheng Wang, and Wei Zhang.
\newblock Reinforcement learning meets hybrid zero dynamics: A case study for
  rabbit.
\newblock In \emph{2019 International Conference on Robotics and Automation
  (ICRA)}, pages 284--290, 2019.

\bibitem[Castro and Meir(2010)]{castro2010convergent}
Dotan~Di Castro and Ron Meir.
\newblock A convergent online single time scale actor critic algorithm.
\newblock \emph{The Journal of Machine Learning Research}, 11:\penalty0
  367--410, 2010.

\bibitem[Cen et~al.(2020)Cen, Cheng, Chen, Wei, and Chi]{cen2020fast}
Shicong Cen, Chen Cheng, Yuxin Chen, Yuting Wei, and Yuejie Chi.
\newblock Fast global convergence of natural policy gradient methods with
  entropy regularization.
\newblock \emph{arXiv preprint arXiv:2007.06558}, 2020.

\bibitem[Chow and Tsitsiklis(1991)]{chow1991optimal}
CHEE-S Chow and John~N Tsitsiklis.
\newblock An optimal one-way multigrid algorithm for discrete-time stochastic
  control.
\newblock \emph{IEEE transactions on automatic control}, 36\penalty0
  (8):\penalty0 898--914, 1991.

\bibitem[Dufour and Prieto-Rumeau(2013)]{dufour2013finite}
Francois Dufour and Tomas Prieto-Rumeau.
\newblock Finite linear programming approximations of constrained discounted
  markov decision processes.
\newblock \emph{SIAM Journal on Control and Optimization}, 51\penalty0
  (2):\penalty0 1298--1324, 2013.

\bibitem[Dufour and Prieto-Rumeau(2015)]{dufour2015approximation}
Francois Dufour and Tomas Prieto-Rumeau.
\newblock Approximation of average cost markov decision processes using
  empirical distributions and concentration inequalities.
\newblock \emph{Stochastics An International Journal of Probability and
  Stochastic Processes}, 87\penalty0 (2):\penalty0 273--307, 2015.

\bibitem[Fazel et~al.(2018)Fazel, Ge, Kakade, and Mesbahi]{fazel2018global}
Maryam Fazel, Rong Ge, Sham Kakade, and Mehran Mesbahi.
\newblock Global convergence of policy gradient methods for the linear
  quadratic regulator.
\newblock In \emph{International Conference on Machine Learning (ICML)}, pages
  1467--1476, 2018.

\bibitem[Fu et~al.(2021)Fu, Yang, and Wang]{fu2020single}
Zeyue Fu, Zhuoran Yang, and Zhaoran Wang.
\newblock Single-timescale actor-critic provably finds globally optimal policy.
\newblock In \emph{International Conference on Learning Representations
  (ICLR)}, 2021.

\bibitem[Fujimoto et~al.(2018)Fujimoto, Hoof, and
  Meger]{fujimoto2018addressing}
Scott Fujimoto, Herke Hoof, and David Meger.
\newblock Addressing function approximation error in actor-critic methods.
\newblock In \emph{International Conference on Machine Learning (ICML)}, pages
  1587--1596, 2018.

\bibitem[Haarnoja et~al.(2018)Haarnoja, Zhou, Abbeel, and
  Levine]{haarnoja2018soft}
Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, and Sergey Levine.
\newblock Soft actor-critic: Off-policy maximum entropy deep reinforcement
  learning with a stochastic actor.
\newblock \emph{arXiv preprint arXiv:1801.01290}, 2018.

\bibitem[Hong et~al.(2020)Hong, Wai, Wang, and Yang]{hong2020two}
Mingyi Hong, Hoi-To Wai, Zhaoran Wang, and Zhuoran Yang.
\newblock A two-timescale framework for bilevel optimization: Complexity
  analysis and application to actor-critic.
\newblock \emph{arXiv preprint arXiv:2007.05170}, 2020.

\bibitem[Kakade(2002)]{kakade2002natural}
Sham~M Kakade.
\newblock A natural policy gradient.
\newblock In \emph{Advances in Neural Information Processing Systems
  (NeurIPS)}, pages 1531--1538, 2002.

\bibitem[Karimi et~al.(2019)Karimi, Miasojedow, Moulines, and
  Wai]{karimi2019non}
Belhal Karimi, Blazej Miasojedow, Eric Moulines, and Hoi-To Wai.
\newblock Non-asymptotic analysis of biased stochastic approximation scheme.
\newblock In \emph{Conference on Learning Theory (COLT)}, 2019.

\bibitem[Konda and Tsitsiklis(2000)]{konda2000actor}
Vijay~R Konda and John~N Tsitsiklis.
\newblock Actor-critic algorithms.
\newblock In \emph{Advances in Neural Information Processing Systems
  (NeurIPS)}, pages 1008--1014, 2000.

\bibitem[Konda(2002)]{konda_2002}
Vijaymohan Konda.
\newblock \emph{Actor-critic algorithms}.
\newblock PhD thesis, Massachusetts Institute of Technology, 2002.

\bibitem[Konda and Borkar(1999)]{konda1999actor}
Vijaymohan~R Konda and Vivek~S Borkar.
\newblock Actor-critic--type learning algorithms for markov decision processes.
\newblock \emph{SIAM Journal on control and Optimization}, 38\penalty0
  (1):\penalty0 94--123, 1999.

\bibitem[Kumar et~al.(2019)Kumar, Koppel, and Ribeiro]{kumar2019sample}
Harshat Kumar, Alec Koppel, and Alejandro Ribeiro.
\newblock On the sample complexity of actor-critic method for reinforcement
  learning with function approximation.
\newblock \emph{arXiv preprint arXiv:1910.08412}, 2019.

\bibitem[Kumar et~al.(2020)Kumar, Kalogerias, Pappas, and
  Ribeiro]{kumar2020zeroth}
Harshat Kumar, Dionysios~S. Kalogerias, George~J. Pappas, and Alejandro
  Ribeiro.
\newblock Zeroth-order deterministic policy gradient.
\newblock \emph{arXiv preprint arXiv:2006.07314}, 2020.

\bibitem[Lillicrap et~al.(2016)Lillicrap, Hunt, Pritzel, Heess, Erez, Tassa,
  Silver, and Wierstra]{lillicrap2015continuous}
Timothy~P. Lillicrap, Jonathan~J. Hunt, Alexander Pritzel, Nicolas Heess, Tom
  Erez, Yuval Tassa, David Silver, and Daan Wierstra.
\newblock Continuous control with deep reinforcement learning.
\newblock In \emph{International Conference on Learning Representations
  (ICLR)}, 2016.

\bibitem[Liu et~al.(2020)Liu, Zhang, Basar, and Yin]{liu2020improved}
Yanli Liu, Kaiqing Zhang, Tamer Basar, and Wotao Yin.
\newblock An improved analysis of (variance-reduced) policy gradient and
  natural policy gradient methods.
\newblock In \emph{Advances in Neural Information Processing Systems
  (NeurIPS)}, 2020.

\bibitem[Lowe et~al.(2017)Lowe, Wu, Tamar, Harb, Abbeel, and
  Mordatch]{lowe2017multi}
Ryan Lowe, Yi~I Wu, Aviv Tamar, Jean Harb, OpenAI~Pieter Abbeel, and Igor
  Mordatch.
\newblock Multi-agent actor-critic for mixed cooperative-competitive
  environments.
\newblock In \emph{Advances in neural information processing systems
  (NeurIPS)}, pages 6379--6390, 2017.

\bibitem[Maei(2018)]{maei2018convergent}
Hamid~Reza Maei.
\newblock Convergent actor-critic algorithms under off-policy training and
  function approximation.
\newblock \emph{arXiv preprint arXiv:1802.07842}, 2018.

\bibitem[Malik et~al.(2019)Malik, Pananjady, Bhatia, Khamaru, Bartlett, and
  Wainwright]{malik2018derivative}
Dhruv Malik, Ashwin Pananjady, Kush Bhatia, Koulik Khamaru, Peter Bartlett, and
  Martin Wainwright.
\newblock Derivative-free methods for policy optimization: Guarantees for
  linear quadratic systems.
\newblock In \emph{International Conference on Artificial Intelligence and
  Statistics (AISTATS)}, pages 2916--2925, 2019.

\bibitem[Mitrophanov(2005)]{mitrophanov2005sensitivity}
A~Yu Mitrophanov.
\newblock Sensitivity and convergence of uniformly ergodic {M}arkov chains.
\newblock \emph{Journal of Applied Probability}, 42\penalty0 (4):\penalty0
  1003--1014, 2005.

\bibitem[Mnih et~al.(2013)Mnih, Kavukcuoglu, Silver, Graves, Antonoglou,
  Wierstra, and Riedmiller]{mnih2013playing}
Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis
  Antonoglou, Daan Wierstra, and Martin Riedmiller.
\newblock Playing {a}tari with deep reinforcement learning.
\newblock \emph{arXiv preprint arXiv:1312.5602}, 2013.

\bibitem[Mnih et~al.(2016)Mnih, Badia, Mirza, Graves, Lillicrap, Harley,
  Silver, and Kavukcuoglu]{mnih2016asynchronous}
Volodymyr Mnih, Adria~Puigdomenech Badia, Mehdi Mirza, Alex Graves, Timothy
  Lillicrap, Tim Harley, David Silver, and Koray Kavukcuoglu.
\newblock Asynchronous methods for deep reinforcement learning.
\newblock In \emph{Proceedings of The 33rd International Conference on Machine
  Learning}, volume~48, pages 1928--1937. PMLR, 20--22 Jun 2016.

\bibitem[Papini et~al.(2017)Papini, Pirotta, and Restelli]{papini2017adaptive}
Matteo Papini, Matteo Pirotta, and Marcello Restelli.
\newblock Adaptive batch size for safe policy gradients.
\newblock In \emph{Advances in Neural Information Processing Systems
  (NeurIPS)}, pages 3591--3600, 2017.

\bibitem[Papini et~al.(2018)Papini, Binaghi, Canonaco, Pirotta, and
  Restelli]{papini2018stochastic}
Matteo Papini, Damiano Binaghi, Giuseppe Canonaco, Matteo Pirotta, and Marcello
  Restelli.
\newblock Stochastic variance-reduced policy gradient.
\newblock In \emph{International Conference on Machine Learning (ICML)}, pages
  4026--4035, 2018.

\bibitem[Papini et~al.(2019)Papini, Pirotta, and Restelli]{papini2019smoothing}
Matteo Papini, Matteo Pirotta, and Marcello Restelli.
\newblock Smoothing policies and safe policy gradients.
\newblock \emph{arXiv preprint arXiv:1905.03231}, 2019.

\bibitem[Pednault et~al.(2002)Pednault, Abe, and
  Zadrozny]{pednault2002sequential}
Edwin Pednault, Naoki Abe, and Bianca Zadrozny.
\newblock Sequential cost-sensitive decision making with reinforcement
  learning.
\newblock In \emph{Proceedings of the eighth ACM SIGKDD International
  conference on Knowledge Discovery and Data Mining}, pages 259--268, 2002.

\bibitem[Peters and Schaal(2008)]{peters2008natural}
Jan Peters and Stefan Schaal.
\newblock Natural actor-critic.
\newblock \emph{Neurocomputing}, 71\penalty0 (7-9):\penalty0 1180--1190, 2008.

\bibitem[Pirotta et~al.(2015)Pirotta, Restelli, and
  Bascetta]{pirotta2015policy}
Matteo Pirotta, Marcello Restelli, and Luca Bascetta.
\newblock Policy gradient in lipschitz {M}arkov decision processes.
\newblock \emph{Machine Learning}, 100\penalty0 (2-3):\penalty0 255--283, 2015.

\bibitem[Qiu et~al.(2019)Qiu, Yang, Ye, and Wang]{qiu2019finite}
Shuang Qiu, Zhuoran Yang, Jieping Ye, and Zhaoran Wang.
\newblock On the finite-time convergence of actor-critic algorithm.
\newblock In \emph{Optimization Foundations for Reinforcement Learning Workshop
  at Advances in Neural Information Processing Systems}, 2019.

\bibitem[Schulman et~al.(2015)Schulman, Levine, Abbeel, Jordan, and
  Moritz]{schulman2015trust}
John Schulman, Sergey Levine, Pieter Abbeel, Michael Jordan, and Philipp
  Moritz.
\newblock Trust region policy optimization.
\newblock In \emph{International Conference on Machine Learning (ICML)}, pages
  1889--1897, 2015.

\bibitem[Schulman et~al.(2017)Schulman, Wolski, Dhariwal, Radford, and
  Klimov]{schulman2017proximal}
John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov.
\newblock Proximal policy optimization algorithms.
\newblock \emph{arXiv preprint arXiv:1707.06347}, 2017.

\bibitem[Shah and Xie(2018)]{shah2018q}
Devavrat Shah and Qiaomin Xie.
\newblock {Q}-learning with nearest neighbors.
\newblock In \emph{Advances in Neural Information Processing Systems
  (NeurIPS)}, pages 3111--3121, 2018.

\bibitem[Shen et~al.(2020)Shen, Zhang, Hong, and Chen]{shen2020asynchronous}
Han Shen, Kaiqing Zhang, Mingyi Hong, and Tianyi Chen.
\newblock Asynchronous advantage actor critic: Non-asymptotic analysis and
  linear speedup.
\newblock \emph{arXiv preprint arXiv:2012.15511}, 2020.

\bibitem[Shen et~al.(2019)Shen, Ribeiro, Hassani, Qian, and
  Mi]{shen2019hessian}
Zebang Shen, Alejandro Ribeiro, Hamed Hassani, Hui Qian, and Chao Mi.
\newblock Hessian aided policy gradient.
\newblock In \emph{International Conference on Machine Learning (ICML)}, pages
  5729--5738, 2019.

\bibitem[Silver et~al.(2014)Silver, Lever, Heess, Degris, Wierstra, and
  Riedmiller]{silver2014deterministic}
David Silver, Guy Lever, Nicolas Heess, Thomas Degris, Daan Wierstra, and
  Martin Riedmiller.
\newblock Deterministic policy gradient algorithms.
\newblock In \emph{International Conference on Machine Learning (ICML)}, pages
  387--395, 2014.

\bibitem[Sutton et~al.(2000)Sutton, McAllester, Singh, and
  Mansour]{sutton2000policy}
Richard~S Sutton, David~A McAllester, Satinder~P Singh, and Yishay Mansour.
\newblock Policy gradient methods for reinforcement learning with function
  approximation.
\newblock In \emph{Advances in Neural Information Processing Systems
  (NeurIPS)}, pages 1057--1063, 2000.

\bibitem[Tadi{\'c} et~al.(2017)Tadi{\'c}, Doucet, et~al.]{tadic2017asymptotic}
Vladislav~B Tadi{\'c}, Arnaud Doucet, et~al.
\newblock Asymptotic bias of stochastic gradient search.
\newblock \emph{The Annals of Applied Probability}, 27\penalty0 (6):\penalty0
  3255--3304, 2017.

\bibitem[Tsitsiklis and Van~Roy(1997)]{tsitsiklis1997analysis}
John~N. Tsitsiklis and Benjamin Van~Roy.
\newblock An analysis of temporal-diffference learning with function
  approximation.
\newblock \emph{IEEE Transactions on Automatic Control}, 42\penalty0
  (5):\penalty0 674 – 690, 1997.

\bibitem[Tu and Recht(2019)]{tu2018gap}
Stephen Tu and Benjamin Recht.
\newblock The gap between model-based and model-free methods on the linear
  quadratic regulator: An asymptotic viewpoint.
\newblock In \emph{Conference on Learning Theory (COLT)}, pages 3036--3083,
  2019.

\bibitem[Wang et~al.(2020)Wang, Cai, Yang, and Wang]{wang2019neural}
Lingxiao Wang, Qi~Cai, Zhuoran Yang, and Zhaoran Wang.
\newblock Neural policy gradient methods: Global optimality and rates of
  convergence.
\newblock In \emph{International Conference on Learning Representations
  (ICLR)}, 2020.

\bibitem[Williams(1992)]{williams1992simple}
Ronald~J Williams.
\newblock Simple statistical gradient-following algorithms for connectionist
  reinforcement learning.
\newblock \emph{Machine Learning}, 8\penalty0 (3-4):\penalty0 229--256, 1992.

\bibitem[Wu et~al.(2020)Wu, Zhang, Xu, and Gu]{wu2020finite}
Yue Wu, Weitong Zhang, Pan Xu, and Quanquan Gu.
\newblock A finite time analysis of two time-scale actor critic methods.
\newblock \emph{arXiv preprint arXiv:2005.01350}, 2020.

\bibitem[Xiong et~al.(2020)Xiong, Xu, Liang, and Zhang]{xiong2020amsgradRL}
Huaqing Xiong, Tengyu Xu, Yingbin Liang, and Wei Zhang.
\newblock Non-asymptotic convergence of adam-type reinforcement learning
  algorithms under markovian sampling.
\newblock \emph{arXiv preprint arXiv:2002.06286}, 2020.

\bibitem[Xu et~al.(2019)Xu, Gao, and Gu]{xu2019improved}
Pan Xu, Felicia Gao, and Quanquan Gu.
\newblock An improved convergence analysis of stochastic variance-reduced
  policy gradient.
\newblock In \emph{International Conference on Uncertainty in Artificial
  Intelligence (UAI)}, 2019.

\bibitem[Xu et~al.(2020{\natexlab{a}})Xu, Gao, and Gu]{xu2020sample}
Pan Xu, Felicia Gao, and Quanquan Gu.
\newblock Sample efficient policy gradient methods with recursive variance
  reduction.
\newblock In \emph{International Conference on Learning Representations
  (ICLR)}, 2020{\natexlab{a}}.

\bibitem[Xu et~al.(2020{\natexlab{b}})Xu, Wang, and Liang]{xu2020improving}
Tengyu Xu, Zhe Wang, and Yingbin Liang.
\newblock Improving sample complexity bounds for actor-critic algorithms.
\newblock \emph{arXiv preprint arXiv:2004.12956}, 2020{\natexlab{b}}.

\bibitem[Xu et~al.(2020{\natexlab{c}})Xu, Wang, and Liang]{xu2020non}
Tengyu Xu, Zhe Wang, and Yingbin Liang.
\newblock Non-asymptotic convergence analysis of two time-scale (natural)
  actor-critic algorithms.
\newblock \emph{arXiv preprint arXiv:2005.03557}, 2020{\natexlab{c}}.

\bibitem[Yang et~al.(2019)Yang, Chen, Hong, and Wang]{yang2019global}
Zhuoran Yang, Yongxin Chen, Mingyi Hong, and Zhaoran Wang.
\newblock On the global convergence of actor-critic: A case for linear
  quadratic regulator with ergodic cost.
\newblock \emph{arXiv preprint arXiv:1907.06246}, 2019.

\bibitem[Zhang et~al.(2019)Zhang, Koppel, Zhu, and
  Ba{\c{s}}ar]{zhang2019global}
Kaiqing Zhang, Alec Koppel, Hao Zhu, and Tamer Ba{\c{s}}ar.
\newblock Global convergence of policy gradient methods to (almost) locally
  optimal policies.
\newblock \emph{arXiv preprint arXiv:1906.08383}, 2019.

\bibitem[Zhang et~al.(2020)Zhang, Liu, Yao, and Whiteson]{zhang2020provably}
Shangtong Zhang, Bo~Liu, Hengshuai Yao, and Shimon Whiteson.
\newblock Provably convergent two-timescale off-policy actor-critic with
  function approximation.
\newblock In \emph{International Conference on Machine Learning}, pages
  11204--11213. PMLR, 2020.

\bibitem[Zou et~al.(2019)Zou, Xu, and Liang]{zou2019finite}
Shaofeng Zou, Tengyu Xu, and Yingbin Liang.
\newblock Finite-sample analysis for sarsa with linear function approximation.
\newblock In \emph{Advances in Neural Information Processing Systems
  (NeurIPS)}, pages 8665--8675, 2019.

\end{thebibliography}
