
\section{A more comprehensive literature review }



\textbf{First-order methods.} For the deterministic setting, \cite{polyak1963gradient} and \cite{karimi2016linear} prove that under the P\L{} condition ($\alpha=2$), the gradient descent algorithm finds an $\epsilon$-optimal solution in $\mathcal{O}(\log({1}/{\epsilon}))$ gradient evaluations. Then \citet{yue2022lower} show that the convergence rate is optimal for the first-order algorithm. While for the stochastic setting, \cite{khaled2022better} show that stochastic gradient descent (SGD) with time-varying stepsize converges to an $\epsilon$-optimal point with a sample complexity of $\mathcal{O}(1/\epsilon)$ under the P\L{} condition. It is verified in \citet{nguyen2019tight} that the dependency of the sample complexity of SGD on $\epsilon$ is optimal. For general gradient-dominated functions with $\alpha \in [1,2]$, \cite{fontaine2021convergence} obtain a sample complexity of $O ( \epsilon^{-4/\alpha + 1})$ for SGD. Moreover, under the weak gradient dominance property with $\alpha = 1$, which is typically observed in reinforcement learning, it has been shown that stochastic policy gradient converges to the global optimal solution with a sample complexity of $\tilde{\mathcal{O}}(\epsilon^{-3})$~\citep{yuan2022general}.


\section{Proof Sketch}
In this section, we give a sketch of the proof \cref{thm:hsodm}. The proof of the variance reduction version \cref{thm:vr} is similar.\par 
We first analyze \cref{algo:ls} and show that it can output a solution such that $ \|d_k\| = \mathcal{O}(\lambda_l + \epsilon ) $ in $ \mathcal{O}(\log((1+\|g_k\|)/\epsilon_{\text{ls}}\epsilon_{\text{eig}}))$, where $d_k = x_k - x_{k+1}$ (\cref{lemma:ls_err}). \par 
Then, we prove \cref{thm:hsodm}. The key of the proof is to estimate the one-step progress of SHSODM (\cref{lemma:descent}). \cref{lemma:descent} analyzes the progress of objective value 
\begin{equation}\label{eq:sket_f}
    F(x_k) - F(x_{k+1}) \geq \mathcal{O}(\|d_k\|^3- \epsilon),
\end{equation}
where $\epsilon$ is some small error cause by perturbation and randomness, and upper bounds of gradient norm $\|g_{k+1}\|$,
\begin{equation} \label{eq:sket_g}
    \|g_{k+1}\| \leq \mathcal{O}(\|d_k\|^2+\lambda_k\|d_k\|+ \epsilon).
\end{equation}
Note that $ \epsilon$ may not be the same in \cref{eq:sket_f} and \cref{eq:sket_g}, but we omit the constant and use $ \epsilon$ to represent small error. \cref{eq:sket_g} means that if the step size $ \| d_k \| $ is small, the current point is close to the optimal point. The line search (\cref{algo:ls}) guarantees $\lambda_k = \mathcal{O}(\|d_k\|+\epsilon)$ (\cref{lemma:ls_err}). Combine three equations, we get an estimation of gradient norm 
\begin{equation*}
    \|g_{k+1}\| \leq \mathcal{O}(\|d_k\|^2+\epsilon_3 )\leq \mathcal{O}((F(x_{k})-F(x_{k+1}))^{2/3}+\epsilon_1).
\end{equation*}
Now, we use the gradient dominance property. 
\begin{equation} \label{eq:sket_iter}
    F(x_{k+1})-F^*\leq\mathcal{O}(\|g_{k+1}\|^\alpha)\leq \mathcal{O}((F(x_{k})-F(x_{k+1}))^{2\alpha/3}+\epsilon)
\end{equation}
\cref{eq:sket_iter} establishes a recursive relationship of the objective value. Let $\Delta_k = F(x_{k})-F^* $, $\Delta_k$ satisfies 
\begin{equation}\label{eq:sket_delta}
\Delta _{k+1} \leq  C_\Delta( \Delta _{k} -\Delta _{k+1})^{2\alpha /3},
\end{equation}
where $ C_\Delta $ is a constant. \cref{lemma:seq} analyzes the convergence rate of $\Delta_k$ that satisfies \cref{eq:sket_delta} for different $\alpha$. Combining these results, we are able to obtain the desired conclusion.


\section{Technical Results for Theorem \ref{lemma:ls_err} on the Adaptive Search of $\delta_k$}


Before we delve into a discussion of \cref{lemma:ls_err}, we introduce some auxiliary lemmas on the two subroutines \cref{algo:per} and \cref{algo:ls}. \textbf{Since it is understood at iteration $k$, we omit subscript $k$ for conciseness}. Recall that
\begin{equation*}
    A( \delta ) \coloneqq \begin{bmatrix}
        H              & g       \\
        g^T & -\delta
    \end{bmatrix}
\end{equation*}
and
$$ \lambda (\delta ) = \lambda_{\min}(A(\delta)).$$
\begin{lemma}\label{prop:property}
    Suppose that $ A( \delta )$ and $\lambda ( \delta )$ are defined as above, then the following statements hold,
    \begin{enumerate}
        \item $ \lambda ( \delta )$ is non-decreasing in $\delta $ and $ 1$-Lipschitz.
        \item Let $ \lambda _{\min}( H)$ be the smallest eigenvalue of $ H$.
                $$ \lim _{\delta \rightarrow +\infty } \lambda ( \delta ) =+\infty, ~ \lim _{\delta \rightarrow -\infty } \lambda ( \delta ) =-\lambda _{\min}( H).$$
        \item Let $ E_{\lambda }$ be the eigenspace of $ H$ corresponding to the eigenvalue $ \lambda $. If $ g$ is not orthogonal to $E_\lambda$, that is,
                $$\mathcal{P}_{\mathcal S_{\min}}( g) \neq 0,$$ then for any $ C_e >0$, there exist $ \delta _{C_e}$ such that\begin{equation}
                    \lambda ( \delta _{C_e}) =C_e\left\Vert ( H+\lambda ( \delta _{C_e}) I)^{-1} g\right\Vert .\label{eq:lambda_delta}
                \end{equation}
    \end{enumerate}
\end{lemma}

\begin{proof}
    (1) $ \lambda ( \delta )$ is a non-decreasing function of $ \delta $ since 
\begin{equation*}
F( \delta _{1}) -F( \delta _{2}) \succeq 0, \quad \delta_1 >\delta_2
\end{equation*}
Let $ \delta _{1}  >\delta _{2} ,v$ be the smallest eigenvector of $ F( \delta _{2})$.\begin{equation*}
( \lambda ( \delta _{1}) -\lambda ( \delta _{2}))\Vert v\Vert ^{2} \leq  v^{\mathsf{T}}( F( \delta _{1}) -F( \delta _{2})) v\leq  ( \delta _{1} -\delta _{2})\Vert v\Vert ^{2} .
\end{equation*}

(2) $ \lim _{\delta \rightarrow +\infty } \lambda ( \delta ) =+\infty $ since $ \lambda ( \delta ) \geq  \delta $. By Schur complement, \begin{align*}
H+\lambda ( \delta ) I-\frac{1}{\lambda ( \delta ) -\delta } gg^{\mathsf{T}} & \succeq 0.
\end{align*}For any $ \epsilon  >0$, and sufficiently small $ \delta $,
\begin{align*}
H+( -\lambda _{\min}( H) +\epsilon ) I-\frac{1}{-\lambda _{\min}( H) +\epsilon -\delta } gg^{\mathsf{T}} & \succ 0.
\end{align*}Thus, $ \lambda _{\min}( H) \leq  \lim _{\delta \rightarrow -\infty } \lambda ( \delta ) \leq  \lambda _{\min}( H) +\epsilon $, for any $ \epsilon  >0$, which implies $ \lim _{\delta \rightarrow -\infty } \lambda ( \delta ) =\lambda _{\min}( H)$. Furthermore, we can get 
\begin{equation}
\lambda ( \delta ) \leq  -\lambda _{\min}( H) +\epsilon ,\forall \delta \leq  -\frac{\Vert g\Vert }{\epsilon } -\lambda _{\min}( H) .\label{eq:est_lambda}
\end{equation}
(3) Let $ h( \lambda ) =\lambda -C_e\left\Vert ( H+\lambda I)^{-1} g\right\Vert $. Since $ g$ is not orthogonal to $ E_{\lambda _{\min}}$, $ \lim _{\lambda \rightarrow -\lambda _{\min}( H)} h( \lambda ) =-\infty $ and $ \lim _{\lambda \rightarrow +\infty } h( \lambda ) =+\infty $. Obviously, by the monotonicity of $ \lambda ( \delta )$ and 2, there exist $ \delta \in \mathbb{R}$ satisfies the equation.
\end{proof}

The following result gives a uniform upper bound and lower bound of the solution of (\ref{eq:lambda_delta})
% \textbf{To ensure the second condtion}: compute $ \mathcal{P}_{\mathcal S_{\min}}( g)$
% \begin{itemize}
% \item If $ \Vert \mathcal{P}_{\mathcal S_{\min}}( g)\Vert \geq  \epsilon _{\text{eig}}$, do nothing.
% \item Otherwise let $ g'=g+\epsilon _{\text{eig}} \cdotp \frac{\mathcal{P}_{\mathcal S_{\min}}( g)}{\Vert \mathcal{P}_{\mathcal S_{\min}}( g)\Vert }$. Use $ g'$ to compute the homogenous direction.
% \item If $ \Vert \mathcal{P}_{\mathcal S_{\min}}( g)\Vert =0$, $ g'=g+\epsilon _{\text{eig}} \cdotp \frac{d}{\Vert d\Vert }$.
% \end{itemize}

\begin{lemma}\label{lemma:ls_interval}
    Suppose that $ \Vert \mathcal{P}_{\mathcal S_{\min}}( g)\Vert \geq  \epsilon _{\text{eig}} $, $ \Vert H \Vert_2 \leq C_H$. Then $ \delta_{C_e} $ and $ \lambda_{C_e} = \lambda(\delta_{C_e}) $ in \cref{prop:property} satisfy 
    \begin{equation*}
        \lambda _{C_e} \in \left[\frac{-\lambda _{\min}( H) +\sqrt{\lambda _{\min}^{2}( H) +4\epsilon _{\text{eig}}}}{2} ,C_e\|g\| +C_{H} +1\right],
        \end{equation*}
    and 
    \begin{equation*}
        \delta _{C_e} \in \left[ -\frac{\|g\|\left( C_{H} +\sqrt{C_H+\epsilon _{\text{eig}}}\right)}{\epsilon _{\text{eig}}} - C_H ,C_e\|g\| +C_{H} +1\right] .
        \end{equation*}
\end{lemma}
\begin{proof}
    Let $h(\lambda )=\lambda -C_{e} \| (H+\lambda I)^{-1} g\| $, then $ h( \lambda )$ is nondecresing. If $ \lambda  >C_{e} \|g\| +C_{H} +1$,
    \begin{equation*}
    h( \lambda ) =\lambda -C_{e} \| (H+\lambda I)^{-1} g\| \geqslant \lambda -C_{e} \|g\| /( \lambda -C_{H})  >0.
    \end{equation*}Thus, $ \lambda _{C_{e}} \leqslant C_{e} \|g\| +C_{H} +1$. On the other hand, if $ \lambda \leqslant \frac{-\lambda _{\min}( H) +\sqrt{\lambda _{\min}^{2}( H) +4\epsilon _{\text{eig}}}}{2}$,
    \begin{equation*}
    h( \lambda ) \leqslant \lambda -C_{e} \epsilon _{\text{eig}} /( \lambda -\lambda _{\min}( H)) < 0.
    \end{equation*}Thus, $ $$ \lambda _{C_{e}} \geqslant \frac{-\lambda _{\min}( H) +\sqrt{\lambda _{\min}^{2}( H) +4\epsilon _{\text{eig}}}}{2} .$ Notice that 
    \begin{equation*}
    \delta_{C_e} \leqslant \lambda ( \delta_{C_e} ) \leqslant C_{e} \|g\| +C_{H} +1.
    \end{equation*}
   Let  $\epsilon = \frac{\lambda _{\min}( H) +\sqrt{\lambda _{\min}^{2}( H) +4\epsilon _{\text{eig}}}}{2}$  in (\ref{eq:est_lambda}), we get 
   $$\delta_{C_e} \geqslant -\frac{\|g\|\left( -\lambda _{\min}( H) +\sqrt{\lambda _{\min}^{2}( H) +4\epsilon _{\text{eig}}} \right)}{\epsilon _{\text{eig}}} - \lambda_{\min}(H)\geqslant -\frac{\|g\|\left( C_{H} +\sqrt{C_H+\epsilon _{\text{eig}}}\right)}{\epsilon _{\text{eig}}} - C_H.$$
\end{proof}
% The following two several concrete examples from ~.
% \begin{example}
%     Consider one dimensional function $f(x) = c \cdot |x|^q$ where $q >  1$ and $c>0$.
% \end{example}

% \begin{example}
%     Consider $f(x_1, x_2) = \cosh{x_1} + 8  \cosh{\left(\sin{x_1}\right)} + \frac{1}{2} \cosh{x_2} + \frac{5}{2} \cosh{ \left( \sin{x_2} \right) }  - 12 $.
% \end{example}
\begin{proposition}\label{prop:perturb}
The output of \cref{algo:per} satisfies $ \Vert \mathcal{P}_{\ E_{\lambda _{\min}}}( g')\Vert \geqslant \epsilon _{\text{eig}}$ and $ \| g'-g\| \leqslant \epsilon _{\text{eig}}$.
\end{proposition}
\begin{proof}

If $ \mathcal{P}_{\ E_{\lambda _{\min}}}( g) \geqslant \epsilon _{\text{eig}}$, output $ g'=g$ and the two inequalities hold trivially. If $ \mathcal{P}_{\ E_{\lambda _{\min}}}( g) < \epsilon _{\text{eig}}$, 

\begin{equation*}
g'=g+\epsilon _{\text{eig}}\frac{\mathcal{P}_{\ E_{\lambda _{\min}}}( g)}{\| \mathcal{P}_{\ E_{\lambda _{\min}}}( g) \| } .
\end{equation*}We have $ \| g-g'\| =\epsilon _{\text{eig}}$ and $ \| \mathcal{P}_{\ E_{\lambda _{\min}}}( g') \| =\| \mathcal{P}_{\ E_{\lambda _{\min}}}( g)\left( 1+\frac{\epsilon _{\text{eig}}}{\| \mathcal{P}_{\ E_{\lambda _{\min}}}( g) \| }\right) \| \geqslant \epsilon _{\text{eig}}$.
    
\end{proof}

\subsubsection{Proof of \cref{lemma:ls_err}}
Finally, we are ready to prove \cref{lemma:ls_err}.

\begin{proof} 
If we compute a $ \hat{\delta }_{C_e}$ such that 
\begin{equation}\label{eq:lambda_err}
|\lambda ( \delta _{C_e}) -\lambda (\hat{\delta }_{C_e}) |\leqslant \epsilon ,
\end{equation}
then
\begin{align*}
\left\Vert ( H+\lambda ( \delta _{C_e}) I)^{-1} g\right\Vert -\left\Vert ( H+\lambda (\hat{\delta }_{C_e}) I)^{-1} g\right\Vert  & \leqslant \left\Vert \left(( H+\lambda ( \delta _{C_e}) I)^{-1} -( H+\lambda (\hat{\delta }_{C_e}) I)^{-1}\right) g\right\Vert \\
 & \leqslant \epsilon\left\Vert ( H+\lambda (\hat{\delta }_{C_e}) I)^{-1}\right\Vert \left\Vert ( H+\lambda ( \delta _{C_e}) I)^{-1}\right\Vert \Vert g\Vert \\
 & \leqslant \frac{4\epsilon \Vert g\Vert }{\left( -\lambda _{\min}( H) +\sqrt{\lambda _{\min}^{2}( H) +4\epsilon _{\text{eig}}}\right)^{2}}.
\end{align*}
Taking 
\begin{equation*}
\epsilon \leqslant \epsilon _{\text{ls}}\min\left\{1/2,\left( -\lambda _{\min}( H) +\sqrt{\lambda _{\min}^{2}( H) +4\epsilon _{\text{eig}}}\right)^{2} /( 8C\Vert g\Vert )\right\} ,
\end{equation*}then (\ref{eq:lambda_err}) guarantees
\begin{equation*}
|\lambda ( \delta _{C_e}) -\lambda (\hat{\delta }_{C_e}) |\leqslant \epsilon _{\text{ls}} /2,\ C\left| \left\Vert ( H+\lambda ( \delta _{C_e}) I)^{-1} g\right\Vert -\left\Vert ( H+\lambda (\hat{\delta }_{C_e}) I)^{-1} g\right\Vert \right| \leqslant \epsilon _{\text{ls}} /2
\end{equation*}
and 
\begin{equation*}
|h(\hat{\delta }_{C_e}) |\leqslant \epsilon _{\text{ls}} .
\end{equation*}
By \cref{prop:property}, $ \lambda ( \delta )$ is $ 1$-Lipschitz continuous. By \cref{assum:smooth}, $f(x,\xi)$ are Lipschitz continuous. Thus, the sample Hessian are bounded. Together with \cref{prop:perturb}, the conditions in \cref{lemma:ls_interval} are satisfied. \cref{lemma:ls_interval} tells us $ \delta _{C_e}$ lies in an interval of length $ O\left( 1 + \| g\| \epsilon _{\text{eig}}^{-1/2}\right)$. Using bisection, we can compute a proper $ \hat{\delta }_{C_e}$ such that (\ref{eq:lambda_err}) holds in $ \mathcal{O}(\log( (1 + \| g\| )/\epsilon _{\text{ls}} \epsilon _{\text{eig}}))$.
\end{proof}


\section{Proof of Sample Complexity Results of Algorithm \ref{algo:hsodm}}\label{sec:pf}
\begin{lemma}\label{lemma:seq}
Suppose a non-negative sequence $ \{\Delta _{k}\}$ satisfies 
\begin{equation*}
\Delta _{k+1} \leq  C_\Delta( \Delta _{k} -\Delta _{k+1})^{2\alpha /3} ,
\end{equation*}
where $ C_e$ is a constant. For any $ \epsilon  >0$, \ 
\begin{enumerate}
    \item  If $ \alpha =3/2$, $\forall k\geq  \mathcal{O}(\log \epsilon )$, we have $ \Delta _{k} \leq  \epsilon$. 
    \item If $ \alpha \in ( 1,3/2)$, $\forall k \geq  O\left( \epsilon ^{1-3/2\alpha }\right)$, we have $\Delta_{k} \leq  \epsilon $.
    \item If $ \alpha  >3/2$, $ \forall k \geq  \mathcal{O}(\log\log \epsilon )$, we have $ \Delta _{k} \leq  \epsilon $.
\end{enumerate}
\end{lemma}


\begin{proof}
    Let $ \beta =2\alpha /3$.
    
    (1)If $ $$ \beta =1$, obviously $ \Delta _{k}$ converge to $ 0$ linearly as \begin{equation*}
        \Delta _{k+1} \leq  \frac{1}{1+C_\Delta} \Delta _{k} .
        \end{equation*}
        
        If $ \beta \neq 1$, let $ D_{k} =\Delta _{k} /C_\Delta^{1/( 1-\beta )}$, then 
        \begin{equation*}
        D_{k+1} \leq  ( D_{k} -D_{k+1})^{\beta }
        \end{equation*}
        (2) If $ \beta < 1$, let $ h( x) =x^{1-1/\beta }$, \begin{align*}
        h( D_{k+1}) -h( D_{k}) & \geq  ( 1/\beta -1) D_{k}^{-1/\beta }( D_{k} -D_{k+1})\\
         & \geq  ( 1/\beta -1)( D_{k+1} /D_{k})^{1/\beta } .
        \end{align*}
        \textbf{Case I.} $ D_{k+1} < \frac{1}{2} D_{k}$. This case will happen at most $ \log( D_{0} /\epsilon )$ times.
        
        \textbf{Case II.} $ D_{k+1} \geq  \frac{1}{2} D_{k}$. $ h( D_{k+1}) -h( D_{k}) \geq  ( 1/\beta -1) /2^{1/\beta }$. Thus, \begin{equation*}
        D_{n} < \epsilon ,\forall n\geq  N_{\epsilon } =\ \log( D_{0} /\epsilon ) /\log 2+\frac{\epsilon ^{1-1/\beta } -h( D_{0})}{( 1/\beta -1) /2^{1/\beta }} =O\left( \epsilon ^{1-1/\beta }\right) .
        \end{equation*}We obtain sublinear rate.
        
        (3) If $ \beta  >1$, let $ N_{0} =\inf\{n:D_{n} < 1/2\}$. Then for all $ n< N_{0} -1,$\begin{equation*}
        1/2\leq  D_{k+1} \leq  ( D_{k} -D_{k+1})^{\beta } .
        \end{equation*}Thus $ N_{0} \leq  2^{1/\beta } \lceil D_{0} \rceil +1$. For all $ n >N_{0}$, \begin{equation*}
        D_{n+1} \leq  D_{n}^{\beta } \leq  D_{N_{0}}^{\beta ^{N-N_{0}}} .
        \end{equation*}Combine the two cases, we have \begin{equation*}
        D_{n} < \epsilon ,\forall n\geq  N_{\epsilon } =\ 2^{1/\beta } \lceil D_{0} \rceil +1+(\log\log( 1/\epsilon ) -\log\log 2) /\log \beta =\mathcal{O}(\log\log( 1/\epsilon )) .
        \end{equation*}we obtain superlinear rate.
\end{proof}
	
\begin{lemma}\label{lemma:descent}
 For \cref{algo:hsodm}, we have 
\begin{equation}\label{eq:f_descent}
\Vert d_{k}\Vert ^{3} \leq  \frac{6}{{L_H}}\left( F\left( x_k\right) -F(\xkn) +\epsilon {_{\text{eig}}}^{3/2} +\epsilon _{\text{ls}}^{3} +\Vert H_{k} -\hat{H}_{k}\Vert ^{3} +\Vert g_{k} -\hat{g}_{k}\Vert ^{3/2}\right) .
\end{equation}
and 
\begin{equation} \label{eq:g_descent}
\Vert g_{k+1}\Vert \leq \frac{5L_{H} +14}{6}\Vert d_{k}\Vert ^{2} +\epsilon _{\text{ls}}^{2} +\epsilon _{\text{eig}} +\Vert \hat{g}_{k} -\hat{g} '_{k}\Vert +\frac{1}{2}\Vert \hat{H}_{k} -H_{k}\Vert ^{2}.
\end{equation}
\end{lemma}

\begin{proof}
Suppose that $ ( \hat{H}_{k} + \lambda_{k} I ) d_{k} = \hat{g} '_{k}$, where $ \hat{H}_{k} =\frac{1}{N_{H}}\sum _{i=1}^{N_{H}} H_{k,i} ,\ \hat{g}_{k} =\frac{1}{N_{g}}\sum _{i=1}^{N_{g}} g_{k,i}$ is the estimation of Hessian and gradient, and $ \hat{g} '_{k}$ is the perturbation of $ \hat{g}_{k}$. By \cref{prop:perturb}, $\|\hat{g}_k - \hat{g}_k'\|\leq \epsilon_{\text{eig}}$. Then we have, 
\begin{align}
F(\xkn) -F\left( x_k\right) & \leq  -\langle g_{k} ,d_{k} \rangle +\frac{1}{2} d{_{k}}^{T} H_{k} d_{k} +\frac{{L_H}}{6}\Vert d_{k}\Vert ^{3} \notag\\
 & =-\langle \hat{g} '_{k} ,d_{k} \rangle +\frac{1}{2} d{_{k}}^{T}\hat{H}_{k} d_{k} +\frac{{L_H}}{6}\Vert d_{k}\Vert ^{3} -\langle g_{k} -\hat{g} '_{k} ,d_{k} \rangle +\frac{1}{2} d{_{k}}^{\mathsf{T}}( H_{k} -\hat{H}_{k}) d_{k}\notag\\
 & \leq  -\frac{\lambda _{k}}{2}\Vert d_{k}\Vert ^{2} +\frac{2+{L_H}}{6}\Vert d_{k}\Vert ^{3} +\Vert g_{k} -\hat{g} '_{k}\Vert \Vert d_{k}\Vert +\frac{1}{2}\Vert d_{k}\Vert ^{2}\Vert H_{k} -\hat{H}_{k}\Vert . \label{eq:f_descent1}
\end{align}
By Young's inequality, 
\begin{equation*}
\Vert g_{k} -\hat{g}_{k}\Vert \Vert d_{k}\Vert \leq  \frac{2}{3}\Vert g_{k} -\hat{g}_{k}\Vert ^{3/2} +\frac{1}{3}\Vert d_{k}\Vert ^{3} ,\ \Vert d_{k}\Vert ^{2}\Vert H_{k} -\hat{H}_{k}\Vert \leq  \frac{2}{3}\Vert d_{k}\Vert ^{3} +\frac{1}{3}\Vert H_{k} -\hat{H}_{k}\Vert ^{3} .
\end{equation*}
Thus,
\begin{align*}
F(\xkn) -F\left( x_k\right) & \leq  -\frac{\lambda _{k}}{2}\Vert d_{k}\Vert ^{2} +\frac{6+{L_H}}{6}\Vert d_{k}\Vert ^{3} +\frac{2}{3} \epsilon {_{\text{eig}}}^{3/2} +\frac{1}{6}\Vert H_{k} -\hat{H}_{k}\Vert ^{3} +\frac{2}{3}\Vert g_{k} -\hat{g}_{k}\Vert ^{3/2}
\end{align*}
Thus, let $ C_e=\frac{2({L_H}+4)}{3}$ in \cref{algo:ls}, by  \cref{lemma:ls_err}, we have 
\begin{equation}
    \lambda_k  \geq  \frac{2({L_H}+4)}{3} \| d_k \| - \epsilon _{\text{ls}}. \label{eq:ls_err}
\end{equation}
By Young's inequality, $ \epsilon _{\text{ls}}\Vert d_{k}\Vert ^{2} \leq  \frac{2}{3}\Vert d_{k}\Vert ^{3} +\frac{1}{3} \epsilon _{\text{ls}}^{3}$,
\begin{align*}
\frac{{L_H}}{6}\Vert d_{k}\Vert ^{3} -\frac{\epsilon _{\text{ls}}^{3}}{6} & \leq  \frac{{L_H}+2}{6}\Vert d_{k}\Vert ^{3} -\frac{\epsilon _{\text{ls}}}{2}\Vert d_{k}\Vert ^{2} \\
 & \leq \frac{{L_H}+2}{6}\Vert d_{k}\Vert ^{3} + \frac{\lambda_k}{2} \|d_k\|^2 - \frac{L_H+4}{3}\|d_k\|^3 \\
 & \leq  F\left( x_k\right) -F(\xkn) +\frac{2}{3} \epsilon {_{\text{eig}}}^{3/2} +\frac{1}{6}\Vert H_{k} -\hat{H}_{k}\Vert ^{3} +\frac{2}{3}\Vert g_{k} -\hat{g}_{k}\Vert ^{3/2},
\end{align*}
where we use \cref{eq:ls_err} in the second inequality and \cref{eq:f_descent1} in the last inequality. We get 
\begin{equation}
\Vert d_{k}\Vert ^{3} \leq  \frac{6}{{L_H}}\left( F\left( x_k\right) -F(\xkn) +\epsilon {_{\text{eig}}}^{3/2} +\epsilon _{\text{ls}}^{3} +\Vert H_{k} -\hat{H}_{k}\Vert ^{3} +\Vert g_{k} -\hat{g}_{k}\Vert ^{3/2}\right) .
\end{equation}
For gradient, 
\begin{align*}
\Vert g_{k+1}\Vert  & \leq  \Vert g_{k+1} -g_{k} -H_{k} d_{k}\Vert +\Vert g_{k} +H_{k} d_{k}\Vert \\
 & \leq  \Vert g_{k+1} -g_{k} -H_{k} d_{k}\Vert +\Vert \hat{g}_{k} +\hat{H}_{k} d_{k}\Vert +\Vert g_{k} -\hat{g}_{k}\Vert +\Vert \hat{g}_{k} -\hat{g} '_{k}\Vert +\Vert \hat{H}_{k} -H_{k}\Vert \Vert d_{k}\Vert \\
 & \leq  \frac{{L_H}+1}{2}\Vert d_{k}\Vert ^{2} +\lambda _{k}\Vert d_{k}\Vert +\epsilon _{\text{eig}} +\Vert \hat{g}_{k} -\hat{g} '_{k}\Vert +\frac{1}{2}\Vert \hat{H}_{k} -H_{k}\Vert ^{2}\\
&\leqslant \frac{L_{H} +1}{2}\Vert d_{k}\Vert ^{2} +\frac{2( L_{H} +4)}{3}\Vert d_{k}\Vert ^{2} +\epsilon _{\text{ls}}\Vert d_{k}\Vert +\epsilon _{\text{eig}} +\Vert \hat{g}_{k} -\hat{g} '_{k}\Vert +\frac{1}{2}\Vert \hat{H}_{k} -H_{k}\Vert ^{2}\\
&\leqslant \frac{7L_{H} +19}{6}\Vert d_{k}\Vert ^{2} +\epsilon _{\text{ls}}^{2} +\epsilon _{\text{eig}} +\Vert \hat{g}_{k} -\hat{g} '_{k}\Vert +\frac{1}{2}\Vert \hat{H}_{k} -H_{k}\Vert ^{2}
\end{align*}
where we have used $\lambda_k  \leq  \frac{{L_H}+4}{3} \| d_k \| + \epsilon _{\text{ls}} $ in the fourth inequality. This completes the proof.
\end{proof}

\subsection{Proof of Theorem \ref{thm:hsodm}}
Now we use the previous results to complete the proof regarding the convergence rate.
\begin{theorem}\label{thm:rate}
    Suppose that $ F( x)$ satisfies the gradient dominance assumption with index $ \alpha $, \cref{assum:smooth}. The output of \cref{algo:hsodm} with parameters $ \epsilon _{\text{eig}} ,\epsilon _{\text{ls}} ,\epsilon _{\text{noise}}$ and $K$, satisfies the following statements,
    \begin{itemize}
        \item  If \ $ \alpha \in [ 1,3/2)$, $ F( x_K) -F^* \leq  O\left( K^{\frac{-2\alpha }{3-2\alpha }} +\epsilon _{\text{eig}}^{\alpha } +\epsilon _{\text{ls}}^{2\alpha } +2\epsilon _{\text{noise}}^{\alpha }\right) .$
        \item     If \ $ \alpha =3/2$, $ F( x_K) -F^* \leq  O\left(\exp( -K) +\epsilon _{\text{eig}}^{\alpha } +\epsilon _{\text{ls}}^{2\alpha } +2\epsilon _{\text{noise}}^{\alpha }\right) .$
        \item     
        If \ $ \alpha \in ( 3/2,2]$, $ F( x_K) -F^* \leq  O\left(\exp(\exp( -K)) +\epsilon _{\text{eig}}^{\alpha } +\epsilon _{\text{ls}}^{2\alpha } +2\epsilon _{\text{noise}}^{\alpha }\right).$
    \end{itemize}
\end{theorem}

\begin{proof}
By the gradient dominance assumption and \cref{eq:g_descent},
\begin{align*}
F\left( x^{k+1}\right) -F^{*} & \leqslant C_{\text{gd}}\Vert g_{k+1}\Vert ^{\alpha }\\
 & \leqslant C_{\text{gd}}\left(\frac{7L_{H} +19}{6}\Vert d_{k}\Vert ^{2} +\epsilon _{\text{ls}}^{2} +\epsilon _{\text{eig}} +\Vert \hat{g}_{k} -\hat{g} '_{k}\Vert +\frac{1}{2}\Vert \hat{H}_{k} -H_{k}\Vert ^{2}\right)^{\alpha }
\end{align*}
Note that for any $\displaystyle x,y >0$, we have 
\begin{align*}
\ ( x+y)^{r} \leqslant  & \begin{cases}
x^{r} +y^{r} & ,r\in ( 0,1) ,\\
2^{r-1}\left( x^{r} +y^{r}\right) & ,r\geqslant 1.
\end{cases}
\end{align*}
Thus, $\displaystyle \ ( x+y)^{r} =O\left( x^{r} +y^{r}\right)$ and
\begin{align*}
F\left( x^{k+1}\right) -F^{*} &  \leqslant O\left(\Vert d_{k}\Vert ^{2\alpha } +\epsilon _{\text{ls}}^{2\alpha } +\epsilon _{\text{eig}}^{\alpha } +\Vert \hat{g}_{k} -\hat{g} '_{k}\Vert ^{\alpha } +\frac{1}{2}\Vert \hat{H}_{k} -H_{k}\Vert ^{2\alpha }\right)\\
 & \leqslant O\left( F\left( x^{k}\right) -F\left( x^{k+1}\right) +\epsilon {_{\text{eig}}}^{3/2} +\epsilon _{\text{ls}}^{3} +\Vert H_{k} -\hat{H}_{k}\Vert ^{3} +\Vert g_{k} -\hat{g}_{k}\Vert ^{3/2}\right)^{2\alpha /3}\\
 & \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ +O\left( \epsilon _{\text{ls}}^{2\alpha } +\epsilon _{\text{eig}}^{\alpha } +\Vert \hat{g}_{k} -\hat{g} '_{k}\Vert ^{\alpha } +\frac{1}{2}\Vert \hat{H}_{k} -H_{k}\Vert ^{2\alpha }\right)\\
 & =O\left(\left( F\left( x^{k}\right) -F\left( x^{k+1}\right)\right)^{2\alpha /3} +\epsilon {_{\text{eig}}}^{\alpha } +\epsilon _{\text{ls}}^{2\alpha } +\Vert H_{k} -\hat{H}_{k}\Vert ^{2\alpha } +\Vert g_{k} -\hat{g}_{k}\Vert ^{\alpha }\right) ,
\end{align*}where we have used \cref{eq:f_descent} in the inequality. Take expectation, we get 
\begin{align}\label{eq:iter}
\mathbb{E}\left[ F\left( x^{k+1}\right) -F^{*}\right] & \leqslant O\left(\left(\mathbb{E}\left[ F\left( x^{k}\right) -F\left( x^{k+1}\right)\right]\right)^{2\alpha /3} +\epsilon _{\text{eig}}^{\alpha } +\epsilon _{\text{ls}}^{2\alpha } +\left(\mathbb{E}\left[\Vert g_{k} -\hat{g}_{k}\Vert ^{2}\right]\right)^{\alpha /2} +\mathbb{E}\left[\Vert \hat{H}_{k} -H_{k}\Vert ^{2\alpha }\right]\right)
\end{align}
Suppose we take $ N_{H} =O\left( \epsilon _{\text{noise}}^{-1}\right) ,N_{g} =O\left( \epsilon _{\text{noise}}^{-2}\right)$, by Lemma 3 in \cite{masiha2022stochastic} and \cref{asp:var}, 
\begin{equation*}
\mathbb{E}\left[\Vert g_{k} -\hat{g}_{k}\Vert ^{2}\right] \leq  \epsilon _{\text{noise}}^{2} ,\ \mathbb{E}\left[\Vert \hat{H}_{k} -H_{k}\Vert ^{2\alpha}\right] \leq  \epsilon _{\text{noise}}^\alpha,
\end{equation*}
Let $ \Delta _{k} = \mathbb{E}[F\left( x_k\right)] -F^* -\left( \epsilon _{\text{eig}}^{\alpha } +\epsilon _{\text{ls}}^{2\alpha } +2\epsilon _{\text{noise}}^{\alpha }\right) ,$ there exist $ C_\Delta $ such that
\begin{align*}
\Delta _{k+1} & \leq  C_\Delta( \Delta _{k} -\Delta _{k+1})^{2\alpha /3} .
\end{align*}
Applying \cref{lemma:seq}, we obtain the result. 
\end{proof}


\begin{proof}[Proof of \cref{thm:hsodm}]
Finally, we prove the sample complexity result in \cref{thm:hsodm}. For a given tolerance $ \epsilon $, let $ \epsilon _{\text{eig}} =O\left( \epsilon ^{1/\alpha }\right) ,\epsilon _{\text{ls}} =O\left( \epsilon ^{1/2\alpha }\right) ,\mathcal{\epsilon _{\text{noise}} =O}\left( \epsilon ^{1/\alpha }\right)$ in \cref{thm:rate}. We need $ O\left( \epsilon ^{2/\alpha }\right)$ samples in each iteration. Thus, by \cref{lemma:seq}, we have the following sample complexity results.\par  
For time complexity, note that in each iteration, by \cref{lemma:ls_err}, we need $\mathcal{O}(\log( (1 + \| g\| )/\epsilon _{\text{ls}} \epsilon _{\text{eig}}))$ step in \cref{algo:ls}. From \cref{eq:g_descent} and \cref{thm:rate}, we know 
$\mathbb{E} \| g_{k} \| \leqslant \mathcal{O}(\epsilon ^{1/\alpha } +K^{-\frac{4\alpha }{3-2\alpha }} )$. Therefore, $ \mathbb{E}[\mathcal{O}(\log( (1 + \| g\| )/\epsilon _{\text{ls}} \epsilon _{\text{eig}}))] \leq \mathcal{O}(\log(1/\epsilon))$ . 
\begin{table}[H]
\centering 
\begin{tabular}{|p{0.2\textwidth}|p{0.4\textwidth}|p{0.25\textwidth}|}
\hline 
 $ \alpha $ & Expected Time Complexity & Sample Complexity \\
\hline 
 $ \alpha \in [1,3/2)$ & $ O\left( \log( 1/\epsilon ) \epsilon ^{-7/( 2\alpha ) +3/4}\right)$ & $ $$ O\left( \epsilon ^{-7/( 2\alpha ) +1}\right)$ \\
\hline 
 $ \alpha =3/2$ & $ O\left( \log^{2}( 1/\epsilon ) \epsilon ^{-2/\alpha -1/4}\right)$ & $ O\left(\log( 1/\epsilon ) \epsilon ^{-2/\alpha }\right)$ \\
\hline 
 $ \alpha \in ( 3/2,2]$ & $ O\left( \log( 1/\epsilon )\log\log( 1/\epsilon ) \epsilon ^{-2/\alpha -1/4}\right)$ & $ O\left(\log\log( 1/\epsilon ) \epsilon ^{-2/\alpha }\right)$ \\
 \hline
\end{tabular}
\end{table}
\end{proof}
\subsection{Proof of Corollary \ref{cor:rl}}
If $ F $ only satisfies the weak gradient dominance property (\ref{asp:gd_wdom}), notice that \cref{lemma:descent} still holds. Following the proof of \cref{thm:rate}, we can get a variant of \cref{eq:iter},
    \begin{align*}
\mathbb{E}\left[ F\left( x^{k+1}\right) -F^{*}\right] & \leqslant O\left(\left(\mathbb{E}\left[ F\left( x^{k}\right) -F\left( x^{k+1}\right)\right]\right)^{2\alpha /3} +\epsilon _{\text{eig}}^{\alpha } +\epsilon _{\text{ls}}^{2\alpha } +\left(\mathbb{E}\left[\Vert g_{k} -\hat{g}_{k}\Vert ^{2}\right]\right)^{\alpha /2} +\mathbb{E}\left[\Vert \hat{H}_{k} -H_{k}\Vert ^{2\alpha }\right] + \epsilon _{\text{noise}}\right)
\end{align*}
        Let $ \Delta'_{k} =F\left( x_k\right) -F^* -\left( \epsilon _{\text{eig}}^{\alpha } +\epsilon _{\text{ls}}^{2\alpha } +2\epsilon _{\text{noise}}^{\alpha } + \epsilon_{\text{weak}}\right)  ,$ there exist $ C'_{\Delta} $ such that
        \begin{align*}
        \Delta' _{k+1} & \leq  C'_\Delta( \Delta' _{k} -\Delta' _{k+1})^{2\alpha /3} .
        \end{align*}
        Applying \cref{lemma:seq}, we get the result.

\section{Proof of Sample Complexity Results of Algorithm \ref{algo:hsodm_vr}}\label{sec:pf_vr}

In this section, we strengthen the sample complexity result using variance reduction techniques. (\ref{eq:iter}) implies that the the estimation error $\mathbb{E}[\|g_k - \hat{g}_k\|^2]$ and $\mathbb{E}[\|H_k-\hat{H}_k\|^{2\alpha}]$ directly influence the convergence rate. We need to use the sample more cleverly to increase sample complexity. By \cref{assum:smooth}, we have $ \Vert H_{k} -\hat{H}_{k}\Vert \leqslant {C_H} = 2L_g$, we have, 
\begin{equation*}
\ \mathbb{E}\left[\Vert H_{k} -\hat{H}_{k}\Vert ^{2\alpha }\right] \leqslant C_H^{2\alpha }\mathbb{E}\left[\Vert H_{k} -\hat{H}_{k}\Vert ^{2}/{C_H}^2\right] \leqslant C_H^{\alpha}\mathbb{E}\left[\Vert H_{k} -\hat{H}_{k}\Vert ^{2}\right]^{\alpha /2}.
\end{equation*}
Thus, the noise error caused by Hessian estimation is of the same order as gradient estimation and we only need to analyze the impact of applying variance reduction techniques to the gradient. A similar analysis applies to Hessian too. \par 
We mainly use the variance reduction technique from \cite{fang2018spider}. Recall that we use gradient estimation in \cref{algo:hsodm_vr}, 
\begin{align*}
v_{k} & =\begin{cases}
\nabla_{S_{k}} F( x_{k}) , & k\ \bmod K_{C} =0,\\
\nabla_{S_{k}} F( x_{k}) -\nabla_{S_{k}} F( x_{k-1}) +v_{k-1} , & k\ \bmod K_{C} \neq 0,
\end{cases}
\end{align*}
where $ \nabla_{S_{k}} F( x_{k}) =\frac{1}{|S_{k} |}\sum _{k=1}^{|S_{k} |} \nabla F( x_{k} ,\xi _{k}) .$ Let 
\begin{align*}
e_{k} & =\begin{cases}
\nabla_{S_{k}} F( x_{k}) -F( x_{k}) , & k\ \bmod K_{C} =0,\\
\nabla_{S_{k}} F( x_{k}) -\nabla_{S_{k}} F( x_{k-1}) -( F( x_{k}) -F( x_{k-1})) , & k\ \bmod K_{C} \neq 0,
\end{cases}
\end{align*}
we have error decomposition  $ F( x_{k}) -v_{k} =\sum _{i=\lfloor k/K_{C} \rfloor K_{C}}^{k} e_{i} .$ We define the sigma field $ \mathcal{F}_{k} =\sigma ( x_{0} ,\cdots ,x_{k} ,e_{1} ,\cdots ,e_{k-1}) .$ The following Lemma analyzes the error $e_k$.

\begin{lemma}\label{lemma:vr_err}
    \begin{equation*}
    \mathbb{E}\left[\Vert e_{k}\Vert ^{2} |\mathcal{F}_{k}\right] \leqslant \frac{4L_g}{n_{k,g}}\Vert d_{k}\Vert ^{2},\quad\quad k \bmod K_{C} \neq 0
    \end{equation*}
    where $ d_k = x_k - x_{k-1} $.
    \end{lemma}
    \begin{proof}

     We have 
    \begin{align*}
    \mathbb{E}\left[\Vert e_{k}\Vert ^{2} |\mathcal{F}_{k}\right] & =\mathbb{E}\left[\left\Vert \frac{1}{n_{k,g}}\sum _{i=1}^{n_{k,g}} \nabla F( x_{k} ,\xi _{i}) -\nabla F( x_{k-1} ,\xi _{i}) -( F( x_{k}) -F( x_{k-1}))\right\Vert ^{2}\right]\\
     & =\frac{1}{n_{k,g}}\mathbb{E}\left[\Vert \nabla F( x_{k} ,\xi _{1}) -\nabla F( x_{k-1} ,\xi _{1}) -( F( x_{k}) -F( x_{k-1}))\Vert ^{2}\right]\\
     & \leqslant \frac{2}{n_{k,g}}\mathbb{E}\left[\Vert \nabla F( x_{k} ,\xi _{1}) -\nabla F( x_{k-1} ,\xi _{1})\Vert ^{2}\right] +\frac{2}{n_{k,g}}\mathbb{E}\left[\Vert F( x_{k}) -F( x_{k-1})\Vert ^{2}\right]\\
     & \leqslant \frac{4L_g}{n_{k,g}}\Vert x_{k} -x_{k-1}\Vert ^{2} =\frac{4L_g}{n_{k,g}}\Vert d_{k}\Vert ^{2} .
    \end{align*}
            
    \end{proof}
    \begin{proof}[Proof of \cref{thm:vr}]
    Using \cref{lemma:vr_err}, we can estimate the variance of the gradient estimation,
    \begin{align*}
    \mathbb{E}\left[\Vert \nabla F( x_{k}) -v_{k}\Vert ^{2}\right] & =\mathbb{E}\left[\left\Vert \sum _{i=\lfloor k/K_{C} \rfloor K_{C}}^{k} e_{i}\right\Vert ^{2}\right] =\sum _{i=\lfloor k/K_{C} \rfloor K_{C}}^{k}\mathbb{E}\left[\Vert e_{i}\Vert ^{2}\right]\\
     & \leqslant \frac{\sigma ^{2}}{n_{\lfloor k/K_{C} \rfloor K_{C} ,g}} +4L_g\sum _{i=\lfloor k/K_{C} \rfloor K_{C} +1}^{k}\mathbb{E}\left[\frac{\Vert d_{i}\Vert ^{2}}{n_{k,g}}\right]
    \end{align*}
    For $ \alpha \in [ 1,3/2)$, if we take 
    \begin{equation*}
    n_{k,g} =\begin{cases}
    O\left( k^{4/( 3-2\alpha )}\right) & k\ \bmod K_C=0,\\
    O\left(\Vert d_{k}\Vert ^{2} K_{C}( \lfloor k/K_{C} \rfloor K_{C})^{4/( 3-2\alpha )}\right) & k\ \bmod K_C\neq 0,
    \end{cases}
    \end{equation*}
    we have $ \mathbb{E}\left[\Vert F( x_{k}) -v_{k}\Vert ^{2}\right]^{\alpha /2} \leqslant O\left( 1/k^{1/( 3/( 2\alpha ) -1)}\right) .$ Follow the proof of \cref{thm:rate}, we can get  $ F\left( x^{kK_{C}}\right) -F^* =O\left(( kK_{C})^{-2\alpha /( 3-2\alpha )}\right) $.
    
    Next, we estimate the expected sample complexity. By \cref{lemma:descent} we have,
    \begin{align*}
    \sum _{i=( k-1) K_{C}}^{kK_{C} -1}\Vert d_{i}\Vert ^{2} & \leqslant \sum _{i=( k-1) K_{C}}^{kK_{C} -1}\left(\frac{6}{{L_H}} F\left( x^{i}\right) -F\left( x^{i+1}\right) +\epsilon {_{\text{eig}}}^{3/2} +\epsilon _{\text{ls}}^{3} +\Vert H_{i} -\hat{H}_{i}\Vert ^{3} +\Vert g_{i} -\hat{g}_{i}\Vert ^{3/2}\right)^{2/3}\\
     & \leqslant O\left(\mathbb{E}\left[\sum _{i=( k-1) K_{C}}^{kK_{C} -1}\left( F\left( x^{i}\right) -F\left( x^{i+1}\right)\right)^{2/3} +\epsilon _{\text{eig}} +\epsilon _{\text{ls}}^{2} +\Vert H_{i} -\hat{H}_{i}\Vert ^{2} +\Vert g_{i} -\hat{g}_{i}\Vert \right]\right)\\
     & \leqslant O\left( K_{C}^{1/3}\left( F\left( x^{( k-1) K_{C}}\right) -F\left( x^{kK_{C}}\right)\right)^{2/3} +K_{C}( kK_{C})^{-2/( 3-2\alpha )}\right)\\
     & =O\left( K_{C}^{1/3} \cdotp ( kK_{C})^{-4\alpha /( 9-6\alpha )} +K_{C}( kK_{C})^{-2/( 3-2\alpha )}\right) ,
    \end{align*}
    where we use $ F\left( x^{kK_{C}}\right) -F^* =O\left(( kK_{C})^{-2\alpha /( 3-2\alpha )}\right) $ by \cref{thm:rate}. We get
    \begin{equation*}
    \ E\left[\sum _{i=( k-1) K_{C}}^{kK_{C} -1}\Vert d_{i}\Vert ^{2}\right] =O\left( K{_{C}}^{1/3}( K_{C} k)^{\frac{-4\alpha }{3( 3-2\alpha )}}\right) .
    \end{equation*}
     Thus,
    \begin{align*}
    \mathbb{E}\left[\sum _{k=1}^{K} n_{k,g}\right] & =O\left(\sum _{k=1}^{\lceil K/K_{C} \rceil }( K_{C} k)^{\frac{4}{3-2\alpha }} +K{_{C}}^{4/3}\sum _{k=1}^{\lceil K/K_{C} \rceil }\frac{( kK_{C})^{4/( 3-2\alpha )}}{( K_{C} k)^{4\alpha /( 3( 3-2\alpha ))}}\right)\\
     & =O\left( K^{1+4/( 3-2\alpha )} /K_{C} +K{_{C}}^{1/3} K^{1+( 12-4\alpha ) /( 3( 3-2\alpha ))}\right) .
    \end{align*}
    Take $ K_{C} =\mathcal{O}(K)$, use $ \mathcal{O}(K) =O\left( \epsilon ^{1-3/( 2\alpha )}\right) ,$ we get the result. We need $ O\left( \epsilon ^{1-3/( 2\alpha )}\right)$ iteration to reach $ \epsilon $-approximate point. The expected sample complexity is $ O\left(K^{4/(3-2\alpha)}\right) =O\left( \epsilon ^{-2 /\alpha}\right)$.

       
    \end{proof}
\section{A Brief Introduction of Reinforcement Learning}\label{sec:rl}
A MDP $ \mathcal{M}$ is specified by tuple $(\mathcal{S}, \mathcal{A}, \mathbb{P}, r, \gamma, \rho)$, where $\mathcal{S}$ is the state space; $\mathcal{A}$ is the action space; $P : \mathcal{S} \times \mathcal{A} \mapsto \Delta(\mathcal{S})$ is the transition function with $\Delta(\mathcal{S})$ the space of probability distribution over $\mathcal{S}$, and $P(s' \mid s, a)$ denotes the probability of
transitioning into state $s'$ upon taking action $a$ in state $s$; $r: \mathcal{S} \times \mathcal{A} \mapsto [0,1]$ is the reward function, and $r(s, a)$ is the immediate reward associated with taking action $a$ in state $s$; $\gamma \in (0, 1)$ is the discount factor; $\rho \in \Delta(\mathcal{S})$ is the initial state distribution. At $ k$-th step, the agent is at state $ s_{k}$ and pick one action from the action space $ a_{k} \in \mathcal{A}$. Then, the environment gives reward $ r_{k}$ to the agent and transit to the next state $ s_{k+1}$ with probability $ P ( s_{k+1} |s_{k} ,a_{k})$.

The parametric policy $\pi_\theta$ is a probability distribution over $\mathcal{S} \times \mathcal{A}$ with parameter $\theta \in \mathbb{R}^d$, and $\pi_\theta(a \mid s)$ denotes the probability of taking action $a$ at a given state $s$. For example, one may consider the sofemax policy, given by 
\begin{equation*}
    \pi_{\theta}(a \mid s) = \frac{ \exp \left( \theta_{s, a}\right)}{\sum_{a' \in \mathcal{A}} \exp \left( \theta_{s, a'}\right)}
\end{equation*}
where the parameter space is $\theta \in \mathbb{R}^{|\mathcal{S}||\mathcal{A}|}$. Let $\tau=\left\{s_t, a_t\right\}_{t \geq 0} $ be the trajectory generated by the policy $\pi_\theta$, and $p(\tau \mid \pi_{\theta})$ be the probability of the trajectory $\tau$ being sampled from $\pi_{\theta}$. Then, we have
\begin{equation*}
    p(\tau \mid \pi_{\theta})=\rho\left(s_0\right) \prod_{t=0}^{\infty} \pi_{\theta}\left(a_t \mid s_t\right)P\left(s_{t+1} \mid s_t, a_t\right), 
\end{equation*}
and the expected return of $\pi_{\theta}$ is 
\begin{equation*}
    J\left(\pi_\theta\right):=\mathbb{E}_{\tau \sim p\left(\cdot \mid \pi_\theta\right)}\left[\sum_{t=0}^{\infty} \gamma^t r\left(s_t, a_t\right)\right].
\end{equation*}
Assume that $\pi_{\theta}$ is differentiable with respect to $\theta$, and denote $J(\theta) = J(\pi_{\theta})$ for simplicity. The goal of reinforcement learning is to find 
\begin{equation*}
    \theta^* = \arg\max_{\theta} J(\theta).
\end{equation*}
However, $J(\theta)$ is differentiable but non-concave in general, leading to the difficulty to find the global optimal solution. With two common assumptions, including non-degenerate Fisher matrix \citep{agarwal2021theory, yuan2022general} and transferred compatible function approximation error \citep{agarwal2021theory}, one can prove that 
\begin{equation*}
    J^* - J(\theta) \leq \tau \|\nabla J(\theta)\| + \epsilon'
\end{equation*}
where $J^* \coloneqq \max J(\theta)$. Hence, $J(\theta)$ satisfies the weak gradient dominance property with $\alpha = 1$, as well as \cref{assum:smooth} and \cref{asp:var}. We list the two assumptions here for self-completeness, and the proof can be found in \citep{masiha2022stochastic, yuan2022general}.
\begin{assumption}[Fisher-non-degenerate.]
    For all $\theta \in \mathbb{R}^d$, there exists $\mu_F>0$ such that the Fisher information matrix $F_\rho(\theta)$ induced by policy $\pi_\theta$ and initial distribution $\rho$ satisfies
\begin{equation*}
F_\rho(\theta):=\mathbb{E}_{(s, a) \sim v_\rho^{\pi_\theta}}\left[\nabla_\theta \log \pi_\theta(a \mid s) \nabla_\theta \log \pi_\theta(a \mid s)^T\right] \succeq \mu_F I_{d \times d},
\end{equation*}
where $v_\rho^{\pi_\theta}(s, a):=(1-\gamma) \mathbb{E}_{s_0 \sim \rho} \sum_{t=0}^{\infty} \gamma^t \mathbb{P}\left(s_t=s, a_t=a \mid s_0, \pi_\theta\right)$ is the state-action visitation measure.
\end{assumption}
\begin{assumption}[Transferred compatible function approximation error]
For all $\theta \in \mathbb{R}^d$, there exists $\epsilon_{\text {bias }}>0$ such that the transferred compatible function approximation error with $(s, a) \sim v_\rho^{\pi_{\theta^*}}$ satisfies
\begin{equation*}
\mathbb{E}\left[\left(A^{\pi_\theta}(s, a)-(1-\gamma) u^{* T} \nabla_\theta \log \pi_\theta(a \mid s)\right)^2\right] \leq \epsilon_{\text {bias }},
\end{equation*}
where $v_\rho^{\pi_{\theta^*}}$ is the state-action distribution induced by an optimal policy, and $u^*=\left(F_\rho(\theta)\right)^{\dagger} \nabla J(\theta)$.
\end{assumption}
We remark that \cite{masiha2022stochastic} also uses this reinforcement learning setting to study the performance of SCRN for the gradient-dominated function with $\alpha = 1$, which shows that our numerical experiment's setting is standard. Practically, we cannot compute $J(\theta)$ due to the infinite horizon length. Hence, we resort to truncated trajectories with the horizon length $H$, and focus on 
\begin{equation*}
    \max_{\theta} \,\, J_{H}(\theta)  \coloneqq \mathbb{E}_{\tau \sim p\left(\cdot \mid \pi_\theta\right)}\left[\sum_{t=0}^{H-1} \gamma^t r\left(s_t, a_t\right)\right].
\end{equation*}
We seek for the stationary point $\hat{\theta}$, that is, 
\begin{equation*}
    \| \nabla J_{H}(\hat{\theta})\| \leq \epsilon.
\end{equation*}
Note that
\begin{equation*}
    \nabla J_H(\theta)=\mathbb{E}_{\tau \sim p\left(\cdot \mid \pi_\theta\right)} \left[\sum_{h=0}^{H-1} \Psi_h(\tau) \nabla \log \pi_\theta\left(a_h \mid s_h\right)\right]
\end{equation*}
where $\Psi_h(\tau) = \sum_{t=h}^{H-1} \gamma^t r(s_t, a_t) $. In practice, we can not compute the full gradient because of the failure to average over all possible trajectories $\tau$. Thus, we construct an empirical estimate by sampling different trajectories. Suppose that we sample $m$ trajectories $\tau^i=\left\{s_t^i, a_t^i\right\}_{0 \leq t \leq H}, 1 \leq i \leq m$. Then the resulting unbiased gradient estimator is 
\begin{equation*}
    \hat{\nabla} J_{H}(\theta) = \frac{1}{m} \sum_{i=1}^m \sum_{h=0}^{H-1} \Psi_h\left(\tau^i\right) \nabla \log \pi_\theta\left(a_h^i \mid s_h^i\right), 
\end{equation*}
 The vanilla policy gradient (VPG) updates $\theta$ by $\theta \leftarrow \theta + \eta \hat{\nabla} J_{H}(\theta)$, where $\eta > 0 $ is the step size.  

When considering the second-order information, we have 
\begin{equation*}
    \nabla^2 J_{\mathrm{H}}(\theta) = \mathbb{E}_{\tau \sim p\left(\cdot \mid \pi_\theta\right)} \left[\nabla \Phi(\theta ; \tau) \nabla \log p\left(\tau \mid \pi_\theta\right)^T+\nabla^2 \Phi(\theta ; \tau)\right]
\end{equation*}
where $\Phi(\theta ; \tau)=$ $\sum_{h=0}^{\mathrm{H}-1} \sum_{t=h}^{\mathrm{H}-1} \gamma^t r\left(s_t, a_t\right) \log \pi_\theta\left(a_h \mid s_h\right)$. As a result, for trajectories $\tau^i=\left\{s_t^i, a_t^i\right\}_{t \geq 0}, 1 \leq i \leq m$, we have the following unbiased estimator of Hessian matrix $\nabla^2 J_{\mathrm{H}}(\theta)$, 
\begin{equation*}
    \hat{\nabla}^2 J_{H}(\theta)=\frac{1}{m} \sum_{i=1}^m \nabla \Phi\left(\theta ; \tau^i\right) \nabla \log p\left(\tau^i \mid \pi_\theta\right)^T+\nabla^2 \Phi\left(\theta ; \tau^i\right)
\end{equation*}
With $ \hat{\nabla} J_{H} (\theta)$ and $ \hat{\nabla}^2 J_{H}(\theta) $ in hand, we can use our stochastic HSODM to find $\hat{\theta}$.

\begin{figure}
    \centering
    \includegraphics[scale=0.4]{figs/additional/halfcheeta_compare_ppo.png} 
    \includegraphics[scale=0.4]{figs/additional/walker_compare_ppo.png}
    \caption{The $x$-axis and $y$-axis represent respectively system probes and the average return. The solid curves depict the mean values of five independent simulations, while the shaded areas correspond to the standard deviation.}
    \label{fig:addi-rl}
\end{figure}

\section{Additional Experiments}

In this section, we further compare the performance of SHSODM with PPO in several RL tasks. For clean demonstration, we only include TRPO as another benchmark. When running PPO, we let its key parameter \texttt{lc\_clip\_range} be $\{ 0.1, 0.2, 0.4, 0.6, 0.8 \}$, and plot the one with the best performance on average return. It is clear that our SHSODM even has better final performance than PPO, although it grows slower than PPO at the initial stage. One of the possible explanations is our SHSODM uses the second-order information of the objective function. While PPO and TRPO both use the second-order information of the constraint (TRPO involves the constraint that the consecutive two policies should not be far away from each other, and PPO penalizes this constraint in the objective function). The information of objective function maybe more useful and can induce better policies.

When comparing the time spent on calculating the update direction, we include TRPO as a benchmark, the variant of PPO. The reason is that PPO uses the first-order optimizer, while TRPO can be seen as a ``second-order'' method. It can be seen that our SHSODM updates faster than TRPO over the 2 tested environments and is consistently better than SCRN.
\begin{figure}[H]
    \centering
    \includegraphics[scale=0.5]{figs/additional/time.png}
    \caption{The $x$-axis represents three different tested environments, including \texttt{HalfCheetah-v2}, \texttt{Hopper-v2}, and \texttt{Walker2d-v2}. The $y$-axis presents the total time required to obtain the update direction in $10^3$ epochs.}
    \label{fig:addi-time}
\end{figure}