\documentclass[accepted]{uai2023} % after acceptance, for a revised
\usepackage[american]{babel}
\usepackage{xr}
\externaldocument[app-]{ganesh_669-supp}
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amsthm}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{lemmma}[theorem]{Lemma}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{corollary}[theorem]{Corollary}
\usepackage{bm}
\usepackage{multirow}
\usepackage{caption} 
\captionsetup[table]{skip=10pt}
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{array}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{textcomp}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage[]{color-edits}
\addauthor{rd}{purple}
\addauthor{sg}{teal}
\addauthor{gt}{blue}
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\input{notation}

\usepackage{chngcntr}
\usepackage{booktabs}
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{tabularx}
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{amsmath}
\usepackage{amsmath}
\usepackage{float}
\usepackage{graphicx}
\usepackage{subfig}
\usepackage{amssymb}
\usepackage{bbm}

\newcommand{\mup}{\mu_+}
\newcommand{\mun}{\mu_-}
\renewcommand{\wp}{w_+}
\newcommand{\wn}{w_-}
\newcommand{\vp}{v_+}
\newcommand{\vn}{v_-}
\renewcommand{\diag}{\textnormal{diag}}

\newtheorem{innercustomgeneric}{\customgenericname}
\providecommand{\customgenericname}{}
\newcommand{\newcustomtheorem}[2]{%
  \newenvironment{#1}[1]
  {%
   \renewcommand\customgenericname{#2}%
   \renewcommand\theinnercustomgeneric{##1}%
   \innercustomgeneric
  }
  {\endinnercustomgeneric}
}

\newcustomtheorem{customthm}{Theorem}
\newcustomtheorem{customlemma}{Lemma}

\newcommand{\vertiii}[1]{{\left\vert\kern-0.25ex\left\vert\kern-0.25ex\left\vert #1 
    \right\vert\kern-0.25ex\right\vert\kern-0.25ex\right\vert}}


\title{Does Momentum Help in Stochastic Optimization? \\A Sample Complexity Analysis.}

\author[1]{ Swetha Ganesh \thanks{Equal Contribution}}
\author[2]{Rohan Deb$^*$\thanks{Part of the research was done while RD was a Project Associate with GT at the Indian Institute of Science, Bangalore.}}
\author[1]{Gugan Thoppe}
\author[3]{Amarjit Budhiraja}
\affil[1]{%
Department of Computer Science and Automation\\
Indian Institute of Science, Bengaluru, India
}
\affil[2]{%
Department of Computer Science\\
University of Illinois Urbana-Champaign, USA
}
\affil[3]{%
Department of Statistics and Operations Research\\
University of North Carolina at Chapel Hill, USA
}


\begin{document}
\maketitle

\begin{abstract}
 Stochastic Heavy Ball (SHB) and Nesterov's Accelerated Stochastic Gradient (ASG) are popular momentum methods in optimization. While the benefits of these acceleration ideas in deterministic settings are well understood, their advantages in stochastic optimization are unclear. Several works have recently claimed that SHB and ASG always help in stochastic optimization. Our work shows that i.) these claims are either flawed or one-sided (e.g., consider only the bias term but not the variance), and ii.) when both these terms are accounted for, SHB and ASG do not always help. Specifically, for \textit{any} quadratic optimization, we obtain a lower bound on the sample complexity of SHB and ASG, accounting for both bias and variance, and show that the vanilla SGD can achieve the same bound.
\end{abstract}

\section{Introduction}
\label{sec_introduction}

In deterministic convex optimization (when one has access to exact gradients), Gradient Descent (GD) is a popular optimization algorithm \citep{cauchy}. In practice, though, exact gradients are not available and one has to rely on their noisy estimates. This brings forth the idea of Stochastic Gradient Descent (SGD). 
Two classic momentum methods used to accelerate GD are Heavy Ball (HB) \citep{polyak_heavy_ball, polyakbook, qian} and Nesterov's Accelerated Gradient (NAG) \citep{nesterov, nesterovbook, Nesterov05}. Naturally, these momentum-based methods and their variants have also gained significant interest in stochastic settings \citep{sutskever13, Nitanda2014, Chonghai}. However, our work shows that the stochastic variants of HB and NAG, i.e., the Stochastic Heavy Ball (SHB) and Nesterov's Accelerated Stochastic Gradient (ASG), are not always better than the vanilla SGD for any quadratic optimization. Specifically, we provide conditions for which the sample complexities of SHB and ASG are never better than that of SGD\footnote{Sample complexity refers to the number of iterations required to reach an $\epsilon$-ball around the solution. Our statement holds for all sufficiently small $\epsilon.$}.

We elaborate on the discussion above. The benefit of using momentum in (deterministic) quadratic optimization is the following. Suppose the driving matrix has condition number $\kappa.$ Then, for any $\epsilon > 0,$ GD with an optimal constant stepsize\footnote{Throughout, we only consider algorithms with constant stepsizes, which are widely popular in practice.} % and ensure faster convergence.} 
converges to an $\epsilon$-close solution in $\mathcal{O}(\kappa\log\frac{1}{\epsilon})$ iterations. In contrast, both HB and NAG with optimal stepsize and momentum parameters only need $\mathcal{O}(\sqrt{\kappa}\log\frac{1}{\epsilon})$ steps; see, e.g.,  \citep{recht2010cs726}.
\begin{figure*}[!bt]
    \centering
    \subfloat[Stepsize SGD,  SHB, ASG: 0.0025
    ]
    {
    \label{sub-figure(a)}
    {\includegraphics[width=5.5cm]{plot_a_v2.png}}
    }
    \subfloat[
    Stepsize SGD: 0.02; SHB, ASG: 0.0025
    ]{
    \label{sub-figure(b)}
    {\includegraphics[width=5.5cm]{plot_b_v2.png}}
    }%
     %\quad
    \subfloat[
     Stepsize - SGD, SHB, ASG: 0.02
    ]{
    \label{sub-figure(c)}
    {\includegraphics[width=5.5cm]{plot_c_v2.png}}
    }%
    \caption{Comparison of SGD, SHB, and ASG's performances for a 2D quadratic optimization problem (see Section~\ref{app-s:Simulation.Details} for details) for the different stepsize choices given above and $\epsilon$-threshold = 5 (denoted by the black horizontal line). 
    }
    \label{figure}%
\end{figure*}
Our main claim here is that momentum does not lead to similar advantages in stochastic settings. We use Figure~\ref{figure} to provide an intuitive justification for this claim. The setup is as follows.  We consider a quadratic optimization problem (see Section~\ref{app-s:Simulation.Details} for the details) and ensure that only a noisy estimate of its gradient is available in each iteration. This problem is solved using SGD, SHB, and ASG and the three panels show how the Mean Squared Error (MSE) decays for different stepsize and momentum parameter choices. Note that these parameters, once chosen, are fixed, i.e., they do not change from one iteration to the other. 

In stochastic settings, the MSE error at any time instance for each of SGD, SHB, and ASG can be broken down into two components: bias and variance. The bias dictates how fast the distance of the initial estimate to the solution is forgotten, while the variance represents a cumulative effect of the noise seen so far. When constant stepsize and momentum parameters are used, the bias decays exponentially fast while the variance converges to some (non-zero) positive constant; this implies the MSE also converges to this constant. Both the rate at which the bias decays and the constant to which the variance converges to are influenced by the stepsize and momentum parameter choices. 

With the above picture in mind, Figure~\ref{figure} illustrates how SHB and ASG's performance can be matched by SGD. Figure~\ref{sub-figure(a)} corresponds to the case where a same stepsize is used in all the three algorithms. In this case,  the MSE for the momentum based methods (SHB, ASG) decreases faster initially, but settles at a higher limiting value eventually. Accordingly, one may conjecture that SHB and ASG {would} have a better sample complexity if the $\epsilon$-threshold for the MSE is set above this limit (one such choice of $\epsilon$ in this example is {$5$}).  However, Figure~\ref{sub-figure(b)} shows that SGD enjoys a similar performance for a larger stepsize choice. This time one may conjecture that SHB and ASG's performance can be improved if their stepsizes are also increased similarly. Figure~\ref{sub-figure(c)} discusses this case when the stepsize for momentum methods is increased to match the new stepsize for SGD. Unfortunately, while MSE for momentum based does decrease faster initially, it also settles at a value that is higher than the threshold that we had set before, i.e., {5}. 

\textbf{Related Works:} Some recent results \citep{Loizou, MJ, Assran, zhu2} 
%look at this question in the constant stepsize setup and
claim that SHB and ASG methods are better than SGD in quadratic or least-squares settings. 
However, \citet{Loizou} needs a strong assumption on noise, which \citet[Section 6]{rahul} claim is information-theoretically impossible even in the simple least squares regression problem. The other results either are based on a one-sided analysis \citep{zhu2}\footnote{This work only considers bias, while ignoring variance}  or have a flaw \citep{MJ, Assran}; see Appendix \ref{app-App_A}. 

On the other hand, there are also a few recent negative results on these momentum methods.
{\citet{devolder2014first} make a similar conclusion to ours in the context of (deterministic) proximal gradient methods and their accelerated variants for smooth convex optimization, when the function can be estimated only up to some (non-random) fixed inaccuracy.}
\citet{ucla} 
show that SHB and ASG are equivalent to SGD with a rescaled stepsize. 
However, this result requires that the stepsize be sufficiently small and the momentum parameter be away from $1.$ 
{ \citet{Kangqiao_Liu} obtain an expression  for the asymptotic variance for SHB and show that it can be matched by that of vanilla SGD with a rescaled stepsize. However, this discussion is only from an asymptotic sense and compares the final size of the ball where the iterates with or without momentum settle, but not the number of iterations needed to reach such a ball. In fact, the asymptotic variance estimate does not provide any information about the sample complexity.}
In \citep{rahul, liu}, for one specific instance of the least squares regression with {\em vanishing noise}, it is shown that the performance of SHB and ASG cannot be better than that of SGD.  
Finally, \cite{nqm} consider SHB for quadratic objectives in the noisy setting as our work and provides upper bounds on the rate at which the objective function decreases. They also argue that rescaled SGD performs as well as SHB and demonstrate it empirically but fall short of rigorously coming up with a lower bound that supports their claim. 

SHB and ASG have also been studied in the decreasing stepsize setting. \cite{ghadimi15} had given the first global convergence of SHB for quadratic objectives while \cite{yang,gadat,Orvieto} gave a.s. convergence rates for convex objectives. In \citep{sebbouh21}, improved bounds on both SGD and SHB have been provided, as compared to previously known bounds. %Yet their bound on SHB is not shown to be better than that on SGD.
\cite{Hu,ghadimi,xio} study Nesterov's momentum under a decreasing stepsize setting and show that though the momentum scheme accelerates the convergence of the iterates in the initial part, the acceleration is lost in the asymptotic regime. \citet{vaswani} study ASG with a decreasing momentum parameter and show a linear convergence to the optimal point. However, the noise at any stationary point vanishes to zero in their setting.
Finally, we also note that other momentum methods have been studied in \citep{katyusha,proximal,defazio14,Johnson,roux} that can provably be shown to have a better performance than SGD.

The current literature can thus be summarized as follows. 

\textbf{Research Gap}: Existing works on SHB and ASG fall into two groups: i.) positive -  where the results claim advantages of these methods over SGD and ii.) negative - where the results claim the opposite. Results in the positive group either have a one-sided or a flawed analysis, while the ones in the negative  apply only in some  restricted settings. 

\textbf{Key Contribution:} Our work belongs to the negative group: SHB and ASG do not have an advantage over SGD. Specifically, for {\em all quadratic optimization problems} with persistent noise (noise variance is sufficiently bounded away from zero) and any sufficiently small $\epsilon > 0,$ we show that number of iterations needed by SHB and ASG to find an $\epsilon$-optimal solution are not better than that of SGD. More technically, we obtain a lower bound on sample complexities of SHB and ASG (Theorem \ref{Lower_bound}) and show that these are of the same order as the corresponding upper bound for SGD (Proposition \ref{Theorem_OTS_mom}). 
Our proof techniques are also significantly different from those used in existing lower bounds such as \citep{rahul,liu}. This is because, under non-vanishing noise, the expected error contains an additional term that cannot be accounted for from their analyses (see Remark \ref{R3}).


\section{Main Results}
\label{sec:main}

We state our main results here that provide lower and upper bounds on the sample complexities of SHB and ASG. We use these bounds along with those of SGD to show that all these methods need a similar effort to find an $\epsilon$-optimal solution. 

Throughout, we consider minimizing
\begin{equation}
    \label{e:obj.fn}
    f(x) = \frac{1}{2}x^TAx - b^Tx + c,
\end{equation}
%
where $A$ is some symmetric $d \times d$ matrix, $b \in \bR^d,$ and $c \in \bR.$ The update rules for standard algorithms such as SHB, ASG, and SGD for solving this problem can be jointly expressed as 
%
\begin{align}
    \label{sgd-m}
    x_{n} = {} & x_{n-1} + \alpha(b - Ax_{n-1} + M_{n}) \nonumber \\
    & +\eta(I_d-\alpha\beta A)(x_{n} - x_{n-1}) \\
    \label{sgd-m-2}
    = {} & x_{n-1} + \alpha(b - A(x_{n-1}+\eta \beta (x_{n-1} -x_{n-2})) + M_{n})\nonumber\\ 
    & + \eta (x_{n-1} - x_{n-2})
\end{align}
%
with $x_{-1}= x_0$. The notation $I_d$ is the $d\times d$ identity matrix, and $M_{n+1} \in \mathbb{R}^d$ is noise. Henceforth, we will refer to the above generic algorithm as Linear Stochastic Approximation with Momentum (LSA-M). 
Note that LSA-M is equivalent to SGD (if $\eta = 0$ in \eqref{sgd-m}), to SHB (if $\beta = 0$ in \eqref{sgd-m}), and to ASG (if $\beta=1$ in \eqref{sgd-m-2}).

%
 

We make the following assumption on the driving matrix.
\begin{assumption}[\textbf{Driving matrix property}]
\label{A1}
$A$ is real symmetric and all its eigenvalues are positive.
\end{assumption}
We also denote the the eigenvalues of $A$ by $\lambda_{\max} = \lambda_1 \geq \lambda_{2} \geq \ldots \geq \lambda_{d} = \lambda_{\min}$. 
Under the above assumption, one would expect the iterates in \eqref{sgd-m} to go to a neighborhood of $x^* := A^{-1} b.$

We next state two assumptions on the noise sequence $(M_n)$, the first is used in Theorem~\ref{Lower_bound}, while the other is used in Proposition~\ref{Theorem_OTS_mom} and Corollary~\ref{Upper_bound_cor}. The notation $A \succeq B$ means $A-B$ is positive semi-definite.

\begin{assumption}[\textbf{Noise attributes for Theorem~\ref{Lower_bound}}]
\label{A2}
$(M_{n})$ is a martingale difference sequence with respect to the filtration $(\mathcal{F}_{n})$, where $\mathcal{F}_{n} = \sigma(x_{m},M_{m};m\leq n)$. Further, $\exists K > 0$ such that $\mathbb{E}[M_{n+1} M_{n+1}^T|\mathcal{F}_{n}]\succeq K I_d$ a.s. $\forall n \geq 0.$
\end{assumption}
\begin{assumption}[\textbf{Noise attributes for Proposition~\ref{Theorem_OTS_mom}}]
\label{A3}
$(M_n)$ is a martingale difference sequence with respect to the filtration $(\mathcal{F}_{n})$, where $\mathcal{F}_{n} = \sigma(x_{m},M_{m};m\leq n)$. Further, $\exists K \geq 0$ such that  $\mathbb{E}[\|M_{n+1}\|^{2}|\mathcal{F}_{n}] \leq K(1+\|x_{n}-x^{*}\|^2)$ a.s. $\forall n \geq 0.$
\end{assumption}
Assumptions~\ref{A2} and \ref{A3} are standard \citep{mandt, jas, cheng, Borkar_Book}. The first of these holds if and only if all the eigenvalues of $\E[M_{n + 1} M_{n + 1}^T |\cF_n]$ are bounded from below by $K,$ i.e., noise is persistent (or non-vanishing) in all directions. On the other hand, Assumption~\ref{A3} requires that the trace of $\E[M_{n + 1} M_{n + 1}^T |\cF_n]$ be bounded from above. This bound can scale with $\|x_n - x^*\|$ and need not vanish near $x^*.$

Next, we define sample complexity to quantify the effort required by LSA-M to obtain an $\epsilon$-close solution to $x^*.$

\begin{definition}[\textbf{Sample Complexity}]
The sample complexity of \eqref{sgd-m} is the minimum number of iterations $n_0$ such that the expected error $\mathbb{E}[\|x_n-x^*\|^2] \leq \epsilon$,  $ \forall n \geq n_0$. 
%
\end{definition}
\begin{table*}[t]
\begin{center}
\begin{tabular}{ |c|c|c|c| }
\hline
 Method &$\beta$ & $\eta$ & $\alpha$ \\ \hline
\multirow{3}{*}{SGD}
 & & & \\
 & - & 0 & $\min\Big(\frac{\lambda_{\min}}{\frac{3}{4}\lambda_{\min}^{2} + C^{2}K}, \frac{\epsilon\lambda_{\min}}{4C^2K}, \frac{2}{\lambda_{\max}+\lambda_{\min}} \Big) $ \\
 & & & \\

 \hline
\multirow{1}{*}{SHB}
 & 0 & $ \left(1-\frac{\sqrt{\alpha\lambda_{\min}}}{2}\right)^{2} $ &$ \min\Big((\frac{\lambda_{\min}^{3/2}}{\frac{3}{8}\lambda_{\min}^{2} + 25C^2K})^{2}, (\frac{\epsilon(\lambda_{\min})^{3/2}}{200 C^2 K})^2,$ \\ 
 & &  &  $(\frac{2}{\sqrt{\lambda_{\min}}+\sqrt{\lambda_{\max}}})^2\Big) $ \\
 & & & \\
  
 \hline
 \multirow{1}{*}{ASG}
 & 1 & $\frac{ \left(1-\frac{\sqrt{\alpha\lambda_{\min}}}{2}\right)^{2}}{(1-\alpha\lambda_{\min})}$ & $\min\Big((\frac{\lambda_{\min}^{3/2}}{\frac{3}{8}\lambda_{\min}^{2} + 25C^2K})^{2}, (\frac{\epsilon(\lambda_{\min})^{3/2}}{200 C^2 K})^2,\frac{1}{\lambda_{\max}}\Big)$\\
 & &  &   \\
  
 \hline
\end{tabular}
\\
\caption{Parameter choices for Proposition
\ref{Theorem_OTS_mom}. Here $C=1$ when the matrix $A$ is symmetric and $C = \frac{\sqrt{d}}{\sigma_{\min}(S)\sigma_{\min}(S^{-1})}$ when $A$ is not symmetric, where $\sigma_{\min}(\cdot)$ denotes the smallest singular value and $S$ is the matrix that diagonalizes $A$, i.e., $S^{-1}AS = D$, a diagonal matrix. {When $A$ is symmetric, indeed the three parameter choices correspond to SGD, SHB and ASG.} We stick to the same naming convention even when the driving matrix $A$ is not symmetric.}
\label{table}
\end{center}
\end{table*}

To enable easy comparison between different algorithms, we shall look at the order of their sample complexities. Towards that, we shall use the notation $n_0 \in \Theta(t)$ to imply that there exist constants $c_1$ and $c_2$ (independent of $t$) such that $c_1 t \leq n_0 \leq c_2 t $. 
The notation $\tilde{\Theta}(t)$ has a similar meaning but hides the dependence on logarithmic terms.
{Further,  $n_0 \in \Omega(t)$ implies there exists $c_1$ such that $n_0 \geq c_1 t$ and $n_0 \in \mathcal{O}(t)$ implies there exists $c_2$ such that $n_0 \leq c_2 t$.}

\begin{theorem}
    \label{Lower_bound}
    \textbf{(Lower bound on sample complexity).} Consider the LSA-M update rule \eqref{sgd-m}, and suppose Assumptions \ref{A1} and \ref{A2} hold. Then there exists an $\epsilon' > 0$ such that, for any $\epsilon \in (0,\epsilon')$
    and for any choice of $\alpha > 0,$ $\beta \in [0,1],$ and $\eta \in [0,1]$, the expected error $\mathbb{E}[\|x_{n_{0}} - x^{*}\|^2]\geq\epsilon$ for $n_0 \in \tilde{\Theta}\left(\frac{K}{\epsilon\lambda_{\min}^2}\right).$ The constant $K$ here is the one from Assumption~\ref{A2}.
    % \rddelete{Thus, $n_0$ is a lower bound on the sample complexity of LSA-M.}
\end{theorem}
See Section~\ref{sec:proof_lower_bound} for the proof of the above Theorem.
\begin{remark}
    \label{R1}
    As stated below \eqref{sgd-m-2}, LSA-M includes SHB and Nesterov's ASG method as special cases and, hence, the above result directly applies to them. In fact, this is the first lower bound on SHB and ASG's sample complexities in quadratic optimization.
\end{remark}

\begin{remark}
\label{R3}
    The lower bounds in \cite{rahul} and \cite{liu} are obtained by viewing the expected error in SHB and ASG iterates for least squares as update rules of the form $z_{n + 1} = P z_n$ for some matrix $P$ \cite[Appendix~A, p~16]{rahul} and \cite[Appendix C, p~12]{liu}). In particular, they obtain bounds on the eigenvalues of $P$ to get the desired claim. In contrast, the error relations for SHB and ASG methods in our setup (quadratic optimization with persistent noise) have the form $z_{n + 1} = P z_n + \alpha W_n$ for some matrix $P$ and vector $W_n$ (cf. \ref{recursion}). This forces us to develop a new proof technique that jointly looks at both these terms and show that at least one of them remains larger than $\epsilon$ for the choice of $n_0$ given in Theorem~\ref{Lower_bound}. 
\end{remark}
%

We next state our upper bound on the sample complexity of \eqref{sgd-m} in Proposition~\ref{Theorem_OTS_mom} and Corollary~\ref{Upper_bound_cor}. Similar bounds already exist in literature when $A$ is assumed to be symmetric and the noise is assumed to be iid with variance bounded by a constant (\citep{zhu2,nqm}). Here, we show that a similar upper bound holds under more general settings: i.) $A$ is not symmetric but is diagonalizable and has real positive eigenvalues, and ii.) the noise is a martingale difference sequence satisfying Assumption~\ref{A3}. 

\begin{proposition}
\label{Theorem_OTS_mom} Consider the LSA-M update rule \eqref{sgd-m}, and suppose $A$ is a (not necessarily symmetric) real diagonalizable matrix with real positive eigenvalues\footnote{When $A$ is not symmetric, LSA-M \emph{cannot} be viewed as a gradient-based algorithm for minimizing \eqref{e:obj.fn}. However, the update rule still makes sense, and it can be seen as one that is useful for solving $Ax = b.$}. Further suppose \ref{A3} holds. 
Then, $\forall \epsilon > 0$, there exists a choice of
$\alpha$, $\beta$ and $\eta$ (see Table~\ref{table} for exact values) such that the expected error $\mathbb{E}[\| x_{n}-x^*\|^2] \leq \epsilon$, $\forall n > n_{0}$, where
%
\begin{enumerate}
    \item[(i)] $n_{0} \in \Tilde{\Theta}(\frac{1}{{\alpha\lambda_{\min}}}),$ when $\eta = 0,$ and 
    \item[(ii)] $n_{0} \in \Tilde{\Theta}(\frac{1}{\sqrt{\alpha\lambda_{\min}}}),$ when $\eta > 0$.
\end{enumerate}
\end{proposition}

For the proof see Appendix~\ref{app-sec:appendix_proof_upper_bound}.

From Table \ref{table}, we see that $\alpha$ is a minimum of three terms in each case. The first term arises due to the unbounded noise (Assumption \ref{A3}), the second due to the target neighborhood $\epsilon$  and the third from the optimal choice of stepsize in the deterministic (no noise scenario) case. Since the bound on $n_0$ provided in Proposition \ref{Theorem_OTS_mom} is in terms of $\alpha$, the minimum of the three terms dictates the sample complexity. Note that $\epsilon$ only influences the middle term in all the choices of $\alpha$ given in Table~\ref{table}. 

Let $\bar{\epsilon}> 0$ be such that, for any $\epsilon \in (0, \bar{\epsilon}),$ the value of $\alpha$ equals the middle term in each of the three cases in Table~\ref{table}. Then the following result is immediate. 

%
\begin{corollary}[\textbf{Upper bound on sample complexity}]
    \label{Upper_bound_cor}
    Consider the LSA-M update rule \eqref{sgd-m}, and suppose $A$ is as in Proposition~\ref{Theorem_OTS_mom}. Further, suppose Assumption \ref{A3} holds. Then, for choice of parameters in Table~\ref{table}, and any $\epsilon \in (0, \bar{\epsilon}),$ $\exists n_0 \in \Tilde{\Theta}\left(\frac{K}{\epsilon\lambda_{\min}^2}\right)$ such that $\mathbb{E}[\| x_{n}-x^*\|^2] \leq \epsilon$, $\forall n \geq n_{0}$. The constant $K$ here is the one from Assumption~\ref{A3}.
\end{corollary}

\begin{remark}
    \label{R4}
    From Corollary \ref{Upper_bound_cor}, we see that the upper bounds on the sample complexities of SGD, SHB, and ASG match the lower bound given in Theorem \ref{Lower_bound} for small enough $\epsilon>0.$ In particular, since an upper bound on the sample complexity of SGD matches a lower bound for SHB and ASG, these latter methods do not always outperform SGD from a sample complexity perspective.
\end{remark}


\begin{remark}
\label{R5_new}
    Consider $\epsilon$  small enough such that the minimum in choice of $\alpha$ is achieved by the second term in Table \ref{table}. For SGD, the stepsize $\alpha \in \Theta(\frac{\epsilon\lambda_{\min}}{K})$ is larger than the choice of stepsize for SHB and ASG, $\alpha \in \Theta(\frac{\epsilon^2\lambda_{\min}^3}{K^2})$. Observe that SGD chooses a larger stepsize than SHB and ASG to reach the $\epsilon$ ball. Therefore, although momentum methods appear to have a better performance than SGD if the same stepsize is chosen, SGD can match this performance by re-scaling its stepsize (see Figure \ref{figure}).
\end{remark}

\begin{remark}
    \label{R5}
    When the noise is assumed to be bounded by a constant, i.e., $\mathbb{E}[\|M_{n+1}\|^{2}|\mathcal{F}_{n}] \leq K$ a.s. in Assumption~\ref{A3}, the first term in the choice of $\alpha$ in Table~\ref{table} does not appear for all three methods. Under such an assumption, if $\epsilon$ is large enough or $K$ is small enough such that the third term in the choice of $\alpha$ is the minimum, then the sample complexity of both SHB and ASG is better than SGD. We emphasize that such improvements are lost when the noise variance is large or the neighbourhood under consideration is small.
\end{remark}

\section{Proof of the Lower Bound (Theorem \ref{Lower_bound})}\label{sec:proof_lower_bound}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
We begin by defining the transformed iterates $\Tilde{x}_{n} = x_{n} - x^*$ and rewriting \eqref{sgd-m} as
%
\begin{equation}
\label{recursion}
    \Tilde{X}_{n} = P \Tilde{X}_{n-1} + \alpha W_{n},
\end{equation}
where
\(\Tilde{X}_{n} \triangleq
\begin{pmatrix}
    \Tilde{x}_{n} \\
    \Tilde{x}_{n-1}      
\end{pmatrix},
W_{n} \triangleq
\begin{pmatrix}
    M_{n}\\
    0
\end{pmatrix}\)
and
\[
P \triangleq
\begin{pmatrix}
    I_{d} - \alpha A + \eta(I_{d}-\alpha\beta A) & -\eta(I_{d}-\alpha\beta A)   \\
    I_{d} & 0      
\end{pmatrix}. 
\]
%
We derive the bound in Theorem \ref{Lower_bound} by obtained a lower bound for the error expression $\E[\|\tilde{X_{n}}\|^2]$.

The proof can be summarized by the following key steps.
%
\begin{enumerate}
    \item[{1.}] Transform $\tilde{X}_n$ to obtain $\tilde{Y}_n$ (see \eqref{e:tildeYn_tildeWn.Defn}). Decompose the 2d-dimensional update rule for $\tilde{Y}_n$ (see  \eqref{eq:y_tilde}) into $d$ separate two-dimensional update rules (see \eqref{eq:y_tilde_i}) using a block diagonalization argument. 
    
    
    % Decompose the original 2d-dimensional update rule for $\tilde{X}_n$ in \eqref{recursion} into $d$ separate two-dimensional update rules for $\tilde{Y}_n$ in \eqref{eq:y_tilde_i} using a block diagonalization argument. 
    
    \item[{2.}] For each of the two-dimensional components of $\tilde{Y}_n$ (denoted $\tilde{Y}_n^{(i)},$ $i = 1, \ldots, d),$ obtain a lower bound on the error $\mathbb{E}\| \tilde{Y}_n^{(i)}\|^2$. We do this using the following three steps. 
    %
    \begin{enumerate}
        \item[{(a)}] Decompose the error into two components: one that captures the impact of the initialization (\emph{bias}), and the other that concerns the effect of the cumulative noise (\emph{variance}); {see Lemma~\ref{bias-variance-lemma}}. 
        
        \item[{(b)}] Use the above decomposition to derive a lower bound on $\bE \|\tilde{Y}_n^{(i)}\|^2$ for the special case of $\beta = 0$. The core idea is to show that the \emph{bias} and the \emph{variance} in $\tilde{Y}^{(i)}_n$ cannot be simultaneously small;  {see Lemma~\ref{lower_main_lemma}}.
       
        \item[{(c)}] Generalize the result to $\beta \in [0, 1]$ case by showing that it can be reduced to the former case.     
    \end{enumerate}
    \item[{3.}] Use the lower bound on $\mathbb{E}\| \tilde{Y}_n^{(i)}\|^2$ from Step 2 to obtain a lower bound on the original error $\mathbb{E}\| \tilde{x}_n\|^2.$ This proves  the desired result for SHB with $\beta = 0$ and ASG with $\beta = 1.$   
\end{enumerate}
Next we describe the technical results involved in each of the above steps.
\begin{enumerate}
\item[\textbf{1.}]
\textbf{Reducing the $2d$-dimensional updates into $d$ separate two-dimensional updates.}

We follow a block diagonalization argument as in \citep{MJ} to transform the update rule \eqref{recursion} below.

\begin{lemma}
    \label{lemma_step_1}
    There exists a transformation matrix $Z$ and a block diagonal matrix $B = \diag(B_i),$ where $B_i \in \R^{2 \times 2},$ so that
    %
    \begin{equation}
    \label{e:tildeYn_tildeWn.Defn}
        \tilde{Y}_n = Z \tilde{X}_n \qquad \text{and} \qquad \tilde{W}_n = Z W_n
    \end{equation} 
    %
    satisfy
    %
    \begin{align}
    \label{eq:y_tilde}
        \tilde{Y}_n = B \tilde{Y}_{n-1} + \alpha \tilde{W}_{n}.
    \end{align}
    %
    In particular, if we break $\tilde{Y}_n$ into $d$ disjoint components of $2$-dimensional vectors, then the $i$-th component
    %
    \begin{align}
    \label{eq:y_tilde_i}
        \tilde{Y}_{n}^{(i)} \!\!=\!\! \begin{pmatrix}
        1  - \alpha \lambda_{i} + \eta' & -\eta'\\
        1 & 0
    \end{pmatrix} \!\Tilde{Y}_{n-1}^{(i)} \! + \! \alpha \tilde{W}_n^{(i)}
    \end{align}
    %
    where $\eta' = \eta(1-\alpha\lambda_i\beta)$.
    %
\end{lemma}

See Section~\ref{d_reduction} for the proof. Notice that the driving matrix $B$ in the transformed update rule \eqref{eq:y_tilde} is a block diagonal matrix unlike the driving matrix $P$ in \eqref{recursion}. In the next step we exploit this structure to lower bound $\mathbb{E}\| \tilde{Y}_n^{(i)}\|^2$.

    \item[\textbf{2.}] \textbf{Bounding the error $\mathbb{E}\| \tilde{Y}_n^{(i)}\|^2$.}
    
    We consider the two dimensional decoupled update given in \eqref{eq:y_tilde_i} for a specific $i$ and express the lower bound on the sample complexity with respect to $\lambda_i$. 
    
    \item[\textbf{(a)}] \textbf{Decompose the error $\mathbb{E}\| \tilde{Y}_n^{(i)}\|^2$ as a sum of bias and variance.} 

    First observe that the update from Lemma \ref{lemma_step_1} can be re-written as
\begin{gather}
\label{decomposed_recursion}
    \Tilde{Y}_{n}^{(i)} = B_i^n \Tilde{Y}_{0}^{(i)} + \alpha \sum_{i=0}^{n-1}B_i^{(n-1-i)}\tilde{W}_{i+1}^{(i)}.
\end{gather}
Taking the square of the norm on both sides of the above equation we get
\begin{equation}
\label{norm-sq}
    \begin{split}
        &\|\Tilde{Y}_{n}^{(i)}\|^{2} = \underbrace{\|B_i^{n}\Tilde{Y}_{0}^{(i)}\|^{2}}_{I} \\
        & + \underbrace{2\alpha \left(B_i^{n}\Tilde{Y}_{0}^{(i)}\right)^{T}\left(\sum_{j=0}^{n-1}B_i^{(n-1-i)}\tilde{W}^{(i)}_{j+1}\right)}_{II}\\
        & + \underbrace{\alpha^{2} \left(\sum_{j=0}^{n-1}B_i^{(n-1-i)}\tilde{W}^{(i)}_{j+1}\right)^{T}\left(\sum_{j=0}^{n-1}B_i^{(n-1-i)}\tilde{W}^{(i)}_{j+1}\right)}_{III}.\\
    \end{split}
\end{equation}
Using the fact that $(\tilde{W}_n) = (Z{W}_n)$ is a martingale difference sequence, it can be shown that expectation of term $II$ is 0 and that of term $III$ is $\alpha^2 \mathbb{E}\Big[ \sum_{j=0}^{n-1} \|B_{i}^{n-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \Big]$ (See Section \ref{proof_lemma_bias-variance-lemma} for details). This leads to the following lemma.
\begin{lemma}
    \label{bias-variance-lemma}
    For the update in \eqref{decomposed_recursion} the error can be decomposed as follows:
    \begin{align}
    \label{eq:bias_variance_y_tilde}
        \mathbb{E}\| \tilde{Y}_n^{(i)}\|^2 \nonumber
        &= \underbrace{\| B_{i}^n  \Tilde{Y}_0^{(i)} \|^2}_{Bias} \\
        & \qquad + \underbrace{\alpha^2 \mathbb{E}\Big[ \sum_{j=0}^{n-1} \|B_{i}^{n-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \Big]}_{Variance}.
        %&\geq \| P^n  \Tilde{X}_{0} \|^2 + \alpha^2  K \sum_{j=0}^{n-1} \|P^{j} e_1 \|^2,
    \end{align}
\end{lemma}
See Section~\ref{proof_lemma_bias-variance-lemma} for the proof.
The \emph{bias} and \emph{variance} here correspond to that of the $i$-th block of the transformed iterates in \eqref{eq:y_tilde_i}.

\item [\textbf{(b)}] \textbf{Bounding the error $\mathbb{E}\| \tilde{Y}_n^{(i)}\|^2$ for $\boldsymbol{\beta=0}$.}

Using the fact that $\eta' = \eta$ when $\beta = 0$, the update in \ref{eq:y_tilde_i} reduces to
\begin{align*}
    \tilde{Y}_{n}^{(i)} = 
     \begin{pmatrix}
         1  - \alpha \lambda_{i} + \eta & -\eta\\
         1 & 0
    \end{pmatrix} \Tilde{Y}_{n-1}^{(i)}+ 
    \alpha 
    \tilde{W}_n^{(i)}.
\end{align*}
We show that there exists an $\epsilon > 0$ such that for some $n_0 \in \tilde{\Theta}\left(\frac{K}{\epsilon\lambda_i^2}\right)$, either the \emph{bias} or the \emph{variance} is larger than $\epsilon$. This is established in the following key lemma.

\begin{lemma}
\label{lower_main_lemma}
Let $\epsilon'_i = \min\left(\frac{K}{32 \lambda_i^2},\frac{(\tilde{x}_0^{(i)})^2}{72}\right)$. Then for any $\epsilon \in (0,\epsilon'_i)$,
 and any $\alpha > 0,$ $\beta = 0$, $\eta \in [0,1],$ there exists 
\( n_0 \in \tilde{\Theta}\left(\frac{K}{\epsilon\lambda_i^2}\right),\) such that at least one of the following statements hold:
%
\begin{enumerate}
    \item $\| B_{i}^{n_0}  \tilde{Y}_0^{(i)} \|^2 > \epsilon$

    \item $\displaystyle\alpha^2 \mathbb{E}\Big[ \sum_{j=0}^{n_0-1} \|B_{i}^{n_0-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \Big]  > \epsilon.$
\end{enumerate}
%
\end{lemma}

See Section~\ref{proof_lower_main_lemma} for the proof. Lemma \ref{lower_main_lemma} along with Lemma \ref{bias-variance-lemma} immediately provides a lower bound on the error, i.e., $\E\|\tilde{Y}_{n_0}^{(i)}\|^2 > \epsilon$ for $\beta=0$. 
Lemma \ref{lower_main_lemma} is the core of the lower bound analysis and the proof is provided in Section \ref{proof_lower_main_lemma}. 


\item[\textbf{(c)}] \textbf{Extending (b) to the case $\boldsymbol{\beta \in (0,1]}$.}

We complete Step 2 by extending Lemma~\ref{lower_main_lemma} to the case when $\beta\in[0,1]$ as formalized below.
\begin{lemma}
\label{lower_main_lemma_general_beta}
Let $\epsilon'_i$ be defined as in Lemma~\ref{lower_main_lemma}. Then for any $\epsilon \in (0,\epsilon'_i)$,
 and any $\alpha > 0,$ $\beta = [0,1]$, $\eta \in [0,1],$ there exists 
\( n_0 \in \tilde{\Theta}\left(\frac{K}{\epsilon\lambda_i^2}\right),\) such that at least one of the following statements hold:
%
\begin{enumerate}
    \item $\| B_{i}^{n_0}  \tilde{Y}_0^{(i)} \|^2 > \epsilon$

    \item $\displaystyle\alpha^2 \mathbb{E}\Big[ \sum_{j=0}^{n_0-1} \|B_{i}^{n_0-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \Big]  > \epsilon.$
\end{enumerate}
%
\end{lemma} 
See Section~\ref{lemma_b_general} for the proof. Note that the general $\beta \in [0,1]$ update rule in \eqref{eq:y_tilde_i} is equivalent to the $\beta=0$ update with $\eta$ redefined as $\eta'$ and therefore we can re-use Lemma~\ref{lower_main_lemma} if we can ensure $\eta' \in [0,1]$. We show this holds when $\alpha\lambda_i\leq1$. For the case $\alpha\lambda_i > 1$, we show that the \emph{variance} term is greater than $\epsilon$ thus implying the conclusion of Lemma~\ref{lower_main_lemma}.

\item[\textbf{3.}]\textbf{Bounding the original error $\mathbb{E}[\|\tilde{X}_{n}\|^2]$}.

{Recall that the original update rule is given by}
\begin{align*}
    \Tilde{X}_{n} = P \Tilde{X}_{n-1} + \alpha W_{n}.
\end{align*}
To provide a bound on the error $\mathbb{E}[\|\tilde{X}_{n}\|^2] $, we invoke Lemma \ref{lower_main_lemma} for $i = d$ and $\lambda_{d} = \lambda_{\min}$ and use the fact that $Z$ is an orthogonal matrix. We have
\begin{align*}
   \mathbb{E}[\|\tilde{X}_{n_0}\|^2] &= \E[\|Z^{-1}\tilde{Y}_{n_0}\|^2]\nonumber\\
   &= \mathbb{E}[\|\tilde{Y}_{n_0}\|^2]\geq \mathbb{E}[\|\tilde{Y}_{n_0}^{(d)}\|^2]\geq \epsilon
\end{align*}
for all $\epsilon \in (0,\epsilon'_d)$ and for
{$n_0$ as defined in Lemma~\ref{lower_main_lemma} with $\lambda_i$ substituted with $\lambda_{\min}$}.

Now to obtain a bound for $\mathbb{E} \|\tilde{x}_n\|^2$ from $\mathbb{E} \|\tilde{X}_n\|^2$, we note that 
\begin{align*}
    2\max{(\|\tilde{x}_n\|^2,\|\tilde{x}_{n-1}\|^2)} &\geq \|\tilde{x}_n\|^2 + \|\tilde{x}_{n-1}\|^2 \\
    &= \|\tilde{X}_{n}\|^2.
\end{align*}
Therefore the lower bound on $\E[\|\tilde{X}_{n}\|^2]$ is enough to prove Theorem \ref{Lower_bound}. Choosing $\epsilon' = \epsilon_d'$ and noting that $n_0 \in \tilde{\Theta}\left(\frac{K}{\epsilon\lambda_{\min}^2}\right)$ completes the proof of Theorem~\ref{Lower_bound}.
 \end{enumerate}

\subsection{Proof of Lemma \ref{lemma_step_1}}
\label{d_reduction}
%
We first discuss how the update rule for $\tilde{Y}_n$ in \eqref{eq:y_tilde} can be obtained using that of $\tilde{X}_n$ in \eqref{recursion}. Towards this, we define
$D = \text{diag}(\lambda_i)_{i=1}^d.$ 
% \begin{align*}
%     B &:= \begin{pmatrix}
%         B_1 & 0 &\ldots & 0\\
%         0 & B_2 & \ddots & \vdots\\
%         \vdots & \ddots & \ddots & 0 \\
%         0 & \ldots & 0 & B_d
%     \end{pmatrix} \\
% \end{align*}
% where $B_{i} = 
%     \begin{pmatrix}
%         1 + \eta - \alpha \lambda_{i} & -\eta\\
%         1 & 0
%     \end{pmatrix}.$
Since $A$ is real symmetric (see Assumption~\ref{A1}), it has a spectral decomposition of the form $A = S D S^{-1}$. We define the transformation matrix $Z$ as
\begin{align}
    \label{Z_tranform}
    Z=E_{2d\times 2d}
\begin{pmatrix}
    S & 0 \\
    0 & S
\end{pmatrix}
\end{align}
where $E_{2d\times 2d}$ is the permutation matrix that changes the order $(1, 2, \ldots , 2d)$ into $ (1, d+1, 2, d+2, \ldots , d, 2d)$.

Since $\Tilde{X}_{n} = P \Tilde{X}_{n-1} + \alpha W_{n},$ we get
\begin{align*}
    \tilde{Y}_{n} &=Z\Tilde{X}_{n} = Z P \Tilde{X}_{n-1} + \alpha Z W_{n} \\
    &= Z P Z^{-1}\Tilde{Y}_{n-1}+ \alpha Z W_{n}
    = B \Tilde{Y}_{n-1}+ \alpha Z W_{n} \\
    %
    & = B \Tilde{Y}_{n-1}+ \alpha \tilde{W}_{n},
\end{align*}
%
as desired. The last but one equality follows because $ZPZ^{-1}=B,$ which itself holds since
%
\begin{gather*}
    Z P Z^{-1} = E_{2d \times 2d} \begin{pmatrix}
    S & 0 \\
    0 & S
\end{pmatrix} P 
\begin{pmatrix}
    S^{-1} & 0 \\
    0 & S^{-1}
\end{pmatrix} E_{2d \times 2d}^{-1}\\
\stackrel{(a)}{=}  E_{2d \times 2d} \underbrace{\begin{pmatrix}
        I_{d \times d}-\alpha D +\eta I_{d \times d} & -\eta I_{d \times d} \\
        I_{d \times d} & 0_{d \times d}
    \end{pmatrix}}_{\Gamma} E_{2d \times 2d} \\
    \stackrel{(b)}{=}  B.
\end{gather*}
Here $(a)$ follows because $E_{2d \times 2d}^{-1} = E_{2d \times 2d}$.
Further $(b)$ follows because the left multiplication of $E_{2d \times 2d}$ to $\Gamma$ changes the order of rows from $(1, 2, \ldots , 2d)$ to $ (1, d+1, 2, d+2, \ldots , d, 2d)$ and the right multiplication of $E_{2d \times 2d}$ changes the order of columns from $(1, 2, \ldots , 2d)$ to $ (1, d+1, 2, d+2, \ldots , d, 2d)$ which exactly results in $B$. 

% Therefore we have
% \begin{align*}
%     \tilde{Y}_n = B\tilde{Y}_{n-1} + \alpha \tilde{W}_n, 
% \end{align*}
% where $\tilde{W}_n = Z {W}_n$, which completes the proof.

To see why \eqref{eq:y_tilde_i} holds, let

$\tilde{Y}_n = \begin{pmatrix}
    \tilde{Y}_n^{(1)} \\
    \tilde{Y}_n^{(2)} \\
    \vdots \\
    \tilde{Y}_n^{(d)} \\
\end{pmatrix}$ and 
$\tilde{M}_{n} =\begin{pmatrix}
    \tilde{M}_{n,1} \\
    \tilde{M}_{n,2} \\
    \vdots \\
    \tilde{M}_{n,d} \\
\end{pmatrix} = S M_{n}$ ,
where $\tilde{Y}_{n} \in \R^{2d},\tilde{Y}_n^{(i)} \in \mathbb{R}^2$, $\tilde{M}_{n} \in \R^{d}, \tilde{M}_{n,i} \in \mathbb{R}$. Now notice that 
\begin{align}
\label{eq:ZWn}
    ZW_n &= E_{2d\times 2d}
\begin{pmatrix}
    S & 0 \\
    0 & S
\end{pmatrix} \begin{pmatrix}
    M_n\\
    0
\end{pmatrix}\\
\nonumber
&= E_{2d\times 2d} \begin{pmatrix}
    \tilde{M}_n\\
    0
\end{pmatrix} = \begin{pmatrix}
    \tilde{M}_{n,1} \\
    0\\
    \tilde{M}_{n,2} \\
    0\\
    \vdots \\
    \tilde{M}_{n,d} \\
    0\\
\end{pmatrix},
\end{align}
where the last equality follows because the left multiplication of $E_{2d\times 2d}$ changes the order of rows from $(1, 2, \ldots , 2d)$ to $ (1, d+1, 2, d+2, \ldots , d, 2d)$. Therefore,  $\forall i \in [d],$
$$\tilde{Y}_{n}^{(i)} = B_i \Tilde{Y}_{n-1}^{(i)}+ \alpha \tilde{W}_n^{(i)}$$
where $\tilde{W}_n^{(i)} = \begin{pmatrix}
    \tilde{M}_{n,i} \\
    0
\end{pmatrix}$.
\subsection{Proof of Lemma \ref{bias-variance-lemma}}
\label{proof_lemma_bias-variance-lemma}
Recall the error expression from \eqref{norm-sq}: 
%
\begin{align*}
        &\|\Tilde{Y}_{n}^{(i)}\|^{2} = \underbrace{\|B_i^{n}\Tilde{Y}_{0}^{(i)}\|^{2}}_{I} \\
        & + \underbrace{2\alpha \left(B_i^{n}\Tilde{Y}_{0}^{(i)}\right)^{T}\left(\sum_{j=0}^{n-1}B_i^{(n-1-i)}\tilde{W}^{(i)}_{j+1}\right)}_{II}\\
        & + \underbrace{\alpha^{2} \left(\sum_{j=0}^{n-1}B_i^{(n-1-i)}\tilde{W}^{(i)}_{j+1}\right)^{T}\left(\sum_{j=0}^{n-1}B_i^{(n-1-i)}\tilde{W}^{(i)}_{j+1}\right)}_{III}.\\
\end{align*}
%
Since $\tilde{W}_n = Z{W}_n,$  it follows that $(\tilde{W}_n)$ is also a martingale difference sequence w.r.t. the filtration $(\mathcal{F}_n),$ where $\mathcal{F}_n$ is as in Assumption~\ref{A2}. In particular, since $\mathbb{E}[\tilde{W}_{n}^{(i)}] = 0$ for each $n,$ we get that the expectation of Term $II$ is $0.$ With regards to Term $III,$ we have
%
\begin{align*}
    &\alpha^{2} \left(\sum_{j=0}^{n-1}B_i^{(n-1-j)}\tilde{W}^{(i)}_{j+1}\right)^{T}\left(\sum_{j=0}^{n-1}B_i^{(n-1-j)}\tilde{W}^{(i)}_{j+1}\right) \\
    &= \alpha^2 \sum_{j,k} (\tilde{W}^{(i)}_{j+1})^T (B_i^{(n-1-j)})^{T} B_i^{(n-1-k)}\tilde{W}^{(k)}_{k+1}\\
    &= \underbrace{\alpha^2 \sum_{j \neq k} (\tilde{W}^{(i)}_{j+1})^T (B_i^{(n-1-j)})^{T} B_i^{(n-1-k)}\tilde{W}^{(k)}_{k+1}}_{III(a)}\\
    &\qquad \qquad + \underbrace{\alpha^2\sum_{j}\| B_i^{(n-1-j)}\tilde{W}^{(i)}_{j+1} \|^2}_{III(b)}
\end{align*}
 We now show that the expectation of $III(a)$ is 0. Without loss of generality, suppose $j<k$. Then,
\begin{equation*}
    \begin{split}
        & \mathbb{E}\left[(\tilde{W}^{(i)}_{j+1})^{T}(B_i^{(n-1-j)})^T B_i^{(n-1-k)} \tilde{W}^{(i)}_{k+1}\right]\\
        & =\mathbb{E}\left[\mathbb{E}\left[(\tilde{W}^{(i)}_{i+1})^{T}(B_i^{(n-1-i)})^T B_i^{(n-1-j)} \tilde{W}^{(i)}_{j+1}\vert\mathcal{F}_{j}\right]\right]\\
        & = \mathbb{E}\left[(\tilde{W}^{(i)}_{i+1})^{T}(B_i^{(n-1-i)})^{T}B_i^{(n-1-j)}\mathbb{E}[\tilde{W}^{(i)}_{j+1}|\mathcal{F}_{j}]\right]
        = 0.
    \end{split}
\end{equation*}

Therefore, taking expectation on both sides of \eqref{norm-sq} gives
\begin{align}
    \label{bias_variance}
    \mathbb{E}\| \tilde{Y}_n^{(i)}\|^2 \nonumber
    &= \underbrace{\| B_i^n  \Tilde{X}_{0} \|^2}_{I} \\
    &+  \mathbb{E}\Big[ \underbrace{\alpha^2\sum_{j=0}^{n-1} \|B_i^{(n-1-j)}\tilde{W}^{(i)}_{j+1} \|^2}_{III(b)} \Big]
\end{align}

\subsection{Proof of Lemma \ref{lower_main_lemma}}
\label{proof_lower_main_lemma}
This is the key result in the lower bound proof. Here we outline the main steps involved in proving the result. The detailed proofs of the all auxiliary lemmas are pushed to Appendix \ref{app-App_thm3}.

Before we proceed with the main proof, we provide a lower bound on the \emph{variance} term in the following lemma.
\begin{lemma}
\label{variance_lower_bound}
    Under Assumption \ref{A2} and $n_0$ as in Lemma~\ref{lower_main_lemma}, the variance term in \eqref{eq:bias_variance_y_tilde} can be lower bounded as follows:
    \begin{align*}
        \alpha^2\mathbb{E}\left[ \sum_{j=0}^{n_0-1} \|B_{i}^{n_0-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \right] \geq \alpha^2K \sum_{j=0}^{n_0-1}  \|B_{i}^{j}e_1 \|^2
    \end{align*}
    where $e_1 = \begin{pmatrix}
    1\\ 0
\end{pmatrix}$ and $K$ is as in Assumption~\ref{A2}.
\end{lemma}

For convenience we redefine the term in the right hand side of the above inequality as the \emph{variance}. If $\alpha$ and $\eta$ are such that $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\|^2 > \epsilon$, then Lemma~\ref{lower_main_lemma} immediately follows for this choice of $\alpha$ and $\eta$. We now consider the case where $\alpha$ and $\eta$ are such that $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\|^2 \leq \epsilon$. Now we show that for this choice of $\alpha$ and $\eta$, the \emph{variance} is necessarily greater than $\epsilon$. Let $\mu^{(i)}_{+}$ and $\mu^{(i)}_{-}$ be the eigenvalues of $B_i$. It is easy to check that
\begin{equation}
\begin{aligned}
\label{mu_def}
    \mu_{+}^{(i)} = \frac{1}{2}\left((1 - \alpha\lambda_{i} + \eta) + \Delta^{(i)}\right)\\
    \mu_{-}^{(i)} = \frac{1}{2}\left((1 - \alpha\lambda_{i} + \eta) - \Delta^{(i)}\right)
\end{aligned}
\end{equation}
where $\Delta^{(i)} = \sqrt{(1 - \alpha\lambda_{i} + \eta)^2 - 4\eta}$.

Recall that $\epsilon \in (0,\epsilon_i')$ in Lemma~\ref{lower_main_lemma} and therefore $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\|^2 \leq \epsilon$ implies $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\|^2 < \epsilon_i'$.
The following Lemma provides a lower bound on the \emph{variance} in terms of the eigen values of $B_i$ and the momentum parameter $\eta$ assuming the \emph{bias} is less than $\epsilon_i'$.
\begin{lemma}[]
\label{Aux_lemma_1}
    Let $\alpha>0$ and $\eta \in [0,1]$ such that $\|B_{i}^{n_0} \tilde{X_0} \|^2 < \epsilon'_i.$ Then 
    % we have the following bound, 
    \begin{align*}
        \alpha^2 K \sum_{j=0}^{n_0-1} \|B_i^{j}e_{1}\|^2 \geq \frac{\alpha^2 K}{2(1-\mu_+^2)(1-\mu_-^2)(1-\eta)}.
    \end{align*}
\end{lemma}

It can be shown that $(1-\mu_+^2)(1-\mu_-^2) = \alpha\lambda_{i}$ and therefore the RHS in the above expression reduces to $\frac{\alpha K}{2\lambda_i(1-\eta)}$. We define the following function
\begin{align*}
    Q(\eta;\alpha,\lambda_{i}) \equiv \frac{\alpha K}{2\lambda_{i}(1-\eta)}\frac{1}{(1-\rho(B_i))}
\end{align*}
where $\rho(B_{i}) = |\mu_{+}^{(i)}|$ is the spectral radius of $B_i$. 
Note that $\rho(B_i)$ depends on $\eta$ (see \ref{mu_def}). 
Now to obtain a further lower bound on the \emph{variance} we optimize over the choice of $\eta$ and show that 
\begin{align*}
    Q(\eta;\alpha,\lambda_{i}) \geq \frac{K}{16 \lambda_{i}^2}
\end{align*}
Combining this with the definition of $Q$ and Lemma \ref{Aux_lemma_1} gives the following bound:
\begin{align*}
    \alpha^2 K \sum_{j=0}^{n_0-1} \Big\|B_i^{j}e_{1}\Big\|^2 \geq \frac{K}{16 \lambda_i^2}(1-\rho(B_i))
\end{align*}
The following lemma proves all these above claims.
\begin{lemma}
\label{Aux_lemma_2}
    Let $\alpha>0$ and $\eta \in [0,1]$ such that $\|B_{i}^{n_0} \tilde{X_0} \|^2 < \epsilon'_i.$ Then we have the following bound
    $\alpha^2 K \sum_{j=0}^{n_0-1} \|B_i^{j}e_{1}\|^2 \geq \frac{K}{16 \lambda_{i}^2}(1-\rho(B_i)).$
\end{lemma}
Lastly, to show that the \emph{variance} is lower bounded by $\epsilon \in (0,\epsilon')$, we need to show that $(1-\rho(B_i)) \geq \frac{16\lambda_{i}^2}{K}\epsilon$. The choice of $n_0$ and the fact that we assumed $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\| < \epsilon$ exactly ensures that. The following lemma proves this claim.

\begin{lemma}
\label{Aux_lemma_3}
For any $\epsilon \in (0,\epsilon_i')$, if $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\| < \epsilon$, then $1-\rho(B_i) \geq \frac{16\lambda_i^2}{K}\epsilon $.
\end{lemma}
This completes the proof of Lemma \ref{lower_main_lemma}.

\subsection{Proof of Lemma~\ref{lower_main_lemma_general_beta}}
\label{lemma_b_general}
We handle the cases $\alpha \lambda_i \leq 1$ and $\alpha \lambda_i > 1$ separately. 

\textbf{Case 1 ($\boldsymbol{\alpha \lambda_i \leq 1}$):} 
Observe that the general $\beta$ update rule in \eqref{eq:y_tilde_i} is equivalent to the $\beta=0$ update with $\eta$ redefined as $\eta'$. Moreover in this case $\eta' \in [0,1]$. To see this first observe that 
\begin{align*}
    \eta' = \eta(1-\alpha\lambda_{i}\beta) \geq \eta (1-\beta) \geq 0.
\end{align*}
Here the first inequality follows because $\alpha\lambda_i \leq 1$ and the second inequality follows because $\beta,\eta \in [0,1]$.

Therefore in this case Lemma~\ref{lower_main_lemma} holds with $\eta$ redefined as $\eta'$.

\textbf{Case 2 ($\boldsymbol{\alpha \lambda_i > 1}$):} 
In this case we show that the variance term is greater than $\epsilon$. This follows as shown below
\begin{align*}
 \alpha^2 \E\left[\sum_{j=0}^{n_0-1} \|B_{i}^{n_0-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \right] &\stackrel{(A)}{\geq}  \alpha^2 K \sum_{j=0}^{n_0-1}\|B_{i}^{j}e_{1}\|^2  \\
    &\stackrel{(B)}{\geq} \alpha^2 K \stackrel{(C)}{>}  \frac{K}{\lambda_{i}^2}  \stackrel{(D)}{>} \epsilon.
\end{align*}
Here $(A)$ follows from Lemma \ref{variance_lower_bound}, $(B)$ follows from non-negativity of norm and lower bounding the sum with the $j=0$ term and $(C)$ follows since $\alpha\lambda_i > 1$. Finally $(D)$ follows for any $\epsilon < \frac{K}{\lambda_{i}^2}$ which in turn is smaller than $\epsilon_i'$ as defined in Lemma \ref{lower_main_lemma}. 

\section{Concluding Remarks}
\label{sec:conclusion}
In this work, we analyze the sample complexity of SHB and ASG and provide matching lower and upper bounds up to constants and logarithmic terms.
More importantly, we show that the same sample complexity bound can be obtained by standard SGD. Our work also calls into question some of the recent positive results in favour of SHB and ASG in the stochastic regime. We show that such improvements do not take into account all the terms involved in the error decomposition, or have major flaws. 
We emphasize that our results hold  specifically for SHB and ASG. Other momentum methods could offer provable improvements over SGD \citep{pmlr-v75-jain18a, liu}. 

% Although some other results do question the superiority of SHB and ASG in the stochastic regime, the assumptions and the setting that these works look at do not correspond to those in the positive results. We also emphasize that the negative results either only consider small stepsizes and momentum parameters not close to 1 or provide such results for specific instances in linear regression. In contrast, our work shows that SHB and ASG cannot obtain an improvement in sample complexity (for small neighbourhoods) over SGD for the entire family of quadratic optimization and holds for all stepsizes and momentum parameters in $[0,1]$.  

\section*{Acknowledgements}
SG was supported by the Prime Minister's Research Fellowship (PMRF).
{RD was supported in part by grants from the National Science Foundation (NSF) through awards IIS 21-31335, OAC 21-30835, DBI 20-21898, as well as a C3.ai research award.}
{GT was supported in part by DST-SERB's Core Research Grant CRG/2021/00833, in part by IISc Start-up grants SG/MHRD-19-0054 and SR/MHRD-19-0040, and in part by the “Pratiksha Trust Young
Investigator” award.}
{AB was supported in part by the NSF (DMS-2152577, DMS-2134107)}.

\bibliography{ganesh_669}

\end{document}
