\documentclass[accepted]{uai2023} % after acceptance, for a revised
\usepackage[american]{babel}
\usepackage{xr}
\externaldocument[app-]{ganesh_669-supp}
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amsthm}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{lemmma}[theorem]{Lemma}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{corollary}[theorem]{Corollary}
\usepackage{bm}
\usepackage{multirow}
\usepackage{caption} 
\captionsetup[table]{skip=10pt}
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{array}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{textcomp}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage[]{color-edits}
\addauthor{rd}{purple}
\addauthor{sg}{teal}
\addauthor{gt}{blue}
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\input{notation}

\usepackage{chngcntr}
\usepackage{booktabs}
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{tabularx}
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{amsmath}
\usepackage{amsmath}
\usepackage{float}
\usepackage{graphicx}
\usepackage{subfig}
\usepackage{amssymb}
\usepackage{bbm}

\newcommand{\mup}{\mu_+}
\newcommand{\mun}{\mu_-}
\renewcommand{\wp}{w_+}
\newcommand{\wn}{w_-}
\newcommand{\vp}{v_+}
\newcommand{\vn}{v_-}
\renewcommand{\diag}{\textnormal{diag}}

\newtheorem{innercustomgeneric}{\customgenericname}
\providecommand{\customgenericname}{}
\newcommand{\newcustomtheorem}[2]{%
  \newenvironment{#1}[1]
  {%
   \renewcommand\customgenericname{#2}%
   \renewcommand\theinnercustomgeneric{##1}%
   \innercustomgeneric
  }
  {\endinnercustomgeneric}
}

\newcustomtheorem{customthm}{Theorem}
\newcustomtheorem{customlemma}{Lemma}

\newcommand{\vertiii}[1]{{\left\vert\kern-0.25ex\left\vert\kern-0.25ex\left\vert #1 
    \right\vert\kern-0.25ex\right\vert\kern-0.25ex\right\vert}}


\title{Does Momentum Help in Stochastic Optimization? \\A Sample Complexity Analysis. (Supplementary material)}

\author[1]{ Swetha Ganesh \thanks{Equal Contribution}}
\author[2]{Rohan Deb$^*$\thanks{Part of the research was done while RD was a Project Associate with GT at the Indian Institute of Science, Bangalore.}}
\author[1]{Gugan Thoppe}
\author[3]{Amarjit Budhiraja}
\affil[1]{%
Department of Computer Science and Automation\\
Indian Institute of Science, Bengaluru, India
}
\affil[2]{%
Department of Computer Science\\
University of Illinois Urbana-Champaign, USA
}
\affil[3]{%
Department of Statistics and Operations Research\\
University of North Carolina at Chapel Hill, USA
}


\begin{document}
\maketitle

\begin{abstract}
 Stochastic Heavy Ball (SHB) and Nesterov's Accelerated Stochastic Gradient (ASG) are popular momentum methods in optimization. While the benefits of these acceleration ideas in deterministic settings are well understood, their advantages in stochastic optimization are unclear. Several works have recently claimed that SHB and ASG always help in stochastic optimization. Our work shows that i.) these claims are either flawed or one-sided (e.g., consider only the bias term but not the variance), and ii.) when both these terms are accounted for, SHB and ASG do not always help. Specifically, for \textit{any} quadratic optimization, we obtain a lower bound on the sample complexity of SHB and ASG, accounting for both bias and variance, and show that the vanilla SGD can achieve the same bound.
\end{abstract}

\section{Introduction}
\label{sec_introduction}

In deterministic convex optimization (when one has access to exact gradients), Gradient Descent (GD) is a popular optimization algorithm \citep{cauchy}. In practice, though, exact gradients are not available and one has to rely on their noisy estimates. This brings forth the idea of Stochastic Gradient Descent (SGD). 
Two classic momentum methods used to accelerate GD are Heavy Ball (HB) \citep{polyak_heavy_ball, polyakbook, qian} and Nesterov's Accelerated Gradient (NAG) \citep{nesterov, nesterovbook, Nesterov05}. Naturally, these momentum-based methods and their variants have also gained significant interest in stochastic settings \citep{sutskever13, Nitanda2014, Chonghai}. However, our work shows that the stochastic variants of HB and NAG, i.e., the Stochastic Heavy Ball (SHB) and Nesterov's Accelerated Stochastic Gradient (ASG), are not always better than the vanilla SGD for any quadratic optimization. Specifically, we provide conditions for which the sample complexities of SHB and ASG are never better than that of SGD\footnote{Sample complexity refers to the number of iterations required to reach an $\epsilon$-ball around the solution. Our statement holds for all sufficiently small $\epsilon.$}.

We elaborate on the discussion above. The benefit of using momentum in (deterministic) quadratic optimization is the following. Suppose the driving matrix has condition number $\kappa.$ Then, for any $\epsilon > 0,$ GD with an optimal constant stepsize\footnote{Throughout, we only consider algorithms with constant stepsizes, which are widely popular in practice.} % and ensure faster convergence.} 
converges to an $\epsilon$-close solution in $\mathcal{O}(\kappa\log\frac{1}{\epsilon})$ iterations. In contrast, both HB and NAG with optimal stepsize and momentum parameters only need $\mathcal{O}(\sqrt{\kappa}\log\frac{1}{\epsilon})$ steps; see, e.g.,  \citep{recht2010cs726}.
\begin{figure*}[!bt]
    \centering
    \subfloat[Stepsize SGD,  SHB, ASG: 0.0025
    ]
    {
    \label{sub-figure(a)}
    {\includegraphics[width=5.5cm]{plot_a_v2.png}}
    }
    \subfloat[
    Stepsize SGD: 0.02; SHB, ASG: 0.0025
    ]{
    \label{sub-figure(b)}
    {\includegraphics[width=5.5cm]{plot_b_v2.png}}
    }%
     %\quad
    \subfloat[
     Stepsize - SGD, SHB, ASG: 0.02
    ]{
    \label{sub-figure(c)}
    {\includegraphics[width=5.5cm]{plot_c_v2.png}}
    }%
    \caption{Comparison of SGD, SHB, and ASG's performances for a 2D quadratic optimization problem (see Section~\ref{app-s:Simulation.Details} for details) for the different stepsize choices given above and $\epsilon$-threshold = 5 (denoted by the black horizontal line). 
    }
    \label{figure}%
\end{figure*}
Our main claim here is that momentum does not lead to similar advantages in stochastic settings. We use Figure~\ref{figure} to provide an intuitive justification for this claim. The setup is as follows.  We consider a quadratic optimization problem (see Section~\ref{app-s:Simulation.Details} for the details) and ensure that only a noisy estimate of its gradient is available in each iteration. This problem is solved using SGD, SHB, and ASG and the three panels show how the Mean Squared Error (MSE) decays for different stepsize and momentum parameter choices. Note that these parameters, once chosen, are fixed, i.e., they do not change from one iteration to the other. 

In stochastic settings, the MSE error at any time instance for each of SGD, SHB, and ASG can be broken down into two components: bias and variance. The bias dictates how fast the distance of the initial estimate to the solution is forgotten, while the variance represents a cumulative effect of the noise seen so far. When constant stepsize and momentum parameters are used, the bias decays exponentially fast while the variance converges to some (non-zero) positive constant; this implies the MSE also converges to this constant. Both the rate at which the bias decays and the constant to which the variance converges to are influenced by the stepsize and momentum parameter choices. 

With the above picture in mind, Figure~\ref{figure} illustrates how SHB and ASG's performance can be matched by SGD. Figure~\ref{sub-figure(a)} corresponds to the case where a same stepsize is used in all the three algorithms. In this case,  the MSE for the momentum based methods (SHB, ASG) decreases faster initially, but settles at a higher limiting value eventually. Accordingly, one may conjecture that SHB and ASG {would} have a better sample complexity if the $\epsilon$-threshold for the MSE is set above this limit (one such choice of $\epsilon$ in this example is {$5$}).  However, Figure~\ref{sub-figure(b)} shows that SGD enjoys a similar performance for a larger stepsize choice. This time one may conjecture that SHB and ASG's performance can be improved if their stepsizes are also increased similarly. Figure~\ref{sub-figure(c)} discusses this case when the stepsize for momentum methods is increased to match the new stepsize for SGD. Unfortunately, while MSE for momentum based does decrease faster initially, it also settles at a value that is higher than the threshold that we had set before, i.e., {5}. 

\textbf{Related Works:} Some recent results \citep{Loizou, MJ, Assran, zhu2} 
%look at this question in the constant stepsize setup and
claim that SHB and ASG methods are better than SGD in quadratic or least-squares settings. 
However, \citet{Loizou} needs a strong assumption on noise, which \citet[Section 6]{rahul} claim is information-theoretically impossible even in the simple least squares regression problem. The other results either are based on a one-sided analysis \citep{zhu2}\footnote{This work only considers bias, while ignoring variance}  or have a flaw \citep{MJ, Assran}; see Appendix \ref{app-App_A}. 

On the other hand, there are also a few recent negative results on these momentum methods.
{\citet{devolder2014first} make a similar conclusion to ours in the context of (deterministic) proximal gradient methods and their accelerated variants for smooth convex optimization, when the function can be estimated only up to some (non-random) fixed inaccuracy.}
\citet{ucla} 
show that SHB and ASG are equivalent to SGD with a rescaled stepsize. 
However, this result requires that the stepsize be sufficiently small and the momentum parameter be away from $1.$ 
{ \citet{Kangqiao_Liu} obtain an expression  for the asymptotic variance for SHB and show that it can be matched by that of vanilla SGD with a rescaled stepsize. However, this discussion is only from an asymptotic sense and compares the final size of the ball where the iterates with or without momentum settle, but not the number of iterations needed to reach such a ball. In fact, the asymptotic variance estimate does not provide any information about the sample complexity.}
In \citep{rahul, liu}, for one specific instance of the least squares regression with {\em vanishing noise}, it is shown that the performance of SHB and ASG cannot be better than that of SGD.  
Finally, \cite{nqm} consider SHB for quadratic objectives in the noisy setting as our work and provides upper bounds on the rate at which the objective function decreases. They also argue that rescaled SGD performs as well as SHB and demonstrate it empirically but fall short of rigorously coming up with a lower bound that supports their claim. 

SHB and ASG have also been studied in the decreasing stepsize setting. \cite{ghadimi15} had given the first global convergence of SHB for quadratic objectives while \cite{yang,gadat,Orvieto} gave a.s. convergence rates for convex objectives. In \citep{sebbouh21}, improved bounds on both SGD and SHB have been provided, as compared to previously known bounds. %Yet their bound on SHB is not shown to be better than that on SGD.
\cite{Hu,ghadimi,xio} study Nesterov's momentum under a decreasing stepsize setting and show that though the momentum scheme accelerates the convergence of the iterates in the initial part, the acceleration is lost in the asymptotic regime. \citet{vaswani} study ASG with a decreasing momentum parameter and show a linear convergence to the optimal point. However, the noise at any stationary point vanishes to zero in their setting.
Finally, we also note that other momentum methods have been studied in \citep{katyusha,proximal,defazio14,Johnson,roux} that can provably be shown to have a better performance than SGD.

The current literature can thus be summarized as follows. 

\textbf{Research Gap}: Existing works on SHB and ASG fall into two groups: i.) positive -  where the results claim advantages of these methods over SGD and ii.) negative - where the results claim the opposite. Results in the positive group either have a one-sided or a flawed analysis, while the ones in the negative  apply only in some  restricted settings. 

\textbf{Key Contribution:} Our work belongs to the negative group: SHB and ASG do not have an advantage over SGD. Specifically, for {\em all quadratic optimization problems} with persistent noise (noise variance is sufficiently bounded away from zero) and any sufficiently small $\epsilon > 0,$ we show that number of iterations needed by SHB and ASG to find an $\epsilon$-optimal solution are not better than that of SGD. More technically, we obtain a lower bound on sample complexities of SHB and ASG (Theorem \ref{Lower_bound}) and show that these are of the same order as the corresponding upper bound for SGD (Proposition \ref{Theorem_OTS_mom}). 
Our proof techniques are also significantly different from those used in existing lower bounds such as \citep{rahul,liu}. This is because, under non-vanishing noise, the expected error contains an additional term that cannot be accounted for from their analyses (see Remark \ref{R3}).


\section{Main Results}
\label{sec:main}

We state our main results here that provide lower and upper bounds on the sample complexities of SHB and ASG. We use these bounds along with those of SGD to show that all these methods need a similar effort to find an $\epsilon$-optimal solution. 

Throughout, we consider minimizing
\begin{equation}
    \label{e:obj.fn}
    f(x) = \frac{1}{2}x^TAx - b^Tx + c,
\end{equation}
%
where $A$ is some symmetric $d \times d$ matrix, $b \in \bR^d,$ and $c \in \bR.$ The update rules for standard algorithms such as SHB, ASG, and SGD for solving this problem can be jointly expressed as 
%
\begin{align}
    \label{sgd-m}
    x_{n} = {} & x_{n-1} + \alpha(b - Ax_{n-1} + M_{n}) \nonumber \\
    & +\eta(I_d-\alpha\beta A)(x_{n} - x_{n-1}) \\
    \label{sgd-m-2}
    = {} & x_{n-1} + \alpha(b - A(x_{n-1}+\eta \beta (x_{n-1} -x_{n-2})) + M_{n})\nonumber\\ 
    & + \eta (x_{n-1} - x_{n-2})
\end{align}
%
with $x_{-1}= x_0$. The notation $I_d$ is the $d\times d$ identity matrix, and $M_{n+1} \in \mathbb{R}^d$ is noise. Henceforth, we will refer to the above generic algorithm as Linear Stochastic Approximation with Momentum (LSA-M). 
Note that LSA-M is equivalent to SGD (if $\eta = 0$ in \eqref{sgd-m}), to SHB (if $\beta = 0$ in \eqref{sgd-m}), and to ASG (if $\beta=1$ in \eqref{sgd-m-2}).

%
 

We make the following assumption on the driving matrix.
\begin{assumption}[\textbf{Driving matrix property}]
\label{A1}
$A$ is real symmetric and all its eigenvalues are positive.
\end{assumption}
We also denote the the eigenvalues of $A$ by $\lambda_{\max} = \lambda_1 \geq \lambda_{2} \geq \ldots \geq \lambda_{d} = \lambda_{\min}$. 
Under the above assumption, one would expect the iterates in \eqref{sgd-m} to go to a neighborhood of $x^* := A^{-1} b.$

We next state two assumptions on the noise sequence $(M_n)$, the first is used in Theorem~\ref{Lower_bound}, while the other is used in Proposition~\ref{Theorem_OTS_mom} and Corollary~\ref{Upper_bound_cor}. The notation $A \succeq B$ means $A-B$ is positive semi-definite.

\begin{assumption}[\textbf{Noise attributes for Theorem~\ref{Lower_bound}}]
\label{A2}
$(M_{n})$ is a martingale difference sequence with respect to the filtration $(\mathcal{F}_{n})$, where $\mathcal{F}_{n} = \sigma(x_{m},M_{m};m\leq n)$. Further, $\exists K > 0$ such that $\mathbb{E}[M_{n+1} M_{n+1}^T|\mathcal{F}_{n}]\succeq K I_d$ a.s. $\forall n \geq 0.$
\end{assumption}
\begin{assumption}[\textbf{Noise attributes for Proposition~\ref{Theorem_OTS_mom}}]
\label{A3}
$(M_n)$ is a martingale difference sequence with respect to the filtration $(\mathcal{F}_{n})$, where $\mathcal{F}_{n} = \sigma(x_{m},M_{m};m\leq n)$. Further, $\exists K \geq 0$ such that  $\mathbb{E}[\|M_{n+1}\|^{2}|\mathcal{F}_{n}] \leq K(1+\|x_{n}-x^{*}\|^2)$ a.s. $\forall n \geq 0.$
\end{assumption}
Assumptions~\ref{A2} and \ref{A3} are standard \citep{mandt, jas, cheng, Borkar_Book}. The first of these holds if and only if all the eigenvalues of $\E[M_{n + 1} M_{n + 1}^T |\cF_n]$ are bounded from below by $K,$ i.e., noise is persistent (or non-vanishing) in all directions. On the other hand, Assumption~\ref{A3} requires that the trace of $\E[M_{n + 1} M_{n + 1}^T |\cF_n]$ be bounded from above. This bound can scale with $\|x_n - x^*\|$ and need not vanish near $x^*.$

Next, we define sample complexity to quantify the effort required by LSA-M to obtain an $\epsilon$-close solution to $x^*.$

\begin{definition}[\textbf{Sample Complexity}]
The sample complexity of \eqref{sgd-m} is the minimum number of iterations $n_0$ such that the expected error $\mathbb{E}[\|x_n-x^*\|^2] \leq \epsilon$,  $ \forall n \geq n_0$. 
%
\end{definition}
\begin{table*}[t]
\begin{center}
\begin{tabular}{ |c|c|c|c| }
\hline
 Method &$\beta$ & $\eta$ & $\alpha$ \\ \hline
\multirow{3}{*}{SGD}
 & & & \\
 & - & 0 & $\min\Big(\frac{\lambda_{\min}}{\frac{3}{4}\lambda_{\min}^{2} + C^{2}K}, \frac{\epsilon\lambda_{\min}}{4C^2K}, \frac{2}{\lambda_{\max}+\lambda_{\min}} \Big) $ \\
 & & & \\

 \hline
\multirow{1}{*}{SHB}
 & 0 & $ \left(1-\frac{\sqrt{\alpha\lambda_{\min}}}{2}\right)^{2} $ &$ \min\Big((\frac{\lambda_{\min}^{3/2}}{\frac{3}{8}\lambda_{\min}^{2} + 25C^2K})^{2}, (\frac{\epsilon(\lambda_{\min})^{3/2}}{200 C^2 K})^2,$ \\ 
 & &  &  $(\frac{2}{\sqrt{\lambda_{\min}}+\sqrt{\lambda_{\max}}})^2\Big) $ \\
 & & & \\
  
 \hline
 \multirow{1}{*}{ASG}
 & 1 & $\frac{ \left(1-\frac{\sqrt{\alpha\lambda_{\min}}}{2}\right)^{2}}{(1-\alpha\lambda_{\min})}$ & $\min\Big((\frac{\lambda_{\min}^{3/2}}{\frac{3}{8}\lambda_{\min}^{2} + 25C^2K})^{2}, (\frac{\epsilon(\lambda_{\min})^{3/2}}{200 C^2 K})^2,\frac{1}{\lambda_{\max}}\Big)$\\
 & &  &   \\
  
 \hline
\end{tabular}
\\
\caption{Parameter choices for Proposition
\ref{Theorem_OTS_mom}. Here $C=1$ when the matrix $A$ is symmetric and $C = \frac{\sqrt{d}}{\sigma_{\min}(S)\sigma_{\min}(S^{-1})}$ when $A$ is not symmetric, where $\sigma_{\min}(\cdot)$ denotes the smallest singular value and $S$ is the matrix that diagonalizes $A$, i.e., $S^{-1}AS = D$, a diagonal matrix. {When $A$ is symmetric, indeed the three parameter choices correspond to SGD, SHB and ASG.} We stick to the same naming convention even when the driving matrix $A$ is not symmetric.}
\label{table}
\end{center}
\end{table*}

To enable easy comparison between different algorithms, we shall look at the order of their sample complexities. Towards that, we shall use the notation $n_0 \in \Theta(t)$ to imply that there exist constants $c_1$ and $c_2$ (independent of $t$) such that $c_1 t \leq n_0 \leq c_2 t $. 
The notation $\tilde{\Theta}(t)$ has a similar meaning but hides the dependence on logarithmic terms.
{Further,  $n_0 \in \Omega(t)$ implies there exists $c_1$ such that $n_0 \geq c_1 t$ and $n_0 \in \mathcal{O}(t)$ implies there exists $c_2$ such that $n_0 \leq c_2 t$.}

\begin{theorem}
    \label{Lower_bound}
    \textbf{(Lower bound on sample complexity).} Consider the LSA-M update rule \eqref{sgd-m}, and suppose Assumptions \ref{A1} and \ref{A2} hold. Then there exists an $\epsilon' > 0$ such that, for any $\epsilon \in (0,\epsilon')$
    and for any choice of $\alpha > 0,$ $\beta \in [0,1],$ and $\eta \in [0,1]$, the expected error $\mathbb{E}[\|x_{n_{0}} - x^{*}\|^2]\geq\epsilon$ for $n_0 \in \tilde{\Theta}\left(\frac{K}{\epsilon\lambda_{\min}^2}\right).$ The constant $K$ here is the one from Assumption~\ref{A2}.
    % \rddelete{Thus, $n_0$ is a lower bound on the sample complexity of LSA-M.}
\end{theorem}
See Section~\ref{sec:proof_lower_bound} for the proof of the above Theorem.
\begin{remark}
    \label{R1}
    As stated below \eqref{sgd-m-2}, LSA-M includes SHB and Nesterov's ASG method as special cases and, hence, the above result directly applies to them. In fact, this is the first lower bound on SHB and ASG's sample complexities in quadratic optimization.
\end{remark}

\begin{remark}
\label{R3}
    The lower bounds in \cite{rahul} and \cite{liu} are obtained by viewing the expected error in SHB and ASG iterates for least squares as update rules of the form $z_{n + 1} = P z_n$ for some matrix $P$ \cite[Appendix~A, p~16]{rahul} and \cite[Appendix C, p~12]{liu}). In particular, they obtain bounds on the eigenvalues of $P$ to get the desired claim. In contrast, the error relations for SHB and ASG methods in our setup (quadratic optimization with persistent noise) have the form $z_{n + 1} = P z_n + \alpha W_n$ for some matrix $P$ and vector $W_n$ (cf. \ref{recursion}). This forces us to develop a new proof technique that jointly looks at both these terms and show that at least one of them remains larger than $\epsilon$ for the choice of $n_0$ given in Theorem~\ref{Lower_bound}. 
\end{remark}
%

We next state our upper bound on the sample complexity of \eqref{sgd-m} in Proposition~\ref{Theorem_OTS_mom} and Corollary~\ref{Upper_bound_cor}. Similar bounds already exist in literature when $A$ is assumed to be symmetric and the noise is assumed to be iid with variance bounded by a constant (\citep{zhu2,nqm}). Here, we show that a similar upper bound holds under more general settings: i.) $A$ is not symmetric but is diagonalizable and has real positive eigenvalues, and ii.) the noise is a martingale difference sequence satisfying Assumption~\ref{A3}. 

\begin{proposition}
\label{Theorem_OTS_mom} Consider the LSA-M update rule \eqref{sgd-m}, and suppose $A$ is a (not necessarily symmetric) real diagonalizable matrix with real positive eigenvalues\footnote{When $A$ is not symmetric, LSA-M \emph{cannot} be viewed as a gradient-based algorithm for minimizing \eqref{e:obj.fn}. However, the update rule still makes sense, and it can be seen as one that is useful for solving $Ax = b.$}. Further suppose \ref{A3} holds. 
Then, $\forall \epsilon > 0$, there exists a choice of
$\alpha$, $\beta$ and $\eta$ (see Table~\ref{table} for exact values) such that the expected error $\mathbb{E}[\| x_{n}-x^*\|^2] \leq \epsilon$, $\forall n > n_{0}$, where
%
\begin{enumerate}
    \item[(i)] $n_{0} \in \Tilde{\Theta}(\frac{1}{{\alpha\lambda_{\min}}}),$ when $\eta = 0,$ and 
    \item[(ii)] $n_{0} \in \Tilde{\Theta}(\frac{1}{\sqrt{\alpha\lambda_{\min}}}),$ when $\eta > 0$.
\end{enumerate}
\end{proposition}

For the proof see Appendix~\ref{app-sec:appendix_proof_upper_bound}.

From Table \ref{table}, we see that $\alpha$ is a minimum of three terms in each case. The first term arises due to the unbounded noise (Assumption \ref{A3}), the second due to the target neighborhood $\epsilon$  and the third from the optimal choice of stepsize in the deterministic (no noise scenario) case. Since the bound on $n_0$ provided in Proposition \ref{Theorem_OTS_mom} is in terms of $\alpha$, the minimum of the three terms dictates the sample complexity. Note that $\epsilon$ only influences the middle term in all the choices of $\alpha$ given in Table~\ref{table}. 

Let $\bar{\epsilon}> 0$ be such that, for any $\epsilon \in (0, \bar{\epsilon}),$ the value of $\alpha$ equals the middle term in each of the three cases in Table~\ref{table}. Then the following result is immediate. 

%
\begin{corollary}[\textbf{Upper bound on sample complexity}]
    \label{Upper_bound_cor}
    Consider the LSA-M update rule \eqref{sgd-m}, and suppose $A$ is as in Proposition~\ref{Theorem_OTS_mom}. Further, suppose Assumption \ref{A3} holds. Then, for choice of parameters in Table~\ref{table}, and any $\epsilon \in (0, \bar{\epsilon}),$ $\exists n_0 \in \Tilde{\Theta}\left(\frac{K}{\epsilon\lambda_{\min}^2}\right)$ such that $\mathbb{E}[\| x_{n}-x^*\|^2] \leq \epsilon$, $\forall n \geq n_{0}$. The constant $K$ here is the one from Assumption~\ref{A3}.
\end{corollary}

\begin{remark}
    \label{R4}
    From Corollary \ref{Upper_bound_cor}, we see that the upper bounds on the sample complexities of SGD, SHB, and ASG match the lower bound given in Theorem \ref{Lower_bound} for small enough $\epsilon>0.$ In particular, since an upper bound on the sample complexity of SGD matches a lower bound for SHB and ASG, these latter methods do not always outperform SGD from a sample complexity perspective.
\end{remark}


\begin{remark}
\label{R5_new}
    Consider $\epsilon$  small enough such that the minimum in choice of $\alpha$ is achieved by the second term in Table \ref{table}. For SGD, the stepsize $\alpha \in \Theta(\frac{\epsilon\lambda_{\min}}{K})$ is larger than the choice of stepsize for SHB and ASG, $\alpha \in \Theta(\frac{\epsilon^2\lambda_{\min}^3}{K^2})$. Observe that SGD chooses a larger stepsize than SHB and ASG to reach the $\epsilon$ ball. Therefore, although momentum methods appear to have a better performance than SGD if the same stepsize is chosen, SGD can match this performance by re-scaling its stepsize (see Figure \ref{figure}).
\end{remark}

\begin{remark}
    \label{R5}
    When the noise is assumed to be bounded by a constant, i.e., $\mathbb{E}[\|M_{n+1}\|^{2}|\mathcal{F}_{n}] \leq K$ a.s. in Assumption~\ref{A3}, the first term in the choice of $\alpha$ in Table~\ref{table} does not appear for all three methods. Under such an assumption, if $\epsilon$ is large enough or $K$ is small enough such that the third term in the choice of $\alpha$ is the minimum, then the sample complexity of both SHB and ASG is better than SGD. We emphasize that such improvements are lost when the noise variance is large or the neighbourhood under consideration is small.
\end{remark}

\section{Proof of the Lower Bound (Theorem \ref{Lower_bound})}\label{sec:proof_lower_bound}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
We begin by defining the transformed iterates $\Tilde{x}_{n} = x_{n} - x^*$ and rewriting \eqref{sgd-m} as
%
\begin{equation}
\label{recursion}
    \Tilde{X}_{n} = P \Tilde{X}_{n-1} + \alpha W_{n},
\end{equation}
where
\(\Tilde{X}_{n} \triangleq
\begin{pmatrix}
    \Tilde{x}_{n} \\
    \Tilde{x}_{n-1}      
\end{pmatrix},
W_{n} \triangleq
\begin{pmatrix}
    M_{n}\\
    0
\end{pmatrix}\)
and
\[
P \triangleq
\begin{pmatrix}
    I_{d} - \alpha A + \eta(I_{d}-\alpha\beta A) & -\eta(I_{d}-\alpha\beta A)   \\
    I_{d} & 0      
\end{pmatrix}. 
\]
%
We derive the bound in Theorem \ref{Lower_bound} by obtained a lower bound for the error expression $\E[\|\tilde{X_{n}}\|^2]$.

The proof can be summarized by the following key steps.
%
\begin{enumerate}
    \item[{1.}] Transform $\tilde{X}_n$ to obtain $\tilde{Y}_n$ (see \eqref{e:tildeYn_tildeWn.Defn}). Decompose the 2d-dimensional update rule for $\tilde{Y}_n$ (see  \eqref{eq:y_tilde}) into $d$ separate two-dimensional update rules (see \eqref{eq:y_tilde_i}) using a block diagonalization argument. 
    
    
    % Decompose the original 2d-dimensional update rule for $\tilde{X}_n$ in \eqref{recursion} into $d$ separate two-dimensional update rules for $\tilde{Y}_n$ in \eqref{eq:y_tilde_i} using a block diagonalization argument. 
    
    \item[{2.}] For each of the two-dimensional components of $\tilde{Y}_n$ (denoted $\tilde{Y}_n^{(i)},$ $i = 1, \ldots, d),$ obtain a lower bound on the error $\mathbb{E}\| \tilde{Y}_n^{(i)}\|^2$. We do this using the following three steps. 
    %
    \begin{enumerate}
        \item[{(a)}] Decompose the error into two components: one that captures the impact of the initialization (\emph{bias}), and the other that concerns the effect of the cumulative noise (\emph{variance}); {see Lemma~\ref{bias-variance-lemma}}. 
        
        \item[{(b)}] Use the above decomposition to derive a lower bound on $\bE \|\tilde{Y}_n^{(i)}\|^2$ for the special case of $\beta = 0$. The core idea is to show that the \emph{bias} and the \emph{variance} in $\tilde{Y}^{(i)}_n$ cannot be simultaneously small;  {see Lemma~\ref{lower_main_lemma}}.
       
        \item[{(c)}] Generalize the result to $\beta \in [0, 1]$ case by showing that it can be reduced to the former case.     
    \end{enumerate}
    \item[{3.}] Use the lower bound on $\mathbb{E}\| \tilde{Y}_n^{(i)}\|^2$ from Step 2 to obtain a lower bound on the original error $\mathbb{E}\| \tilde{x}_n\|^2.$ This proves  the desired result for SHB with $\beta = 0$ and ASG with $\beta = 1.$   
\end{enumerate}
Next we describe the technical results involved in each of the above steps.
\begin{enumerate}
\item[\textbf{1.}]
\textbf{Reducing the $2d$-dimensional updates into $d$ separate two-dimensional updates.}

We follow a block diagonalization argument as in \citep{MJ} to transform the update rule \eqref{recursion} below.

\begin{lemma}
    \label{lemma_step_1}
    There exists a transformation matrix $Z$ and a block diagonal matrix $B = \diag(B_i),$ where $B_i \in \R^{2 \times 2},$ so that
    %
    \begin{equation}
    \label{e:tildeYn_tildeWn.Defn}
        \tilde{Y}_n = Z \tilde{X}_n \qquad \text{and} \qquad \tilde{W}_n = Z W_n
    \end{equation} 
    %
    satisfy
    %
    \begin{align}
    \label{eq:y_tilde}
        \tilde{Y}_n = B \tilde{Y}_{n-1} + \alpha \tilde{W}_{n}.
    \end{align}
    %
    In particular, if we break $\tilde{Y}_n$ into $d$ disjoint components of $2$-dimensional vectors, then the $i$-th component
    %
    \begin{align}
    \label{eq:y_tilde_i}
        \tilde{Y}_{n}^{(i)} \!\!=\!\! \begin{pmatrix}
        1  - \alpha \lambda_{i} + \eta' & -\eta'\\
        1 & 0
    \end{pmatrix} \!\Tilde{Y}_{n-1}^{(i)} \! + \! \alpha \tilde{W}_n^{(i)}
    \end{align}
    %
    where $\eta' = \eta(1-\alpha\lambda_i\beta)$.
    %
\end{lemma}

See Section~\ref{d_reduction} for the proof. Notice that the driving matrix $B$ in the transformed update rule \eqref{eq:y_tilde} is a block diagonal matrix unlike the driving matrix $P$ in \eqref{recursion}. In the next step we exploit this structure to lower bound $\mathbb{E}\| \tilde{Y}_n^{(i)}\|^2$.

    \item[\textbf{2.}] \textbf{Bounding the error $\mathbb{E}\| \tilde{Y}_n^{(i)}\|^2$.}
    
    We consider the two dimensional decoupled update given in \eqref{eq:y_tilde_i} for a specific $i$ and express the lower bound on the sample complexity with respect to $\lambda_i$. 
    
    \item[\textbf{(a)}] \textbf{Decompose the error $\mathbb{E}\| \tilde{Y}_n^{(i)}\|^2$ as a sum of bias and variance.} 

    First observe that the update from Lemma \ref{lemma_step_1} can be re-written as
\begin{gather}
\label{decomposed_recursion}
    \Tilde{Y}_{n}^{(i)} = B_i^n \Tilde{Y}_{0}^{(i)} + \alpha \sum_{i=0}^{n-1}B_i^{(n-1-i)}\tilde{W}_{i+1}^{(i)}.
\end{gather}
Taking the square of the norm on both sides of the above equation we get
\begin{equation}
\label{norm-sq}
    \begin{split}
        &\|\Tilde{Y}_{n}^{(i)}\|^{2} = \underbrace{\|B_i^{n}\Tilde{Y}_{0}^{(i)}\|^{2}}_{I} \\
        & + \underbrace{2\alpha \left(B_i^{n}\Tilde{Y}_{0}^{(i)}\right)^{T}\left(\sum_{j=0}^{n-1}B_i^{(n-1-i)}\tilde{W}^{(i)}_{j+1}\right)}_{II}\\
        & + \underbrace{\alpha^{2} \left(\sum_{j=0}^{n-1}B_i^{(n-1-i)}\tilde{W}^{(i)}_{j+1}\right)^{T}\left(\sum_{j=0}^{n-1}B_i^{(n-1-i)}\tilde{W}^{(i)}_{j+1}\right)}_{III}.\\
    \end{split}
\end{equation}
Using the fact that $(\tilde{W}_n) = (Z{W}_n)$ is a martingale difference sequence, it can be shown that expectation of term $II$ is 0 and that of term $III$ is $\alpha^2 \mathbb{E}\Big[ \sum_{j=0}^{n-1} \|B_{i}^{n-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \Big]$ (See Section \ref{proof_lemma_bias-variance-lemma} for details). This leads to the following lemma.
\begin{lemma}
    \label{bias-variance-lemma}
    For the update in \eqref{decomposed_recursion} the error can be decomposed as follows:
    \begin{align}
    \label{eq:bias_variance_y_tilde}
        \mathbb{E}\| \tilde{Y}_n^{(i)}\|^2 \nonumber
        &= \underbrace{\| B_{i}^n  \Tilde{Y}_0^{(i)} \|^2}_{Bias} \\
        & \qquad + \underbrace{\alpha^2 \mathbb{E}\Big[ \sum_{j=0}^{n-1} \|B_{i}^{n-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \Big]}_{Variance}.
        %&\geq \| P^n  \Tilde{X}_{0} \|^2 + \alpha^2  K \sum_{j=0}^{n-1} \|P^{j} e_1 \|^2,
    \end{align}
\end{lemma}
See Section~\ref{proof_lemma_bias-variance-lemma} for the proof.
The \emph{bias} and \emph{variance} here correspond to that of the $i$-th block of the transformed iterates in \eqref{eq:y_tilde_i}.

\item [\textbf{(b)}] \textbf{Bounding the error $\mathbb{E}\| \tilde{Y}_n^{(i)}\|^2$ for $\boldsymbol{\beta=0}$.}

Using the fact that $\eta' = \eta$ when $\beta = 0$, the update in \ref{eq:y_tilde_i} reduces to
\begin{align*}
    \tilde{Y}_{n}^{(i)} = 
     \begin{pmatrix}
         1  - \alpha \lambda_{i} + \eta & -\eta\\
         1 & 0
    \end{pmatrix} \Tilde{Y}_{n-1}^{(i)}+ 
    \alpha 
    \tilde{W}_n^{(i)}.
\end{align*}
We show that there exists an $\epsilon > 0$ such that for some $n_0 \in \tilde{\Theta}\left(\frac{K}{\epsilon\lambda_i^2}\right)$, either the \emph{bias} or the \emph{variance} is larger than $\epsilon$. This is established in the following key lemma.

\begin{lemma}
\label{lower_main_lemma}
Let $\epsilon'_i = \min\left(\frac{K}{32 \lambda_i^2},\frac{(\tilde{x}_0^{(i)})^2}{72}\right)$. Then for any $\epsilon \in (0,\epsilon'_i)$,
 and any $\alpha > 0,$ $\beta = 0$, $\eta \in [0,1],$ there exists 
\( n_0 \in \tilde{\Theta}\left(\frac{K}{\epsilon\lambda_i^2}\right),\) such that at least one of the following statements hold:
%
\begin{enumerate}
    \item $\| B_{i}^{n_0}  \tilde{Y}_0^{(i)} \|^2 > \epsilon$

    \item $\displaystyle\alpha^2 \mathbb{E}\Big[ \sum_{j=0}^{n_0-1} \|B_{i}^{n_0-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \Big]  > \epsilon.$
\end{enumerate}
%
\end{lemma}

See Section~\ref{proof_lower_main_lemma} for the proof. Lemma \ref{lower_main_lemma} along with Lemma \ref{bias-variance-lemma} immediately provides a lower bound on the error, i.e., $\E\|\tilde{Y}_{n_0}^{(i)}\|^2 > \epsilon$ for $\beta=0$. 
Lemma \ref{lower_main_lemma} is the core of the lower bound analysis and the proof is provided in Section \ref{proof_lower_main_lemma}. 


\item[\textbf{(c)}] \textbf{Extending (b) to the case $\boldsymbol{\beta \in (0,1]}$.}

We complete Step 2 by extending Lemma~\ref{lower_main_lemma} to the case when $\beta\in[0,1]$ as formalized below.
\begin{lemma}
\label{lower_main_lemma_general_beta}
Let $\epsilon'_i$ be defined as in Lemma~\ref{lower_main_lemma}. Then for any $\epsilon \in (0,\epsilon'_i)$,
 and any $\alpha > 0,$ $\beta = [0,1]$, $\eta \in [0,1],$ there exists 
\( n_0 \in \tilde{\Theta}\left(\frac{K}{\epsilon\lambda_i^2}\right),\) such that at least one of the following statements hold:
%
\begin{enumerate}
    \item $\| B_{i}^{n_0}  \tilde{Y}_0^{(i)} \|^2 > \epsilon$

    \item $\displaystyle\alpha^2 \mathbb{E}\Big[ \sum_{j=0}^{n_0-1} \|B_{i}^{n_0-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \Big]  > \epsilon.$
\end{enumerate}
%
\end{lemma} 
See Section~\ref{lemma_b_general} for the proof. Note that the general $\beta \in [0,1]$ update rule in \eqref{eq:y_tilde_i} is equivalent to the $\beta=0$ update with $\eta$ redefined as $\eta'$ and therefore we can re-use Lemma~\ref{lower_main_lemma} if we can ensure $\eta' \in [0,1]$. We show this holds when $\alpha\lambda_i\leq1$. For the case $\alpha\lambda_i > 1$, we show that the \emph{variance} term is greater than $\epsilon$ thus implying the conclusion of Lemma~\ref{lower_main_lemma}.

\item[\textbf{3.}]\textbf{Bounding the original error $\mathbb{E}[\|\tilde{X}_{n}\|^2]$}.

{Recall that the original update rule is given by}
\begin{align*}
    \Tilde{X}_{n} = P \Tilde{X}_{n-1} + \alpha W_{n}.
\end{align*}
To provide a bound on the error $\mathbb{E}[\|\tilde{X}_{n}\|^2] $, we invoke Lemma \ref{lower_main_lemma} for $i = d$ and $\lambda_{d} = \lambda_{\min}$ and use the fact that $Z$ is an orthogonal matrix. We have
\begin{align*}
   \mathbb{E}[\|\tilde{X}_{n_0}\|^2] &= \E[\|Z^{-1}\tilde{Y}_{n_0}\|^2]\nonumber\\
   &= \mathbb{E}[\|\tilde{Y}_{n_0}\|^2]\geq \mathbb{E}[\|\tilde{Y}_{n_0}^{(d)}\|^2]\geq \epsilon
\end{align*}
for all $\epsilon \in (0,\epsilon'_d)$ and for
{$n_0$ as defined in Lemma~\ref{lower_main_lemma} with $\lambda_i$ substituted with $\lambda_{\min}$}.

Now to obtain a bound for $\mathbb{E} \|\tilde{x}_n\|^2$ from $\mathbb{E} \|\tilde{X}_n\|^2$, we note that 
\begin{align*}
    2\max{(\|\tilde{x}_n\|^2,\|\tilde{x}_{n-1}\|^2)} &\geq \|\tilde{x}_n\|^2 + \|\tilde{x}_{n-1}\|^2 \\
    &= \|\tilde{X}_{n}\|^2.
\end{align*}
Therefore the lower bound on $\E[\|\tilde{X}_{n}\|^2]$ is enough to prove Theorem \ref{Lower_bound}. Choosing $\epsilon' = \epsilon_d'$ and noting that $n_0 \in \tilde{\Theta}\left(\frac{K}{\epsilon\lambda_{\min}^2}\right)$ completes the proof of Theorem~\ref{Lower_bound}.
 \end{enumerate}

\subsection{Proof of Lemma \ref{lemma_step_1}}
\label{d_reduction}
%
We first discuss how the update rule for $\tilde{Y}_n$ in \eqref{eq:y_tilde} can be obtained using that of $\tilde{X}_n$ in \eqref{recursion}. Towards this, we define
$D = \text{diag}(\lambda_i)_{i=1}^d.$ 
% \begin{align*}
%     B &:= \begin{pmatrix}
%         B_1 & 0 &\ldots & 0\\
%         0 & B_2 & \ddots & \vdots\\
%         \vdots & \ddots & \ddots & 0 \\
%         0 & \ldots & 0 & B_d
%     \end{pmatrix} \\
% \end{align*}
% where $B_{i} = 
%     \begin{pmatrix}
%         1 + \eta - \alpha \lambda_{i} & -\eta\\
%         1 & 0
%     \end{pmatrix}.$
Since $A$ is real symmetric (see Assumption~\ref{A1}), it has a spectral decomposition of the form $A = S D S^{-1}$. We define the transformation matrix $Z$ as
\begin{align}
    \label{Z_tranform}
    Z=E_{2d\times 2d}
\begin{pmatrix}
    S & 0 \\
    0 & S
\end{pmatrix}
\end{align}
where $E_{2d\times 2d}$ is the permutation matrix that changes the order $(1, 2, \ldots , 2d)$ into $ (1, d+1, 2, d+2, \ldots , d, 2d)$.

Since $\Tilde{X}_{n} = P \Tilde{X}_{n-1} + \alpha W_{n},$ we get
\begin{align*}
    \tilde{Y}_{n} &=Z\Tilde{X}_{n} = Z P \Tilde{X}_{n-1} + \alpha Z W_{n} \\
    &= Z P Z^{-1}\Tilde{Y}_{n-1}+ \alpha Z W_{n}
    = B \Tilde{Y}_{n-1}+ \alpha Z W_{n} \\
    %
    & = B \Tilde{Y}_{n-1}+ \alpha \tilde{W}_{n},
\end{align*}
%
as desired. The last but one equality follows because $ZPZ^{-1}=B,$ which itself holds since
%
\begin{gather*}
    Z P Z^{-1} = E_{2d \times 2d} \begin{pmatrix}
    S & 0 \\
    0 & S
\end{pmatrix} P 
\begin{pmatrix}
    S^{-1} & 0 \\
    0 & S^{-1}
\end{pmatrix} E_{2d \times 2d}^{-1}\\
\stackrel{(a)}{=}  E_{2d \times 2d} \underbrace{\begin{pmatrix}
        I_{d \times d}-\alpha D +\eta I_{d \times d} & -\eta I_{d \times d} \\
        I_{d \times d} & 0_{d \times d}
    \end{pmatrix}}_{\Gamma} E_{2d \times 2d} \\
    \stackrel{(b)}{=}  B.
\end{gather*}
Here $(a)$ follows because $E_{2d \times 2d}^{-1} = E_{2d \times 2d}$.
Further $(b)$ follows because the left multiplication of $E_{2d \times 2d}$ to $\Gamma$ changes the order of rows from $(1, 2, \ldots , 2d)$ to $ (1, d+1, 2, d+2, \ldots , d, 2d)$ and the right multiplication of $E_{2d \times 2d}$ changes the order of columns from $(1, 2, \ldots , 2d)$ to $ (1, d+1, 2, d+2, \ldots , d, 2d)$ which exactly results in $B$. 

% Therefore we have
% \begin{align*}
%     \tilde{Y}_n = B\tilde{Y}_{n-1} + \alpha \tilde{W}_n, 
% \end{align*}
% where $\tilde{W}_n = Z {W}_n$, which completes the proof.

To see why \eqref{eq:y_tilde_i} holds, let

$\tilde{Y}_n = \begin{pmatrix}
    \tilde{Y}_n^{(1)} \\
    \tilde{Y}_n^{(2)} \\
    \vdots \\
    \tilde{Y}_n^{(d)} \\
\end{pmatrix}$ and 
$\tilde{M}_{n} =\begin{pmatrix}
    \tilde{M}_{n,1} \\
    \tilde{M}_{n,2} \\
    \vdots \\
    \tilde{M}_{n,d} \\
\end{pmatrix} = S M_{n}$ ,
where $\tilde{Y}_{n} \in \R^{2d},\tilde{Y}_n^{(i)} \in \mathbb{R}^2$, $\tilde{M}_{n} \in \R^{d}, \tilde{M}_{n,i} \in \mathbb{R}$. Now notice that 
\begin{align}
\label{eq:ZWn}
    ZW_n &= E_{2d\times 2d}
\begin{pmatrix}
    S & 0 \\
    0 & S
\end{pmatrix} \begin{pmatrix}
    M_n\\
    0
\end{pmatrix}\\
\nonumber
&= E_{2d\times 2d} \begin{pmatrix}
    \tilde{M}_n\\
    0
\end{pmatrix} = \begin{pmatrix}
    \tilde{M}_{n,1} \\
    0\\
    \tilde{M}_{n,2} \\
    0\\
    \vdots \\
    \tilde{M}_{n,d} \\
    0\\
\end{pmatrix},
\end{align}
where the last equality follows because the left multiplication of $E_{2d\times 2d}$ changes the order of rows from $(1, 2, \ldots , 2d)$ to $ (1, d+1, 2, d+2, \ldots , d, 2d)$. Therefore,  $\forall i \in [d],$
$$\tilde{Y}_{n}^{(i)} = B_i \Tilde{Y}_{n-1}^{(i)}+ \alpha \tilde{W}_n^{(i)}$$
where $\tilde{W}_n^{(i)} = \begin{pmatrix}
    \tilde{M}_{n,i} \\
    0
\end{pmatrix}$.
\subsection{Proof of Lemma \ref{bias-variance-lemma}}
\label{proof_lemma_bias-variance-lemma}
Recall the error expression from \eqref{norm-sq}: 
%
\begin{align*}
        &\|\Tilde{Y}_{n}^{(i)}\|^{2} = \underbrace{\|B_i^{n}\Tilde{Y}_{0}^{(i)}\|^{2}}_{I} \\
        & + \underbrace{2\alpha \left(B_i^{n}\Tilde{Y}_{0}^{(i)}\right)^{T}\left(\sum_{j=0}^{n-1}B_i^{(n-1-i)}\tilde{W}^{(i)}_{j+1}\right)}_{II}\\
        & + \underbrace{\alpha^{2} \left(\sum_{j=0}^{n-1}B_i^{(n-1-i)}\tilde{W}^{(i)}_{j+1}\right)^{T}\left(\sum_{j=0}^{n-1}B_i^{(n-1-i)}\tilde{W}^{(i)}_{j+1}\right)}_{III}.\\
\end{align*}
%
Since $\tilde{W}_n = Z{W}_n,$  it follows that $(\tilde{W}_n)$ is also a martingale difference sequence w.r.t. the filtration $(\mathcal{F}_n),$ where $\mathcal{F}_n$ is as in Assumption~\ref{A2}. In particular, since $\mathbb{E}[\tilde{W}_{n}^{(i)}] = 0$ for each $n,$ we get that the expectation of Term $II$ is $0.$ With regards to Term $III,$ we have
%
\begin{align*}
    &\alpha^{2} \left(\sum_{j=0}^{n-1}B_i^{(n-1-j)}\tilde{W}^{(i)}_{j+1}\right)^{T}\left(\sum_{j=0}^{n-1}B_i^{(n-1-j)}\tilde{W}^{(i)}_{j+1}\right) \\
    &= \alpha^2 \sum_{j,k} (\tilde{W}^{(i)}_{j+1})^T (B_i^{(n-1-j)})^{T} B_i^{(n-1-k)}\tilde{W}^{(k)}_{k+1}\\
    &= \underbrace{\alpha^2 \sum_{j \neq k} (\tilde{W}^{(i)}_{j+1})^T (B_i^{(n-1-j)})^{T} B_i^{(n-1-k)}\tilde{W}^{(k)}_{k+1}}_{III(a)}\\
    &\qquad \qquad + \underbrace{\alpha^2\sum_{j}\| B_i^{(n-1-j)}\tilde{W}^{(i)}_{j+1} \|^2}_{III(b)}
\end{align*}
 We now show that the expectation of $III(a)$ is 0. Without loss of generality, suppose $j<k$. Then,
\begin{equation*}
    \begin{split}
        & \mathbb{E}\left[(\tilde{W}^{(i)}_{j+1})^{T}(B_i^{(n-1-j)})^T B_i^{(n-1-k)} \tilde{W}^{(i)}_{k+1}\right]\\
        & =\mathbb{E}\left[\mathbb{E}\left[(\tilde{W}^{(i)}_{i+1})^{T}(B_i^{(n-1-i)})^T B_i^{(n-1-j)} \tilde{W}^{(i)}_{j+1}\vert\mathcal{F}_{j}\right]\right]\\
        & = \mathbb{E}\left[(\tilde{W}^{(i)}_{i+1})^{T}(B_i^{(n-1-i)})^{T}B_i^{(n-1-j)}\mathbb{E}[\tilde{W}^{(i)}_{j+1}|\mathcal{F}_{j}]\right]
        = 0.
    \end{split}
\end{equation*}

Therefore, taking expectation on both sides of \eqref{norm-sq} gives
\begin{align}
    \label{bias_variance}
    \mathbb{E}\| \tilde{Y}_n^{(i)}\|^2 \nonumber
    &= \underbrace{\| B_i^n  \Tilde{X}_{0} \|^2}_{I} \\
    &+  \mathbb{E}\Big[ \underbrace{\alpha^2\sum_{j=0}^{n-1} \|B_i^{(n-1-j)}\tilde{W}^{(i)}_{j+1} \|^2}_{III(b)} \Big]
\end{align}

\subsection{Proof of Lemma \ref{lower_main_lemma}}
\label{proof_lower_main_lemma}
This is the key result in the lower bound proof. Here we outline the main steps involved in proving the result. The detailed proofs of the all auxiliary lemmas are pushed to Appendix \ref{app-App_thm3}.

Before we proceed with the main proof, we provide a lower bound on the \emph{variance} term in the following lemma.
\begin{lemma}
\label{variance_lower_bound}
    Under Assumption \ref{A2} and $n_0$ as in Lemma~\ref{lower_main_lemma}, the variance term in \eqref{eq:bias_variance_y_tilde} can be lower bounded as follows:
    \begin{align*}
        \alpha^2\mathbb{E}\left[ \sum_{j=0}^{n_0-1} \|B_{i}^{n_0-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \right] \geq \alpha^2K \sum_{j=0}^{n_0-1}  \|B_{i}^{j}e_1 \|^2
    \end{align*}
    where $e_1 = \begin{pmatrix}
    1\\ 0
\end{pmatrix}$ and $K$ is as in Assumption~\ref{A2}.
\end{lemma}

For convenience we redefine the term in the right hand side of the above inequality as the \emph{variance}. If $\alpha$ and $\eta$ are such that $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\|^2 > \epsilon$, then Lemma~\ref{lower_main_lemma} immediately follows for this choice of $\alpha$ and $\eta$. We now consider the case where $\alpha$ and $\eta$ are such that $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\|^2 \leq \epsilon$. Now we show that for this choice of $\alpha$ and $\eta$, the \emph{variance} is necessarily greater than $\epsilon$. Let $\mu^{(i)}_{+}$ and $\mu^{(i)}_{-}$ be the eigenvalues of $B_i$. It is easy to check that
\begin{equation}
\begin{aligned}
\label{mu_def}
    \mu_{+}^{(i)} = \frac{1}{2}\left((1 - \alpha\lambda_{i} + \eta) + \Delta^{(i)}\right)\\
    \mu_{-}^{(i)} = \frac{1}{2}\left((1 - \alpha\lambda_{i} + \eta) - \Delta^{(i)}\right)
\end{aligned}
\end{equation}
where $\Delta^{(i)} = \sqrt{(1 - \alpha\lambda_{i} + \eta)^2 - 4\eta}$.

Recall that $\epsilon \in (0,\epsilon_i')$ in Lemma~\ref{lower_main_lemma} and therefore $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\|^2 \leq \epsilon$ implies $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\|^2 < \epsilon_i'$.
The following Lemma provides a lower bound on the \emph{variance} in terms of the eigen values of $B_i$ and the momentum parameter $\eta$ assuming the \emph{bias} is less than $\epsilon_i'$.
\begin{lemma}[]
\label{Aux_lemma_1}
    Let $\alpha>0$ and $\eta \in [0,1]$ such that $\|B_{i}^{n_0} \tilde{X_0} \|^2 < \epsilon'_i.$ Then 
    % we have the following bound, 
    \begin{align*}
        \alpha^2 K \sum_{j=0}^{n_0-1} \|B_i^{j}e_{1}\|^2 \geq \frac{\alpha^2 K}{2(1-\mu_+^2)(1-\mu_-^2)(1-\eta)}.
    \end{align*}
\end{lemma}

It can be shown that $(1-\mu_+^2)(1-\mu_-^2) = \alpha\lambda_{i}$ and therefore the RHS in the above expression reduces to $\frac{\alpha K}{2\lambda_i(1-\eta)}$. We define the following function
\begin{align*}
    Q(\eta;\alpha,\lambda_{i}) \equiv \frac{\alpha K}{2\lambda_{i}(1-\eta)}\frac{1}{(1-\rho(B_i))}
\end{align*}
where $\rho(B_{i}) = |\mu_{+}^{(i)}|$ is the spectral radius of $B_i$. 
Note that $\rho(B_i)$ depends on $\eta$ (see \ref{mu_def}). 
Now to obtain a further lower bound on the \emph{variance} we optimize over the choice of $\eta$ and show that 
\begin{align*}
    Q(\eta;\alpha,\lambda_{i}) \geq \frac{K}{16 \lambda_{i}^2}
\end{align*}
Combining this with the definition of $Q$ and Lemma \ref{Aux_lemma_1} gives the following bound:
\begin{align*}
    \alpha^2 K \sum_{j=0}^{n_0-1} \Big\|B_i^{j}e_{1}\Big\|^2 \geq \frac{K}{16 \lambda_i^2}(1-\rho(B_i))
\end{align*}
The following lemma proves all these above claims.
\begin{lemma}
\label{Aux_lemma_2}
    Let $\alpha>0$ and $\eta \in [0,1]$ such that $\|B_{i}^{n_0} \tilde{X_0} \|^2 < \epsilon'_i.$ Then we have the following bound
    $\alpha^2 K \sum_{j=0}^{n_0-1} \|B_i^{j}e_{1}\|^2 \geq \frac{K}{16 \lambda_{i}^2}(1-\rho(B_i)).$
\end{lemma}
Lastly, to show that the \emph{variance} is lower bounded by $\epsilon \in (0,\epsilon')$, we need to show that $(1-\rho(B_i)) \geq \frac{16\lambda_{i}^2}{K}\epsilon$. The choice of $n_0$ and the fact that we assumed $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\| < \epsilon$ exactly ensures that. The following lemma proves this claim.

\begin{lemma}
\label{Aux_lemma_3}
For any $\epsilon \in (0,\epsilon_i')$, if $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\| < \epsilon$, then $1-\rho(B_i) \geq \frac{16\lambda_i^2}{K}\epsilon $.
\end{lemma}
This completes the proof of Lemma \ref{lower_main_lemma}.

\subsection{Proof of Lemma~\ref{lower_main_lemma_general_beta}}
\label{lemma_b_general}
We handle the cases $\alpha \lambda_i \leq 1$ and $\alpha \lambda_i > 1$ separately. 

\textbf{Case 1 ($\boldsymbol{\alpha \lambda_i \leq 1}$):} 
Observe that the general $\beta$ update rule in \eqref{eq:y_tilde_i} is equivalent to the $\beta=0$ update with $\eta$ redefined as $\eta'$. Moreover in this case $\eta' \in [0,1]$. To see this first observe that 
\begin{align*}
    \eta' = \eta(1-\alpha\lambda_{i}\beta) \geq \eta (1-\beta) \geq 0.
\end{align*}
Here the first inequality follows because $\alpha\lambda_i \leq 1$ and the second inequality follows because $\beta,\eta \in [0,1]$.

Therefore in this case Lemma~\ref{lower_main_lemma} holds with $\eta$ redefined as $\eta'$.

\textbf{Case 2 ($\boldsymbol{\alpha \lambda_i > 1}$):} 
In this case we show that the variance term is greater than $\epsilon$. This follows as shown below
\begin{align*}
 \alpha^2 \E\left[\sum_{j=0}^{n_0-1} \|B_{i}^{n_0-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \right] &\stackrel{(A)}{\geq}  \alpha^2 K \sum_{j=0}^{n_0-1}\|B_{i}^{j}e_{1}\|^2  \\
    &\stackrel{(B)}{\geq} \alpha^2 K \stackrel{(C)}{>}  \frac{K}{\lambda_{i}^2}  \stackrel{(D)}{>} \epsilon.
\end{align*}
Here $(A)$ follows from Lemma \ref{variance_lower_bound}, $(B)$ follows from non-negativity of norm and lower bounding the sum with the $j=0$ term and $(C)$ follows since $\alpha\lambda_i > 1$. Finally $(D)$ follows for any $\epsilon < \frac{K}{\lambda_{i}^2}$ which in turn is smaller than $\epsilon_i'$ as defined in Lemma \ref{lower_main_lemma}. 

\section{Concluding Remarks}
\label{sec:conclusion}
In this work, we analyze the sample complexity of SHB and ASG and provide matching lower and upper bounds up to constants and logarithmic terms.
More importantly, we show that the same sample complexity bound can be obtained by standard SGD. Our work also calls into question some of the recent positive results in favour of SHB and ASG in the stochastic regime. We show that such improvements do not take into account all the terms involved in the error decomposition, or have major flaws. 
We emphasize that our results hold  specifically for SHB and ASG. Other momentum methods could offer provable improvements over SGD \citep{pmlr-v75-jain18a, liu}. 

% Although some other results do question the superiority of SHB and ASG in the stochastic regime, the assumptions and the setting that these works look at do not correspond to those in the positive results. We also emphasize that the negative results either only consider small stepsizes and momentum parameters not close to 1 or provide such results for specific instances in linear regression. In contrast, our work shows that SHB and ASG cannot obtain an improvement in sample complexity (for small neighbourhoods) over SGD for the entire family of quadratic optimization and holds for all stepsizes and momentum parameters in $[0,1]$.  

\section*{Acknowledgements}
SG was supported by the Prime Minister's Research Fellowship (PMRF).
{RD was supported in part by grants from the National Science Foundation (NSF) through awards IIS 21-31335, OAC 21-30835, DBI 20-21898, as well as a C3.ai research award.}
{GT was supported in part by DST-SERB's Core Research Grant CRG/2021/00833, in part by IISc Start-up grants SG/MHRD-19-0054 and SR/MHRD-19-0040, and in part by the “Pratiksha Trust Young
Investigator” award.}
{AB was supported in part by the NSF (DMS-2152577, DMS-2134107)}.


\bibliography{ganesh_669-supp}
\appendix
 \onecolumn
 \newpage
  \begin{center}
    \vspace{50mm}
     \Huge\textbf{Appendix}
 \end{center}
 \vspace{-2mm}
 \par\noindent\rule{\textwidth}{0.1pt}

\section{Comparison with recent works}
\label{App_A}
\subsection[Comparison with]{Comparison with \citet{MJ}}
\label{App_A1}
Claim 1 in (p. 20, \citet{MJ}) analyzes the asymptotic covariance of the heavy ball momentum algorithm (with Polyak averaging) and claims a correction term that satisfies:
\[\mbox{Tr}(L_{\eta}) \in \mathcal{O}\left(\eta\frac{\kappa^2(U)}{\lambda_{\min}^{3/2}}\right)\]
where \(\tilde{A} = \begin{pmatrix}
    0 & I_{d}\\
    -\bar{A} & \alpha I_{d}+\eta \bar{A}
\end{pmatrix} = UDU^{-1}\) as in the decomposition of $\tilde{A}$ in Lemma 1 of \cite{MJ} and $\kappa(U) = \|U\|_{op}\|U^{-1}\|_{op}$.



However in the proof of claim 1, we are not sure if the following bound holds, since the matrix $\tilde{A}$ is not symmetric:
\[\text{Tr}(\tilde{A}^{-1}\mathbb{E}(\tilde{\Xi}_{A}\Lambda_{\eta}^{*}(\tilde{\Xi}_{A})^{T})(\tilde{A}^{-1})^{T}) \leq (\min_{i}|\lambda_{i}(\tilde{A})|)^{-2}(1 + \eta^{2}) v_{A}^{2}\mathbb{E}_{\pi_{\eta}}\|x_{t} - x^{*}\|_{2}^{2}
\]

Our calculation points towards the following bound:
\[\mbox{Tr}(L_{\eta}) \in \mathcal{O}\left(\eta\frac{\kappa^2(U)}{\lambda_{\min}^{5/2}}\right).\]
We outline the proof for the uni-variate case, when $\bar{A} = \lambda$ for some $\lambda>0$. Then, 
\[\tilde{A} = \begin{pmatrix}
     0 & 1\\
    -\lambda & \alpha +\eta \lambda
\end{pmatrix},
\mbox{ and } 
\tilde{A}^{-1} = \frac{1}{\lambda}\begin{pmatrix}
     \alpha +\eta \lambda & -1\\
    \lambda & 0
\end{pmatrix}.\]
Observe that \(\displaystyle\tilde{A}^{-1}(\tilde{A}^{-1})^T = \frac{1}{\lambda^2}
\begin{pmatrix}
    1 + (\alpha+\eta\lambda)^2 & \lambda(\alpha + \eta\lambda)\\
    \lambda(\alpha + \eta\lambda) & \lambda^2
\end{pmatrix}\)
and therefore $\displaystyle Tr(\tilde{A}^{-1}(\tilde{A}^{-1})^T) = \mathcal{O}\left(\frac{1}{\lambda^2}\right)$.
Using this we have, 
\begin{align}
    \text{Tr}(\tilde{A}^{-1}\mathbb{E}(\tilde{\Xi}_{A}\Lambda_{\eta}^{*}(\tilde{\Xi}_{A})^{T})(\tilde{A}^{-1})^{T}) &\leq \mathcal{O}\left(\frac{1}{\lambda^2}\right)\; \text{Tr}(\mathbb{E}(\tilde{\Xi}_{A}\Lambda_{\eta}^{*}(\tilde{\Xi}_{A})^{T}) \in \mathcal{O}\left(\eta\frac{\kappa^2(U)}{\lambda^{5/2}}\right)
    \label{mou_bound}
\end{align}
The second inequality follows as in \cite{MJ}. Next we analyze the dependence of $\kappa^2(U)$ on $\lambda$. Again for simplicity we consider the uni-variate case where $\bar{A} = \lambda$. Let
\(\displaystyle\tilde{A} = \begin{pmatrix}
     0 & 1\\
    -\lambda & \alpha +\eta \lambda
\end{pmatrix},\) be diagonalizable. Therefore,
\[
    \tilde{A} = U 
    \begin{pmatrix}
        \mu_{+} & 0\\
        0 & \mu_{-}
    \end{pmatrix}
    U^{-1},
\]
where $\mu_{+}$ and $\mu_{-}$ are the eigenvalues of $\tilde{A}$.
Let \(U = \begin{pmatrix}
    u_{1} & u_{2}\\
    u_{3} & u_{4}
\end{pmatrix}\). We therefore have,
\[
    \begin{pmatrix}
     0 & 1\\
    -\lambda & \alpha +\eta \lambda
    \end{pmatrix}
    \begin{pmatrix}
    u_{1} & u_{2}\\
    u_{3} & u_{4}
\end{pmatrix} = 
\begin{pmatrix}
    u_{1} & u_{2}\\
    u_{3} & u_{4}
\end{pmatrix}
\begin{pmatrix}
        \mu_{+} & 0\\
        0 & \mu_{-}
    \end{pmatrix}.
\]
Solving the system of equations, we get:
\[
U = \begin{pmatrix}
        1 & 1\\
        \mu_{+} & \mu_{-}
    \end{pmatrix}
    \mbox{ and }
    U^{-1} = \frac{1}{\mu_{+} - \mu_{-}}
    \begin{pmatrix}
        1 & 1\\
        \mu_{+} & \mu_{-}
    \end{pmatrix}
\]
Now, $\mu_{+} - \mu_{-} = \sqrt{(\alpha + \eta\lambda)-4\lambda}$. Using the choice of $\alpha = \sqrt{\lambda}$ as in \cite{MJ}, we have:
\[
\mu_{+} - \mu_{-} = \sqrt{ \lambda + \eta^2\lambda^2 + 2\alpha\eta\lambda - 4\lambda}
\]
\[
= \sqrt{\lambda}\sqrt{\eta^2\lambda+2\eta\sqrt{\lambda} - 3}
\]
For $\lambda \ll 1$ (which is the case where the momentum algorithm is claimed to improve the mixing time in \cite{MJ}), \(\mu_{+} - \mu_{-} \geq \mathcal{O}(\sqrt{\lambda})\).
As in proof of Lemma \ref{c_hat_lemma} in Appendix \ref{C2} and using the fact that $\displaystyle\|U\|_{op}\|U^{-1}\|_{op} = \sigma_{\max}(U)\sigma_{\max}(U^{-1})$, we can show that $\kappa^2(U) \leq \mathcal{O}(\frac{1}{\lambda})$. Combining with \eqref{mou_bound}, we have:
\[
\mbox{Tr}(L_{\eta}) \in \mathcal{O}(\eta\lambda^{-7/2}).
\]
A similar analysis can be carried for the multivariate case to show that 
\[\mbox{Tr}(L_{\eta}) \in \mathcal{O}(\eta\lambda_{\min}(\bar{A})^{-7/2})\]
The correction term in SGD is $\mathcal{O}(\eta\lambda_{\min}(\bar{A})^{-3})$ (See \cite{MJ}, Claim 1). 
The stationary distribution for the momentum algorithm is larger than that of SGD when $\lambda_{\min} << 1$. Indeed if we enforce that the two asymptotic covariances are of the same size by choosing the stepsize for momentum iterate $\eta^{m}$ in terms of the stepsize of SGD, i.e.,
\[
\mathcal{O}\left(\eta\frac{1}{\lambda_{\min}(\bar{A})^3}\right) = \mathcal{O}\left(\eta^{m} \frac{1}{\lambda_{\min}(\bar{A})^{7/2}}\right),
\]
then we must choose $\eta^{m} \in \mathcal{O}(\eta\sqrt{\lambda_{\min}(\bar{A})})$. 
In Appendix C.1. of \cite{MJ}, the mixing time of momentum iterate is shown to be $\displaystyle\mathcal{O}\left(\frac{1}{\eta^m\sqrt{\lambda_{\min}(\bar{A})}}\right)$, while the mixing time of SGD is $\displaystyle\mathcal{O}\left(\frac{1}{\eta\lambda_{\min}(\bar{A})}\right)$. When we choose $\eta^{m} \in \mathcal{O}(\sqrt{\lambda_{\min}(\bar{A})}\eta)$, then the mixing time of momentum algorithm turns out to be the same as SGD.  This behaviour is identical to what we observe in Proposition \ref{Theorem_OTS_mom}, where if we choose the same stepsize then there is improvement by a square root factor. 

\subsection{Comparison with SHB, Can et al. (2019)}
\label{App_A2}
For strongly convex quadratic functions of the form:
\[
f(x) = \frac{1}{2}x^TQx + a^Tx + b,
\]
where $x\in \mathbb{R}^d$, $Q\in R^{d\times d}$ is p.s.d, $a\in \mathbb{R}^d$, $b\in R$ and $\mu I_d \preceq Q \preceq LI_d$, \cite{zhu2} shows acceleration in Wasserstein distance by a factor of $\displaystyle\sqrt{\kappa} = \sqrt{\frac{L}{\mu}}$. The trace of the asymptotic covariance matrix $X_{HB}$ is given by (See Appendix C.2 of \cite{zhu2}):
\[
\mbox{Tr}(X_{HB}) = \sum_{i=1}^{d}\frac{2\alpha(1+\beta)}{(1-\beta)\lambda_{i}(2+2\beta-\alpha\lambda_{i})},
\]
where, $\alpha$ is the stepsize, $\beta$ is the momentum parameter and $\lambda_{i}$ is the $i^{th}$ eigenvalue of $Q$.
We show that the asymptotic covariance matrix is worse compared to when no momentum is used, i.e., $\beta = 0$ and the optimial stepsize $\alpha = \frac{2}{\mu+L}$ is used. Substituting these values for $\beta$ and $\alpha$ we get:
\begin{align*}
    \mbox{Tr}(X_{\beta=0}) &= \sum_{i=1}^{d}\frac{2\frac{2}{\mu + L}}{\lambda_{i}(2-\frac{2}{\mu + L}\lambda_{i})}\\
    &= \sum_{i=1}^{d}\frac{2\frac{2}{\mu + L}}{\lambda_{i}\frac{2}{\mu+L}(\mu + L -\lambda_{i})}\\
    &= \sum_{i=1}^{d}\frac{2}{\lambda_{i}(\mu + L -\lambda_{i})}
\end{align*}
To compute the size of the stationary distribution with the iterates of heavy ball we set the stepsize $\alpha =\displaystyle \frac{4}{(\sqrt{\mu}+\sqrt{L})^2}$ and momentum parameter $\beta = \left(\frac{\sqrt{L}-\sqrt{\mu}}{\sqrt{L}+\sqrt{\mu}}\right)^2$ and get:
\begin{align*}
    \mbox{Tr}(X_{HB}) &= \sum_{i=1}^{d}\frac{2\displaystyle\left(\frac{4}{(\sqrt{\mu} + \sqrt{L})^2}\right)(1+\beta)}{(1-\beta)\lambda_{i}\left(2+2\left(\frac{\sqrt{L}-\sqrt{\mu}}{\sqrt{L}+\sqrt{\mu}}\right)^2-\frac{4}{(\sqrt{\mu} + \sqrt{L})^2}\lambda_{i}\right)}\\
    &= \sum_{i=1}^{d}\frac{2\left(\displaystyle\frac{4}{(\sqrt{\mu} + \sqrt{L})^2}\right)(1+\beta)}{(1-\beta)\lambda_{i}\left(\displaystyle\frac{2(\sqrt{L}+\sqrt{\mu})^2 + 2(\sqrt{L}-\sqrt{\mu})^2-4\lambda_{i}}{(\sqrt{L}+\sqrt{\mu})^2}\right)}\\
    &= \sum_{i=1}^{d}\frac{2\left(\displaystyle\frac{4}{(\sqrt{\mu} + \sqrt{L})^2}\right)(1+\beta)}{(1-\beta)\lambda_{i}\left(\displaystyle\frac{4}{(\sqrt{\mu} + \sqrt{L})^2}\right)(\mu + L -\lambda_{i})}\\
    &= \sum_{i=1}^{d}\frac{2(1+\beta)}{(1-\beta)\lambda_{i}(\mu + L -\lambda_{i})}\\
    &= \sum_{i=1}^{d}\frac{2}{\lambda_{i}(\mu + L -\lambda_{i})}\left(\frac{1+\left(\frac{\sqrt{L} - \sqrt{\mu}}{\sqrt{L} + \sqrt{\mu}}\right)^2}{1-\left(\frac{\sqrt{L} - \sqrt{\mu}}{\sqrt{L} + \sqrt{\mu}}\right)^2}\right)\\
    &= \sum_{i=1}^{d}\frac{2}{\lambda_{i}(\mu + L -\lambda_{i})}\frac{L+\mu}{4\sqrt{\mu L}}\\
    &= \sum_{i=1}^{d}\frac{2}{\lambda_{i}(\mu + L -\lambda_{i})}\frac{1}{2}\left(\sqrt{\kappa}+\frac{1}{\sqrt{\kappa}}\right)\\
    &\geq \text{Tr}(X_{\beta=0}) \frac{\sqrt{\kappa}}{2}
\end{align*}
The above calculation shows that the size of the asymptotic covariance matrix of SHB is worse by a factor of ${\Theta}(\sqrt{\kappa})$.

\subsection{Comparison with Assran and Rabbat (2020)}

\label{App_A4}

{Let $Q$ be the condition number of the given quadratic optimization problem's driving matrix (denoted by $A$ in our work). The key argument in \citet{Assran} to claim ``ASG converges at an accelerated rate" is that \textbf{both} its bias and variance are better than that of SGD (see the discussion below Theorem 2 there). Specifically, \citet{Assran} claims that (under optimal stepsize choices)     
    %
    \begin{enumerate}
        \item ASG's bias decays as (roughly) $\left(\frac{\sqrt{Q} - 1}{\sqrt{Q}}\right)^{2k},$ while  SGD's decays at  $O\left(\frac{Q - 1}{Q + 1}\right)^{2k};$

        \item also, ASG's asymptotic variance is $O(Q^{1/2}),$ while that of SGD's is $O(Q).$
    \end{enumerate}
    %
    While the bias computations are correct, there is an order of magnitude error in ASG's variance estimate. Instead of $O(Q^{3/2}),$ they incorrectly deduce it to be $O(Q^{1/2})$.
    
    
    Based on the incorrect variance estimate, \citet{Assran}'s claim essentially is that, compared to SGD, the ASG iterates converge to a `smaller' ball and also does so at a `faster' rate. However, with the correct variance estimate, one can only conclude that ASG's iterates converge faster but settle on a `bigger ball,' as illustrated in our Figure~\ref{sub-figure(a)}. Instead, our work shows that, if we look at the number of iterations needed to reach the same-sized ball as that of SGD, then there is a constraint on the choice of the stepsize and momentum parameters for ASG. Under this constraint, the ASG's optimal sample complexity  turns out to be of the same order as that of SGD.


    We now show that ASG's variance estimate in \citet{Assran} should be $O(Q^{3/2}),$ and not $O(Q^{1/2}).$ In what follows, the matrix $A$ is the stacked matrix defined in Eq. (15) of \citet{Assran}, and not the driving matrix $A$ of our work. In the display below Eq. (29) in \citet{Assran}, the authors have correctly shown that $\|A^k\|_2^2 \lesssim  k^2 \rho^{2k},$ where $\rho = \frac{\sqrt{Q} - 1}{\sqrt{Q}}$ and $\lesssim$ hides global constants. As can be seen in the proof of their [Appendix B, Cor. 1.1], they plug this estimate back in Eq. (25) in the proof of their Theorem~1 to obtain a bound on $\displaystyle \sum_{j = 1}^k \|A^{k - j}\|^2 = \sum_{j = 0}^{k - 1} \|A^j\|^2 \lesssim \sum_{j = 0}^{k - 1} j^2 \rho^{2j}.$ It is easy to check that 
    \[
        \sum_{j = 0}^{\infty} j^2 \rho^{2j} =  \frac{\rho^2 (1+\rho^2)}{(1  - \rho^2)^3}. 
    \]
    Thus, for all sufficiently large $k,$
    \[
        \sum_{j = 0}^{k - 1} j^2 \rho^{2j} \geq  \frac{\rho^2 (1+\rho^2)}{2(1  - \rho^2)^3}\; {\geq}\; \frac{\rho^2}{2(1  - \rho^2)^3}.
    \]
    Note that for $Q\geq 4$, $\rho^2 \geq \frac{1}{4}$ and therefore,
    \begin{align}
    \label{eq:rho_bound}
        \frac{\rho^2}{2(1  - \rho^2)^3}  \gtrsim 
        \frac{1}{\left(1 - \left(\frac{\sqrt{Q}-1}{\sqrt{Q}}\right)^2\right)^3} = \left(\frac{Q}{Q - (\sqrt{Q}-1)^2}\right)^3 \in \Omega(Q^{3/2}) 
    \end{align}
    where $ \gtrsim $ hides global constants.
    However, \citet{Assran} deduce $\displaystyle \sum_{j = 1}^k \|A^{k - j}\|^2$ to be $\mathcal{O}\left(\frac{1}{1 - \rho^2}\right);$ see the second display below (29). 
     
    This is in contradiction to \eqref{eq:rho_bound} as shown below
    \begin{align*}
        \frac{1}{1-\rho^2} &= \frac{1}{1 - \left(\frac{\sqrt{Q}-1}{\sqrt{Q}}\right)^2} = \frac{Q}{Q - (\sqrt{Q}-1)^2} \in \Theta(\sqrt{Q}).
    \end{align*}
    }
   

\section{Proof of Claims in Section \ref{proof_lower_main_lemma}}
\label{App_thm3}
\begin{customlemma}{\ref{variance_lower_bound}}
    Under Assumption \ref{A2} and $n_0$ as in Lemma~\ref{lower_main_lemma}, the variance term in \eqref{eq:bias_variance_y_tilde} can be lower bounded as follows:
    \begin{align*}
        \alpha^2\mathbb{E}\left[ \sum_{j=0}^{n_0-1} \|B_{i}^{n_0-1-j}\tilde{W}_{j+1}^{(i)} \|^2 \right] \geq \alpha^2K \sum_{j=0}^{n_0-1}  \|B_{i}^{j}e_1 \|^2
    \end{align*}
    where $e_1 = \begin{pmatrix}
    1\\ 0
\end{pmatrix}$ and $K$ is as in Assumption~\ref{A2}.
\end{customlemma}

\begin{proof}
Recall that
$$\tilde{W}_n^{(i)}=\begin{pmatrix}
    \tilde{M}_{n}^{(i)}\\
    0
\end{pmatrix} = \tilde{M}_{n}^{(i)} e_1; \mbox{where }  e_1=\begin{pmatrix}
1\\
0
\end{pmatrix} $$ 

Therefore the \emph{variance} can be bounded as follows:
\begin{align*}
   &\mathbb{E}\left[ \sum_{j=0}^{n-1} \|B_i^{n-1-j}\tilde{W}^{(i)}_{j+1} \|^2 \right] = \mathbb{E}\left[ \sum_{j=0}^{n-1} \|B_i^{(n-1-j)}\tilde{M}^{(i)}_{j+1} e_1 \|^2 \right] \\
    &\qquad\qquad= \mathbb{E}\left[ \sum_{j=0}^{n-1} (\tilde{M}^{(i)}_{j+1})^2 \|B_i^{(n-1-j)}e_1 \|^2 \right]\\
    &\qquad\qquad=  \sum_{j=0}^{n-1} \mathbb{E}[(\tilde{M}^{(i)}_{j+1})^2] \|B_i^{(n-1-j)}e_1 \|^2 \\
    &\qquad\qquad\geq K \sum_{j=0}^{n-1}  \|B_i^{n-1-j}e_1 \|^2. 
\end{align*}
Here the last inequality follows because $\mathbb{E}[(\tilde{M}^{(i)}_{j+1})^2] > K$. To see this observe that $\forall j$, almost surely
\begin{align*}
    \mathbb{E}[(\tilde{M}^{(i)}_j)^2| \mathcal{F}_{j-1}] &\geq \min_{\| v\| =1} v^T\mathbb{E}[\tilde{M}_j \tilde{M}^{T}_j | \mathcal{F}_{j-1}]v \\
    &= \min_{\| v\| =1} v^T S \mathbb{E}[ M_j M^{T}_j| \mathcal{F}_{j-1}] S^T v
\end{align*}
Define $\hat{w} = S^Tv$. Observe that since $A$ is symmetric, $S$ is an orthogonal matrix. Thus, using the fact that for $\|v\| = 1$, we have
\(\|S^T v\| = 1\)
Finally using Assumption \ref{A2} we have,
\begin{align*}    
    \mathbb{E}[(\tilde{M}^{(i)}_j)^2| \mathcal{F}_{j-1}] &\geq \min_{\|\hat{w}\| = 1} \mathbb{E}[\hat{w}^TM_j M^{T}_j\hat{w}| \mathcal{F}_{j-1}] \\
    &\geq K \mbox{ a.s.}
\end{align*}
and therefore, $\E[(\tilde{M}^{(i)}_j)^2] \geq K, \forall j$
\end{proof}

\begin{customlemma}{\ref{Aux_lemma_1}}
Let $\alpha>0$ and $\eta \in [0,1]$ such that $\|B_{i}^{n_0} \tilde{X_0} \|^2 < \epsilon'_i.$ Then 
    \begin{align*}
        \alpha^2 K \sum_{j=0}^{n_0-1} \|B_i^{j}e_{1}\|^2 \geq \frac{\alpha^2 K}{2(1-\mu_+^2)(1-\mu_-^2)(1-\eta)}.
    \end{align*}
\end{customlemma}
\begin{proof}
We omit superscript/subscript $i$ for ease of exposition. 

We first find an expression for $B^{n_0}$ in terms of the eigenvalues of $B$. Towards this, first consider the case where $\eta \neq (1-\sqrt{\alpha \lambda})^2$. Then, $B$ is diagonalizable, since the eigenvalues of $B$, $\mu_+$ and $\mu_-$, are distinct.  

Thus, there exists $U$ such that $B=U D_{B} U^{-1}$, where $D_B=\begin{pmatrix}
    \mu_+ & 0 \\
    0   & \mu_-
\end{pmatrix}$. 
With this, we obtain the following expression for $B^{n_0}$
\begin{align*}
    B^{n_0} &= U D_B^{n_0} U^{-1} \\
    &= \begin{pmatrix}
        \frac{\mu_+^{n_0+1}-\mu_-^{n_0+1}}{\mu_+-\mu_-} & \frac{\mu_+\mu_-^{n_0+1}-\mu_-\mu_+^{n_0+1}}{\mu_+-\mu_-}\\
    \frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+-\mu_-} & \frac{\mu_+\mu_-^{n_0}-\mu_-\mu_+^{n_0}}{\mu_+-\mu_-}
    \end{pmatrix}.
\end{align*}


Since $x_{-1}=x_0$, it can be seen that $\Tilde{Y}_0=\begin{pmatrix}
    \tilde{x}_0\\
    \tilde{x}_0 
\end{pmatrix} = \tilde{x}_0 \begin{pmatrix}
    1\\
    1
\end{pmatrix}$. We will denote $\begin{pmatrix}
    1\\
    1
\end{pmatrix}$ with $\mathbbm{1}$ henceforth. Using this, we obtain the following expression for the \emph{bias} 
\begin{align*}
    \|B^{n_0}\Tilde{Y}_0\| = \tilde{x}_0^2 \|B^{n_0}\mathbbm{1}\|^2.
\end{align*}


It follows that 
\begin{align*}
    \|B^{n_0} \mathbbm{1}\|^2 &= \frac{1}{(\mu_+ - \mu_-)^2} (((\mu_+^{n_0+1}-\mu_-^{n_0+1})+(-\mu_+^{n_0+1}\mu_- - \mu_-^{n_0+1}\mu_+))^2+((\mu_+^{n_0}-\mu_-^{n_0})+(-\mu_+^{n_0}\mu_- - \mu_-^{n_0}\mu_+))^2)
\end{align*}
and 
\begin{align*}
    \|B^{n_0} e_1\|^2 &= \frac{1}{(\mu_+ - \mu_-)^2} ((\mu_+^{n_0+1}-\mu_-^{n_0+1})^2+(\mu_+^{n_0}-\mu_-^{n_0})^2).
\end{align*}

From the expression for $\|B^{n_0} e_1\|^2$, we can see it can be lower bounded as
\begin{align*}
    \|B^{n_0} e_1\|^2 &\geq \left( \frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+ - \mu_-}\right)^2.
\end{align*}

Using this bound, we obtain
\begin{align*}
    \sum_{j=0}^{n_0-1} \Big\|B^{j}e_{1}\Big\|^2 &\geq \sum_{j=0}^{n_0-1} \left(\frac{\mu_+^j-\mu_-^j}{\mu_+-\mu_-}\right)^2 \\
    &= \frac{1}{(\mu_+-\mu_-)^2} \sum_{j=0}^{n_0-1}(\mu_+^{2j}+\mu_-^{2j}-2(\mu_+\mu_-)^{j})\\
    &= \frac{1}{(\mu_+-\mu_-)^2} \left(\sum_{j=0}^{n_0-1} \mu_+^{2j}+ \sum_{j=0}^{n_0-1} \mu_-^{2j}-2 \sum_{j=0}^{n_0-1} (\mu_+\mu_-)^{j}\right) \\
    &= \frac{1}{(\mu_+-\mu_-)^2} \left( \frac{1-\mu_+^{2n_0}}{1-\mu_+^2}+ \frac{1-\mu_-^{2n_0}}{1-\mu_-^2}-2 \frac{1-(\mu_+\mu_-)^{n_0}}{1-\mu_+\mu_-}\right). 
\end{align*}

The sum of fractions in the bracket can be expressed as a single fraction with denominator $(1-\mu_+^2)(1-\mu_-^2)(1-\mu_+\mu_-)$. The numerator turns out to be:


\begin{gather*}
    num = \underbrace{1+\mu_+\mu_--2(\mu_+\mu_-)^{n_0}}_{T_1}\\+\underbrace{\mu_+\mu_- \left(\frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+ - \mu_-}\right)^{2}+(\mu_+\mu_-)^2\left(\frac{\mu_+^{n_0-1}-\mu_-^{n_0-1}}{\mu_+ - \mu_-}\right)^{2} -  \left(\mu_+\mu_-\right)^3\left(\frac{\mu_+^{n_0-1}-\mu_-^{n_0-1}}{\mu_+ - \mu_-}\right)^2 -\left(\frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+ - \mu_-}\right)^2}_{T_2}.
\end{gather*}

Our aim now is to show $num \geq \frac{1}{2}$, which would then complete the proof of the lemma. We do this by lower bounding terms $T_1$ and $T_2$ separately. 

Note that $(\mu_+\mu_-)^{n_0} \leq \rho(B)^{2 n_0}$. From \eqref{rho-bound}, $\frac{\rho(B)^{2(n_0+1)}}{16} < \frac{\epsilon}{\tilde{x}_0^2}$, and we get $ 2(\mu_+\mu_-)^{n_0} \leq \frac{32 \epsilon}{\tilde{x}_0^2}.$ Thus,
\begin{align*}
    T_1 = 1+\mu_+\mu_--2(\mu_+\mu_-)^{n_0} \geq 1-\frac{32 \epsilon}{\tilde{x}_0^2}.
\end{align*}

Consider term $T_2$. It can be re-written as  
\begin{align}
\label{T2}
    T_2 &= (1-\mu_+\mu_-)\left((\mu_+ \mu_-)^2\left(\frac{\mu_+^{n_0-1}-\mu_-^{n_0-1}}{\mu_+-\mu_-}\right)^2-\left(\frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+-\mu_-}\right)^2\right)\\
    &= (1-\mu_+\mu_-) \underbrace{ \left(\mu_+ \mu_-\left(\frac{\mu_+^{n_0-1}-\mu_-^{n_0-1}}{\mu_+-\mu_-}\right)-\left(\frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+-\mu_-}\right)\right) }_{T_2(i)}   \underbrace{\left(\mu_+ \mu_-\left(\frac{\mu_+^{n_0-1}-\mu_-^{n_0-1}}{\mu_+-\mu_-}\right)+\left(\frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+-\mu_-}\right)\right)}_{T_2(ii)}.
\end{align}

We now lower bound $T_2$ by bounding the magnitude of $T_2(i)$ and $T_2(ii)$. Since $\|B^n \Tilde{X}_0\|^2 = \Tilde{x}_0^2 \|B^n \mathbbm{1}\|^2 < \epsilon$, we get
\begin{align}
\label{bias-inequality}
    |T_2(i)|=\left|\mu_+ \mu_-\left(\frac{\mu_+^{n_0-1}-\mu_-^{n_0-1}}{\mu_+-\mu_-}\right)-\left(\frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+-\mu_-}\right)\right|<\frac{\sqrt{\epsilon}}{\tilde{x}_0}.
\end{align}

We first consider the case $\eta \in [0, (1-\sqrt{\alpha\lambda})^2)$. Here, eigenvalues $\mu_{+}$ and $\mu_{-}$ are real. Then,
\begin{align*}
    \mu_+\mu_-\left(\frac{\mu_+^{n_0-1}-\mu_-^{n_0-1}}{\mu_+-\mu_-}\right)&=\mu_-(\mu_+^{n_0}-\mu_-^{n_0-1}\mu_+)\\
    &\geq \mu_-(\mu_+^{n_0} - \mu_-^{n_0}).
\end{align*}
Consequently,
\begin{align*}
    \mu_+\mu_-\left(\frac{\mu_+^{n_0-1}-\mu_-^{n_0-1}}{\mu_+-\mu_-}\right)&=\mu_-\left(\frac{\mu_+^{n_0}-\mu_+\mu_-^{n_0-1}}{\mu_+-\mu_-}\right)\\
    &\geq \mu_-\left(\frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+-\mu_-}\right).
\end{align*}
It follows that \begin{align}
   (1-\mu_-)\left(\frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+-\mu_-} \right) \leq \frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+-\mu_-}  - \mu_+ \mu_-\left(\frac{\mu_+^{n_0-1}-\mu_-^{n_0-1}}{\mu_+-\mu_-}\right)\leq \|B^{n_0} \mathbbm{1}\| < \frac{\sqrt{\epsilon}}{\tilde{x}_0}.
\end{align}
We then obtain
\begin{align}
   \mu_+ \mu_-\left(\frac{\mu_+^{n_0-1}-\mu_-^{n_0-1}}{\mu_+-\mu_-}\right) \leq \frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+-\mu_-} \leq \frac{\sqrt{\epsilon}}{\tilde{x}_0(1-\mu_-)}.
\end{align}

Thus, $|T_2(ii)| \leq  \frac{2\sqrt{\epsilon}}{\tilde{x}_0(1-\mu_-)} $ and we can bound term $T_2$ as follows

\begin{align}
    |T_2| &= (1-\mu_+\mu_-)|T_2(i)||T_2(ii)|\\
    &\leq (1-\mu_+\mu_-)\frac{\sqrt{\epsilon}}{\tilde{x}_0}\frac{2\sqrt{\epsilon}}{\tilde{x}_0(1-\mu_-)}\\
    &\leq \frac{4\epsilon}{\tilde{x}_0^2}.
\end{align}
The last inequality follows since
\begin{align*}
    1-\mu_+ \mu_- \leq 1-\mu_-^2 = (1-\mu_-)(1+\mu_-)\leq 2(1-\mu_-).
\end{align*}

When $\mu_+$ and $\mu_-$ are complex, $\left(\frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+-\mu_-}\right)=\frac{\sin n_0 \omega }{\sin \omega } \rho (P)^{n_0-1}$. The choice of $n_0$ ensures that $\frac{\sin n_0 \omega }{\sin \omega },\frac{\sin (n_0+1) \omega }{\sin \omega } \geq 0$ and $\frac{\sin n_0 \omega }{\sin \omega }<\frac{\sin (n_0+1) \omega }{\sin \omega }$ (see Lemma~\ref{phi-lemma}). Using this, we obtain 
\begin{align*}
\frac{\sqrt{\epsilon}}{\tilde{x}_0} &>\left|\rho(B)^{n_0} \frac{\sin (n_0+1)\omega}{\sin \omega }-\rho(B)^{n_0+1} \frac{\sin n_0 \omega}{\sin \omega } \right| \\
&=\rho(B)^{n_0} \frac{\sin (n_0+1)\omega}{\sin \omega }-\rho(B)^{n_0+1} \frac{\sin n_0 \omega}{\sin \omega }  \\
&\geq \rho(B)^{n_0} \frac{\sin (n_0+1)\omega}{\sin \omega }-\rho(B)^{n_0+1} \frac{\sin (n_0+1) \omega}{\sin \omega } \\
 &\geq  \rho(B)^{n_0} (1-\rho(B)) \frac{\sin (n_0+1)\omega}{\sin \omega }
\end{align*}
It follows that \begin{align}
    \frac{\sin n_0\omega}{\sin \omega }\rho(B)^{n_0+1} < \frac{\sin (n_0+1)\omega}{\sin \omega }\rho(B)^{n_0}<\frac{\sqrt{\epsilon}}{\tilde{x}_0(1-\rho(B))}
\end{align}
since $\frac{\sin n_0 \omega }{\sin \omega }<\frac{\sin (n_0+1) \omega }{\sin \omega }$ and $\rho(B)<1$. Thus, $|T_2(ii)| \leq  \frac{2\sqrt{\epsilon}}{\tilde{x}_0(1-\rho(B))} $ and we can bound term $T_2$ as follows

\begin{align*}
    |T_2| &= (1-\mu_+\mu_-)|T_2(i)||T_2(ii)|\\
    &\leq (1-\mu_+\mu_-)\frac{\sqrt{\epsilon}}{\tilde{x}_0}\frac{2\sqrt{\epsilon}}{(1-\rho(B))\tilde{x}_0}\\
    &\leq \frac{4\epsilon}{\tilde{x}_0^2}.
\end{align*}
The last inequality follows since
\begin{align*}
    1-\mu_+ \mu_- = 1-\rho(B)^2 = (1-\rho(B))(1+\rho(B))\leq 2(1-\rho(B)).
\end{align*}

Thus, $num \geq T_1 - |T_2| \geq 1-\frac{36 \epsilon}{\tilde{x}_0^2}$. When $\epsilon \leq \frac{\tilde{x}_0^2}{72}$, it follows that $num \geq \frac{1}{2}$, and the proof is complete for the case $\eta \neq (1-\sqrt{\alpha \lambda})^2$.


Finally, consider the case when $\eta = (1 - \sqrt{\alpha\lambda})^2$. Here, $B$ is not diagonalizable because of it now has repeated eigenvalues. Instead, we can find matrix $V_i$ such that $B=V J_B V^{-1}$, where $J_B$ is the Jordan block matrix $\begin{pmatrix}
    \mu_+ & 1 \\
    0 &  \mu_+
\end{pmatrix}$. We find that $V=\begin{pmatrix}
    0 & 1\\
    1 & -\sqrt{\eta}
\end{pmatrix}$ is one such solution. Using this, we obtain the following expression for $B^{n_0}$
\begin{align*}
    B^{n_0} &= V J_B^{n_0} V^{-1} \\
    &= \begin{pmatrix}
    (n_0+1)\mu_+^{n_0} & -n_0\mu_+^{n_0+1}\\
    n_0 \mu_+^{n_0-1} & -(n_0-1) \mu_+^{n_0}
\end{pmatrix}.
\end{align*}
Thus, we find that

\begin{align*}
    \|B^{n_0} \mathbbm{1}\|^2 &= ((n_0+1)\mu_+^{n_0} -n_0\mu_+^{n_0+1})^2+ (n_0 \mu_+^{n_0-1} -(n_0-1) \mu_+^{n_0})^2.
\end{align*}

when $\eta = (1-\sqrt{\alpha \lambda})^2$. 

Observe that,
\begin{align*}
    \sum_{j=0}^{n-1} \|B^{j}e_{1}\|^2 &= \sum_{j=0}^{n-1} (j+1)^2\mu^2j + j^2\mu^{2(j-1)} \geq  \sum_{j=0}^{n-1} (j\mu^{j-1})^2\\
    & = \lim_{\mu_+ \rightarrow \mu_-}  \sum_{j=0}^{n-1}\left(\frac{\mu_+^j - \mu_-^j}{\mu_+-\mu_-}\right)^2 \\
    & \geq  \lim_{\mu_+ \rightarrow \mu_-} \frac{1}{2(1-\mu_+^2)(1-\mu_-^2)(1-\mu_+\mu_-)},
\end{align*}
where the last inequality follows from the bounds for the case $\eta \neq (1-\sqrt{\alpha \lambda})^2$. Therefore,
$$\alpha^2 K \sum_{j=0}^{n-1} \Big\|B^{j}e_{1}\Big\|^2 \geq \frac{\alpha^2 K}{2(1-\mu^2)(1-\mu^2)(1-\eta)}.$$
\end{proof}


\begin{customlemma}{\ref{Aux_lemma_2}}
Let $\alpha>0$ and $\eta \in [0,1]$ such that $\|B_{i}^{n_0} \tilde{X_0} \|^2 < \epsilon'_i.$ Then we have the following bound
    $\alpha^2 K \sum_{j=0}^{n_0-1} \|B_i^{j}e_{1}\|^2 \geq \frac{K}{16 \lambda_{i}^2}(1-\rho(B_i)).$
\end{customlemma}
\begin{proof}
\label{C4}
We omit superscript/subscript $i$ for ease of exposition. 
Before proceeding to the proof, we introduce a new function 
\begin{align*}
\label{h-def}
    h(\eta, \alpha \lambda) \triangleq \frac{(1-\mu_+^2)(1-\mu_-^2)(1-\eta)}{(\alpha \lambda)^2}    
\end{align*}
to simplify calculations. 
First we consider the case $\eta \in [0,(1-\sqrt{\alpha\lambda})^2]$ when $\mu_+$ and $\mu_-$ are real.
Since $\mu_+$ and $\mu_-$ are only functions of $\alpha \lambda$ and $\eta$, $h$ is well-defined.

The proof of the Lemma follows by showing $h(\eta, \alpha \lambda)(1-\rho(B)) \leq 8$. Note that
$1-\mu_+^2 = (1-\mu_+)(1+\mu_+) \leq 2(1-\mu_+)$ since $\mu_+ \leq 1$. Similarly, $(1-\mu_-^2) \leq 2(1-\mu_-)$. Thus, $h(\eta, \alpha \lambda)(1-\mu_+) \leq \frac{4 (1-\mu_+)^2(1-\mu_-)(1-\mu_+\mu_-)}{(\alpha \lambda)^2}$. 
Since $\mu_+ + \mu_- = 1-\alpha \lambda + \eta$, it follows that $(1-\mu_+)(1-\mu_-)=1+\mu_ +  \mu_- -\mu_+ -\mu_-= 1-(1-\alpha \lambda + \eta)+\eta=\alpha\lambda$. Thus,
\begin{align*}
    h(\eta, \alpha \lambda)(1-\mu_+) &\leq \frac{4(1-\mu_+)(1-\eta)}{\alpha\lambda}\\
    & = \frac{4(1-\eta)}{\alpha\lambda}\Bigg(1-\frac{( 1 + \eta - \lambda \alpha) + \sqrt{(\lambda \alpha - 1 - \eta)^2 - 4\eta}}{2}\Bigg) \triangleq g(\eta, \alpha\lambda).
\end{align*}

We see that 
\begin{align*}
    \frac{\partial g(\eta, \alpha\lambda)}{\partial \eta} = \frac{2\Big(\sqrt{(1-\alpha\lambda+\eta)^2 - 4\eta}+\eta-1\Big)\Big(\sqrt{(1-\alpha\lambda+\eta)^2 - 4\eta}+\eta-\alpha\lambda-1\Big)}{\alpha\lambda\sqrt{(1-\alpha\lambda+\eta)^2 - 4\eta}}
\end{align*}

Observe that the denominator in the above expression is positive. We consider the following two cases:  (i) $\alpha\lambda \geq 1$ and (ii) $\alpha\lambda < 1$. When $\alpha\lambda \geq 1$, we can directly bound $g(\eta,\alpha\lambda)=\frac{4(1-\mu_+)(1-\eta)}{\alpha\lambda}$. Since $\mu_+ \geq -1$ and $\eta \geq 0$, we get $g(\eta,\alpha\lambda) \leq 8$.

Now consider the case  $\alpha\lambda < 1$. We have that, 
\begin{align*}
    \sqrt{(1-\alpha\lambda+\eta)^2 - 4\eta}+\eta-1 &= \sqrt{(1-\eta)^2+(\alpha\lambda)^2 - 2\alpha\lambda(1+\eta)} - (1-\eta)
\end{align*}
When $\alpha\lambda < 1$, $(\alpha\lambda)^2 - 2\alpha\lambda(1+\eta)<0$ for all $\eta \geq 0$, thus $\Big(\sqrt{(1-\alpha\lambda+\eta)^2 - 4\eta}+\eta-\alpha\lambda-1\Big) \leq \Big(\sqrt{(1-\alpha\lambda+\eta)^2 - 4\eta}+\eta-1\Big) < 0$. This implies that the numerator of the partial derivative is also positive and we have that $\displaystyle\frac{\partial g(\eta, \alpha\lambda)}{\partial \eta} > 0$. Since the partial derivative is positive $g(\eta, \alpha\lambda)$ is an increasing function of $\eta$ and thus the maximum is achieved at $\eta=(1-\sqrt{\alpha \lambda})^2$ and is given by:
\begin{align*}
    g(\eta, \alpha\lambda) &\leq \frac{4(1-\mu_+)(1-\eta)}{\alpha\lambda}\\
    & = \frac{4(1-\rho(B))(1-\eta)}{\alpha\lambda}\\
    & = \frac{4(1-(1-\sqrt{\alpha\lambda}))(1-\eta)}{\alpha\lambda}\\
    & = \frac{4\sqrt{\alpha\lambda}(2\sqrt{\alpha\lambda} - \alpha\lambda)}{\alpha\lambda}\\
    &\leq 8
\end{align*}

Next, for $\eta = ((1-\sqrt{\alpha\lambda})^2,1]$, the eigen values $\mu_+$ and $\mu_-$ are complex. We have,
\begin{align*}
    h(\eta,\alpha\lambda) = \frac{(1-\mu_+^2)(1-\mu_-^2)(1-\eta)}{(\alpha\lambda)^2}.
\end{align*}
First we show that $(1-\mu_+^2)(1-\mu_-^2) \leq 4 (1-\mu_+)(1-\mu_-)$. Notice,
\begin{align*}
    (1-\mu_+^2)(1-\mu_-^2) &= (1-\mu_+)(1+\mu_+)(1-\mu_-)(1+\mu_-)\\
    & = (1 - \mu_+)(1 - \mu_)(1+ \mu_+ + \mu_- + \mu_+\mu_-)\\
    & = (1 - \mu_+)(1 - \mu_-)(1 + (1-\alpha\lambda+\eta) + \eta) \\
    & \leq 4 (1 - \mu_+)(1 - \mu_-).
\end{align*}
It follows that, 
\begin{align*}
    h(\eta,\alpha\lambda)(1-\rho(B)) &\leq \frac{4(1-\rho(B))(1-\eta)}{\alpha\lambda}\\
    &= \frac{4(1-\sqrt{\eta})(1-\eta)}{\alpha\lambda} \triangleq l(\eta,\alpha\lambda)
\end{align*}
Observe that $l(\eta,\alpha\lambda)$ is a decreasing function of $\eta$ and therefore, the infimum is attained for $\eta = (1-\sqrt{\alpha\lambda})^2$. For this choice of $\eta$,
\begin{align*}
    l(\eta,\alpha\lambda) &= \frac{4(1 - (1-\sqrt{\alpha\lambda}))(1-(1-\sqrt{\alpha\lambda})^2)}{\alpha\lambda}
    \leq 8
\end{align*}


\end{proof}



\begin{customlemma}{\ref{Aux_lemma_3}}
 For any $\epsilon \in (0,\epsilon_i')$, if $\|B_i^{n_0}\tilde{Y}_{0}^{(i)}\| < \epsilon$, then $1-\rho(B_i) \geq \frac{16\lambda_i^2}{K}\epsilon $.
\end{customlemma}
\begin{proof}
We omit superscript/subscript $i$ for ease of exposition. 

We first consider the case where $\mu_+$ and $\mu_-$ are complex. Here, we require the following lemma to complete the proof: 

\begin{lemma}
\label{phi-lemma}

    There exists $n_0\in \left[\frac{K}{768\epsilon \lambda_{i}^2}\log\left(\frac{\tilde{x}_0^2}{16 \epsilon}\right)-1,\frac{K}{64\epsilon \lambda_{i}^2}\log\left(\frac{\tilde{x}_0^2}{16 \epsilon}\right)-1\right]$ such that $\frac{\sin(n_0+1)\omega}{\sin \omega},\frac{\sin n_0\omega}{\sin \omega} \geq 0$ and $\frac{\sin(n_0+1)\omega}{\sin \omega} - \frac{\sin n_0\omega}{\sin \omega} > \frac{1}{2}$.
\end{lemma}


Using Lemma \ref{phi-lemma}, we can now obtain the following bound for the \emph{bias} for this case as follows


\begin{align}
    \| B^{n_0} \mathbbm{1}\|^2 &= \frac{1}{(\mu_+ - \mu_-)^2} (((\mu_+^{n_0+1}-\mu_-^{n_0+1})+(-\mu_+^{n_0+1}\mu_- - \mu_-^{n_0+1}\mu_+))^2+((\mu_+^{n_0}-\mu_-^{n_0})+(-\mu_+^{n_0}\mu_- - \mu_-^{n_0}\mu_+))^2) \\
    &\geq \frac{1}{(\mu_+ - \mu_-)^2} ((\mu_+^{n_0+1}-\mu_-^{n_0+1})+(-\mu_+^{n_0+1}\mu_- - \mu_-^{n_0+1}\mu_+))^2 \\
    &= \frac{1}{\sin^2 \omega } (\rho(B)^{n_0} \sin (n_0+1)\omega-\rho(B)^{n_0+1} \sin n_0\omega)^2 \\
     &= \left(\rho(B)^{n_0} \frac{\sin (n_0+1)\omega}{\sin \omega }-\rho(B)^{n_0+1} \frac{\sin n_0 \omega}{\sin \omega }\right)^2 \\
     &\geq \left(\left(\frac{\sin (n_0+1)\omega}{\sin \omega }-\frac{\sin n_0\omega}{\sin \omega }+\frac{1}{4}\right)\rho(B)^{n_0} \right)^2 \\
     &\geq \frac{\rho(B)^{2n_0}}{16}.
\end{align} 

Now we consider the case where $\mu_+$ and $\mu_-$ are real. We obtain a bound on the \emph{bias} as follows

\begin{align}
    \| B^{n_0} \mathbbm{1}\|^2 &= \frac{1}{(\mu_+ - \mu_-)^2} (((\mu_+^{n_0+1}-\mu_-^{n_0+1})+(-\mu_+^{n_0+1}\mu_- - \mu_-^{n_0+1}\mu_+))^2+((\mu_+^{n_0}-\mu_-^{n_0})+(-\mu_+^{n_0}\mu_- - \mu_-^{n_0}\mu_+))^2) \\
    &\geq \left(\frac{\mu_+^{n_0+1}-\mu_-^{n_0+1}}{\mu_+ - \mu_-}+\frac{\mu_+\mu_-^{n_0+1}-\mu_-\mu_+^{n_0+1}}{\mu_+ - \mu_-}\right)^2 \\
    &= \left(\frac{\mu_+^{n_0+1}-\mu_-^{n_0+1}}{\mu_+ - \mu_-}-\mu_+\mu_-\frac{\mu_+^{n_0}-\mu_-^{n_0}}{\mu_+ - \mu_-}\right)^2 \\
    &= \left(\sum_{j=0}^{n_0} \mu_+^j \mu_-^{n_0-j}  -\mu_+\mu_-\left(\sum_{j=0}^{n_0-1} \mu_+^j \mu_-^{n_0-j-1}\right)\right)^2 \\
    &= \left(\sum_{j=0}^{n_0} \mu_+^j \mu_-^{n_0-j}  - \mu_+ \left(\sum_{j=0}^{n_0} \mu_+^{j} \mu_-^{n_0-j} - \mu_+^{n_0}\right)\right)^2 \\
    &= \left(\mu_+^{n_0+1}+(1-\mu_+)\left(\sum_{j=0}^{n_0} \mu_+^{j} \mu_-^{n_0-j}\right)\right)^2 \\
    &\geq \mu_+^{2(n_0+1)} = \rho(B)^{2(n_0+1)}.
\end{align} 

From the above two bounds, we find that 

\begin{align}
\label{rho-bound}
    \epsilon > \|B^{n_0} \Tilde{Y}_0\|^2 \geq  \frac{\rho(B)^{2(n_0+1)}\tilde{x}_0^2}{16}.
\end{align}

We can further bound the above expression by using the fact that $x^m \geq e^ {-m\frac{1-x}{x}}$, for all $x\in (0,1]$ to get

\begin{align*}
    \epsilon \geq \frac{\tilde{x}_0^2}{16}e^{-2(n_0+1)\frac{1-\rho(B)}{\rho(B)}}
\end{align*}

From the above inequality, we obtain the following bound on $n_0+1$

\begin{align*}
    n_0+1  \geq \frac{\rho(B)}{2(1-\rho(B))} \log\left(\frac{\tilde{x}_0^2}{16 \epsilon}\right) = \frac{1}{2}\left(\frac{1}{1-\rho(B)}-1\right) \log\left(\frac{\tilde{x}_0^2}{16 \epsilon}\right) 
\end{align*}

Since $n_0 +1 \leq \frac{K}{64 \epsilon \lambda^2}\log(\frac{\tilde{x}_0^2}{16 \epsilon})$, we obtain the following bound on $\frac{1}{1-\rho(B)}$
\begin{align}
    \frac{1}{1-\rho(B)} \leq \frac{K}{32\epsilon \lambda^2}+1 \leq \frac{K}{16 \epsilon \lambda^2},
\end{align}
provided that $1 \leq \frac{K}{32\epsilon \lambda^2}$.

\end{proof}

\section{Proof of Proposition \ref{Theorem_OTS_mom}}
\label{sec:appendix_proof_upper_bound}


We prove the theorem separately for $\eta = 0$ (corresponding to SGD), $\beta = 0$ (corresponding to SHB) and $\beta = 1$ (corresponding to ASG).

\textbf{Case-1: $\eta = 0$ (SGD)}

The LSA-M iterate in \eqref{sgd-m} with $\eta = 0$ corresponds to:
\begin{gather}
    \label{transformedSA}
    x_{n+1} - x^* = x_{n}-x^* + \alpha(Ax^* - Ax_{n}+M_{n+1})\\
    \label{transformedSAMom}
    x_{n+1} - x^* = x_{n}-x^* + \alpha(Ax^* - Ax_{n}+M_{n+1}) + \eta((x_{n}-x^{*}) - (x_{n-1}-x^{*}))
\end{gather}
Let $\tilde{x}_{n} = x_{n} - x^{*}$. Then, equation \eqref{transformedSA} can be rewritten as:
\begin{equation*}
    \begin{split}
        \Tilde{x}_{n} &= \Tilde{x}_{n-1} + \alpha(-A\Tilde{x}_{n-1} + M_{n})
        = (I - \alpha A)\Tilde{x}_{n-1} + \alpha M_{n}\\
        &= (I - \alpha A)^{n}\Tilde{x}_{0} + \alpha\sum_{i=0}^{n-1}[(I-\alpha A)^{n-1-i} M_{i+1}]
    \end{split}
\end{equation*}
Taking the square of the norm on both sides of the above equation, we obtain
\begin{equation*}
    \begin{split}
        \|\Tilde{x}_{n}\|^2 &= \|(I-\alpha A)^n\Tilde{x}_{0}\|^{2} + 2\alpha\left((I-\alpha A)^{n}\Tilde{x}_{0}\right)^T\left(\sum_{i=0}^{n-1}(I-\alpha A)^{n-1-i}M_{i+1}\right)\\
        & + \alpha^2 \sum_{i=0}^{n-1}\sum_{j=0}^{n-1}((I-\alpha A)^{n-1-i} M_{i+1})^T ((I-\alpha A)^{(n-1-j)} M_{j+1}))
    \end{split}
\end{equation*}
Now we take expectation on both sides to obtain
\begin{equation*}
    \begin{split}
        \mathbb{E}[\|\Tilde{x}_{n}\|^2] 
        &\leq \|(I-\alpha A)^n\|^{2} \|\Tilde{x_{0}}\|^2 + 2\alpha\left((I-\alpha A)^{n}\Tilde{x}_{0}\right)^T\left(\sum_{i=0}^{n}(I-\alpha A)^{(n-1-i)}\mathbb{E}[M_{i+1}]\right)\\
        & + \alpha^2 \sum_{i=0}^{n-1}\sum_{j=0}^{n-1}\mathbb{E}((I-\alpha A)^{(n-1-i)} M_{i+1})^T ((I-\alpha A)^{(n-1-j)} M_{j+1}))
    \end{split}
\end{equation*}
Now, from \textbf{Assumption \ref{A2}}, $\mathbb{E}[M_{i+1}] = \mathbb{E}[\mathbb{E}[M_{i+1}|\mathcal{F}_{i}]] = 0.$
Therefore the second term becomes 0. Next consider the term inside the double summation. First consider the case $i\neq j$. Without loss of generality, suppose $i<j$.
\begin{equation*}
    \begin{split}
        & \mathbb{E}\left[M_{i+1}^{T}\left((I-\alpha A)^{(n-1-i)}\right)^T (I-\alpha A)^{(n-1-j)} M_{j+1}\right]\\
        & =\mathbb{E}\left[\mathbb{E}\left[M_{i+1}^{T}\left((I-\alpha A)^{(n-1-i)}\right)^T (I-\alpha A)^{(n-1-j)} M_{j+1}\vert\mathcal{F}_{j}\right]\right]\\
        & = \mathbb{E}\left[M_{i+1}^{T}\left((I-\alpha A)^{(n-1-i)}\right)^{T}(I-\alpha A)^{(n-1-j)}\mathbb{E}[M_{j+1}|\mathcal{F}_{j}]\right]
        = 0
    \end{split}
\end{equation*}
The last equality follows from \textbf{Assumption \ref{A2}}. When $i=j$,
\begin{equation*}
    \begin{split}
        & \mathbb{E}\left[M_{i+1}^{T}\left((I-\alpha A)^{(n-1-i)}\right)^T (I-\alpha A)^{(n-1-i)} M_{i+1}\right]\\
        & \leq \mathbb{E}\left[\|(I-\alpha A)^{(n-1-i)}\|^{2}\mathbb{E}\left[\|M_{i+1}\|^{2}|\mathcal{F}_{i}\right]\right]
         \leq \|(I-\alpha A)^{(n-1-i)}\|^{2} K\left(1 + \mathbb{E}[\|\Tilde{x}_{i}\|^{2}]\right)
    \end{split}
\end{equation*}
Substituting the above values and using $\Lambda =||x_{0} - x^*||^{2}$ we get
\begin{equation*}
    \begin{split}
        \mathbb{E}[\|\Tilde{x_n}\|^2] &\leq \|(I-\alpha A)^{n}\|^2\Lambda + \alpha^2 K\sum_{i=0}^{n-1}\|(I-\alpha A)^{(n-1-i)}\|^2 (1 + \mathbb{E}[\|\Tilde{x}_{i}\|^2])
    \end{split}
\end{equation*}
We next use the following lemma to bound $\|(I-\alpha A)^i\|$.
\begin{lemma}
\label{norm_upper_bound}
 Let, $M \in \mathbb{R}^{d\times d}$ be a matrix and $\lambda_{i}(M)$ denote the $i^{th}$ eigenvalue of $M$. Then, $\forall \delta > 0$
\[\|M^{n}\|\leq C_{\delta}(\rho(M) + \delta)^n\]
where $\rho(M) = \max_{i}|\lambda_{i}(M)|$ is the spectral radius of $M$ and $C_{\delta}$ is a constant that depends on $\delta$. Furthermore, if $M$ is diagonalizable, then
\[\|M^{n}\|\leq C(\rho(M))^n\]
\end{lemma}
\begin{proof}
See Appendix \ref{Lemma5_proof}.
\end{proof}
We let $\lambda_{\min} = \min_{i}\lambda_{i}$. Using Lemma \ref{norm_upper_bound} and the fact that $A$ is diagonalizable, we have
\begin{equation*}
    \begin{split}
        \mathbb{E}[\|\Tilde{x}_{n}\|^2] &\leq C^2(1-\alpha\lambda_{\min})^{2n}\Lambda + C^2\alpha^2K\sum_{i=0}^{n-1}(1-\alpha\lambda_{\min})^{2(n-1-i)}(1 + \mathbb{E}[\|\Tilde{x}_{i}\|^2])
    \end{split}
\end{equation*}
\begin{equation}
    \label{C_thm1}
    \mbox{where, \quad } C = \frac{\sqrt{d}}{\sigma_{\min}(S)\sigma_{\min}(S^{-1})},
\end{equation}
$S$ is the matrix in Jordan decomposition of $A$ and $\sigma_{\min}(S)$ is the smallest singular value of $S$. 
We define the sequence $\{U_{k}\}$ as below:
\[U_{k} = C^2(1-\alpha\lambda_{\min})^{2k}\Lambda + C^2\alpha^2K\sum_{i=0}^{k-1}(1-\alpha\lambda_{\min})^{2(k-1-i)}(1 + U_{i})\]
Observe that $\mathbb{E}[\|\Tilde{x}_{n}\|^2] \leq U_{n}$ and that the sequence $\{U_{k}\}$ satisfies
\begin{gather*}
    U_{k+1} = (1-\alpha\lambda_{\min})^{2}U_{k} + C^{2}K\alpha^{2}(1 + U_{k});\quad
    U_{0} = C^{2}\Lambda
\end{gather*}
Therefore, we have 
\[U_{k+1} = \left((1-\alpha\lambda_{\min})^{2} + C^{2}K\alpha^{2}\right)U_{k} + \alpha^{2}C^{2}K\]
To ensure that $(1-\alpha\lambda_{\min})^{2} + C^{2}K\alpha^{2} \leq (1-\alpha\lambda_{\min}/2)^2$, choose $\alpha$ as follows:
\begin{gather*}
    \alpha^{2}\lambda_{\min}^{2} - 2\alpha\lambda_{\min} + C^{2}K\alpha^{2} \leq \frac{\alpha^{2}\lambda_{\min}^{2}}{4} - \alpha\lambda_{\min}\\
    \mbox{ or } \alpha \leq \frac{\lambda_{\min}}{\frac{3}{4}\lambda_{\min}^{2} + C^{2}K}
\end{gather*}
\begin{equation*}
    \begin{split}
        U_{n} & \leq \left(1 - \frac{\alpha\lambda_{\min}}{2}\right)^{2}U_{n-1} + \alpha^{2}C^{2}K \\
        & \leq \left(1 - \frac{\alpha\lambda_{\min}}{2}\right)^{2n}U_{0} + \alpha^{2}C^{2}K\sum_{i=0}^{n-1} \left(1 - \frac{\alpha\lambda_{\min}}{2}\right)^{2i}\\
        & \leq  \left(1 - \frac{\alpha\lambda_{\min}}{2}\right)^{2n}U_{0} + \alpha^{2}C^{2}K \frac{1}{1-\left(1-\frac{\alpha\lambda_{\min}}{2}\right)^{2}}\\
        & \leq \left(1 - \frac{\alpha\lambda_{\min}}{2}\right)^{2n}U_{0} + \alpha^{2}C^{2}K \frac{2}{\alpha\lambda_{\min}}\\
    \end{split}
\end{equation*}
We assume that $\alpha \leq \frac{1}{\lambda_{\min}}$ and therefore $(1-\frac{\alpha\lambda_{\min}}{2})^{2} \leq e^{-\alpha\lambda_{\min}}$.
\begin{equation*}
    \begin{split}
        U_{n}\leq e^{-n\alpha\lambda_{\min}}C^{2}\Lambda + \frac{2\alpha C^{2}K}{\lambda_{\min}}
    \end{split}
\end{equation*}
Choose $\alpha$ as below:
\[\alpha \leq \frac{\epsilon\lambda_{\min}}{4C^2K}\]
Then,
\[\frac{2\alpha C^{2}K}{\lambda_{\min}} \leq \frac{\epsilon}{2}
\Rightarrow 
\mathbb{E}[\|\Tilde{x}_{n}\|^2] \leq U_{n} \leq \frac{\epsilon}{2} + \frac{\epsilon}{2} = \epsilon,\]
when the sample complexity is:
\[n = \frac{1}{\alpha\lambda_{\min}}\log\left(\frac{2C^2 \Lambda}{\epsilon}\right)\]

\textbf{Case-2: $\beta = 0$ (SHB)}
\label{App_thm2}
LSA-M iterate with $\beta = 0$ can be re-written as:
\[\Tilde{x}_{n+1} = (I-\alpha A)\Tilde{x}_{n} + \alpha(M_{n+1}) + \eta(\Tilde{x}_{n} - \Tilde{x}_{n-1})\]
This can be re-written as:
\[
\begin{pmatrix}
    \Tilde{x}_{n+1} \\
    \Tilde{x}_{n}      
\end{pmatrix}
=
\begin{pmatrix}
    I - \alpha A + \eta I & -\eta I   \\
    I & 0      
\end{pmatrix} 
\begin{pmatrix}
    \Tilde{x}_{n} \\
    \Tilde{x}_{n-1}      
\end{pmatrix}
+ \alpha
\begin{pmatrix}
    M_{n+1}\\
    0      
\end{pmatrix}
\]
Let us define 
\[\Tilde{X}_{n} \triangleq
\begin{pmatrix}
    \Tilde{x}_{n} \\
    \Tilde{x}_{n-1}      
\end{pmatrix}
, P \triangleq
\begin{pmatrix}
    I - \alpha A + \eta I & -\eta I   \\
    I & 0      
\end{pmatrix} 
\mbox{ and } W_{n} \triangleq
\begin{pmatrix}
    M_{n}\\
    0
\end{pmatrix}.
\]
Note that $\mathbb{E}[W_{n+1}|\mathcal{F}_{n}] = 0$ and $\mathbb{E}[\|W_{n+1}\|^{2}|\mathcal{F}_{n}] = \mathbb{E}[\|M_{n+1}\|^{2}|\mathcal{F}_{n}] \leq K(1 + \mathbb{E}[\|\Tilde{x}_{n}\|^{2}]) \leq K(1+ \mathbb{E}[\|\Tilde{X}_{n}\|^{2}])$. It follows that,
\begin{gather*}
    \Tilde{X}_{n} = P \Tilde{X}_{n-1} + \alpha W_{n}
    = P^n \Tilde{X}_{0} + \alpha \sum_{i=0}^{n-1}P^{(n-1-i)}W_{i+1}
\end{gather*}
The norm square of the above equation gives:
\begin{equation*}
    \begin{split}
        \|\Tilde{X}_{n}\|^{2} & = \|P^{n}\Tilde{X}_{0}\|^{2} + \alpha \left(P^{n}\Tilde{X}_{0}\right)^{T}\left(\sum_{i=0}^{n-1}P^{(n-1-i)}W_{i+1}\right) + \alpha\left(\sum_{i=0}^{n-1}P^{(n-1-i)}W_{i+1}\right)^{T}\left(P^{n}\Tilde{X}_{0}\right)\\
        & + \alpha^{2} \left(\sum_{i=0}^{n-1}P^{(n-1-i)}W_{i+1}\right)^{T}\left(\sum_{i=0}^{n-1}P^{(n-1-i)}W_{i+1}\right)
    \end{split}
\end{equation*}
Taking expectation on both sides as well as using the fact that $\mathbb{E}[W_{k+1}|\mathcal{F}_{k}] = 0$ and $\mathbb{E}[\|W_{k+1}\|^{2}|\mathcal{F}_{k}] \leq K(1 + \mathbb{E}[\|\Tilde{X}_{k}\|^{2}])$, we have
\[
\mathbb{E}[\|\Tilde{X}_{n}\|^2] \leq \|P^n\|^2 \|\Tilde{X}_{0}\|^2 + \alpha^2K\sum_{i=0}^{n-1}\|P^{(n-1-i)}\|^2(1 + \mathbb{E}[\|\Tilde{X}_{i}\|^{2}])
\]
As before, for a matrix $M$, let $\rho(M) = \max_{i} |\lambda_{i}(M)|$ denote the spectral radius of $M$.

Next, we compute $\rho(P)$. Consider the characteristic equation of $P$:
\[
det\left(
\begin{pmatrix}
    I - \alpha A + \eta I -\mu I & -\eta I   \\
    I & -\mu I      
\end{pmatrix} \right) = 0
\]
When $A_{21}$ and $A_{22}$ commute, we have the following formula for determinant of a block matrix (\citet{horn}):
\[
det\left(
\begin{pmatrix}
A_{11} & A_{12}\\
A_{21} & A_{22}
\end{pmatrix}\right) = det\left(A_{11}A_{22} - A_{12}A_{21}\right)
\]
Using this, the characteristic equation of $P$ simplifies to:
\begin{gather*}
    det(- \mu I + \alpha\mu A - \eta \mu I + \mu^2I + \eta I)  = 0
\end{gather*}
We note that when $\mu=0$, the LHS of the above equation becomes $\det(\eta I)$. Thus, $\mu=0$ can never be a solution of the characteristic equation of $P$ whenever $\eta \neq 0$. We now further simplify the characteristic equation of $P$ to a more convienient form:
$$det\left(A - I \left(\frac{\mu+\eta\mu-\mu^2-\eta}{\alpha\mu}\right)\right)=0$$
The only zeros of the characteristic equation of a matrix are its eigenvalues. Let $\lambda_{i}$ be the eigenvalue of $A$ with \(\lambda_{i} = \frac{\mu + \eta \mu - \mu^2 - \eta}{\alpha\mu}\) so that
\begin{gather*}
    \mu^2 + \mu(\alpha\lambda_{i}-1-\eta) + \eta = 0
\end{gather*}
The above is a quadratic equation in $\mu$ and the solution is given by
\begin{gather*}
    \mu = \frac{-(\lambda_{i}\alpha - 1 - \eta) \pm \sqrt{(\lambda_{i}\alpha - 1 - \eta)^2 - 4\eta}}{2}
\end{gather*}

When $(\lambda_{i}\alpha - 1 - \eta)^2 - 4\eta \leq 0$, the absolute value of eigenvalues of $P$ are independent of $\alpha$ and 
\[|\mu| = \frac{1}{2}\left(\sqrt{(\lambda_{i}\alpha - 1 - \eta)^2 + |(\lambda_{i}\alpha - 1 - \eta)^2 - 4\eta|} \right)= \sqrt{\eta}\]
To ensure that $(\lambda_{i}\alpha - 1 - \eta)^2 - 4\eta \leq 0$, we must have
\[(\alpha\lambda_{i} + 1) - 2\sqrt{\lambda_{i}\alpha} \leq \eta \leq (\alpha\lambda_{i} +1) + 2\sqrt{\lambda_{i}\alpha}\]
\[\left(1-\sqrt{\lambda_{i}\alpha}\right)^2 \leq \eta \leq \left(1+\sqrt{\lambda_{i}\alpha}\right)^2\]
For the spectral radius of $P$ to be $\sqrt{\eta}$, the above must hold for all $i$. We choose $\alpha$ as: 
\[\alpha \leq \left(\frac{2}{\sqrt{\lambda_{\min}} + \sqrt{\lambda_{\max}}}\right)^{2}\] 
and $\eta$ as:
\[(1-\sqrt{\lambda_{\min}\alpha})^2\leq \eta \leq(1+\sqrt{\lambda_{\min}\alpha})^2\]
Observe that if we choose the momentum parameter $\eta = \left(1-\sqrt{\lambda_{\min}\alpha}\right)^2$, then $P$ has two repeated roots since $\sqrt{(\lambda_{i}\alpha - 1 - \eta)^2 - 4\eta} = 0$. To ensure that $P$ does not have any repeated root we choose the momentum parameter $\eta = \left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)^2$. Therefore, $\rho(P) = \left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)$.
Using Lemma \ref{norm_upper_bound} and the fact that $A$ is diagonalizable, we get
\[\mathbb{E}[\|\Tilde{X}_{n}\|^2] \leq \hat{C}^2\left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)^{2n} \Lambda + \alpha^2\hat{C}^2K\sum_{i=0}^{n-1}\left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)^{2(n-1-i)}(1 + \mathbb{E}[\|\Tilde{X}_{i}\|^{2}])\]

However, unlike in Case-1, here the constant $\hat{C}$ is not independent of $\alpha$ and $\lambda_{\min}$. The following lemma specifies an upper bound on $\hat{C}$.

\begin{lemma}
\label{c_hat_lemma}
\(\hat{C} \leq C\frac{5}{\sqrt{\alpha\lambda_{\min}}}\), where $C$ is as defined in \eqref{C_thm1}.
\end{lemma}
\begin{proof}
    See Appendix \ref{C2}
\end{proof}
We define the sequence $\{V_{n}\}$ as follows
\[V_{n} = \hat{C}^2\left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)^{2n} \Lambda + \alpha^2\hat{C}^2K\sum_{i=0}^{n-1}\left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)^{2(n-1-i)}(1 + V_{i})\]
Observe that $\mathbb{E}[\|\Tilde{X}_{n}\|^2] \leq V_{n}$, and that $\{V_{k}\}$ satisfies 
\begin{gather*}
    V_{k+1} = \left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)^{2}V_{k} + \hat{C}^{2}K\alpha^{2}(1 + V_{k}); \quad
    V_{0} = \hat{C}^{2}\Lambda
\end{gather*}
Therefore, we have
\[V_{k+1} = \left(\left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)^{2} + \hat{C}^{2}K\alpha^{2}\right)V_{k} + \hat{C}^{2}K\alpha^{2}\]
To ensure that $\left(\left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)^{2} + \hat{C}^{2}K\alpha^{2}\right) \leq \left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{4}\right)^{2}$ we need to choose $\alpha$ such that
\[1 + \frac{\lambda_{\min}\alpha}{4} - \sqrt{\lambda_{\min}\alpha} + \hat{C}^{2}K\alpha^{2} \leq 1 + \frac{\lambda_{\min}\alpha}{16} - \frac{\sqrt{\lambda_{\min}\alpha}}{2} \]
\[\mbox{or \quad } \frac{3\sqrt{\alpha}\lambda_{\min}}{16} + \hat{C}^2K\alpha^{\frac{3}{2}} \leq \frac{\sqrt{\lambda_{\min}}}{2}\]
Next, using Lemma \ref{c_hat_lemma}, the above can be ensured by choosing $\alpha$ such that
\[\frac{3\sqrt{\alpha}\lambda_{\min}}{16} + C^2\frac{25}{\alpha\lambda_{\min}}K\alpha^{\frac{3}{2}} \leq \frac{\sqrt{\lambda_{\min}}}{2}\]
\[\mbox{ or } \alpha \leq \left(\frac{\sqrt{\lambda_{\min}}}{\frac{3}{8}\lambda_{\min} + \frac{25CK}{\lambda_{\min}}}\right)^{2}\]

The recursion for the sequence $\{V_{k+1}\}$ then follows
\begin{equation*}
    \begin{split}
        V_{k+1} & \leq \left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{4}\right)^{2}V_{k} + \hat{C}^{2}K\alpha^{2}
    \end{split}
\end{equation*}
Unrolling the recursion, we get
\begin{equation*}
    \begin{split}
         V_{n} & \leq \left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{4}\right)^{2n}V_{0} + \hat{C}^{2}K\alpha^{2} \sum_{i=0}^{n-1} \left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{4}\right)^{2i}\\
         & \leq \left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{4}\right)^{2n}V_{0} + \hat{C}^{2}K\alpha^{2} \frac{1}{1-\left(1-\frac{\sqrt{\lambda_{\min}}}{4}\right)^{2}}\\
         & \leq \left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{4}\right)^{2n}V_{0} + \hat{C}^{2}K\alpha^{2} \frac{4}{\sqrt{\alpha\lambda_{\min}}}
    \end{split}
\end{equation*}

Further it follows from $\alpha \leq \left(\frac{2}{\sqrt{\lambda_{\min}}+\sqrt{\lambda_{\max}}}\right)^2$ that $\alpha \leq \frac{1}{\lambda_{\min}}$ and  $\left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{4}\right)^{2} \leq e^{-\frac{\sqrt{\lambda_{\min}\alpha}}{2}}$.
\[V_{n} \leq  e^{-n\frac{\sqrt{\lambda_{\min}\alpha}}{2}}\hat{C}^{2}\Lambda + \frac{4\alpha^{2}\hat{C}^{2}K}{\sqrt{\alpha\lambda_{\min}}}
\]
Again using Lemma \ref{c_hat_lemma},
\[V_{n} \leq  e^{-n\frac{\sqrt{\lambda_{\min}\alpha}}{2}}\frac{25 C^{2}}{\lambda_{\min}\alpha}\Lambda + \alpha^{2}\frac{100 C^{2}K}{\left(\lambda_{\min}\alpha\right)^{3/2}}
\]
Observe that
\[\frac{e^{-n\frac{\sqrt{\lambda_{\min}\alpha}}{2}}}{\lambda_{\min}\alpha} \leq e^{-n\frac{\sqrt{\lambda_{\min}\alpha}}{4}}\]
\[\mbox{ for } n \geq \frac{4}{\sqrt{\alpha\lambda_{\min}}}\log\left(\frac{1}{\lambda_{\min}\alpha}\right).\]
Let $n$ be as above. Then,
\[V_{n} \leq 25 C^{2}\Lambda e^{-\frac{n}{4}\sqrt{\lambda_{\min}}\alpha} + \sqrt{\alpha}\frac{100 C^{2}K}{\left(\lambda_{\min}\right)^{3/2}}\]

Choose $\alpha$ as below:
\[\alpha \leq \left(\frac{\epsilon(\lambda_{\min})^{3/2}}{200 C^2 K}\right)^2\]
Then,
\[\sqrt{\alpha}\frac{100 C^{2}K}{\left(\lambda_{\min}\right)^{3/2}} \leq \frac{\epsilon}{2} \Rightarrow \mathbb{E}[\|\Tilde{x}_{n}\|^2] \leq \mathbb{E}[\|\Tilde{X}_{n}\|^2] \leq V_{n} \leq \frac{\epsilon}{2} + \frac{\epsilon}{2} = \epsilon,\]
when $n$ is as follows:
\[n = \frac{4}{\sqrt{\alpha\lambda_{\min}}}\log\left(\frac{50 C^2 \Lambda}{\epsilon}\right)\]

\[n = \max\left(\frac{4}{\sqrt{\alpha\lambda_{\min}}}\log\left(\frac{50 C^2 \Lambda}{\epsilon}\right),\frac{4}{\sqrt{\alpha\lambda_{\min}}}\log\left(\frac{1}{\lambda_{\min}\alpha}\right)\right)\]

\textbf{Case-3: $\beta = 1$ (ASG)}

The proof progresses in a similar way as in Case-2. It is easy to see that the following relation holds with a modified definition of the matrix $P$.
\[
\mathbb{E}[\|\Tilde{X}_{n}\|^2] \leq \|P^n\|^2 \|\Tilde{X}_{0}\|^2 + \alpha^2K\sum_{i=0}^{n-1}\|P^{(n-1-i)}\|^2(1 + \mathbb{E}[\|\Tilde{X}_{i}\|^{2}]),
\]
where, 
\[ P \triangleq
\begin{pmatrix}
    I - \alpha A + \eta (I-\alpha A) & -\eta (I-\alpha A)   \\
    I & 0      
\end{pmatrix} 
\]
As in the previous case, we compute the eigenvalues of $P$. The characteristic equation of $P$ is given by:
\[
det\left(
\begin{pmatrix}
    I - \alpha A + \eta (I-\alpha A) -\mu I & -\eta (I-\alpha A)  \\
    I & -\mu I      
\end{pmatrix} \right) = 0
\]
As in the previous case, this can be simplified to the following equation:
\begin{align*}
    det(- \mu I + \alpha\mu A - \mu\eta(I-\alpha A) + \mu^2I + \eta (I-\alpha A))  = 0
\end{align*}
We now further simplify the characteristic equation of $P$ to a more convenient form:
$$det\left(A - I \left(\frac{\mu+\eta\mu-\mu^2-\eta}{\alpha\mu+\alpha\mu\eta-\eta\alpha}\right)\right)=0$$
Progressing as in the previous case, we get that the eigenvalues of $P$ satisfies:
\begin{gather*}
    \mu = \frac{-(\lambda_{i}\alpha(1+\eta) - 1 - \eta) \pm \sqrt{(\lambda_{i}\alpha(1+\eta) - 1 - \eta)^2 - 4\eta(1-\alpha\lambda_{i})}}{2}
\end{gather*}
When $(\lambda_{i}\alpha(1+\eta) - 1 - \eta)^2 - 4\eta(1-\alpha\lambda_{i}) \leq 0$, we have that
\[|\mu| = \frac{1}{2}\left(\sqrt{(\lambda_{i}\alpha(1+\eta) - 1 - \eta)^2 - (\lambda_{i}\alpha(1+\eta) - 1 - \eta)^2 + 4\eta (1-\alpha\lambda_{i})} \right)= \sqrt{\eta(1-\alpha\lambda_{i})}\]
This implies that,
\begin{equation}
    \label{rho(P)ASG}
    \rho(P) = \sqrt{\eta(1-\alpha\lambda_{\min})}
\end{equation}
To ensure that $(\lambda_{i}\alpha(1+\eta) - 1 - \eta)^2 - 4\eta(1-\alpha\lambda_{i}) \leq 0$, we must have
\[\eta^2(1-\alpha\lambda_{i})^2 + 2\eta(1-\alpha^2\lambda_{i}^2) + (1-\alpha\lambda_{i})^2 \leq 0\]
We assume $\alpha\leq \frac{1}{\lambda_{\max}}$ and therefore, $(1 - \alpha\lambda_{i}) \geq 0$ holds for all $i$. Using this, the above relation simplifies to:
\[\eta^2(1-\alpha\lambda_{i}) + 2\eta(1+\alpha\lambda_{i}) + (1-\alpha\lambda_{i}) \leq 0\]
For the above to hold, we must have that
\[\frac{2(1+\alpha\lambda_{i}) - 4\sqrt{\alpha\lambda_{i})}}{2(1-\alpha\lambda_{i})}\leq \eta \leq \frac{2(1+\alpha\lambda_{i}) + 4\sqrt{\alpha\lambda_{i})}}{2(1-\alpha\lambda_{i})}\]
\[\frac{(1-\sqrt{\alpha\lambda_{i}})^2}{(1-\alpha\lambda_{i})}\leq \eta \leq \frac{(1+\sqrt{\alpha\lambda_{i}})^2}{(1-\alpha\lambda_{i})}\]
The above must hold $\forall i$ and therefore we choose $\eta$ as:
\[\frac{(1-\sqrt{\alpha\lambda_{\min}})^2}{(1-\alpha\lambda_{\min})}\leq \eta \leq \frac{(1+\sqrt{\alpha\lambda_{\min}})^2}{(1-\alpha\lambda_{\min})}\]
As in Case-2, if we choose the momentum parameter $\displaystyle\eta = \frac{(1-\sqrt{\lambda_{\min}\alpha})^2}{(1-\alpha\lambda_{\min})}$, then $P$ has two repeated roots. To ensure that $P$ does not have any repeated root we choose the momentum parameter $$\displaystyle\eta = \frac{\left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)^2}{(1-\alpha\lambda_{\min})}.$$
Using \eqref{rho(P)ASG}, we get
$\rho(P) = \left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)$ which is same as in Case-2 and therefore we have:
\[\mathbb{E}[\|\Tilde{X}_{n}\|^2] \leq \tilde{C}^2\left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)^{2n} \Lambda + \alpha^2\Tilde{C}^2K\sum_{i=0}^{n-1}\left(1-\frac{\sqrt{\lambda_{\min}\alpha}}{2}\right)^{2(n-1-i)}(1 + \mathbb{E}[\|\Tilde{X}_{i}\|^{2}]).\]
The above expression is same as that in Case-2 except the term $\Tilde{C}$. Since, the matrix $P$ is different when $\beta = 1$, $\hat{C}$ in Case-2 need not be equal to $\Tilde{C}$ in Case-3.
However, we next show that $\Tilde{C}$ follows the exact same upper bound as in Case-2, Lemma~\ref{c_hat_lemma}. Towards this, we have the following lemma:
\begin{lemma}
\label{c_tilde_lemma}
\(\Tilde{C} \leq C\frac{5}{\sqrt{\alpha\lambda_{\min}}}\), where $C$ is as defined in \eqref{C_thm1}.
\end{lemma}
\begin{proof}
    See Appendix \ref{C3}
\end{proof}
Thereafter, we can proceed exactly as in case-2 to prove the theorem.






\section{Proof of Auxilary Lemmas}
\subsection{Proof of Lemma~\ref{phi-lemma}}

We begin the proof by defining $\phi(n)\triangleq \frac{\sin(n+1)\omega}{\sin \omega} - \frac{\sin n\omega}{\sin \omega}$. We now state and prove a few intermediate results below.
\begin{enumerate}
    \item $\phi(n)$ is periodic with period $\frac{2 \pi}{\omega}$: This follows by observing that $\phi(n+\frac{2 \pi}{\omega})=\frac{\sin((n+1)\omega+2 \pi)}{\sin \omega} - \frac{\sin (n\omega+2\pi)}{\sin \omega}=\phi(n)$. 

    \item If $n \in [0, \frac{\pi}{6 \omega}]$, then $\sin (n \omega )$ and $\sin (n+1) \omega \geq 0$: If $n \in [0, \frac{\pi}{6 \omega}]$, then $n \omega \in [0,\frac{\pi}{6}]$. This implies $\sin(n \omega) \geq 0$. Since $n \geq 1$, we have $\omega \leq n \omega \leq \frac{\pi}{6}$. This implies that $(n+1)\omega \in [0, \frac{\pi}{3}]$, and we get that $\sin (n+1) \omega \geq 0$.
    
    \item  If $n \in [0,\frac{\pi}{6\omega}]$, then $\phi(n) \geq \frac{1}{2}$: We have 
    \begin{align*}
        \frac{\sin(n+1)\omega-\sin n\omega}{\sin \omega} = \frac{2\cos(n+\frac{1}{2})\omega \sin \frac{\omega}{2}}{\sin \omega}.
    \end{align*}

    Note that 
\begin{align*}
    \frac{2 \sin \frac{\omega}{2}}{\sin \omega} = \frac{2 \sin \frac{\omega}{2}}{2 \sin \frac{\omega}{2} \cos \frac{\omega}{2}} =  \frac{1}{\cos \frac{\omega}{2}} \geq 1.
\end{align*}
    
    Moreover, since $(n+\frac{1}{2})\omega \geq 0$ and $(n+\frac{1}{2})\omega \leq (n+\frac{n}{2})\omega \leq \frac{\pi}{6}+\frac{\pi}{12} \leq \frac{\pi}{3}$. Thus, $\cos(n +\frac{1}{2})\omega \geq \cos \frac{\pi}{3} = \frac{1}{2}$.

    
\end{enumerate}
    
Before proceeding, we define $\bar{n} \triangleq \frac{K}{64\epsilon \lambda_{i}^2}\log\left(\frac{\tilde{x}_0^2}{16 \epsilon}\right)-1$. We now consider two cases: $\bar{n} \geq \frac{2\pi}{\omega}$ and $\bar{n} < \frac{2\pi}{\omega}$.


In the first case, $\bar{n} \in [\frac{2m\pi}{\omega},\frac{2(m+1)\pi}{\omega}]$, for some integer $m \geq 1$. Then, $\frac{\bar{n}}{12} \leq \frac{2(m+1)\pi}{12 \omega} \leq  \frac{2m\pi}{\omega}$. Setting $n_0 = \frac{2m\pi}{\omega}$, it follows that 
\begin{align*}
    n_0 = \frac{2m\pi}{\omega} \in \left[\frac{\bar{n}}{12},\bar{n}\right] \subseteq \left[ \frac{K}{768\epsilon \lambda_{i}^2}\log\left(\frac{\tilde{x}_0^2}{16 \epsilon}\right)-1,\frac{K}{64\epsilon \lambda_{i}^2}\log\left(\frac{\tilde{x}_0^2}{16 \epsilon}\right)-1\right]
\end{align*}
and $\phi(n_0)=\phi(\frac{2m\pi}{\omega})=\phi(0)$. Since $0 \in [0,\frac{\pi}{6\omega}]$, the lemma follows from points 1 and 2.

In the second case, we have $\bar{n} < \frac{2\pi}{\omega}$, and consequently $0 \leq \frac{\bar{n}}{12}<\frac{\pi}{6 \omega}$. Setting $n_0=\frac{\bar{n}}{12}$, we see that 
\begin{align*}
    n_0 \in \left[ \frac{K}{768\epsilon \lambda_{i}^2}\log\left(\frac{\tilde{x}_0^2}{16 \epsilon}\right)-1,\frac{K}{64\epsilon \lambda_{i}^2}\log\left(\frac{\tilde{x}_0^2}{16 \epsilon}\right)-1\right]
\end{align*}
and 
$n_0 \in [0,\frac{\pi}{6 \omega}]$. Thus, the lemma follows from points 1 and 2.

To summarise, we set
\begin{align}
\label{n0-expression}
    n_0 &=
    \begin{cases}
    \frac{\bar{n}}{12},&\text{ when }\bar{n} < \frac{2\pi}{\omega}\\
    \frac{2 m \pi}{\omega}\text{, where }m= \left\lfloor \frac{\bar{n} \omega}{2\pi} \right \rfloor, &\text{ otherwise}
    \end{cases}\\
    &\mbox{ where, } \bar{n} = \frac{K}{64\epsilon \lambda_{i}^2}\log\left(\frac{\tilde{x}_0^2}{16 \epsilon}\right)-1.\nonumber
\end{align}
and showed that it satisfies the lemma. 


\subsection{Proof of Lemma \ref{norm_upper_bound}}
\label{Lemma5_proof}
As in \citet{foucart}, we first construct a matrix norm $\vertiii{\cdot}$ such that $\vertiii{M} = \rho(M) + \delta$. Consider the Jordan canonical form of $M$
\begin{center}
    \[M = S\begin{pmatrix}
        J_{n_{1}}(\lambda_{1}(M)) & 0 &\ldots & 0\\
        0 & J_{n_{2}}(\lambda_{2}(M)) & \ddots & \vdots\\
        \vdots & \ddots & \ddots & 0 \\
        0 & \ldots & 0 & J_{n_{k}}(\lambda_{k}(M))
    \end{pmatrix} S^{-1}\]
\end{center}
We define 
\begin{center}
    \[D(\delta) =
    \begin{pmatrix}
        D_{n_{1}}(\delta) & 0 &\ldots & 0\\
        0 & D_{n_{2}}(\delta) & \ddots & \vdots\\
        \vdots & \ddots & \ddots & 0 \\
        0 & \ldots & 0 & D_{n_{k}}(\delta)
    \end{pmatrix},\] where \[D_{j}(\delta) = 
    \begin{pmatrix}
        \delta & 0 & \ldots & 0\\
        0 & \delta^{2} & \ddots & \vdots\\
        \vdots & \ddots & \ddots & 0 \\
        0 & \ldots & 0 & \delta^{j}
    \end{pmatrix}\]
\end{center}
Therefore,
\begin{center}
    \[D(\frac{1}{\delta})S^{-1}MSD(\delta) = 
    \begin{pmatrix}
        B_{n_{1}}(\lambda_{1}(M),\delta) & 0 &\ldots & 0\\
        0 & B_{n_{2}}(\lambda_{2}(M),\delta) & \ddots & \vdots\\
        \vdots & \ddots & \ddots & 0 \\
        0 & \ldots & 0 & B_{n_{k}}(\lambda_{k}(M),\delta)
    \end{pmatrix}\]
\end{center}
where,
\begin{center}
    \[B_{i}(\lambda,\delta) = D_{i}(\frac{1}{\delta})J_{i}(\lambda)D_{i}(\delta) = 
    \begin{pmatrix}
        \lambda & \delta & 0 &\ldots & 0\\
        0 & \lambda & \delta & \ddots & \vdots\\
        0 & \ddots & \ddots & \ddots & 0 \\
        \vdots & \ddots & \ddots & \lambda & \delta \\
        0 & \ldots & 0 & 0 & \lambda
    \end{pmatrix}\]
\end{center}
We define the matrix norm $\vertiii{\cdot}$ as
\[\vertiii{M} \triangleq \|D(\frac{1}{\delta})S^{-1}MSD(\delta)\|_{1}\]
where $\|\cdot\|_{1}$ is the matrix norm induced by the vector $L_{1}$-norm. Using the fact that $\|M\|_{1} = \max_{j\in[1:d]}\sum_{i=1}^{d}|m_{i,j}|$, where $m_{i,j}$ is the $i,j$-th entry of $M$, we have
\[\vertiii{M} = \max_{j\in [1:d]}(|\lambda_{j}| + \delta) = \rho(M) + \delta.\]
\[\mbox{ and }\vertiii{M^{n}} \leq \vertiii{M}^{n} \leq (\rho(M)+\delta)^{n}\]
It can be easily seen that $\|M\|_{1} \geq \frac{1}{\sqrt{d}}\|M\|_{2}$. Therefore it follows that
\begin{equation*}
    \begin{split}
        \vertiii{M} & = \|D\left(\frac{1}{\delta}\right)S^{-1}MSD(\delta)\|_{1}\\
        & \geq \frac{1}{\sqrt{d}} \|D\left(\frac{1}{\delta}\right)S^{-1}MSD(\delta)\|_{2}\\
        &\geq \frac{1}{\sqrt{d}} \sigma_{\min}\left(D\left(\frac{1}{\delta}\right)\right)\sigma_{\min}(S^{-1})\|M\|_{2}\sigma_{\min}(S)\sigma_{\min}(D(\delta))
    \end{split}
\end{equation*}
where $\sigma_{\min}(\cdot)$ denotes the smallest singular value and we have used the fact that $\|AB\| \geq \sigma_{\min}(A)\|B\|$ repeatedly. For $\delta < 1$, and $r$ defined as the size of largest Jordan block of $M$ \[\sigma_{\min}\left(D\left(\frac{1}{\delta}\right)\right)\sigma_{\min}\left(D\left(\delta\right)\right) = \delta^{r}\frac{1}{\delta} = \delta^{r-1}.\]
We conclude the first half of the lemma by defining $C_{\delta}$ as $\frac{\sqrt{d}}{\delta^{r-1}\sigma_{\min}(S)\sigma_{\min}(S^{-1})}$. 


In the case that the matrix is diagonalizable $C_\delta$ defined above becomes independent of $\delta.$ Moreover, in this case, each Jordan block is $J_{n_i}(\lambda_{i}(M)) = [\lambda_{i}(M)]$ and the second half follows.

\subsection{Proof of Lemma \ref{c_hat_lemma}}
\label{C2}
 
Let $S$ be the matrix in Jordan decomposition of $A$, i.e., $SAS^{-1} = D$, where $D$ is a diagonal matrix with eigenvalues of $A$ as its diagonal elements. Then,

\[
\begin{pmatrix}
    S & 0 \\
    0 & S
\end{pmatrix} P
\begin{pmatrix}
    S^{-1} & 0 \\
    0 & S^{-1}
\end{pmatrix} = 
\begin{pmatrix}
    I - \alpha S A S^{-1}+\eta I & -\eta SS^{-1}\\
    SS^{-1} & 0
\end{pmatrix} = 
\begin{pmatrix}
    I - \alpha D +\eta I & -\eta I\\
    I & 0
\end{pmatrix}, 
\]
where $0_d$ is the zero matrix of dimension $d \times d$. For ease of exposition, suppose $A$ is a $2 \times 2$ matrix with eigenvalues $\lambda_1$ and $\lambda_{2}$. Then 
\[
\begin{pmatrix}
    S & 0 \\
    0 & S
\end{pmatrix} P
\begin{pmatrix}
    S^{-1} & 0 \\
    0 & S^{-1}
\end{pmatrix} = 
\begin{pmatrix}
    1 + \eta - \alpha \lambda_{1} & 0 & -\eta & 0\\
    0 & 1 + \eta - \alpha \lambda_{2} & 0 & -\eta\\
    1 & 0 & 0 & 0\\
    0 & 1 & 0 & 0
\end{pmatrix}
\]
Suppose $E$ is the elementary matrix associated with the exchange of row-2 and row-3. 
It is easy to see that $E=E^T=E^{-1}$ and that,
\[
E
\begin{pmatrix}
    S & 0 \\
    0 & S
\end{pmatrix} P
\begin{pmatrix}
    S^{-1} & 0 \\
    0 & S^{-1}
\end{pmatrix}E^{-1} = 
\begin{pmatrix}
    1 + \eta - \alpha \lambda_{1} & -\eta & 0 & 0\\
    1 & 0 & 0 & 0\\
    0 & 0 & 1 + \eta - \alpha \lambda_{2} & -\eta\\
    0 & 0 & 1 & 0
\end{pmatrix}= 
\begin{pmatrix}
    B_{1} & 0 \\
    0 & B_{2}
\end{pmatrix}
\]
where, 
\[B_{i} = 
\begin{pmatrix}
    1 + \eta - \alpha \lambda_{i} & -\eta\\
    1 & 0
\end{pmatrix}
\]
Suppose, \(X_{i} = \begin{pmatrix}
    x_{i,1} & x_{i,2}\\
    x_{i,3} & x_{i,4}
\end{pmatrix}\) and,
\[
X_{i}^{-1}B_{i}X_{i} = 
\begin{pmatrix}
    \mu_{i,+} & 0\\
    0 & \mu_{i,-}
\end{pmatrix}.
\]
Here $\mu_{i,+} = \frac{(1 - \alpha\lambda_{i} + \eta) + \sqrt{\Delta_{i}}}{2}$ and $\mu_{i,-} =  \frac{(1 - \alpha\lambda_{i} + \eta) - \sqrt{\Delta_{i}}}{2}$, where $\Delta_i = (1+\eta - \alpha\lambda_{i})^2 - 4\eta$. Solving the above equation we get, 
\[
X_{i} = \begin{pmatrix}
    x_{i,3}\mu_{i,+} & x_{i,4}\mu_{i,-}\\
    x_{i,3} & x_{i,4}
\end{pmatrix}
\]
Setting $x_{i,3} = x_{i,4} = 1$,
\[
X_{i} = \begin{pmatrix}
    \mu_{i,+} & \mu_{i,-}\\
    1 & 1
\end{pmatrix} \mbox{ and }
X_{i}^{-1} = 
\frac{1}{\mu_{i,+} - \mu_{i,-}}
\begin{pmatrix}
    1 & -1\\
    -\mu_{i,-} & \mu_{i,+}\\
\end{pmatrix} 
\]
For a general $d\times d$ matrix $A$, using a similar procedure, it can be shown that 
\[
\begin{pmatrix}
    X_{1} & 0 & 0\\
    0 & \ddots & \vdots\\
    0 & \cdots &X_{d}\\
\end{pmatrix}
E_{2d\times 2d}
\begin{pmatrix}
    S & 0 \\
    0 & S
\end{pmatrix} P
\begin{pmatrix}
    S^{-1} & 0 \\
    0 & S^{-1}
\end{pmatrix}E_{2d\times 2d}^{-1} 
\begin{pmatrix}
    X_{1}^{-1} & 0 & 0\\
    0 & \ddots & \vdots\\
    0 & \cdots &X_{d}^{-1}\\
\end{pmatrix} 
\]
\[= 
\begin{pmatrix}
    \mu_{1,+} &0 &0 &\cdots& 0\\
    0 &\mu_{1,-}& 0& \cdots& 0\\
    \vdots& 0 &\ddots& \cdots& 0\\
    0& \cdots& 0& \mu_{d,+}& 0\\
    0& \cdots& 0& 0&\mu_{d,-}
\end{pmatrix}
\]
where $E_{2d\times 2d}$ and $E_{2d\times 2d}^{-1}$ are permutation matrices that transform the matrix between them to a block diagonal matrix.
Let \(\hat{S} = \begin{pmatrix}
    X_{1} & 0 & 0\\
    0 & \ddots & \vdots\\
    0 & \cdots &X_{d}\\
\end{pmatrix}
E_{2d\times 2d}
\begin{pmatrix}
    S & 0 \\
    0 & S
\end{pmatrix}\).
Therefore, 
\[
\hat{C} = \frac{\sqrt{d}}{\sigma_{\min}(\hat{S}) \sigma_{\min}(\hat{S}^{-1})}
\]
In order to simplify the expression for $\hat C$, we require the following lemma:
\begin{lemma}
\label{singular-val}
For all invertible matrices $M$ of order $d \times d$, the following identity holds:
 \[\frac{1}{\sigma_{d}(M)\sigma_{d}(M^{-1})} = \sigma_{1}(M)\sigma_{1}(M^{-1}),\]
 where $\sigma_1(X) \geq \cdots \geq \sigma_d(X)$ denote the singular values of $X$.
\end{lemma}

\begin{proof}
By definition, $\sigma_1^2(M) \geq \cdots \geq \sigma_d^2(M)$ are the eigenvalues of $M^{T}M$. Then, the eigenvalues of $(M^TM)^{-1}=M^{-1}(M^{-1})^T$ are $\frac{1}{\sigma_d(M)^2} \geq \cdots \geq \frac{1}{\sigma_1(M)^2}.$ Note that $M^{-1}(M^{-1})^T$ and $(M^{-1})^TM^{-1}$ are similar since $(M^{-1})^TM^{-1}=M(M^{-1}(M^{-1})^T)M^{-1}$. Consequently, 
$M^{-1}(M^{-1})^{T}$ and $(M^{-1})^{T}M^{-1}$ have the same set of eigenvalues and we find that the singular values of $M^{-1}$ are $\frac{1}{\sigma_d(M)} \geq \cdots \geq \frac{1}{\sigma_1(M)}.$

Thus,
$$\sigma_1(M^{-1})=\frac{1}{\sigma_d(M)},$$
$$\sigma_d(M^{-1})=\frac{1}{\sigma_1(M)}$$
and the result follows.
\end{proof}


Using Lemma \ref{singular-val},
we have 
\begin{equation*}
    \begin{split}
        \hat{C} &= \sqrt{d}\sigma_{\max}(\hat{S}) \sigma_{\max}(\hat{S}^{-1})\\
        &\leq \sqrt{d} \max_{i}\{\sigma_{\max}(X_{i})\}\sigma_{\max}(S)\sigma_{\max}(S^{-1})\max_{i}\{\sigma_{\max}(X_{i}^{-1})\}\\
        & = C \max_{i}\{\sigma_{\max}(X_{i})\}\max_{i}\{\sigma_{\max}(X_{i}^{-1})\},
    \end{split}
\end{equation*}
where $C$ is as defined in \eqref{C_thm1}. Now, for any matrix $X$ of order $d \times d$, $$\sigma_{\max}(X)=\|X\|_2\leq \|X\|_F=(\sum_{i,j}|x_{ij}|^2)^{1/2} \leq d\max_{i,j}|x_{ij}|,$$
where $\|\cdot\|_F$ denotes the Frobenius norm. Using the above inequality, $\sigma_{\max}(X_{i}) \leq 2$ and $\sigma_{\max}(X_{i}^{-1}) \leq \frac{2}{|\mu_{i,+}-\mu_{i,-}|}= \frac{2}{|\sqrt{\Delta_{i}}|}$. Next we lower bound $|\sqrt{\Delta_{i}}|$.

For a complex number $z$, observe that $|\sqrt{z}|$ = $\sqrt{|z|}$. Now,
\begin{align}
        \label{delta_i}
        |\Delta_{i}| &= 4\eta - (1+\eta-\alpha\lambda_{i})^2\\
        &\geq  4\eta - (1+\eta-\alpha\lambda_{\min})^2 \nonumber
\end{align}

Using $\eta = \left(1 - \frac{\sqrt{\alpha\lambda_{\min}}}{2}\right)^{2}$,
\begin{equation*}
    \begin{split}
        |\Delta_{i}| & \geq 4\eta - \left(1+ 1 + \frac{\alpha\lambda_{\min}}{4} - \sqrt{\alpha\lambda_{\min}} -\alpha\lambda_{\min}\right)^{2}\\
        & = 4\eta - (2\sqrt{\eta} - \frac{3\alpha}{4}\lambda_{\min})^{2}\\
        & = 4 \left[ (\sqrt{\eta})^{2} - \left(\sqrt{\eta} - \frac{3\alpha\lambda_{\min}}{8}\right)^{2}  \right]\\
        & = \frac{3\alpha\lambda_{\min}}{2}\left(2\sqrt{\eta} - \frac{3\alpha\lambda_{\min}}{8}\right)\\
        & = \frac{3\alpha\lambda_{\min}}{2} \left(2 - \sqrt{\alpha\lambda_{\min}} - \frac{3\alpha\lambda_{\min}}{8}\right)\\
        & \geq \frac{3\alpha\lambda_{\min}}{2}\left(2 - 1 - \frac{3}{8}\right) \\
        & = \frac{15}{16}\alpha\lambda_{\min}
    \end{split}
\end{equation*}
Using this, we get $\max_{i}\{\sigma_{\max}(X_{i}^{-1})\} \leq 2\sqrt{\frac{16}{15}}\frac{1}{\sqrt{\alpha\lambda_{\min}}}$, and therefore
\(\hat{C} \leq \frac{5C}{\sqrt{\alpha\lambda_{\min}}}\)

\subsection{Proof of Lemma \ref{c_tilde_lemma}}
\label{C3}
The proof of the lemma can proceed exactly as in the proof of Lemma~\ref{C2}. The only difference is that one needs to lower bound $|\Delta_i| = 4\eta(1-\alpha\lambda_{i}) - (\alpha(1+\eta)\lambda_{i}-1-\eta)^2$ for $\eta = \frac{\left(1 - \frac{\sqrt{\alpha\lambda_{\min}}}{2}\right)^{2}}{(1-\alpha\lambda_{i})}.$ We define $\eta' = \left(1 - \frac{\sqrt{\alpha\lambda_{\min}}}{2}\right)^{2}$ and therefore $\eta = \frac{\eta'}{(1-\alpha\lambda_{i})}$. Using this, we get
\begin{align*}
    |\Delta_{i}| &= 4\eta(1-\alpha\lambda_{i}) - (1+\eta-\alpha(1+\eta)\lambda_{i})^2\\
    &= 4\eta' - (1+\eta-\alpha\lambda_{i}(1+\eta))^2\\
    &= 4\eta' - \Big((1+\eta)(1-\alpha\lambda_{i})\Big)^2\\
    &= 4\eta' - \bigg(\Big(1+\frac{\eta'}{1-\alpha\lambda_{i}}\Big)(1-\alpha\lambda_{i})\bigg)^2\\
    &= 4\eta' - (1+\eta'-\alpha\lambda_{i})
\end{align*}
The expression for $|\Delta_{i}|$ is exactly same as in the proof of Lemma~\ref{C2} (cf. \eqref{delta_i}). Thereafter, we proceed as in proof of Lemma~\ref{C2} to prove the claim.

\section{Details of Simulation}
\label{s:Simulation.Details}
Here we provide details of the simulation in Figure \ref{figure}. The MSE at each iteration $n$ is defined as the average of the errors obtained, $\|x_n-x^*\|^2$, over the number of runs. We consider the same objective function and noise distribution for each sub-figure and plot the MSE obtained for SGD, SHB and ASG against the number of iterations. The only differences within the sub-figures is the choice of stepsize and momentum parameters.

We consider a quadratic optimization problem of the form $x^TAx-b^Tx+c$, where $$A=\begin{pmatrix}
    1 & 0 \\
    0 & 50
\end{pmatrix}, b=\begin{pmatrix}
    0\\
    0
\end{pmatrix}, c=0.$$ 
The solution to the above problem is $x^*=\begin{pmatrix}
    0\\
    0
\end{pmatrix}$. To simulate stochastic gradients, we add gaussian noise and the gradient at $x$ is given by $Ax-b+\nu$, where $\nu \sim \mathcal{N}(0,1500I)$. We run SGD, SHB and ASG on the above problem, starting at $[600,600]$, averaged over 50 runs. In particular, we run SHB and ASG in all three sub-figures with momentum parameter $\eta$ as 0.9025 and 0.9048, respectively. The stepsize choices for each case are provided below Figure \ref{figure}. Due to space constraints, the y-axes are scaled down by a factor of 10000. 



\end{document}
