\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{amsmath,amssymb,amsfonts,amsthm,mathrsfs}
\usepackage[page,title,titletoc,header]{appendix}
\usepackage{enumerate}
\usepackage{bm}
%\usepackage[notref,notcite]{showkeys}
\usepackage{indentfirst}
\usepackage[mathscr]{eucal}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{threeparttable}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}[section]
\newtheorem{proposition}{Proposition}[section]
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}[section]
\newtheorem{Exa}{Example}[section]
\newtheorem{assumption}{Assumption}
\newcommand{\creflastconjunction}{, and~}
\newcommand{\mR}{\mathbb{R}}
\newcommand{\mN}{\mathbb{N}}
\newcommand{\mE}{\mathbb{E}}
%\newcommand{\mS}{\mathbb{S}}
\newcommand{\la}{\langle}
\newcommand{\ra}{\rangle}
\newcommand{\tr}{\operatorname{tr}}
\newcommand{\teta}{\tilde{\eta}}
\newcommand{\heta}{\hat{\eta}}
\newcommand{\ep}{\epsilon}
\newcommand{\gsi}{g_{s,i}}
\newcommand{\bgsi}{\bar{g}_{s,i}}
\newcommand{\xsi}{x_{s,i}}
\newcommand{\bsi}{b_{s,i}}
\newcommand{\asi}{a_{s,i}}
\newcommand{\xisi}{\xi_{s,i}}
\newcommand{\msi}{m_{s,i}}
\newcommand{\hmsi}{\hat{m}_{s,i}}
\newcommand{\vsi}{v_{s,i}}
\newcommand{\siginf}{\sigma_{\infty}}
\newcommand{\vx}{\bm{x}}
\newcommand{\vy}{\bm{y}}
\newcommand{\vm}{\bm{m}}
\newcommand{\vv}{\bm{v}}
\newcommand{\tvv}{\tilde{\bm{v}}}
\newcommand{\vb}{\bm{b}}
\newcommand{\va}{\bm{a}}
\newcommand{\vep}{\boldsymbol{\epsilon}}
\newcommand{\vg}{\bm{g}}
\newcommand{\vz}{\bm{z}}
\newcommand{\vp}{\bm{p}}
\newcommand{\vxi}{\boldsymbol{\xi}}
\newcommand{\sig}{\boldsymbol{\sigma}_0}
\newcommand{\sigg}{\boldsymbol{\sigma}_1}
\newcommand{\mG}{\mathcal{G}}
\newcommand{\mLx}{\mathcal{L}^{(x)}}
\newcommand{\mLy}{\mathcal{L}^{(y)}}
\newcommand{\tG}{\tilde{G}}
\newcommand{\mL}{\mathcal{L}}
\newcommand{\tC}{\tilde{C}}
\newcommand{\mH}{\mathcal{H}}
\newcommand{\mI}{\mathcal{I}}
\newcommand{\mJ}{\mathcal{J}}
\newcommand{\mM}{\mathscr{M}}
\newcommand{\bD}{\bar{\del}}
\newcommand{\delx}{\Delta^{(x)}}
\newcommand{\dely}{\Delta^{(y)}}
\newcommand{\mF}{\mathcal{F}}
\newcommand{\mO}{\mathcal{O}}
\newcommand{\del}{\Delta}
\newcommand{\tdel}{{\Delta}}
\newcommand{\lam}{\Lambda}
\newcommand{\tL}{\tilde{L}}
\newcommand{\tmG}{\tilde{\mG}}
\title{Revisiting Convergence of AdaGrad with Relaxed Assumptions}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[]{Yusu Hong}{}
\author[]{\href{mailto:<junhong@zju.edu.cn>}{Junhong Lin}}
% Add affiliations after the authors
\affil[]{%
    Zhejiang University
}
  
\begin{document}
\maketitle
\begin{abstract}
    In this study, we revisit the convergence of AdaGrad with momentum (covering AdaGrad as a special case) on non-convex smooth optimization problems. We consider a general noise model where the noise magnitude is controlled by the function value gap together with the gradient magnitude. This model encompasses a broad range of noises including bounded noise, sub-Gaussian noise, affine variance noise and the expected smoothness, and it has been shown to be more realistic in many practical applications. Our analysis yields a probabilistic convergence rate which, under the general noise, could reach at $\tilde{\mathcal{O}}(1/\sqrt{T})$. This rate does not rely on prior knowledge of problem-parameters and could accelerate to $\tilde{\mathcal{O}}(1/T)$ where $T$ denotes the total number iterations, when the noise parameters related to the function value gap and noise level are sufficiently small. The convergence rate thus matches the lower rate for stochastic first-order methods over non-convex smooth landscape up to logarithm terms \citep{arjevani2023lower}. We further derive a convergence bound for AdaGrad with momentum, considering the generalized smoothness where the local smoothness is controlled by a first-order function of the gradient norm.
\end{abstract}

\section{Introduction}
In recent years, AdaGrad \citep{duchi2011adaptive} and its variants have witnessed a large success in solving the following stochastic optimization problems:
\begin{align*}
    \min_{\vx \in \mR^d } f(\vx), \quad \text{where} \quad f(\vx) = \mE_{{\bm \zeta} }[f_{{\bm \zeta}}(\vx;{\bm \zeta})].
\end{align*}
Distinct from vanilla Stochastic Gradient Descent (SGD) \citep{robbins1951stochastic}, which typically requires smoothness or Lipschitz constants for tuning step-sizes, AdaGrad
applies an adaptive step-size with each coordinate satisfying that
\begin{align*}
    \eta_{t,i} = \frac{\eta}{\sqrt{\sum_{s=1}^t g_{s,i}^2}+\ep}, \quad \forall t \in \mN, i \in [d],
\end{align*}
where $\ep > 0$ is a constant and $\gsi$ denotes the $i$-th coordinate of the stochastic gradient $\vg_s$. This approach assigns larger step-sizes for infrequent features whose corresponding gradients are small, reminding learners of taking notice of those infrequent features. It also liberates the algorithm from the need for problem-parameters, which may be challenging to obtain in practical applications. Moreover, AdaGrad's efficiency has been empirically validated, especially in scenarios with sparse gradients \citep{duchi2011adaptive}.

Numerous works have studied the convergence of AdaGrad and its scalar version, AdaGrad-Norm \citep{duchi2011adaptive,streeter2010lesson}. \cite{duchi2011adaptive} first provided the convergence bound of AdaGrad on online convex optimization. In non-convex smooth scenario, \cite{ward2020adagrad} first obtained a convergence bound for AdaGrad-Norm without pre-tuning step-sizes, assuming bounded gradients and noises. \cite{alina2023high} proved the convergence for AdaGrad under coordinate-wise sub-Gaussian noise, discarding the bounded gradient assumption. 

Recently, several studies have proven AdaGrad-Norm's convergence under the affine variance noise, both in expectation \citep{faw2022power,wang2023convergence} and in high probability \citep{attia2023sgd}. The noise model assumes that the stochastic gradient $g(\vx),\forall \vx \in \mR^d$ satisfies that for some constants $B,C > 0$,
\begin{equation}\label{eq:affine}
    \begin{split}
         \mE &\|g(\vx)-\nabla f(\vx)\|^2 \le   B \|\nabla f(\vx)\|^2 +C  \\
        \text{or} \quad &\|g(\vx)-\nabla f(\vx)\|^2 \le   B \|\nabla f(\vx)\|^2 + C.  
    \end{split}
\end{equation}
This noise model, verified in machine learning applications with feature noise \citep{fuller2009measurement,khani2020feature}, and in robust linear regression \citep{xu2008robust}, offers a more realistic portrayal by allowing the noise norm to increase with the gradient norm, covering both bounded noise and sub-Gaussian noise. These studies not only provided a convergence rate of $\tilde{\mO}(1/\sqrt{T})$, but also addressed challenges posed by the entanglement of adaptive step-sizes and stochastic gradients, and the additional variance in \eqref{eq:affine}. However, to the best of our knowledge, none of existing works have proved the convergence of vanilla AdaGrad under \eqref{eq:affine} without assuming bounded gradients. Moreover, the distinct step-size for each coordinate in AdaGrad, as opposed to an unified step-size for all coordinates in AdaGrad-Norm, brings more challenges when considering \eqref{eq:affine}. 


In this paper, we provide a deep analysis framework and establish a probabilistic convergence bound for AdaGrad with heavy-ball style momentum, covering AdaGrad as a special case. More importantly, we consider a general noise model such that for some constants $A,B,C \ge 0$,
\begin{align}\label{eq:noise}
    \|g(\vx) - \nabla f(\vx)\|^2 \le A(f(\vx)-f^*) + B \|\nabla f(\vx)\|^2 + C,
\end{align}
where $f(\vx) \ge f^*,\forall \vx \in \mR^d$.
It's obvious to verify that \eqref{eq:noise} is strictly weaker than the almost surely affine variance noise in \eqref{eq:affine}, and thus than the bounded noise or sub-Gaussian noise condition\footnote{For conciseness, we mainly consider the almost-sure version of this general noise model. Extending our high probability analysis from an almost-sure version to a sub-Gaussian version is easy, which will be included in Appendix.}. Indeed, \eqref{eq:noise} could be regarded as an extension of \eqref{eq:affine} and the following expected smoothness condition \citep{gower2019sgd,grimmer2019convergence,wangjmlr2023convergence},
\begin{equation}\label{eq:expect_smooth}
    \begin{split}
        \mE &\|g(\vx)\|^2 \le A(f(\vx)-f^*) + B  \\
        \text{or} \quad &\|g(\vx)\|^2 \le A(f(\vx)-f^*) + B.
    \end{split}
\end{equation}
Existing researches have studied SGD's convergence behavior under \eqref{eq:noise} with smooth objective functions, both in asymptotic \citep{poljak1973pseudogradient} and non-asymptotic view \citep{khaled2022better}. 
More importantly, it has been shown that numerous of practical stochastic gradient settings satisfy \eqref{eq:noise} but out of the range of \eqref{eq:affine}, including commonly used perturbation, sub-sampling and compression \citep{khaled2022better}. However, the analysis for SGD could not be directly extended to AdaGrad due to the correlation of adaptive-sizes and stochastic gradients, and the coordinate-wise performance in AdaGrad. 

Finally, we apply our analysis framework to the $(L_0,L_1)$-smoothness where the local smoothness of $f$ satisfies that when $\|\vy-\vx\| \le 1/L_1$,
\begin{align}
    \|\nabla f(\vy) - \nabla f(\vx) \|\le (L_0 + L_1\|\nabla f(\vx)\|) \|\vy-\vx\|. \label{eq:general}
\end{align}
This assumption was proposed by \citep{zhang2020why} through empirical studies on language models and later verified in large language models, e.g., \citep{zhang2020improved,crawshaw2022robustness}. \eqref{eq:general} generalizes the standard global smoothness condition and allows unbounded smooth parameter, bringing more challenges for the convergence analysis of adaptive methods. Previous works \citep{faw2023beyond,wang2023convergence} have derived convergence bounds for AdaGrad-Norm with \eqref{eq:affine} and \eqref{eq:general}. Also, prior knowledge of problem-parameters is necessary as pointed out by the counter examples in \citep{wang2023convergence}. However, the analysis for coordinate-wise AdaGrad is non-trivial and requires more delicate constructions, particularly when considering the weaker noise assumption in \eqref{eq:noise}.

In the following, we will summarize our main contributions as follows. We also refer readers to see the comparison of our results with existing works in Table \ref{table} from the appendix.

\paragraph{Contribution}
\begin{itemize}
    \item We demonstrate the probabilistic convergence of AdaGrad with momentum on non-convex smooth optimization under a general noise assumption in \eqref{eq:noise}. For an $L$-smooth function $f$, we demonstrate that after $T$ iterations of the algorithm, with probability at least $1-\delta$, $\sum_{s=1}^T \|\nabla f(\vx_s)\|^2/T$ is bounded by
    \begin{align*}
         \mO\left( \frac{{\rm poly}\left(\log \frac{T}{\delta} \right)}{T} + \sqrt{\frac{(A+C){\rm poly}\left(\log \frac{T}{\delta} \right)}{T} }\right),
    \end{align*}
    which could also accelerate to $\tilde{\mO}(1/T$) rate when the noise parameters $A,C$ are sufficiently low. 
    \item As direct corollaries, we also derive similar probabilistic convergence results of AdaGrad on non-convex smooth optimization with affine variance noise. More importantly, the convergence rate is optimal and adaptive to the noise level $C$ in \eqref{eq:affine}.
   \item We derive a convergence bound for AdaGrad with momentum considering \eqref{eq:noise} and \eqref{eq:general}. The rate is similar to the smooth case and adaptive to the noise level as well, and necessitating problem-parameters to tune step-sizes. 
\end{itemize}

Our analysis relies on the descent lemma with telescoping, and the novel decomposition and estimations over the first-order term related to new proxy step-sizes that are used to decorrelate stochastic gradients and adaptive step-sizes in this new noise regime. We also prove that the function value gap as well as the gradient norm are controlled by the polynomial of $\log T$ along the optimization process.

The rest of the paper are organized as follows. The next section introduces some extra related works. Section \ref{sec:preliminary} provides the problem setup and basic assumptions, and the introduction of AdaGrad with momentum. Section \ref{sec:convergence} provides high probability convergence bounds for AdaGrad with momentum, and also for AdaGrad as direct corollaries. Section \ref{sec:proof} provides proof details for the main results. Section \ref{sec:general} presents necessary introduction of the generalized smooth condition and the subsequent convergence result. 
All missing proofs for some of the lemmas and convergence results under generalized smoothness are given in Appendix.
\section{Related works}
SGD and its adaptive variants have been a target of intense interest in the last decade. We refer to \citep{bottou2018optimization,ruder2016overview} for an overview. We limit our discussions to the most relevant literature in the sequel.

\paragraph{Convergence of AdaGrad} 
Numerous of works mainly studied the convergence of AdaGrad-Norm over non-convex smooth landscape. \citet{li2019convergence} first proved the convergence for AdaGrad-Norm. However, they studied a variant with a delayed step-size that is independent from the current stochastic gradient and required knowledge of the smoothness parameter for tuning step-sizes. Getting rid of prior-knowledge on problem-parameters, \citet{ward2020adagrad} relied on a novel proxy step-size technique and showed the convergence with an uniform bound of stochastic gradients for vanilla AdaGrad-Norm. \cite{kavis2022high} and \cite{alina2023high} proved probabilistic convergence  under the sub-Gaussian noise without relying on the bounded gradient assumption. In case of the affine variance noise, \citet{faw2022power} provided the convergence bound. However, their rate is adaptive to the noise level only when $B \sim \mO(1/T)$. 
\cite{wang2023convergence} relied on a distinct framework to improve the dependency on $T$ in the convergece rate \citep{faw2022power} and achieved the adaptivity on noise level without any restriction over $B$. Concurrently, \cite{attia2023sgd} deduced a probabilistic bound, using a novel induction argument to control the function value gap. Their result also adapted to the noise level without further requirement on $B$. \citet{alina2023on} formulated a convergence bound for AdaGrad-Norm and its acceleration version in quasar-convex smooth setting. 

The element-wise version of AdaGrad was first studied in \citep{duchi2011adaptive} on online convex optimization. In non-convex smooth case, a line of works investigated AdaGrad with bounded gradients. \citet{zou2019sufficient} explored the convergence of AdaGrad with a heavy-ball or Nesterov style momentum. \cite{zhou2018convergence} also covered AdaGrad in their analysis, but they deduced a bound under requiring bounded gradients' summation.
\citet{defossez2020simple} studied AdaGrad with Adam-type momentum and improved the dependency on the momentum parameter $\beta$ to $\mO((1-\beta)^{-1})$. \cite{shen2023unified} introduced a weighted AdaGrad with unified momentum covering both heavy-ball and Nesterov’s acceleration. Recently, \citet{alina2023high}, a work mentioned before, derived a convergence bound under coordinate-wise sub-Gaussian noise, i.e., $g(\vx)_i - \nabla f(\vx)_i$ is sub-Gaussian for each $i\in [d]$, without requiring bounded gradients.

\paragraph{Convergence with affine variance noise}
We briefly summarize some works on the convergence of SGD or AdaGrad with \eqref{eq:affine} and its variants under the non-convex smooth landscape. \citet{bertsekas2000gradient} provided an almost-surely convergence bound for SGD. In non-asymptotic view, \citet{bottou2018optimization} derived a convergence bound for SGD of the form $\mO(1/T+\sqrt{C/T})$ when step-sizes are well tuned by the smooth parameter, $B$ and $C$. They also pointed out that the extension is immediate from the bounded noise case \citep{ghadimi2013stochastic}.
The convergence of AdaGrad-Norm under \eqref{eq:affine} has been well studied by \citep{faw2022power,wang2023convergence,attia2023sgd} as mentioned before. \citet{faw2023beyond} further extended the analysis considering a generalized smooth condition. 
%\citet{wang2023convergence} obtained expected convergence rate for AdaGrad under a stronger coordinate-wise version of (3):
However, none of these existing works could prove the convergence of coordinate-wise version of AdaGrad under \eqref{eq:affine} or a weaker noise condition.

\paragraph{Convergence with the expected smoothness}
The expected smoothness condition was once applied for convex optimization such that for some constant $A > 0$,
\begin{align}\label{eq:expect_smooth_1}
    \mE [\|g(\vx) - g(\vx^*)\|^2 ] \le A(f(\vx)-f^*),
\end{align}
where $\vx^*$ denotes the global minimizer. Based on \eqref{eq:expect_smooth_1}, \cite{richtarik2020stochastic} relied on matrix analysis to bound the identities of expected iterates of SGD in the setting of stochastic reformulations of linear systems. \citet{gower2021stochastic} applied \eqref{eq:expect_smooth_1} to analyze the JacSketch method (a general form of SAGA) over strongly convex optimization. 

Since $\vx^*$ is ill-defined for non-convex optimization, \cite{gower2019sgd} then directly set $\mE[\|g(\vx^*)\|^2] = B$ and deduced the non-convex version of the expected smoothness in \eqref{eq:expect_smooth}, which aligns with the weak growth condition \citep{vaswani2019fast} when $B=0$. \cite{gower2019sgd} relied on \eqref{eq:expect_smooth} to analyze SGD over quasi-strongly convex optimization.
Independently, \citet{grimmer2019convergence} relied on \eqref{eq:expect_smooth} and developed a general framework for SGD equipped with projection operators
over convex non-smooth functions. \citet{wangjmlr2023convergence} also used \eqref{eq:expect_smooth} to derive a convergence bound for SGD using
bandwidth-based step-sizes.

Regarding the noise model in \eqref{eq:noise}, \citet{poljak1973pseudogradient} provided an asymptotic convergence bound for SGD  with smooth objective functions. Very recently, \cite{khaled2022better} derived a non-asymptotic convergence rate of $\mO(1/\sqrt{T})$ for SGD with non-convex smooth functions.

In conclusion, it's clear to see that \eqref{eq:noise} is weaker than the above conditions including \eqref{eq:affine} (when assuming the existence of $f^*$), \eqref{eq:expect_smooth} and \eqref{eq:expect_smooth_1}\footnote{We make the comparison when assuming all conditions are in almost-surely form.}. Our result then shows that AdaGrad could find a stationary point under this mild noise assumption without prior knowledge of problem-parameters.

\paragraph{Convergence with generalized smoothness}
The generalized smooth condition \citep{zhang2020why} has been well studied under different algorithms, e.g., \citep{qian2021understanding,zhao2021convergence,reisizadeh2023variance,zhang2020improved,crawshaw2022robustness}. Considering AdaGrad and its variants, \cite{faw2023beyond} established a convergence bound for AdaGrad-Norm considering \eqref{eq:affine}. However, their result required $B < 1$. \cite{wang2023convergence} further tightened the dependency to the iteration number $T$ and got rid of restriction on $B$. 
% \paragraph{Contributions } We summarize our main contributions as follows.
% \begin{itemize}
%     \item We provide a high probability convergence bound for AdaGrad-Norm with heavy-ball momentum under ``sub-Gaussian affine" variance noise. The bound is of order $\tilde{\mathcal{O}}(1/T + \sigma_0/ \sqrt{T})$, which aligns with the convergence rate of some SGD variants, including non-adaptive SGD  \citep{bottou2018optimization}  and AdaGrad-Norm  \citep{attia2023sgd}, under the same conditions. 
%     \item We derive a high probability convergence bound for AdaGrad with heavy-ball momentum under coordinate-wise ``sub-Gaussian affine" variance noise. We demonstrate that after $T$ iterations of the algorithm, with probability at least $1-\delta$, it holds that
%     \begin{align*} 
%         \frac{1}{T}\sum_{t=1}^T\|\nabla f(x_t)\|^2 \le \tilde{\mathcal{O}}\left(\frac{1}{T} + \frac{\|\boldsymbol{\sigma}_0\|_{\infty}}{\sqrt{T}} \right). 
%     \end{align*} 
%    \item  We also establish the high probability convergence bound for AdaGrad with Adam-type momentum under coordinate-wise ``sub-Gaussian affine" variance noise. The bound shares the same form to AdaGrad with heavy-ball momentum which is also adaptive to the noise level.
% \end{itemize}



% \input{Related_work}

% \SetAlgoNlRelativeSize{-2}

\section{Problem setting and algorithm}\label{sec:preliminary}

We consider unconstrained stochastic optimization over the Euclidean space $\mR^d$ with $l_2$-norm. The objective function $f: \mR^d \rightarrow \mR$ is $L$-smooth satisfying that for any $\vx, \vy \in \mR^d$,
\begin{align*}
f(\vy) - f(\vx) - \la \nabla f(\vx), \vy-\vx \ra \le \frac{L}{2}\|\vx-\vy\|^2.
\end{align*}
Given $\vx \in \mathbb{R}^d$, we assume a gradient oracle that returns a random vector $ g(\vx,\vz) \in \mathbb{R}^d$, where $\vz $ denotes a random sample. The deterministic gradient of $f$ at $\vx$ is denoted by $\nabla f(\vx)  \in \mathbb{R}^d$.
\paragraph{Notations}  
% We use $[T]$ to denote the set $\{1,2,\cdots, T\}$ and $\| \cdot \|, \| \cdot \|_1$ and $\| \cdot \|_{\infty}$ to denote $l_2$-norm, $l_1$-norm and $l_\infty$-norm respectively. $a \sim \mathcal{O}(b)$ and $a \le \mathcal{O}(b)$ denote $a = C_1b$ and $a \le C_2b$ for some positive universal constants $C_1, C_2$ and $a \le \tilde{\mathcal{O}}(b)$ denotes $a \le \mathcal{O}(b)\text{poly}(\log b)$. 
% For any vector $\vx \in \mR^d$, $\vx^2$ and $\sqrt{\vx}$ denote coordinate-wise square and square root respectively. 
% For any two vectors $\vx,\vy \in \mR^d$, we use $\vx \odot \vy$ and $\vx/\vy$ to denote the coordinate-wise product and quotient respectively. ${\bf 0}_d$ and ${\bf 1}_d$ represent zero and one $d$-dimensional vectors respectively.
We denote the set $\{1,2,\cdots, T\}$ as $[T]$, and use $\| \cdot \|, \| \cdot \|_1$, and $\| \cdot \|_{\infty}$ to represent the $l_2$-norm, $l_1$-norm, and $l_\infty$-norm, respectively. The notations $a \sim \mathcal{O}(b)$ and $a \le \mathcal{O}(b)$ refer to $a = c_1b$ and $a \le c_2b$ with $c_1, c_2$ being positive universal constants, and $a \le \tilde{\mathcal{O}}(b)$ indicates $a \le \mathcal{O}(b) \text{poly}(\log b)$. For any vector $\vx \in \mR^d$, the expressions $\vx^2$ and $\sqrt{\vx}$ refer to the coordinate-wise square and square root. For two vectors $\vx,\vy \in \mR^d$, $\vx \odot \vy$ and $\vx/\vy$ denote the coordinate-wise product and quotient. ${\bf 0}_d$ and ${\bf 1}_d$ signify zero and one vectors in $d$ dimensions. Further, we write ${\bf 1}_d/ \vx$ as $1/\vx$, whenever there is no any confusion.

\paragraph{Assumption}
We make the following assumptions.
\begin{itemize}
    \item  \textbf{(A1) Bounded below:} The objective function is bounded below, i.e., there exists $f^* > -\infty$ such that $f(\vx) \ge f^*, \forall \vx \in \mR^d$;
    \item \textbf{(A2) Unbiased estimator:} The gradient oracle provides an unbiased estimator of $\nabla f(\vx)$, i.e., $\forall \vx \in \mR^d$, $\mE_{\vz}\left[  g(\vx,\vz) \right]=\nabla f(\vx)$;
    \item  \textbf{(A3) Relaxed affine variance noise: }  The gradient oracle satisfies that for some constants $A,B,C > 0$, $\|  g(\vx,\vz)-\nabla f(\vx)\|^2 \le A(f(\vx)-f^*) + B \|\nabla f(\vx)\|^2 + C, a.s., \forall \vx \in \mR^d$.
    % \item \textbf{(A5) Sub-Gaussian noise:} For any $x \in \mR^d$, $\mE\left[\exp\left(\|g(x) - \nabla f(x)\|_{\infty}^2/\sigma^2 \right)\right] \le \exp(1)$.
\end{itemize}

The first two assumptions are standard in the analysis of algorithm's convergence. With a simple calculation, it's easy to verify that Assumption (A3) is equivalent to 
\begin{align*}
     \|g(\vx,\vz)\|^2 \le A'(f(\vx)-f^*) + B' \|\nabla f(\vx)\|^2 + C'
\end{align*}
for another three positive constants $A',B',C'$. Therefore, (A3) is a generalization of \eqref{eq:affine} and \eqref{eq:expect_smooth}. For more detailed examples of stochastic gradient settings satisfying Assumption (A3), we refer interested readers to see \citep[Proposition 2,3]{khaled2022better}.
%Under Assumption (A3), we can obtain high probability convergence, while under Assumption (A3'), we can get expected convergence for the studied algorithm. 
\begin{algorithm}[H]
\caption{AdaGrad with momentum}
\label{alg:AdaGrad}
\begin{algorithmic}
    \STATE{ \textbf{Input: }Horizon $T$, $\vx_1 \in \mathbb{R}^d$, $\beta\in [0,1)$, $\vm_0 = \vv_0 = {\bf 0}_d$, $\eta,\ep > 0$, $\vep = \ep {\bf 1}_d$}
    \FOR{$s=1,\cdots,T$}
    \STATE{Draw a random sample $\vz_s$ and generate $\vg_s =   g(\vx_s,\vz_s) $;}
    \STATE{$\vv_{s} = \vv_{s-1} + \vg_{s}^2 $;}
    \STATE{$\vm_{s}=\beta \vm_{s-1} - \eta \vg_{s}/\left(\sqrt{\vv_s}+\vep \right)$;}
    \STATE{$\vx_{s+1} = \vx_{s} + \vm_s$; }
    \ENDFOR
    \end{algorithmic}
\end{algorithm}

\paragraph{AdaGrad with momentum} Throughout the paper, we study AdaGrad with momentum given in Algorithm \ref{alg:AdaGrad}. 
We can transform Algorithm \ref{alg:AdaGrad} into the classical Polyak's heavy-ball method \citep{polyak1964some} with an adaptive step-size:
\begin{equation}\label{eq:x_iterate}
    \vx_{s+1} = \vx_s - \eta \frac{\vg_{s}}{\sqrt{\vv_s}+\vep} + \beta(\vx_{s} - \vx_{s-1}),\ \  \forall s \in [T],
\end{equation}
where we set $\vx_0 = \vx_1$.
\paragraph{AdaGrad} AdaGrad is  Algorithm \ref{alg:AdaGrad}   with  $\beta = 0$.
\section{Main Convergence Result}\label{sec:convergence}
In this section, we provide the probabilistic convergence result for Algorithm \ref{alg:AdaGrad} under Assumption (A3) and smooth objective functions.
\begin{theorem}\label{thm:1}
    Given $T \ge 1$, let $\{\vx_s\}_{s \in [T]}$ be generated by Algorithm \ref{alg:AdaGrad}. If Assumptions (A1), (A2), (A3) hold, then for any $\beta \in [0,1),\eta,\ep > 0$ and $\delta \in (0,1)$, it holds that with probability at least $1-\delta$,
    \begin{align*}
       &\frac{1}{T}\sum_{s=1}^T\|\nabla f(\vx_s)\|^2 \\
       \le &\mO\left[\tdel_1\left(\frac{B_1\tdel_1+ \sqrt{B_1L\del}+\ep}{T}  + \sqrt{\frac{A\del+C}{T}} \right)\right],
    \end{align*}
    where $B_1=B+1,$ $\tdel_1 = \tdel(1-\beta)/\eta$, and $\tdel$ is given by\footnote{The detailed expression of $\tdel$ could be found in \eqref{eq:define_tD}.}
    \begin{align*}
        \tdel \sim &\mO\left[f(\vx_1)-f^*+ \frac{\sqrt{C}\eta d}{1-\beta}\log\left(\frac{T}{\delta} + \frac{T}{\ep^2}\right)\right.\\
        &\left.+ \frac{(A+B_1L)\eta^2d^2}{(1-\beta)^3}\log^2\left(\frac{T}{\delta}+ \frac{T}{\ep^2}\right) \right].
    \end{align*}
\end{theorem}
\begin{remark}\label{rem:beta}
With a simple calculation, when $\eta = c_1(1-\beta)^{3/2}$ for some constant $c_1 > 0$, the above upper bound has a minimum order of $\mO((1-\beta)^{-1})$ with respect to $(1-\beta)^{-1}$. The comparison of existing results with our convergence bound could be found in Table \ref{table} from the appendix.
\end{remark}

\paragraph{Convergence of AdaGrad with affine variance noise}
As a direct consequence of Theorem \ref{thm:1}, it's worthy to mention the following convergence bound for AdaGrad with affine variance noise considering their empirical significance.
\begin{corollary}
	Under the assumptions and notations of Theorem \ref{thm:1}, let  $\beta = 0$ and $A=0$.
    Then for any $\eta,\ep > 0$ and $\delta \in (0,1)$, it holds that with probability at least $1-\delta$,
    \begin{align*}
       &\frac{1}{T}\sum_{s=1}^T\|\nabla f(\vx_s)\|^2 \\
       \le &\mO\left[\tdel_1\left(\frac{B_1\tdel_1 + \sqrt{B_1L\del}+\ep}{T} + \sqrt{\frac{C}{T}} \right)\right],
    \end{align*}
    where $B_1=B+1,$ $\tdel_1 = \tdel/\eta$, and $\tdel$ is defined as follows
    \begin{align*}
        \tdel \sim &\mO\left[f(\vx_1)-f^*+ {\sqrt{C}\eta d}\log\left(\frac{T}{\delta}+\frac{T}{\ep^2} \right)\right.\\
       &\left.+ {B_1L\eta^2d^2}\log^2\left(\frac{T}{\delta}+\frac{T}{\ep^2} \right) \right].
    \end{align*}
\end{corollary}
\begin{remark}\label{rem:highpro}
	1) Setting $\eta\sim {1 / \left(d \log\left(\frac{T}{\delta} \right)\right)}$, then $\del \sim 1$, and the above derived upper bound is of order $\mO \left( {d^2 } \log^2\left(\frac{T}{\delta}\right) /T + d \log\left(\frac{T}{\delta}\right)\sqrt{C/T}\right)$, matching the lower rate in \citep{arjevani2023lower} up to logarithm factors.
	\\
	2)  The convergence rate is of order $\tilde{\mO}({1/ T} + \sqrt{{C / T}}),$
	and when the noise level $C$ is sufficiently low, the convergence rate could be $\tilde{\mO}(1/T)$, which aligns with the result for non-adaptive SGD under the same conditions \citep{ghadimi2013stochastic,bottou2018optimization} up to logarithmic terms.  \\
	3)  As in standard probability theory, the derived high-probability convergence can ensure expected convergence. \\
4) Assumption (A3) can be replaced by its sub-Gaussian form where
$\mE_{\vz}\left[\exp\left({\|  g(\vx,\vz)-\nabla f(\vx)\|^2 \over A(f(\vx)-f^*) + B \|\nabla f(\vx)\|^2 + C}\right)\right]\le \mathrm{e} ,$ and our results still hold true, as shown in Appendix.
% 5) Assumption (A3) can be replaced with its expected version where $\mE_{\vz}\left[\|  g(\vx,\vz)-\nabla f(\vx)\|^2\right] \le A(f(\vx)-f^*) + B \|\nabla f(\vx)\|^2 + C$, as will be shown in Appendix, where our results lose the adaptivity on the noise parameters $C$ and $A$.
\end{remark}



\section{Proof detail}\label{sec:proof}
To start with, we let $\vg_s = (\gsi)_i$ be as in Algorithm \ref{alg:AdaGrad} and let $\nabla f(\vx_s) = \bar{\vg}_s = (\bar{g}_{s,i})_i $, $\vxi_s = (\xi_{s,i})_i = \vg_s - \bar{\vg}_s$ and $\delx_s = f(\vx_s)-f^*$. 

During the proof, we will introduce several key lemmas to deduce the final results. All the missing proofs could be found in Appendix.
\subsection{Preliminary} Before proving the main result, we shall introduce several useful auxiliary sequences. The first sequence $\{\vy_s \}_{ s \ge 1}$ is defined as
\begin{align}
	& \vy_1 = \vx_1, \vy_s = \frac{\beta}{1-\beta}(\vx_s-\vx_{s-1}) + \vx_s, \quad\forall s \ge 2, \label{eq:define_y_s}
\end{align}
following from \citep{ghadimi2015global,yang2016unified} which was used to prove the convergence of SGD with momentum and later applied to handle with many variants of momentum-based algorithms. When $\vx_s$ is generated by Algorithm \ref{alg:AdaGrad}, we reveal that $\vy_s$ satisfies that for any $s \ge 1$,
\begin{align}\label{eq:y_iterative}
    \vy_{s+1} = \vy_s -\frac{\eta}{1-\beta}  \frac{\vg_s}{\vb_s},\quad \vb_s = \sqrt{\vv_s} + \vep.
\end{align}
We let the function value gap $\dely_s = f(\vy_s)-f^*$.
In addition, we introduce $\{  \mG_s \}_{s \ge 1}$ and the value $\mG$,
\begin{equation}\label{eq:define_G_t}
    \begin{split}
     &\mG_s  =   \sqrt{X\delx_s+ 2C },  \\
     &\mG =  \sqrt{X\del+ 2C },\quad  X = 2A+4LB+4L,
    \end{split}
\end{equation}
where $\del$ is as in Theorem \ref{thm:1}.

\subsection{Rough estimations}
Motivated by \citep{faw2022power}, we provide some rough estimations for several key algorithm-dependent terms in this section. These estimations are not delicate, but they play vital roles in further deducing the final convergence rate. 
\begin{lemma}\label{lem:estimation_rough}
    For any $s \ge 1$ and $\beta \in [0,1)$,
    \begin{align*}
        \|\vm_s \| \le \frac{\eta\sqrt{d}}{1-\beta},\quad \|\bar{\vg}_s \|  \le \|\bar{\vg}_1\| + \frac{L\eta s\sqrt{d}}{1-\beta}.
    \end{align*}
\end{lemma}

\begin{lemma}\label{lem:delta_rough}
    Suppose that $\beta \in [0,1)$. Then for any $T \ge 1$,
    \begin{align*}
        \sum_{t=1}^T &\delx_t \le \delx_1  T\\
    & + \left(\frac{\eta \|\bar{\vg}_1\| \sqrt{d}}{1-\beta} + \frac{L\eta^2 d}{2(1-\beta)^2}\right) T^2 + \frac{L\eta^2dT^3}{(1-\beta)^2}  .
    \end{align*}
\end{lemma}


\subsection{Start Point and decomposition}
We now proceed the proof for the main result. 
We fix the horizon $T$. Following \citep{ward2020adagrad}, we start from the descent lemma of smoothness over $\vy_s$ with both sides subtracting with $f^*$,
\begin{align*}
    \dely_{s+1} \le \dely_s   +   \left\la \nabla f(\vy_s) ,\vy_{s+1} - \vy_s \right\ra  + \frac{L}{2}\|\vy_{s+1}-\vy_s\|^2.
\end{align*}
Combining with \eqref{eq:y_iterative}, and summing over $s \in [t]$,
\begin{align}
    \dely_{t+1}
               &\le \delx_1 +\frac{\eta}{1-\beta}  \underbrace{\left(- \sum_{s=1}^t  \left\la \nabla f(\vy_s), \frac{\vg_s}{\vb_s} \right\ra \right)}_{\textbf{A}} \nonumber \\
               &\quad+ \frac{L\eta^2}{2(1-\beta)^2}\sum_{s=1}^t\left\|   \frac{\vg_s}{\vb_s}  \right\|^2,  \label{eq:A+B}
\end{align}
where we apply $\vy_1= \vx_1$.
We subsequently further make a decomposition over {\bf A} as
\begin{align}\label{eq:A_decomp}
    \textbf{A} 
    &=  \underbrace{  -\sum_{s=1}^t\left\langle \bar{\vg}_s, \frac{\vg_s}{\vb_s}\right\rangle }_{\textbf{A.1}} + \underbrace{ \sum_{s=1}^t \left\langle \bar{\vg}_s - \nabla f(\vy_s), \frac{\vg_s}{\vb_s}\right\rangle}_{\textbf{A.2}}.
\end{align}

\subsection{Estimating {\bf A}}
The first main challenge comes from the entanglement of $\vg_s$ and $\vb_s$ emerging in {\bf A}, which is a key problem distinct  from the analysis for SGD.

\paragraph{Estimating {\bf A.1}}We adopt the so-called proxy step-size technique which is a commonly used technique for breaking the correlation of $\vb_s$ and $\vg_s$ in the analysis of adaptive methods. This technique relies on introducing appropriate proxy step-sizes. It has been first introduced in \citep{ward2020adagrad} for AdaGrad-Norm with bounded stochastic gradients and variants of proxy step-sizes have been developed in the related literature, e.g., \citep{defossez2020simple,faw2022power,attia2023sgd,alina2023high}. However, none of these proxy step-sizes could be potentially applied for AdaGrad with potential unbounded gradients under the mild noise model in Assumption (A3).

We thus provide a construction of proxy step-sizes that is general enough to handle with Assumption (A3). The proxy step-sizes rely on $ \mG_s$ given in \eqref{eq:define_G_t}, specifically defined in terms of 
\begin{align}\label{eq:proxy_stepsize}
    \va_s = \sqrt{  \vv_{s-1}+ \left( \mG_s{\bf 1}_d\right)^2} + \vep, \quad \forall s \in [T].
\end{align}
Based on the proxy step-size $\eta/\va_s$, we further have
\begin{align}
    &{\bf A.1} =  - \sum_{s=1}^t  \left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}}\right\|^2 \nonumber \\
    &\quad\quad\underbrace{-  \sum_{s=1}^t \left\la  \bar{\vg}_s, \frac{\vxi_s}{\va_s} \right\ra }_{{\bf A.1.1}} + \underbrace{\sum_{s=1}^t  \left\la \bar{\vg}_s,  \left( \frac{1}{\va_s} - \frac{1}{\vb_s} \right)\vg_s 
        \right\ra}_{\textbf{A.1.2}}.  \label{eq:A.1.12}
\end{align}
The estimation for {\bf A.1.1} relies on a probabilistic analysis over a summation of martingale difference sequence.
\begin{lemma}\label{lem:1_bounded}
    Given $T \ge 1$ and $\delta \in (0,1)$, if Assumptions (A2) and (A3) hold, then with probability at least $1-\delta$, 
    \begin{align}
        {\bf A.1.1} \le \frac{1}{4}\sum_{s=1}^t\frac{ \mG_s}{\mG}\left\|\frac{ \bar{\vg}_s  }{\sqrt{\va_s}}\right\|^2 + 3 \mG  \log \left(\frac{T}{\delta} \right),\forall t \in [T], \label{eq:A.1.1}
    \end{align}
    where $\mG_s,\mG$ are as in \eqref{eq:define_G_t}. 
\end{lemma}


The {\bf A.1.2} serves as an error term for introducing $\va_s$. However, due to the delicate construction of $\va_s$, we could estimate the gap as follows, 
 \begin{lemma}\label{lem:gap_as_bs}
Under Assumption (A3),  let $\vb_s=(\bsi)_i,\va_s=(\asi)_i$ be defined in \eqref{eq:y_iterative} and \eqref{eq:proxy_stepsize}. Then 
    \begin{align*}
        \left|\frac{1}{\asi} - \frac{1}{\bsi} \right| \le \frac{\mG_s}{\asi\bsi},\quad \forall s \in[T] ,\forall i \in [d].
    \end{align*}
 \end{lemma}
 
Based on this lemma, it's then shown in the following lemma that {\bf A.1.2} could be controlled.
\begin{lemma}\label{lem:A.1.2}
    Under Assumption (A3),   for any $t \ge 1$, if $\beta \in [0,1)$, it holds that 
    \begin{align}
        {\bf A.1.2} \le  \frac{1}{4} \sum_{s=1}^t\left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2 + \sum_{s=1}^t \mG_s \left\|\frac{\vg_s}{\vb_s} \right\|^2. \label{eq:A.1.2}
    \end{align}
\end{lemma}
 
Finally, we rely on the smoothness to estimate {\bf A.2}.
\begin{lemma}\label{lem:A.2}
    For any $t \ge 1$, if $\beta \in [0,1)$, it holds that 
    \begin{align}
        {\bf A.2}\le \frac{L }{2\eta}\sum_{s=1}^t \|\vm_{s-1}\|^2 + \frac{L\eta}{2(1-\beta)^2}\sum_{s=1}^t  \left\|\frac{\vg_s}{\vb_s} \right\|^2. \label{eq:A.2}
    \end{align}
\end{lemma}
% \begin{proof}
%     Using the smoothness of $f$, \eqref{eq:define_y_s} and $\beta \in [0,1)$,
%     \begin{align}
%         &\|\bar{\vg}_s - \nabla f(\vy_s)\| 
%         \le L \|\vy_s - \vx_s\| \nonumber\\
%         =& \frac{L\beta}{1-\beta}\|\vx_s - \vx_{s-1}\| 
%         \le \frac{L}{1-\beta}\|\vm_{s-1}\|. \label{eq:gradient_xs_ys}
%     \end{align}
%     Applying Cauchy-Schwarz inequality and using \eqref{eq:gradient_xs_ys}, 
%     \begin{align*}
%         {\bf A.2} &\le \sum_{s=1}^t \|\bar{\vg}_s - \nabla f(\vy_s)\| \left\|\frac{\vg_s}{\vb_s}\right\| \nonumber\\
%         &\le \frac{L}{1-\beta} \sum_{s=1}^t \|\vm_{s-1}\|\left\|\frac{\vg_s}{\vb_s} \right\| \nonumber \\
%         &\le  \frac{L }{2\eta}\sum_{s=1}^t \|\vm_{s-1}\|^2 + \frac{L\eta}{2(1-\beta)^2}\sum_{s=1}^t\left\|\frac{\vg_s}{\vb_s} \right\|^2 . 
%     \end{align*}
%     The proof is complete.
% \end{proof}
\subsection{Bounding the function value gap}
Based on the above estimations, we could use an induction argument to deduce an upper bound for function value gaps. The induction technique is motivated by \citep{attia2023sgd} where AdaGrad-Norm with affine variance noise was studied. As we study a more relaxed assumption on AdaGrad, it's required to provide some new estimations.
\begin{proposition}\label{pro:delta_s}
    Under the same conditions of Theorem \ref{thm:1}, the following two inequalities hold with probability at least $1-\delta$,
    \begin{align}\label{eq:pro_1}
        \delx_t \le \del, \quad \mG_t \le \mG, \quad \forall t \in [T+1],
    \end{align}
    and
    \begin{align}\label{eq:pro_2}
        \delx_{t+1} \le \tdel-  \frac{\eta}{1-\beta} \sum_{s=1}^t\left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2,\quad \forall  t \in [T],
    \end{align}
    where $\del$ is as in Theorem \ref{thm:1} and $\mG_t,\mG$ are as in \eqref{eq:define_G_t}.
\end{proposition}
%\begin{proof}
In what follows, we prove Proposition \ref{pro:delta_s}. We assume that \eqref{eq:A.1.1} always happens and then deduce \eqref{eq:pro_1} and \eqref{eq:pro_2}. Recall that \eqref{eq:A.1.1} holds with probability at least $1-\delta$. We therefore obtain that both \eqref{eq:pro_1} and \eqref{eq:pro_2} would hold with probability at least $1-\delta$.
We first plug \eqref{eq:A.1.12}, \eqref{eq:A.1.1} and \eqref{eq:A.1.2} into \eqref{eq:A_decomp}, and then combine with \eqref{eq:A.2} and \eqref{eq:A+B} to get that
\begin{align}\label{eq:final_1}
        \dely_{t+1}
        &\le \delx_1 + \frac{\eta}{1-\beta}\sum_{s=1}^t\left(\frac{\mG_s}{4\mG}-\frac{3}{4} \right) \left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2  \nonumber \\
        &+ \frac{3 \mG \eta}{1-\beta} \log \left(\frac{T}{\delta} \right)  + \frac{\eta}{1-\beta}\sum_{s=1}^t \mG_s \left\|\frac{\vg_s}{\vb_s} \right\|^2 \nonumber\\
        &+ \frac{L }{2(1-\beta)}\sum_{s=1}^t \|\vm_{s-1}\|^2 + \tL\sum_{s=1}^t  \left\|\frac{\vg_s}{\vb_s} \right\|^2,
    \end{align}
where we let $\tL = \frac{L\eta^2}{2(1-\beta)^3} + \frac{L\eta^2}{2(1-\beta)^2}$. Then, we present the specific definition of $ \Delta$ as
\begin{align}\label{eq:define_tD}
    \Delta&:= 4\delx_1+ \frac{12 \sqrt{2C } \eta }{1-\beta} \log \left(\frac{T}{\delta} \right) \\
    &+4 \left(\frac{ \sqrt{2C }\eta}{1-\beta}+ \frac{ \eta^2L}{(1-\beta)^3}+\tL \right)d\log \mF_T \nonumber\\
    &+ \frac{72X\eta^2}{(1-\beta)^2}\log^2\left(\frac{T}{\delta} \right) 
    +\frac{8X\eta^2}{(1-\beta)^2} d^2\log^2\mF_T.  \nonumber
%    \nonumber
\end{align}
Here, $\mF_T$ is a polynomial with respect to $T$ with the detailed expression in \eqref{eq:define_poly_F} from Appendix.
Then, it's easy to verify that $\delx_1 \le \del$. Suppose that for some $t \in [T]$, 
\begin{align}
    \delx_s \le \del, \forall s \in [t], \quad \text{thus}, \quad \mG_s \le \mG, \forall s \in [t]. \label{eq:induction_assumption}
\end{align}
In order to apply \eqref{eq:final_1} to control $\delx_{t+1}$, we introduce the following lemma to lower bound the LHS of \eqref{eq:final_1}.
\begin{lemma}\label{lem:delta_y_x}
Let $\vy_s$ be defined in \eqref{eq:define_y_s} and $\beta \in [0,1)$. Then for any $s \ge 1$,
\begin{align*}
    \dely_s \ge \frac{\delx_s}{2} - \frac{L\|\vm_{s-1} \|^2 }{2(1-\beta)^2}.
\end{align*}
\end{lemma}
Based on Lemma \ref{lem:delta_y_x}, the LHS of \eqref{eq:final_1} could be lower bounded in terms of $\delx_{t+1}$. We use \eqref{eq:induction_assumption} to upper bound the RHS of \eqref{eq:final_1}, which leads to
\begin{align*}
    &\frac{\delx_{t+1}}{2}
    \le \delx_1 - \frac{\eta}{2(1-\beta)}\sum_{s=1}^t\left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2 + \frac{L \|\vm_t\|^2}{2(1-\beta)^2}  \\
    &+ \frac{3  (\sqrt{X\del}+\sqrt{2C}) \eta }{1-\beta} \log \left(\frac{T}{\delta} \right)+ \tL\sum_{s=1}^t  \left\|\frac{\vg_s}{\vb_s} \right\|^2  \\
    &+ \frac{(\sqrt{X\del}+\sqrt{2C})\eta }{1-\beta}\sum_{s=1}^t   \left\|\frac{\vg_s}{\vb_s} \right\|^2 + \sum_{s=1}^t \frac{L \|\vm_{s-1}\|^2}{2(1-\beta)},
\end{align*}
where we use $\mG \le \sqrt{X\del}+\sqrt{2C}$. 
Further, using Young's inequality twice for the terms related to $\sqrt{X\Delta}$, and $\beta<1,$
\begin{align}
    &\frac{\delx_{t+1}}{2}
    \le \frac{\del}{4}+ \delx_1- \frac{\eta}{2(1-\beta)} \sum_{s=1}^t\left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2 \nonumber \\
    & + \frac{ L\|\vm_t \|^2}{2(1-\beta)^2}+ \frac{3 \sqrt{2C } \eta }{1-\beta} \log \left(\frac{T}{\delta} \right) \nonumber\\
    &+ \left(\frac{ \sqrt{2C }\eta}{1-\beta}+\tL \right)\sum_{s=1}^t  \left\|\frac{\vg_s}{\vb_s} \right\|^2+ \sum_{s=1}^t \frac{L \|\vm_{s-1}\|^2}{2(1-\beta)} \nonumber\\
    &+ \frac{18X\eta^2}{(1-\beta)^2}\log^2\left(\frac{T}{\delta} \right) +\frac{2X\eta^2}{(1-\beta)^2}\left( \sum_{s=1}^t  \left\|\frac{\vg_s}{\vb_s} \right\|^2\right)^2. \label{eq:final_3} 
\end{align}

%Let
%\begin{align*}
%	 \eta L \sum_{s=1}^t  \left\|\frac{\vg_s}{\vb_s} \right\|^2   \leq  \sum_{s=1}^t\left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2?
%\end{align*}
%Or to prove 
%\begin{align*}
%	\eta L \sum_{s=1}^t  \left\|\frac{\vg_s}{\vb_s} \right\|^2   \leq  \sum_{s=1}^t\left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2\\
%	 \geq \sum_{s=1}^t\frac{\left\|\bar{\vg}_s \right\|^2}{\sqrt{\sum \|\bar{\vg}_s\|^2+ t C}}  \geq \sqrt{\sum \|\bar{\vg}_s\|^2+ t C} - \sqrt{tC} \end{align*}


Finally, we shall use the following lemma to further estimate $\|\vm_t\|$ and the other two summations related to $\vg_s,\vb_s,\vm_s$. 
\begin{lemma}\label{lem:sum_1}
    Given $T \ge 1$ and $\beta \in [0,1)$, then for any $t \in [T]$,
    \begin{align*}
        &\sum_{s=1}^t \left\|\frac{\vg_s}{\vb_s}\right\|^2 \le d\log \mF_T, \quad \|\vm_t\|^2  \le \frac{\eta^2d}{1-\beta}\log\mF_T, \\
        &\sum_{s=1}^t\|\vm_s\|^2  \le \frac{\eta^2d}{(1-\beta)^2}\log\mF_T,
    \end{align*}
    where $\mF_T$ is a polynomial with respect to $T$ with the detailed expression in \eqref{eq:define_poly_F} from Appendix.
\end{lemma}
% \begin{lemma}\label{lem:sum_2}
%     Given $T \ge 1$ and $\beta \in [0,1)$, let $\mF_T$ be as in Lemma \ref{lem:sum_1}. Then for any $1  \le t \le T$,
%     \begin{align*}
%         & \quad .
%     \end{align*}
% \end{lemma}
%\begin{align}
%	& + \frac{ L\|\vm_t \|^2}{2(1-\beta)^2} 
%	+ \frac{3 \sqrt{2C } \eta }{1-\beta} \log \left(\frac{T}{\delta} \right) \nonumber\\
%	&+ \left(\frac{ \sqrt{2C }\eta}{1-\beta}+\tL \right)\sum_{s=1}^t  \left\|\frac{\vg_s}{\vb_s} \right\|^2+ \sum_{s=1}^t \frac{L \|\vm_{s-1}\|^2}{2(1-\beta)} \nonumber\\
%	&+ \frac{18X\eta^2}{(1-\beta)^2}\log^2\left(\frac{T}{\delta} \right) +\frac{2X\eta^2}{(1-\beta)^2}\left( \sum_{s=1}^t  \left\|\frac{\vg_s}{\vb_s} \right\|^2\right)^2\\
%	&= 
%	+ \frac{6 \sqrt{2C } \eta }{1-\beta} \log \left(\frac{T}{\delta} \right) \nonumber\\
%	&+2 \left(\frac{ \sqrt{2C }\eta}{1-\beta}+ \frac{ \eta^2L}{(1-\beta)^3}+\tL \right)d\log \mF_T \nonumber\\
%	&+ \frac{36X\eta^2}{(1-\beta)^2}\log^2\left(\frac{T}{\delta} \right) +\frac{4X\eta^2}{(1-\beta)^2}d^2\log^2 \mF_T\\
%\end{align}
Compared with Lemma \ref{lem:estimation_rough}, Lemma \ref{lem:sum_1} improves the dependency to $1-\beta$ for estimating $\|\vm_t\|^2$, which leads to the $\mO((1-\beta)^{-1})$ order for the final convergence as in Remark \ref{rem:beta}. Thus, applying Lemma \ref{lem:sum_1} over \eqref{eq:final_3}, and then combining with $\del$ in \eqref{eq:define_tD},
    % \begin{align*}
    %     &\frac{\delx_{t+1}}{L+1}
    %     \le \frac{\del}{2(L+1)}+ \delx_1 \\
    %     &- \frac{\eta}{2(1-\beta)} \sum_{s=1}^t\left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2+ \frac{3 \sqrt{2C } \eta d}{1-\beta} \log \left(\frac{dT}{\delta} \right) \\
    %     & + \left(\frac{ \sqrt{2C }\eta}{1-\beta}+\tL+\frac{(L+1)\eta^2}{2(1-\beta)^3} \right)d\log \mF_T  \\
    %     &+ \frac{9X(L+1)\eta^2 d^2}{(1-\beta)^2}\log^2\left(\frac{dT}{\delta} \right)+\frac{X(L+1)\eta^2 d^2\log^2\mF_T}{(1-\beta)^2}.
    % \end{align*}
    \begin{align*}
        \frac{\delx_{t+1}}{2} &\le \frac{\del}{2}- \frac{\eta}{2(1-\beta)} \sum_{s=1}^t\left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2 \leq {\Delta \over 2}.
%        \\
%        &\le \tdel- \frac{\eta}{2(1-\beta)} \sum_{s=1}^t\left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2 \le \tdel.
    \end{align*}
%    Then, we obtain $\delx_{t+1} \le L_1\tdel = \del$. 
    The induction is complete and the desired result in \eqref{eq:pro_1} is proved. Finally, as an intermediate result, we verify \eqref{eq:pro_2}.
%\end{proof}
\subsection{Proof of the main result}
Based on Proposition \ref{pro:delta_s}, we are able to prove Theorem \ref{thm:1}.
\begin{proof}[Proof of Theorem \ref{thm:1}]
In what follows, we will obtain the final convergence result based on \eqref{eq:pro_1} and \eqref{eq:pro_2}. Since \eqref{eq:pro_1} and \eqref{eq:pro_2} hold with probability at least $1-\delta$, the final convergence result then holds with probability at least $1-\delta$.
Let us first set $t = T$ in \eqref{eq:pro_2}, and we get
\begin{align}
     \frac{\eta}{1-\beta} \sum_{s=1}^T\frac{\|\bar{\vg}_s\|^2}{\|\va_s\|_{\infty}}  \le \frac{\eta}{1-\beta} \sum_{s=1}^T\left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2 \le \del. \label{eq:final_2}
\end{align}
Using \eqref{eq:proxy_stepsize}, the basic inequality and Assumption (A3), we have that for any $s \in [T]$, with $B_1=B+1,$
\begin{align}\label{eq:upper_bound_asi}
    &\|\va_s\|_{\infty}  -\epsilon 
    \le \max_{i \in [d]}\sqrt{v_{s-1,i}+\mG_s^2} = \max_{i \in [d]}\sqrt{\sum_{j=1}^{s-1} g_{j,i}^2 +\mG_s^2} \nonumber \\
    &\le \sqrt{\sum_{j=1}^{s-1} \|\vg_{j}\|^2 +\mG_s^2} \le \sqrt{2\sum_{j=1}^{s-1} (\|\bar{\vg}_{j}\|^2 + \|\vxi_j\|^2) +\mG_s^2}\nonumber \\
    &\le \sqrt{2\sum_{j=1}^{s-1} (A \delx_j + B_1  \|\bar{\vg}_{j}\|^2 + C) +\mG_s^2} .
\end{align}
Further applying \eqref{eq:pro_1} where $\delx_s \le \del,\mG_s \le \mG, \forall s \in [T]$,
\begin{align*}
    \|\va_s\|_{\infty}  -\epsilon
    &\le \sqrt{2B_1\sum_{s=1}^{T} \|\bar{\vg}_s\|^2 + 2(A\del+C)T +\mG^2 }.  
\end{align*}
Combining with \eqref{eq:final_2}, using $\del_1 = \del(1-\beta)/\eta$, then applying Young's inequality,
\begin{align*}
    &\quad\sum_{s=1}^T\|\bar{\vg}_s\|^2 - \del_1 \epsilon \\
    &\le \del_1\left(\sqrt{2B_1\sum_{s=1}^{T} \|\bar{\vg}_s\|^2}+\sqrt{  2(A\del+C)T} + \mG \right)\\
    &\le \sum_{s=1}^T\frac{\|\bar{\vg}_s\|^2}{2} + \del_1^2B_1+\del_1\left(\sqrt{  2(A\del+C)T} + \mG \right).
\end{align*}
We then re-arrange the order and divide $T$ on both sides, leading to a desired convergence result
\begin{align*}
   \frac{1}{T}\sum_{s=1}^T\|\bar{\vg}_s\|^2 \le 2\del_1\left[\frac{\del_1{B_1} + \mG + \epsilon}{T} + \sqrt{\frac{2(A\del+C)}{T}} \right].
 \end{align*}
 The proof is complete.
\end{proof}

\input{paper_general}

%\section{Conclusion}
%In this paper, we investigate the probabilistic convergence behavior of AdaGrad with heavy-ball style momentum over the non-convex smooth landscape. More importantly, we study the convergence under a mild noise model which covers the commonly used bounded noise, sub-Gaussian noise, affine variance noise and the expected smoothness condition. We rely on a new proxy step-size to disentangle the adaptive step-size and stochastic gradient and use an induction argument to control the function value gap.
%The theoretical result shows that AdaGrad (with momentum) could converge to a stationary point and achieve the optimal convergence rate of $\tilde{\mO}(1/\sqrt{T})$ without using any prior knowledge of problem parameters. 
\bibliography{ref}
\input{Appendix}

\end{document}