\section{Convergence under generalized smoothness}\label{sec:general}
In this section, we present the convergence of Algorithm \ref{alg:AdaGrad} in the generalized smooth case.  
\subsection{Generalized smoothness}
For a differentiable objective function $f: \mR^d \rightarrow \mR$, we consider the following $(L_0,L_1)$-smoothness condition: there exist
 constants $L_0,L_1 > 0$, satisfying that for any $\vx,\vy \in \mR^d$ with $\|\vx-\vy\| \le 1/L_1$, 
\begin{equation}\label{eq:general_smooth_1}
    \|\nabla f(\vy) - \nabla f(\vx) \| \le \left(L_0 + L_1\|\nabla f(\vx)\| \right) \|\vx-\vy\|.
\end{equation}
   The generalized smooth condition was originally put forward by \citep{zhang2020why} for any twice-order differentiable function $f$ satisfying that
    \begin{align}\label{eq:general_smooth_2}
        \|\nabla^2 f(\vx)\| \le L_0 + L_{1} \|\nabla f(\vx)\|.
    \end{align}
    They revealed the superior of SGD with gradient-clipping in convergence over the vanilla SGD when considering \eqref{eq:general_smooth_2}. Moreover, empirical evidence has demonstrated that numerous objective functions satisfy \eqref{eq:general_smooth_2} while deviating from the global smoothness, particularly in large language models, see e.g., \citep[Figure 1]{zhang2020why} and \citep{crawshaw2022robustness}. To better explain the convergence of gradient-clipping algorithms, \cite{zhang2020improved} provided an alternative form in \eqref{eq:general_smooth_1}, only requiring $f$ to be first-order differentiable. 
    
    The condition in \eqref{eq:general_smooth_1} is selected in our paper for three main reasons. First, it's easy to verify that \eqref{eq:general_smooth_1} is strictly weaker than $L$-smoothness. A concrete example is that the simple function $f(x)=x^3, x\in \mR$ does not satisfy any global smoothness but \eqref{eq:general_smooth_1}. Second, \eqref{eq:general_smooth_1} aligns with the practical limitation to first-order stochastic gradients in our setting, making it more reasonable to assume that $f$ is only first-order differentiable. Finally, both \eqref{eq:general_smooth_1} and \eqref{eq:general_smooth_2} are shown to be equivalent up to constant factors for twice-order differentiable functions, see \citep[Lemma A.2]{zhang2020improved} and \citep[Proposition 1]{faw2023beyond}. Thus, \eqref{eq:general_smooth_1} includes more functions than \eqref{eq:general_smooth_2}. We refer interested readers to see \citep{zhang2020why,zhang2020improved,faw2023beyond} for more discussions and concrete examples of the generalized smoothness. 
    
    
\subsection{Convergence result}
In the following, we establish the convergence bound for AdaGrad with momentum under the generalized smooth condition.
\begin{theorem}\label{thm:general_smooth}
    Let $T \ge 1$ and $\delta \in (0,1)$. Suppose that $\{\vx_s\}_{s \in [T]}$ is a sequence generated by Algorithm \ref{alg:AdaGrad}, $f$ is $(L_0,L_{1})$-smooth satisfying \eqref{eq:general_smooth_1}, Assumptions (A1), (A2), (A3) hold, and the parameters satisfy that $\beta \in [0,1)$,
    \begin{align}\label{eq:general_parameter}
        &\ep > 0,\quad \eta \le \min\left\{C_0, \frac{C_0}{\mH}, \frac{C_0}{\mL},  \frac{(1-\beta)^2}{L_{1}\sqrt{d}}\right\},
    \end{align}
    where $C_0 > 0$ is a constant, $\mH,\mL$ are defined as 
    \begin{equation}\label{eq:define_H_L}
        \begin{split}
        &\mH = \sqrt{2A \lam_x + 2(B+1)\left(4L_1\lam_x + \sqrt{4L_0\lam_x}\right)^2 + 2C}, \\
        &\mL =2L_0 +2L_{1}\left(4L_1\lam_x + \sqrt{4L_0\lam_x}\right), 
        \end{split}
    \end{equation}
    and $\lam_y,\tilde{\lam}_y,\lam_x$ are given with the following order,\footnote{The detailed expressions of $\lam_y,\lam_x$ could be found in \eqref{eq:define_lamy} and \eqref{eq:define_lamx} respectively from Appendix.}
    \begin{align*}
        &\lam_y \sim \mO\left( \delx_1 + \frac{C_0^2d +C_0d}{(1-\beta)^3}\log\left( \frac{T}{\delta} + \frac{T}{\ep^2}\right)  \right), \\
        &\tilde{\lam}_y = \frac{2\lam_y(1-\beta)}{\eta},\quad \lam_x \sim \mO\left( \lam_y^2 \right).
    \end{align*}
    Then, with $B_1=B+1$, it holds that with probability at least $1-\delta$,
    \begin{align*}
        &\frac{1}{T}\sum_{s=1}^T\|\nabla f(\vx_s)\|^2 \\
        \le &\mO\left(\tilde{\lam}_y\left( \frac{B_1\tilde{\lam}_y +\sqrt{B_1L\lam_x} + \ep}{T}+\sqrt{\frac{A\lam_x+C}{T}} \right)\right).
    \end{align*}
\end{theorem}
It's easy to verify that $\lam_y\sim \mathcal{O}(\log(T/\delta))$ and thereby $\lam_x,\mH,\mL \sim \mathcal{O}(\log^2(T/\delta))$. Then, from \eqref{eq:general_parameter}, when $T \gg d$, a typical setting is $\eta \sim \mO(\log^2(T/\delta))$. Moreover, the convergence rate is still adaptive to the noise parameters $A,C$ and requires problem-parameters to tune step-sizes potentially due to the relaxation of smoothness. 
The subsequent result for AdaGrad under \eqref{eq:general_smooth_1} could be directly deduced from Theorem \ref{thm:general_smooth} and will be presented in Appendix.

\section{Conclusion}
In this paper, we provide high probability convergence bounds for AdaGrad and its momentum variant under the non-convex smooth optimization. In particular, we consider a mild noise model incorporating affine variance noise and the expected smoothness. We rely on a new proxy step-size and some delicate estimations to derive the bound. Our findings reveal that without problem-parameters dependent step-sizes, AdaGrad can find a stationary point with a rate of $\tilde{\mathcal{O}}(1/\sqrt{T})$, particularly accelerating to $\tilde{\mathcal{O}}(1/T)$ when specific noise parameters are sufficiently small. Furthermore, we extend our framework to the generalized smooth case that allows for unbounded smooth parameters, showing the same convergence rate, albeit that problem-parameters dependent step-sizes
are required in the latter.

\paragraph{Limitation} Although AdaGrad plays an important role in the adaptive method field, several other adaptive methods including RMSProp, Adam and AdamW, may be preferred in some real applications. Therefore, it is also pertinent to study these algorithms under relaxed assumptions. In addition, it is still unknown whether similar convergence result could be also achieved under an expected version of Assumption (A3). Finally, as we study a new assumption over AdaGrad, it would be more beneficial to provide more experimental results to support the theoretical results.
% The result reveals that Algorithm \ref{alg:AdaGrad} obtains the same convergence rate as in the smooth case in Theorem \ref{thm:1}, but at the expense of requiring problem-parameters $L_0,L_1,A,B,C$ to tune step-sizes. The necessity of prior knowledge of problem-parameters was also indicated by the counter example in \citep{wang2023convergence} for AdaGrad.


\section*{Acknowledgments}
The authors would like to thank the reviewers and area chairs for their constructive comments. This work was supported in part by the National Key Research and Development Program of China under grant number 2021YFA1003500, and NSFC under grant numbers 11971427. We also thank Chenhao Yu very much on pointing out one error in the generalized-smooth analysis, and Mouxiang Chen for his great help with the experimental results.
The corresponding author is Junhong Lin.
% \paragraph{Convergence of AdaGrad }
% As a consequence of Theorem \ref{thm:general_smooth}, we obtain the following convergence bound for AdaGrad considering affine variance noise and the generalized smoothness.
% \begin{corollary}\label{coro:general}
%     Given $T \ge 1$ and $\delta \in (0,1)$. Suppose that $\{\vx_s\}_{s \in [T]}$ is a sequence generated by Algorithm \ref{alg:AdaGrad} with $\beta = 0$, $f$ is $(L_0,L_{1})$-smooth satisfying \eqref{eq:general_smooth_1}, Assumptions (A1), (A2) hold and Assumption (A3) holds with $A=0$, and the parameters follow the condition in \eqref{eq:general_parameter}, $\mH,\mL$ follow the definitions in \eqref{eq:define_H_L}, 
%         \begin{align*}
%         &\lam_y \sim \mO\left( \delx_1 + C_0^2d\log\left( \frac{T}{\delta}\right)  \right), \quad \lam_x \sim \mO\left( \lam_y^2 \right).
%     \end{align*}
%     Then it holds that with probability at least $1-\delta$,
%     \begin{align*}
%         \frac{1}{T}\sum_{s=1}^T\|\nabla f(\vx_s)\|^2 \le \frac{4\lam_y}{\eta}\left( \frac{2\lam_y(B+1)/\eta+\mH + \ep}{T}+\sqrt{\frac{2C}{T}} \right).
%     \end{align*}
% \end{corollary}
% \begin{remark}\cite{wang2023convergence} provided a convergence rate for AdaGrad-Norm under the generalized smoothness with affine variance noise, specifically when $\eta < \frac{1}{L_1}\min\left\{ \frac{1}{64B},\frac{1}{8\sqrt{B}}\right\}$,
% \begin{align}
%     \min_{t \in [T]} \|\nabla f(\vx_t)\|^2 = \mO\left( \frac{\log(\sqrt{CT})}{T\delta^2}+ \frac{\sqrt{C}\log(\sqrt{CT})}{\sqrt{T}\delta^2} \right). \label{eq:adagrad-norm}
% \end{align}
% Thus, our convergence bound in Corollary \ref{coro:general} could reduce to the AdaGrad-Norm case and match the rate in \eqref{eq:adagrad-norm} up to logarithm factors, while with a better dependency on the probability parameter.
% \end{remark}