\section{Proof of convergence under the generalized smoothness}\label{appendix:general}
In this section, we provide the detailed proof of Theorem \ref{thm:general_smooth}. We still follow all the notations defined in Section \ref{sec:proof}.
We shall first introduce two sequences $\{\mH_s\}_{s \ge 1}$ and $\{\mL_s \}_{s\ge1}$ as follows:
\begin{align}\label{eq:define_L_s}
    \mH_s = \sqrt{2A \delx_s + 2(B+1)\left(4L_1\delx_s + \sqrt{4L_0\delx_s}\right)^2+2C} ,\quad\mL_s =2L_0 +2L_{1}\left(4L_1\delx_s + \sqrt{4L_0\delx_s}\right).
\end{align}
\newcommand{\tva}{\tilde{\bm a}}
\newcommand{\tasi}{\tilde{a}_{s,i}}
As a consequence, we slightly change the proxy step-size $\va_s$ in \eqref{eq:proxy_stepsize} as 
\begin{align}\label{eq:general_proxy_stepsize}
    \tilde{\va}_s = \sqrt{  \vv_{s-1}+ \left( \mH_s{\bf 1}_d\right)^2} + \vep, \quad \forall s \in [T].
\end{align}
\subsection{Convergence of AdaGrad}
As a consequence of Theorem \ref{thm:general_smooth}, we obtain the following convergence bound for AdaGrad considering affine variance noise and the generalized smoothness.
\begin{corollary}\label{coro:general}
    Let $T \ge 1$ and $\delta \in (0,1)$. Suppose that $\{\vx_s\}_{s \in [T]}$ is a sequence generated by Algorithm \ref{alg:AdaGrad} with $\beta = 0$, $f$ is $(L_0,L_{1})$-smooth satisfying \eqref{eq:general_smooth_1}, Assumptions (A1), (A2) hold and Assumption (A3) holds with $A=0$, and the parameters follow the condition in \eqref{eq:general_parameter} with $\beta= 0$, $\mH,\mL$ follow the definitions in \eqref{eq:define_H_L}, 
        \begin{align*}
        &\lam_y \sim \mO\left( \delx_1 + C_0^2d\log\left( \frac{T}{\delta} + \frac{T}{\ep^2}\right)  \right), \quad \lam_x \sim \mO\left( \lam_y^2 \right).
    \end{align*}
    Then it holds that with probability at least $1-\delta$,
    \begin{align*}
        \frac{1}{T}\sum_{s=1}^T\|\nabla f(\vx_s)\|^2 \le \frac{4\lam_y}{\eta}\left( \frac{2\lam_y(B+1)/\eta+\mH + \ep}{T}+\sqrt{\frac{2C}{T}} \right).
    \end{align*}
\end{corollary}
\begin{remark}\cite{wang2023convergence} provided a convergence rate for AdaGrad-Norm under the generalized smoothness with the expected affine variance noise, specifically when $\eta < \frac{1}{L_1}\min\left\{ \frac{1}{64B},\frac{1}{8\sqrt{B}}\right\}$, with probability at least $1-\delta,$
\begin{align}
    \min_{t \in [T]} \|\nabla f(\vx_t)\|^2 = \mO\left( \frac{\log(\sqrt{CT})}{T\delta^2}+ \frac{\sqrt{C}\log(\sqrt{CT})}{\sqrt{T}\delta^2} \right). \label{eq:adagrad-norm}
\end{align}
Thus, our convergence bound in Corollary \ref{coro:general} could reduce to the AdaGrad-Norm case and match the rate in \eqref{eq:adagrad-norm} up to logarithm factors, while with a better dependency on the probability margin $\delta$.
\end{remark}

\subsection{Technical lemmas}
We provide an equivalent result in \citep[Lemma A.2]{zhang2020improved}, which establishes a different relationship of the gradient norm and the function value gap. We refer to the proof of \citep[Lemma A.2]{zhang2020improved}.
\begin{lemma}\label{lem:general_gradient_value}
    Suppose that $f$ is $(L_0,L_{1})$-smooth and Assumption (A1) holds. Then, for any $\vx \in \mR^d$,
    \begin{align*}
        \|\nabla f(\vx) \| \le \max \left\{ 4L_{1}(f(\vx)- f^*),\sqrt{4L_{0}(f(\vx)- f^*)}  \right\}.
    \end{align*}
\end{lemma}
We also have the following lemma to ensure the distance of $\vy_s$ and $\vx_s$ within $1/L_1$ in order to ensure the generalized smoothness in \eqref{eq:general_smooth_1}.
\begin{lemma}\label{lem:gap_xs_ys}
    Let $\vx_s,\vy_s$ be as in Algorithm \ref{alg:AdaGrad} and \eqref{eq:define_y_s}. If $\beta \in [0,1)$, then for any $s \ge 1$,
    \begin{align}
        \max\{\|\vx_{s+1} - \vx_{s}\|,\|\vy_s - \vx_s\|,\|\vy_{s+1} - \vy_s\| \} \le \frac{\eta\sqrt{d}}{(1-\beta)^2}. \label{eq:gap_xs_ys}
    \end{align}
    As a consequence, when 
    \begin{align}\label{eq:eta_local_smooth}
        \eta \le \frac{(1-\beta)^2}{L_{1}\sqrt{d}}, \quad \text{then}, \quad \max\{\|\vx_{s+1} - \vx_{s}\|,\|\vy_s - \vx_s\|,\|\vy_{s+1} - \vy_s\| \} \le \frac{1}{L_1},\quad \forall s \ge 1.
    \end{align}
\end{lemma}
\begin{proof}
    Recalling in Lemma \ref{lem:estimation_rough}, we have already bounded $\|\vm_s\|$ that is independent from smooth-related conditions as follows:
    \begin{align}\label{eq:xs_xs+1}
        \|\vx_{s+1} - \vx_s \| = \|\vm_s\| \le \frac{\eta\sqrt{d}}{1-\beta},\quad \forall s \ge 1.
    \end{align}
    Applying the definition of $\vy_s$ in \eqref{eq:define_y_s}, \eqref{eq:xs_xs+1} and $\beta \in [0,1)$,\footnote{The inequality still holds for $s =1$ since $\vx_1 = \vy_1$.}
    \begin{align}\label{eq:ys_xs}
        \|\vy_s - \vx_s \| = \frac{\beta}{1-\beta}\|\vx_s - \vx_{s-1}\| \le \frac{\eta\sqrt{d}}{(1-\beta)^2} ,\quad \forall s \ge 1.
    \end{align}
    Using the iteration of $\vy_s$ in \eqref{eq:y_iterative} and Young's inequality
    \begin{align}\label{eq:ys+1_ys}
        \|\vy_{s+1}-\vy_s\| = \frac{\eta}{1-\beta}\left\| \frac{\vg_s}{\vb_s}\right\| \le \frac{\eta\sqrt{d}}{1-\beta}\left\| \frac{\vg_s}{\vb_s}\right\|_{\infty} \le\frac{\eta\sqrt{d}}{1-\beta},\quad \forall s \ge 1,
    \end{align}
    where we apply $|\gsi/\bsi| \le 1,\forall i \in [d]$ in the last inequality.
    Combining with \eqref{eq:xs_xs+1}, \eqref{eq:ys_xs} and \eqref{eq:ys+1_ys}, and using $\beta \in [0,1)$, we then deduce an uniform bound for all three gaps. Finally, letting $\frac{\eta\sqrt{d}}{(1-\beta)^2} \le \frac{1}{L_1}$, we then prove that \eqref{eq:eta_local_smooth} holds.
\end{proof}
\begin{lemma}\label{lem:local_smooth}
    Suppose that \eqref{eq:eta_local_smooth} holds. If $f$ is $(L_0,L_1)$-smooth, then for any $s \ge 1$, 
    \begin{equation}\label{eq:general_gradient_gap}
        \begin{split}
            \|\nabla f(\vx_{s+1})- \nabla f(\vx_s)\| &\le \mL_s \|\vx_{s+1}-\vx_s\|,\\
            \|\nabla f(\vy_s)- \nabla f(\vx_s)\|  &\le \mL_s\|\vy_s - \vx_s\| ,\\
            \|\nabla f(\vy_{s+1})- \nabla f(\vy_s)\| &\le \mL_s \|\vy_{s+1} - \vy_s\|.
        \end{split}
    \end{equation}
    As a consequence, for any $s \ge 1$,
    \begin{equation}\label{eq:general_descent}
        \begin{split}
            f(\vy_{s+1}) - f(\vy_s) - \la \nabla f(\vy_s),\vy_{s+1} - \vy_s \ra &\le \frac{\mL_s}{2}\|\vy_{s+1} - \vy_s\|^2, \\
            f(\vx_s) - f(\vy_s) - \la \nabla f(\vy_s),\vx_s-\vy_s\ra &\le \frac{\mL_s}{2}\|\vx_{s}-\vy_s\|^2,\\
            f(\vx_{s+1}) - f(\vx_s) - \la \nabla f(\vx_s),\vx_{s+1} - \vx_s \ra &\le \frac{\mL_s}{2}\|\vx_{s+1} - \vx_s\|^2.
        \end{split}
    \end{equation}
\end{lemma}
\begin{proof}
    Noting that when \eqref{eq:eta_local_smooth} holds, we could use $\|\vy_s-\vx_s\|\le 1/L_1$ and the generalized smoothness in \eqref{eq:general_smooth_1} to get that
    \begin{align*}
        \|\nabla f(\vy_s) \| 
        &\le \|\nabla f(\vx_s) \| + \|\nabla f(\vy_s)-\nabla f(\vx_s) \| \\
        &\le \|\nabla f(\vx_s) \| + (L_0+L_{1}\|\nabla f(\vx_s) \| )\|\vy_s - \vx_s\|  \le  2\|\nabla f(\vx_s) \|  + L_0/L_{1}. 
    \end{align*}
    Using Lemma \ref{lem:general_gradient_value} and combining with $\mL_s$ in \eqref{eq:define_L_s}, we have 
    \begin{align}\label{eq:Lys_local_smooth}
        &L_0 + L_{1} \|\nabla f(\vx_s) \| \le L_0 +L_1 \left(4L_1\delx_s + \sqrt{4L_0\delx_s}\right)  \le \mL_s, \nonumber \\ 
        &L_0 + L_{1} \|\nabla f(\vy_s) \| \le 2L_0 +2L_{1} \|\nabla f(\vx_s) \| \le  2L_0 +2L_{1}\left(4L_1\delx_s + \sqrt{4L_0\delx_s}\right)\le  \mL_s.
    \end{align}
    Thus, combining with \eqref{eq:general_smooth_1}, we prove \eqref{eq:general_gradient_gap}. Now based on \eqref{eq:general_gradient_gap}, we could deduce \eqref{eq:general_descent}. We refer to the proof of \citep[Lemma A.3]{zhang2020improved}. 
\end{proof}
\subsection{Rough Estimations}
In generalized smooth cases, we revise the estimations in Lemmas \ref{lem:estimation_rough} and \ref{lem:delta_rough} as follows.
\begin{lemma}\label{lem:general_estimation_rough}
    Suppose that $f$ is $(L_0,L_1)$-smooth, $\beta \in [0,1)$ and \eqref{eq:eta_local_smooth} holds. Then, for any $s \ge 1$, 
    \begin{align*}
        \|\vm_s \| \le \frac{\eta\sqrt{d}}{1-\beta},\quad \|\bar{\vg}_s \|  \le \|\bar{\vg}_1\| + \frac{\eta\sqrt{d}}{1-\beta}\sum_{j=1}^{s} \mL_j.
    \end{align*}
\end{lemma}
\begin{proof}
First, the estimation for $\|\vm_s\|$ remains unchanged as in Lemma \ref{lem:estimation_rough} since it does not rely on smooth-related conditions.
Note that \eqref{eq:eta_local_smooth} holds. Then using \eqref{eq:general_gradient_gap}, for any $s \ge 2$,
    \begin{align}\label{eq:general_gs_gs-1}
        \|\bar{\vg}_s \| \le \|\bar{\vg}_{s-1}\| + \|\bar{\vg}_s - \bar{\vg}_{s-1}\| \le \|\bar{\vg}_{s-1}\| + \mL_{s-1} \|\vx_s - \vx_{s-1}\| = \|\bar{\vg}_{s-1}\| + \mL_{s-1}\|\vm_{s-1}\|.
    \end{align}
    Further using \eqref{eq:general_gs_gs-1} and the estimation for $\|\vm_s\|$, for any $s \ge 2$,
    \begin{align*}
        \|\bar{\vg}_s \| \le \|\bar{\vg}_{s-1}\| + \frac{\mL_{s-1}\eta\sqrt{d}}{1-\beta} \le \|\bar{\vg}_1\| + \frac{\eta\sqrt{d}}{1-\beta}\sum_{j=1}^{s-1} \mL_j .    
    \end{align*}
    Note that the above inequality also holds when $s = 1$. Thus, the proof is complete.
\end{proof}
\begin{lemma}\label{lem:general_delta_rough}
    Under the same conditions of Lemma \ref{lem:general_estimation_rough}, for any $T \ge 1$,
    \begin{align}\label{eq:poly_I_t}
        \sum_{t=1}^T \delx_t \le\mI_T,\quad \mI_T:= \delx_1  T+ \frac{\eta \sqrt{d}}{1-\beta} \sum_{t=1}^T \sum_{s=1}^t \left(\|\bar{\vg}_1\| + \frac{\eta \sqrt{d}}{1-\beta}\sum_{j=1}^{s
        }\mL_j \right) + \frac{\eta^2 d}{2(1-\beta)^2}\sum_{t=1}^T\sum_{s=1}^t \mL_s.
    \end{align}
\end{lemma}
\begin{proof} 
    Since \eqref{eq:eta_local_smooth} holds, we could rely on the updated rule in Algorithm \ref{alg:AdaGrad} and \eqref{eq:general_descent} to obtain that
    \begin{align}\label{eq:general_xs_descent}
        f(\vx_{s+1})
        &\le f(\vx_s)+ \la \bar{\vg}_s, \vx_{s+1}-\vx_s \ra + \frac{\mL_s}{2} \|\vx_{s+1} - \vx_s\|^2  = f(\vx_s) + \la \bar{\vg}_s, \vm_s \ra + \frac{\mL_s}{2} \|\vm_s\|^2.
    \end{align} 
    Using Cauchy-Schwarz inequality and Lemma \ref{lem:general_estimation_rough}, for any $s \ge 1$,
    \begin{align*}
        &\la \bar{\vg}_s, \vm_s \ra \le \|\bar{\vg}_s\|  \|\vm_s\|  \le \frac{\eta \sqrt{d}}{1-\beta}\left(\|\bar{\vg}_1\| + \frac{\eta \sqrt{d}}{1-\beta}\sum_{j=1}^{s} \mL_j \right),\quad \frac{\mL_s}{2}\|\vm_s\|^2 \le \frac{\mL_s\eta^2 d}{2(1-\beta)^2}.
    \end{align*}
    Combining the above, subtracting $f^*$ on both sides of \eqref{eq:general_xs_descent} and summing up over $s \in [t]$, we obtain that for any $t \ge 1$,
    \begin{align*}
        \delx_{t+1} 
        &\le \delx_1 +\frac{\eta \sqrt{d}}{1-\beta} \sum_{s=1}^t \left(\|\bar{\vg}_1\| + \frac{\eta \sqrt{d}}{1-\beta}\sum_{j=1}^{s}\mL_j \right) + \frac{\eta^2 d}{2(1-\beta)^2}\sum_{s=1}^t \mL_s.
    \end{align*}
    We define $\sum_{a}^b = 0$ when $a < b$. Then, we sum up over $t \in [0,1,\cdots T-1]$ and  
    obtain the desired result.
%     that
%    \begin{align*}
%        \sum_{t=1}^T \delx_{t} 
%        &\le \sum_{t=1}^T\delx_1 + \frac{\eta \sqrt{d}}{1-\beta} \sum_{t=1}^T \sum_{s=1}^t \left(\|\bar{\vg}_1\| + \frac{\eta \sqrt{d}}{1-\beta}\sum_{j=1}^{s}\mL_j \right) + \frac{\eta^2 d}{2(1-\beta)^2}\sum_{t=1}^T\sum_{s=1}^t \mL_s.
%    \end{align*}
\end{proof}

\subsection{Start Point and Decomposition}
To start with, we recall $\eta$ in \eqref{eq:general_parameter} and verify that \eqref{eq:eta_local_smooth} always holds. Then, we could use the descent lemma \eqref{eq:general_descent} and apply \eqref{eq:y_iterative}, and sum up both sides over $s \in [t]$ to get that
\begin{align}
    f(\vy_{t+1}) 
    &\le f(\vx_1) + \sum_{s=1}^t\langle \nabla f(\vy_s), \vy_{s+1}-\vy_s \rangle + \sum_{s=1}^t\frac{\mL_s}{2}\|\vy_{s+1}-\vy_s\|^2 \nonumber \\
               &= f(\vx_1) + \frac{\eta}{1-\beta}\cdot \textbf{A} + \frac{\eta^2}{2(1-\beta)^2} \sum_{s=1}^t\mL_s \left\| \frac{\vg_s}{\vb_s} \right\|^2 ,  \label{eq:general_A+B}
\end{align}
where we use $\vx_1 = \vy_1 $ and the definition of \textbf{A} in \eqref{eq:A+B}. We follow the decomposition in \eqref{eq:general_A_decomp} and restate as follows,
\begin{align}\label{eq:general_A_decomp}
    \textbf{A} 
    &=  \underbrace{  -\sum_{s=1}^t\left\langle \bar{\vg}_s, \frac{\vg_s}{\vb_s}\right\rangle }_{\textbf{A.1}} + \underbrace{ \sum_{s=1}^t \left\langle \bar{\vg}_s - \nabla f(\vy_s), \frac{\vg_s}{\vb_s}\right\rangle}_{\textbf{A.2}}.
\end{align}

\subsection{Estimating {\bf A}}
We then introduce $\tva$ in \eqref{eq:general_proxy_stepsize} into \eqref{eq:general_A_decomp} to derive that
\begin{align}
    &{\bf A.1} =  - \sum_{s=1}^t  \left\|\frac{\bar{\vg}_s}{\sqrt{\tva_s}}\right\|^2 \underbrace{-  \sum_{s=1}^t \left\la  \bar{\vg}_s, \frac{\vxi_s}{\tva_s} \right\ra }_{\textbf{A'.1.1}} + \underbrace{\sum_{s=1}^t  \left\la \bar{\vg}_s,  \left( \frac{1}{\tva_s} - \frac{1}{\vb_s} \right)\vg_s 
        \right\ra}_{\textbf{A'.1.2}}.  \label{eq:general_A.1.12}
\end{align}
The technique for estimating {\bf A'.1.1} is similar to Lemma \ref{lem:1_bounded}. Let $X'_s = -\left\la  \bar{\vg}_s, \frac{\vxi_s}{\tva_s} \right\ra$. We could still verify that $X'_s$ is a martingale difference sequence and define $\zeta'_{s} =  \left\|{\bar{\vg}_s \over \tva_s}\right\| \sqrt{A \delx_s + B \|\bar{\vg}_s\|^2 + C}$. Similarly, $\zeta'_{s}$ is a random variable dependent on $\vz_1,\cdots,\vz_{s-1}$. Using Cauchy-Schwarz inequality and Assumption (A3), we have
    \begin{align*}
        &\mE \left[\exp\left[\left(\frac{X'_{s}}{\zeta'_{s}}\right)^2\right] \mid \vz_1,\cdots,\vz_{s-1} \right] 
        \le \mE \left[ \exp\left(\frac{\| \vxi_s \|^2}{A \delx_s + B \|\bar{\vg}_s\|^2 + C} \right)\mid \vz_1,\cdots,\vz_{s-1} \right] \le \mathrm{e}.
    \end{align*}
However, using Lemma \ref{lem:general_gradient_value} and $\mH_s$ in \eqref{eq:define_L_s}, we derive that
\begin{align*}
    A \delx_s + B \|\bar{\vg}_s\|^2 + C \le A\delx_s + B\left( 4L_1\delx_s + \sqrt{4L_0\delx_s}\right)^2  + C \le \mH_s^2.
\end{align*}
Hence, we obtain a different inequality from \eqref{eq:martin_1} that
\begin{align}
        \textbf{A'.1.1} 
        &\le \frac{3\lambda }{4}\sum_{s=1}^t \sum_{i=1}^d\frac{ \bgsi^2}{\tasi^2}\left(A \delx_s + B \|\bar{\vg}_s\|^2 + C \right)+ \frac{1}{\lambda} \log \left(\frac{1}{\delta} \right) \nonumber \\
        &\le \frac{3\lambda }{4}\sum_{s=1}^t \sum_{i=1}^d\frac{ \bgsi^2  \mH_s^2}{\tasi^2} + \frac{1}{\lambda} \log \left(\frac{1}{\delta} \right) \le \frac{3\lambda }{4}\sum_{s=1}^t \sum_{i=1}^d \frac{ \bgsi^2  \mH_s }{\tasi} + \frac{1}{\lambda} \log \left(\frac{1}{\delta} \right), \label{eq:martin_2}
    \end{align}
where the last inequality applies $1/\tasi \le 1/\mH_s $ from \eqref{eq:general_proxy_stepsize}.
Then, we can re-scale $\delta$ and take $\lambda = 1/(3\mH)$, leading to the new estimation as follows: with probability at least $1-\delta$, 
\begin{align}\label{eq:general_A.1.1}
    \textbf{A'.1.1} \le \frac{1}{4}\sum_{s=1}^t\frac{ \mH_s}{\mH}\left\|\frac{ \bar{\vg}_s  }{\sqrt{\tva_s}}\right\|^2 + 3 \mH  \log \left(\frac{T}{\delta} \right),\quad \forall t \in [T].
\end{align}
The estimation for {\bf A'.1.2} remains similar to \eqref{eq:A.1.2}. We first derive from the basic inequality and Assumption (A3) that 
\begin{align*}
    \|\vg_s\|^2 \le 2\|\bar{\vg}_s\|^2 + 2\|\vxi_s\|^2 \le 2A\delx_s + 2(B+1)\|\bar{\vg}_s\|^2 + 2C \le \mH_s^2.
\end{align*}
Then, based on $\|\vg_s\|^2 \le  \mH_s^2$, we derive a similar result as in \eqref{eq:a-b} where 
\begin{align*}
    \left|\frac{1}{\tasi} - \frac{1}{\bsi} \right| \le \frac{\mH_s}{\tasi\bsi},\quad \forall s \in[T] ,\forall i \in [d].
\end{align*}
Then, using a similar deduction in \eqref{eq:A.1.2}, we derive that
\begin{align}
    \textbf{A'.1.2} 
    &\le  \frac{1}{4}\sum_{s=1}^t \left\|\frac{\bar{\vg}_s}{\sqrt{\tva_s}}\right\|^2 +   \sum_{s=1}^t \mH_s \left\|\frac{\vg_s}{\vb_s}\right\|^2. \label{eq:general_A.1.2}
\end{align}
The estimation for {\bf A.2} in \eqref{eq:general_A_decomp} is revised by the following lemma.
\begin{lemma}\label{lem:general_A.2}
    Suppose that $f$ is $(L_0,L_1)$-smooth and \eqref{eq:eta_local_smooth} holds.
    For any $t \ge 1$, if $\beta \in [0,1)$, it holds that 
    \begin{align}
        {\bf A.2}\le \sum_{s=1}^t \frac{\mL_s }{2\eta}\|\vm_{s-1}\|^2 + \sum_{s=1}^t \frac{\mL_s\eta}{2(1-\beta)^2} \left\|\frac{\vg_s}{\vb_s} \right\|^2. \label{eq:general_A.2}
    \end{align}
\end{lemma}
\begin{proof}
    Noting that when \eqref{eq:eta_local_smooth} holds, we could rely on \eqref{eq:general_gradient_gap} and $\beta \in [0,1)$ to obtain that
    \begin{align}
        &\|\bar{\vg}_s - \nabla f(\vy_s)\| 
        \le \mL_s \|\vy_s - \vx_s\| 
        = \frac{\mL_s\beta}{1-\beta}\|\vx_s - \vx_{s-1}\| 
        \le \frac{\mL_s}{1-\beta}\|\vm_{s-1}\|. \label{eq:general_gradient_xs_ys}
    \end{align}
    Applying Cauchy-Schwarz inequality and using \eqref{eq:general_gradient_xs_ys}, 
    \begin{align*}
        {\bf A.2} &\le \sum_{s=1}^t \|\bar{\vg}_s - \nabla f(\vy_s)\| \left\|\frac{\vg_s}{\vb_s}\right\|  \le  \sum_{s=1}^t \frac{\mL_s}{1-\beta}\|\vm_{s-1}\|\left\|\frac{\vg_s}{\vb_s} \right\| \le  \sum_{s=1}^t \frac{\mL_s }{2\eta}\|\vm_{s-1}\|^2 + \sum_{s=1}^t\frac{\mL_s\eta}{2(1-\beta)^2}\left\|\frac{\vg_s}{\vb_s} \right\|^2.
    \end{align*}
\end{proof}
\subsection{Bounding the function value gap}
Based on the above estimations, we are now ready to provide the bound for the function value gap along the optimization trajectory in the following proposition.
\begin{proposition}\label{pro:general_delta_s}
    Under the same conditions in Theorem \ref{thm:general_smooth}, for any given $\delta \in (0,1)$, the following two inequalities hold with probability at least $1-\delta$,
    \begin{align}
        \delx_t \le \lam_x,\quad \mH_t \le \mH,\quad \mL_t \le \mL,\quad \forall t \in [T+1],\label{eq:general_delta_s_1}
    \end{align}
    and
    \begin{align}
         \dely_{t+1} \le \lam_y- \frac{\eta}{2(1-\beta)}\sum_{s=1}^t \left\| \frac{\bar{\vg}_s}{\sqrt{\tva_s}} \right\|^2, \quad \forall t \in [T],\label{eq:general_delta_s}
    \end{align}
    where $\mH_t,\mL_t$ are as in \eqref{eq:define_L_s} and $\lam_x,\lam_y,\mH,\mL$ are as in Theorem \ref{thm:general_smooth}.
\end{proposition}
%\begin{proof} 
In what follows, we prove Proposition \ref{pro:general_delta_s}.
First, we verify that \eqref{eq:eta_local_smooth} holds from the setting of $\eta$ in \eqref{eq:general_parameter}. Hence, we ensure that \eqref{eq:general_A+B} and \eqref{eq:general_A.2} hold.

In the following, we will assume the inequality \eqref{eq:general_A.1.1} holds and then deduce the results in \eqref{eq:general_delta_s_1} and \eqref{eq:general_delta_s}. Noting that \eqref{eq:general_A.1.1} holds with probability at least $1-\delta$, we thereby deduce that the desired results hold with probability at least $1-\delta$. 

Plugging the estimations \eqref{eq:general_A.1.1} and \eqref{eq:general_A.1.2} into \eqref{eq:general_A.1.12},  we obtain the bound for \textbf{A.1}. Then, combining with
\eqref{eq:general_A.2}, \eqref{eq:general_A_decomp} and \eqref{eq:general_A+B}, and subtracting $f^*$ on both sides of \eqref{eq:general_A+B}, 
\begin{align}\label{eq:general_final_1}
    \dely_{t+1}
    &\le \delx_1 + \frac{\eta}{1-\beta}\sum_{s=1}^t\left(\frac{\mH_s}{4\mH}-\frac{3}{4} \right) \left\|\frac{\bar{\vg}_s}{\sqrt{\tva_s}} \right\|^2   + \frac{3 \mH \eta}{1-\beta} \log \left(\frac{T}{\delta} \right)  + \frac{\eta}{1-\beta}\sum_{s=1}^t \mH_s \left\|\frac{\vg_s}{\vb_s} \right\|^2 \nonumber\\
    &+ \sum_{s=1}^t \frac{\mL_s }{2(1-\beta)}\|\vm_{s-1}\|^2 + \sum_{s=1}^t \left(\frac{\mL_s\eta^2}{2(1-\beta)^3} + \frac{\mL_s\eta^2}{2(1-\beta)^2} \right)  \left\|\frac{\vg_s}{\vb_s} \right\|^2.
\end{align}
We still rely on an induction argument to deduce the result. First, we define two polynomials $\tilde{\mI}_t$ and $\tilde{\mJ}_t$ with respect to $t$ and present the detailed expressions of $\lam_y$ and $\lam_x$ that are independent from $t$ as follows:
\begin{align}
    &\tilde{\mI}_t:=\delx_1 \cdot t+ \frac{C_0\sqrt{d}}{1-\beta} \left(\|\bar{\vg}_1\| + \frac{C_0\sqrt{d}t}{1-\beta}  \right)\cdot t^2 + \frac{C_0^2 d}{2(1-\beta)^2}\cdot t^2, \label{eq:tilde_poly_I_t}\\
    &\tilde{\mJ}_t:= 1+ \frac{1}{\ep^2}\left[2A\tilde{\mI}_t + 2(B+1)   \left(\|\bar{\vg}_1\| + \frac{C_0\sqrt{d}t}{1-\beta} \right)^2\cdot t + 2Ct\right], \label{eq:tilde_poly_J_t}\\
    &\lam_y:= \delx_1   + \frac{3 C_0}{1-\beta} \log \left(\frac{T}{\delta} \right)  + \frac{C_0 d}{1-\beta}\log \tilde{\mJ}_T, \nonumber\\
    &\quad+  \frac{C_0^2 d}{2(1-\beta)^3}\log \tilde{\mJ}_T + \left(\frac{C_0^2 d}{2(1-\beta)^3} + \frac{C_0^2 d}{2(1-\beta)^2} \right) \log \tilde{\mJ}_T, \label{eq:define_lamy} \\
    &\lam_x:= (2L_0+1)\lam_y + 8L_1\lam_y^2 + \frac{C_0^2 d}{(1-\beta)^2} \log \tilde{\mJ}_T. \label{eq:define_lamx}
\end{align}
It's worthy noting that $\tilde{\mI}_t$ and $\tilde{\mJ}_t$ are deterministic polynomials with respect to $t$ and are dependent on hyper-parameters $C_0,\beta,d$ and problem-parameters $A,B,C$. We could verify that 
$\lam_y \sim \mO(\log(T/\delta) ))$ and $\lam_x \sim \mO(\log^2(T/\delta) )$. 
\paragraph{Induction argument}
The induction begins by noting that $\delx_1 \le \lam_x$ from \eqref{eq:define_lamy} and \eqref{eq:define_lamx}. Suppose that for some $t \in [T]$, 
\begin{align}\label{eq:general_induction_1}
    \delx_s \le \lam_x, \quad \text{thus}, \quad \mH_s \le \mH, \quad \mL_s \le \mL,\quad \forall s \in [t],
\end{align}
where we rely on $\mH_s$, $\mL_s$ in \eqref{eq:define_L_s}, and $\mH,\mL$ in \eqref{eq:define_H_L}. 
We thus apply \eqref{eq:general_induction_1} to \eqref{eq:general_final_1} and get
\begin{align}
    \dely_{t+1}
    &\le \delx_1 - \frac{\eta}{2(1-\beta)}\sum_{s=1}^t  \left\|\frac{\bar{\vg}_s}{\sqrt{\tva_s}} \right\|^2   + \frac{3 \mH \eta}{1-\beta} \log \left(\frac{T}{\delta} \right)  + \frac{\eta\mH}{1-\beta}\sum_{s=1}^t   \left\|\frac{\vg_s}{\vb_s} \right\|^2 \nonumber\\
    &+  \frac{\mL }{2(1-\beta)}\sum_{s=1}^t\|\vm_{s-1}\|^2 + \left(\frac{\mL\eta^2}{2(1-\beta)^3} + \frac{\mL\eta^2}{2(1-\beta)^2} \right)  \sum_{s=1}^t \left\|\frac{\vg_s}{\vb_s} \right\|^2. \label{eq:general_final_1.1}
\end{align}
Then, Lemmas \ref{lem:sum_1} should be further revised under the generalized smooth condition as follows. 
\begin{lemma}\label{lem:general_sum_1}
    Suppose that $f$ is $(L_0,L_1)$-smooth and \eqref{eq:eta_local_smooth} holds. Then for any $ t \ge 1$,
    \begin{align*}
        \sum_{s=1}^t \left\|\frac{\vg_s}{\vb_s}\right\|^2 \le d\log \mJ_t,\quad \|\vm_t\|^2  \le \frac{\eta^2d}{1-\beta}\log\mJ_t, \quad \sum_{s=1}^t\|\vm_s\|^2  \le \frac{\eta^2d}{(1-\beta)^2}\log\mJ_t,
    \end{align*}
    where $\mJ_t$ is a polynomial with respect to $t$ with the detailed expression in \eqref{eq:poly_J_T}.
\end{lemma}
\begin{proof}
    Recall that the three estimations in \eqref{eq:gs/bs}, \eqref{eq:ms} and \eqref{eq:sum_ms} remain unchanged as they are not dependent by smooth-related conditions. However, we shall revise the estimation for the term inside the logarithm operator. We also start from the basic inequality and Assumption (A3),
    \begin{align*}
        \sum_{j=1}^t g_{j,i}^2 
        &\le 2\sum_{j=1}^t (\bar{g}_{j,i}^2 + \xi_{j,i}^2) \le 2\sum_{j=1}^t (\|\bar{\vg}_{j}\|^2 + \|\vxi_{j}\|^2)  \le 2 \sum_{j=1}^t( A \delx_j + (B+1) \|\bar{\vg}_j\|^2 + C).
    \end{align*}
    Then, applying Lemma \ref{lem:general_estimation_rough} 
    and Lemma \ref{lem:general_delta_rough},
    \begin{align}\label{eq:poly_J_T}
        1+ \frac{1}{\ep^2}\sum_{j=1}^t g_{j,i}^2  \le \mJ_t :=1+ \frac{1}{\ep^2}\left[2A\mI_t + 2(B+1)  \sum_{s=1}^t \left(\|\bar{\vg}_1\| + \frac{\eta\sqrt{d}}{1-\beta}\sum_{j=1}^{s} \mL_j\right)^2 + 2Ct\right].
    \end{align}
    Plugging \eqref{eq:poly_J_T} into \eqref{eq:gs/bs}, \eqref{eq:ms} and \eqref{eq:sum_ms}, we thereby deduce the final result.
\end{proof}
Therefore, applying Lemma \ref{lem:general_sum_1} to \eqref{eq:general_final_1.1}, we obtain that 
\begin{align}
    \dely_{t+1}
    &\le \delx_1 - \frac{\eta}{2(1-\beta)}\sum_{s=1}^t  \left\|\frac{\bar{\vg}_s}{\sqrt{\tva_s}} \right\|^2   + \frac{3 \mH \eta}{1-\beta} \log \left(\frac{T}{\delta} \right)  + \frac{  \mH \eta d}{1-\beta}\log \mJ_t \nonumber\\
    &+  \frac{\mL \eta^2 d}{2(1-\beta)^3}\log \mJ_t + \left(\frac{\mL\eta^2 d}{2(1-\beta)^3} + \frac{\mL\eta^2 d}{2(1-\beta)^2} \right) \log \mJ_t. \label{eq:general_final_2}
\end{align}
Using \eqref{eq:general_parameter}, we have that
\begin{align}
    \mH \eta \le C_0, \quad \mL \eta \le C_0, \quad \mL\eta^2 \le C_0^2. \label{eq:C_0}
\end{align}
We should also note that $\mI_t$ in \eqref{eq:poly_I_t} and $\mJ_t$ in \eqref{eq:poly_J_T} both include $\mL_s,s\le t$. Then recalling $\tilde{\mI}_t$ and $\tilde{\mJ}_t$ in \eqref{eq:tilde_poly_I_t} and \eqref{eq:tilde_poly_J_t}, and applying $\mL_s \le \mL,\forall s \le t$ in \eqref{eq:general_induction_1} and \eqref{eq:C_0}, we have that for any $t \in [T]$, 
\begin{align}\label{eq:log_J_t}
    \mI_t \le \tilde{\mI}_t \le \tilde{\mI}_T, \quad \mJ_t \le \tilde{\mJ}_t \le \tilde{\mJ}_T,\quad \log \mJ_t \le \log \tilde{\mJ}_t \le \log \tilde{\mJ}_T.
\end{align}
Then, applying \eqref{eq:C_0} and \eqref{eq:log_J_t} to \eqref{eq:general_final_2}, and recalling $\lam_y$ in \eqref{eq:define_lamy}, 
\begin{align}
    &\dely_{t+1} \le \lam_y - \frac{\eta}{2(1-\beta)}\sum_{s=1}^t  \left\|\frac{\bar{\vg}_s}{\sqrt{\tva_s}} \right\|^2 \le \lam_y\label{eq:general_final_3}.
\end{align}
Finally, we use the following lemma to control $\delx_s$ by $\dely_s$.
\begin{lemma}\label{lem:general_delta_y_x}
Suppose that $f$ is $(L_0,L_q)$-smooth and \eqref{eq:eta_local_smooth} holds.
Let $\vy_s$ be defined in \eqref{eq:define_y_s} and $\beta \in [0,1)$. Then for any $s \ge 1$,
\begin{align*}
    \delx_s \le (2L_0+1)\dely_s + 8L_1\left(\dely_s\right)^2 + \frac{\mL_s+1 }{2(1-\beta)^2}\|\vm_{s-1} \|^2.
\end{align*}
\end{lemma}
\begin{proof}
    Since \eqref{eq:eta_local_smooth} holds, then using \eqref{eq:general_descent},
    \begin{align*}
        f(\vx_s) 
        &\le f(\vy_s) + \la \nabla f(\vy_s), \vx_s - \vy_s \ra + \frac{\mL_s}{2} \|\vx_s - \vy_s \|^2 \\
        &= f(\vy_s) - \frac{\beta}{1-\beta} \la \nabla f(\vy_s), \vx_s - \vx_{s-1} \ra + \frac{\mL_s\beta^2}{2(1-\beta)^2} \|\vx_s - \vx_{s-1} \|^2.
    \end{align*}
    Using Young's inequality and subtracting $f^*$ on both sides,
    \begin{align*}
        \delx_s &\le \dely_s + \frac{1}{2}\|\nabla f(\vy_s)\|^2 + \frac{(\mL_s+1)\beta^2}{2(1-\beta)^2}\|\vx_s - \vx_{s-1} \|^2 \le \dely_s + 8L_1\left(\dely_s\right)^2 + 2L_0\dely_s + \frac{\mL_s+1 }{2(1-\beta)^2}\|\vm_{s-1} \|^2,
    \end{align*}
    where we apply Lemma \ref{lem:general_gradient_value} and $\beta \in [0,1)$ for the last inequality. 
\end{proof}
Using Lemma \ref{lem:general_delta_y_x}, \eqref{eq:general_final_3}, and Lemma \ref{lem:general_sum_1}, we could bound $\delx_{t+1}$ as
\begin{align*}
    \delx_{t+1} &\le (2L_0+1)\lam_y + 8L_1\lam_y^2 + \frac{(\mL_s+1)\eta^2 d}{2(1-\beta)^2} \log \mJ_t \le (2L_0+1)\lam_y + 8L_1\lam_y^2 + \frac{2C_0^2 d}{2(1-\beta)^2} \log \tilde{\mJ}_T,
\end{align*}
where the last inequality applies \eqref{eq:C_0} and  \eqref{eq:log_J_t}. Recalling $\lam_x$ in \eqref{eq:define_lamx}, we find that 
\begin{align*}
    \delx_{t+1} \le \lam_x.
\end{align*}
Combining with \eqref{eq:general_induction_1},
the induction is thus complete. We prove the desired result in \eqref{eq:general_delta_s_1}. Finally, as an intermediate result, \eqref{eq:general_delta_s} is verified by \eqref{eq:general_final_3}. The proof of Proposition \ref{pro:general_delta_s} is complete.
%\end{proof}

\subsection{Proof of the main result}
Based on Proposition \ref{pro:general_delta_s}, we could prove the final convergence result as follows.
\begin{proof}[Proof of Theorem \ref{thm:general_smooth}]
    The proof for the final convergence rate follows the similar idea and some same estimations in the proof of Theorem \ref{thm:1}.
    Setting $t =T$ in \eqref{eq:general_delta_s}, it holds that with probability at least $1-\delta$,
    \begin{align}
         \frac{\eta}{2(1-\beta)} \sum_{s=1}^T \frac{ \left\| \bar{\vg}_s \right\|^2}{\|\tva_s\|_{\infty}} \le \frac{\eta}{2(1-\beta)}\sum_{s=1}^T \left\| \frac{\bar{\vg}_s}{\sqrt{\tva_s}} \right\|^2 \le \lam_y. \label{eq:general_final_5}
    \end{align}
    In what follows, we assume that \eqref{eq:general_delta_s_1} and \eqref{eq:general_delta_s} always hold and deduce the convergence bound under these two inequalities. Note that \eqref{eq:general_delta_s_1} and \eqref{eq:general_delta_s} hold with probability at least $1-\delta$ according to Proposition \ref{pro:general_delta_s}. Thus, the final convergence bound also holds with probability at least $1-\delta$.
    Applying $\tva_s$ in \eqref{eq:general_proxy_stepsize} and \eqref{eq:general_delta_s_1}, and following the similar analysis in \eqref{eq:upper_bound_asi}, 
    \begin{align*}
        \|\tva_s\|_{\infty} - \ep &\le \sqrt{2\sum_{j=1}^{s-1} (A \delx_j + (B+1)\|\bar{\vg}_{j}\|^2 + C) +\mH_s^2} \le \sqrt{2(B+1)\sum_{j=1}^{s-1} \|\bar{\vg}_j\|^2 + 2(A\lam_x+C)s +\mH^2 } \\
        &\le \sqrt{2(B+1)\sum_{s=1}^{T} \|\bar{\vg}_s\|^2 + 2(A\lam_x+C)T +\mH^2 }, \quad \forall s \in [T].
    \end{align*}
    Then combining with \eqref{eq:general_final_5}, and letting $\tilde{\lam}_y = \frac{2\lam_y(1-\beta)}{\eta}$,
    \begin{align*}
    \sum_{s=1}^T\|\bar{\vg}_s\|^2  &\le \tilde{\lam}_y \left(\sqrt{2(B+1)\sum_{s=1}^{T} \|\bar{\vg}_s\|^2 + 2(A\lam_x+C)T +\mH^2 } + \ep \right)\\
    &\le \tilde{\lam}_y\left(\sqrt{2(B+1)\sum_{s=1}^{T} \|\bar{\vg}_s\|^2}+\sqrt{  2(A\lam_x+C)T} + \mH + \ep\right)\\
    &\le \sum_{s=1}^T\frac{\|\bar{\vg}_s\|^2}{2} + \tilde{\lam}_y^2(B+1)+\tilde{\lam}_y\left(\sqrt{  2(A\lam_x+C)T} + \mH + \ep \right),
\end{align*}
where we apply Young's inequality for the last inequality.
Re-arranging the order and dividing $T$ on both sides, we get
\begin{align*}
    \frac{1}{T}\sum_{s=1}^T\|\bar{\vg}_s\|^2 \le 2\tilde{\lam}_y\left( \frac{\tilde{\lam}_y(B+1)+\mH + \ep}{T}+\sqrt{\frac{2(A\lam_x+C)}{T}} \right).
\end{align*}
\end{proof}


\section{Experiment}
In this section, we present an experiment to verify that AdaGrad can find a stationary point when the noise satisfies Assumption (A3).


\begin{figure}[h]
\centering
\includegraphics[width=0.8\textwidth]{figure.pdf}
\caption{Training loss vs. steps and gradient norms vs. steps using SGD and AdaGrad.}
\label{fig}
\end{figure}
\paragraph{Experimental Setup} We follow the experimental task outlined in \citep{khaled2022better}, where the objective is to minimize a regularized logistic regression problem defined as follows:
\begin{align}
\min_{x \in \mathbb{R}^d}\left\{\frac{1}{n}\sum_{i=1}^n \log \left(1+\exp(-a_i^\top x)\right)+\lambda \sum_{j=1}^d \frac{x_j^2}{1+x_j^2} \right\}. \label{eq
}
\end{align}
In \citep{khaled2022better}, it was verified that using uniform sampling over the a9a dataset and the loss function in \eqref{eq
}, the noise conforms to Assumption (A3) with $\hat{A}= 10.09, \hat{B}=0, \hat{C}=0.373$, which closely aligns with the theoretical values where $A=9, B=0, C=0.994$. We then executed both SGD and AdaGrad to minimize \eqref{eq
} using a batch size of 256. The learning rates were set to $7 \times 10^{-6}$ for SGD and $5 \times 10^{-4}$ for AdaGrad.

We utilized the a9a dataset and the PyTorch implementations of SGD and AdaGrad. The experiments were conducted on a single NVIDIA GeForce RTX 4090 GPU.



\paragraph{Results} We plotted the training loss and gradient norms against the number of steps in Figure \ref{fig}, training for 1200 epochs. The results indicate that both SGD and AdaGrad can find a stationary point given a sufficiently large number of steps $T$, thereby supporting the conclusion in Theorem \ref{thm:1}. 

% \subsection{Adaptive}
% We now have
% \begin{align*}
%     \mE\left[\frac{2B\sum_{s=1}^t\left\|\bar{\vg}_s \right\|^2}{\sqrt{\sum_{j=1}^t \|\bar{\vg}_j\|^2 + 2(A\max_{j \in [T]}\mG_j^2+C)T}}\right]\le \sum_{s=1}^t\mE\left[\frac{\left\|\bar{\vg}_s \right\|^2}{\sqrt{\sum_{j=1}^t \|\vg_j\|^2 + \mG_j^2}}\right] \le \sum_{s=1}^t\mE\left[\frac{\left\|\bar{\vg}_s \right\|^2}{\|\va_s\|_{\infty}}\right] \le \sum_{s=1}^t\mE\left[\left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2\right] \leq \del.
% \end{align*}
% Then we could rely on a simple inequality that for any two positive real numbers $X,C$,
% \begin{align*}
%     \frac{X}{\sqrt{X+C}} \ge \sqrt{X}- \sqrt{C}.
% \end{align*}
% We thus have
% \begin{align*}
%     \mE \left[\sqrt{\sum_{s=1}^t \|\bar{\vg}_s}\|^2 - \sqrt{(A\max_{j \in [T]}\mG_j^2+C)T}\right] \le \del.
% \end{align*}
% But we only have $\mE [\mG_j] \le \mG$, but how about $\mE[\max_{j \in [T]}\mG_j^2]$ ??.