\newpage
\onecolumn

\title{Revisiting Convergence of AdaGrad with Relaxed Assumptions\\(Supplementary Material)}
\maketitle




\appendix
\begin{table}[htbp]
\centering
\caption{Comparison of existing results with ours for AdaGrad/AdaGrad-Norm on non-convex smooth case}
\label{table}
\begin{threeparttable}
\begin{tabular}{cccccc}
\hline
                               & Alg. type  & Smooth        & Noise                                                                  & \begin{tabular}[c]{@{}c@{}}Unbounded\\ Gradients\end{tabular}                                & Conv. type   \\ \hline
\citep{li2019convergence} & $\text{Both}^{a}$      & $L$           & Sub-Gaussian                                                           &          -                                                         & w.h.p.       \\
\citep{ward2020adagrad}   & Scalar     & $L$           & Bounded                                                                &                    -                                               & $\mathbb{E}$ \\
\citep{kavis2022high}  & Scalar     & $L$           & Sub-Gaussian                                                           & \checkmark                                         & w.h.p.       \\
\citep{faw2022power}    & Scalar     & $L$           & Affine                                                                 & \checkmark                                         & $\mathbb{E}$ \\
\citep{wang2023convergence}   & Both       & $L$/$(L_0,L_1)$ & Affine                                                                 & \checkmark                                         & $\mathbb{E}$ \\
\citep{alina2023high}    & Both       & $L$           & \begin{tabular}[c]{@{}c@{}}Coordinate-wise\\ Sub-Gaussian\end{tabular} & \checkmark                                         & w.h.p.       \\
\citep{attia2023sgd}  & Scalar     & $L$           & Affine                                                                 & \checkmark                                         & w.h.p.       \\
\citep{faw2023beyond}    & Scalar     & $(L_0,L_1)$     & Affine                                                                 & \checkmark                                         & $\mathbb{E}$ \\
This paper, Thm. 1                         & Coordinate & $L$           & Relaxed Affine                                                              & \checkmark                                       & w.h.p.       \\
This paper, Thm. 2                         & Coordinate & $(L_0,L_1)$     & Relaxed Affine                                                             & \checkmark                                       & w.h.p.       \\ \hline
\end{tabular}
\begin{tablenotes}
    \item  In the "Alg. type" column, "Scalar" refers to AdaGrad-Norm, "Coordinate" refers to AdaGrad, and "Both" refers to both algorithms. "Relaxed Affine" corresponds to Assumption (A3) in this paper. In the "Conv. type" column, "w.h.p." stands for the high probability convergence bound, and "$\mathbb{E}$" represents the expected convergence bound.
    \item[a] \cite{li2019convergence} studied a variant of AdaGrad/AdaGrad-Norm using a delayed step-size which is independent from the current stochastic gradient.
\end{tablenotes}
\end{threeparttable}
\end{table}
\section{Complementary lemmas}
Following \citep{li2020high,attia2023sgd}, we will first present several important technical lemmas. The first lemma is a standard result in the smooth-based optimization which will be used in our analysis motivated also by \citep{attia2023sgd,yusu2023}.
\begin{lemma}\label{lem:gradient_delta_s}
	Suppose that $f$ is $L$-smooth and Assumption (A1) holds. Then for any $\vx\in \mR^d$,
	\begin{align*}
	\|\nabla f(\vx) \|^2 \le 2L(f(\vx)-f^*).
	\end{align*}
\end{lemma}
We introduce a concentration inequality for the martingale difference sequence, see \citep{li2020high} for a proof.
\begin{lemma} \label{lem:Azuma}
    Suppose that $\{Z_s\}_{s \in [T]}$ is a martingale difference sequence with respect to $\zeta_1,\cdots,\zeta_T$. Assume that for each $s \in [T]$, $\sigma_s$ is a random variable dependent on $\zeta_1,\cdots,\zeta_{s-1}$ and satisfies that
    \begin{align*}
        \mE\left[ \exp\left(\frac{Z_s^2}{\sigma_s^2 }\right) \mid \zeta_1,\cdots,\zeta_{s-1} \right] \le \mathrm{e}.
    \end{align*}
    Then, for any $\lambda > 0$, and for any $\delta \in (0,1)$, it holds that
    \begin{align*}
        \mathbb{P}\left(\sum_{s=1}^T Z_s > \frac{1}{\lambda}\log \left({1\over \delta}\right) + \frac{3}{4}\lambda \sum_{s=1}^T \sigma_s^2 \right) \le \delta.
    \end{align*}
\end{lemma}
The following lemma is a commonly used result in the analysis of adaptive methods. See \citep{levy2018online} for a proof.
\begin{lemma}\label{lem:log_sum}
    Let $\{\alpha_s\}_{s\ge 1}$ be a non-negative sequence. For any $\varepsilon > 0$ and positive integer $t$,
    \begin{align*}
        \sum_{s=1}^t \frac{\alpha_s}{\varepsilon+\sum_{k=1}^s \alpha_k} \le \log\left(1 + \frac{1}{\varepsilon}\sum_{s=1}^t \alpha_s \right).
    \end{align*}
\end{lemma}
% \begin{lemma}\label{lem:gradient_delta_s}
% 	Suppose that $f$ is $L$-smooth and Assumption (A1) holds, then for any $\vx\in \mR^d$,
% 	\begin{align*}
% 	\|\nabla f(\vx) \|^2 \le 2L(f(\vx)-f^*).
% 	\end{align*}
% \end{lemma}


% Then, we introduce a concentration inequality for the martingale difference sequence that is useful for achieving the high probability bounds, see \cite{li2020high} for a proof.
% \begin{lemma} \label{lem:Azuma}
%     Suppose $\{Z_s\}_{s \in [T]}$ is a martingale difference sequence with respect to $\zeta_1,\cdots,\zeta_T$. Assume that for each $s \in [T]$, $\sigma_s$ is a random variable dependent on $\zeta_1,\cdots,\zeta_{s-1}$ and satisfies that
%     \begin{align*}
%         \mE\left[ \exp(Z_s^2/\sigma_s^2) \mid \zeta_1,\cdots,\zeta_{s-1} \right] \le \exp(1).
%     \end{align*}
%     Then for any $\lambda > 0$, and for any $\delta \in (0,1)$, it holds that
%     \begin{align*}
%         \mathbb{P}\left(\sum_{s=1}^T Z_s > \frac{1}{\lambda}\log \left({1\over \delta}\right) + \frac{3}{4}\lambda \sum_{s=1}^T \sigma_s^2 \right) \le \delta.
%     \end{align*}
% \end{lemma}

\section{Omitted proofs under smooth case}
In this section, we provide the missing detailed proofs for some results and lemmas used in the proof of Theorem \ref{thm:1}.

%\begin{proof}[Proof of Remark \ref{rem:beta}]
%Let us set $\eta = c_1(1-\beta)^a$ for some constants $c_1,a > 0$. It's easy to verify that $\del$ and $\del_1$ have the following order with respect to $1-\beta$,
%\begin{align*}
%    \del \sim \mO\left( \frac{1}{(1-\beta)^{1-a}}+\frac{1}{(1-\beta)^{3-2a}}\right),\quad
%    \del_1  \sim \mO\left(\frac{1}{(1-\beta)^{a-1}}+\frac{1}{(1-\beta)^{2-a}}\right).
%\end{align*}
%The convergence rate satisfies the following order with respect to $1-\beta$:
%\begin{align*}
%    \frac{1}{T}\sum_{s=1}^T\|\bar{\vg}_s\|^2 &\le \mO\left(\del_1^2 + \del_1\sqrt{\del}\right).
%\end{align*}
%\paragraph{Case 1:}When $a \le 1$, it leads to $\del \sim \mO\left(\frac{1}{(1-\beta)^{3-2a}}\right), \del_1  \sim \mO\left(\frac{1}{(1-\beta)^{2-a}}\right)$. Then,
%\begin{align*}
%    &\frac{1}{T}\sum_{s=1}^T\|\bar{\vg}_s\|^2  \le \mO\left(\frac{1}{(1-\beta)^{4-2a}} + \frac{1}{(1-\beta)^{7/2-2a}}\right).
%\end{align*}
%Thus, when $a=1$, the order is minimum of $\mO((1-\beta)^{-2})$. 
%
%\paragraph{Case 2:}When $1 < a \le \frac{3}{2}$, it leads to $\del \sim \mO\left(\frac{1}{(1-\beta)^{3-2a}}\right), \del_1  \sim \mO\left( \frac{1}{(1-\beta)^{a-1}}+ \frac{1}{(1-\beta)^{2-a}}\right) \le \mO\left(\frac{1}{(1-\beta)^{2-a}}\right)$. Then,
%\begin{align*}
%    &\frac{1}{T}\sum_{s=1}^T\|\bar{\vg}_s\|^2  \le \mO\left(\frac{1}{(1-\beta)^{4-2a}} + \frac{1}{(1-\beta)^{7/2-2a}}\right) .
%\end{align*}
%Thus, when $a=3/2$, the order is minimum of $\mO((1-\beta)^{-1})$. 
%\paragraph{Case 3:}When $\frac{3}{2} < a \le 2$, we obtain that 
%\begin{align*}
%    &\del \sim \mO\left(1\right), \quad \del_1  \sim \mO\left( \frac{1}{(1-\beta)^{a-1}}+ \frac{1}{(1-\beta)^{2-a}}\right) \le \mO\left(\frac{1}{(1-\beta)^{a-1}}\right),\quad
%    \frac{1}{T}\sum_{s=1}^T\|\bar{\vg}_s\|^2 \le \mO\left(\frac{1}{(1-\beta)^{2a-2}} \right),
%\end{align*}
%which also achieves the minimum order when $a=3/2$. 
%\paragraph{Case 4:}When $a > 2$, the order is at least $\mO((1-\beta)^{-2})$.
%
%Combining the above analysis, we obtain that when $a =3/2$, the order for convergence bound with respect to $1-\beta$ is minimum.
%\end{proof}
%\begin{proof}[Proof of Lemma \ref{lem:gradient_delta_s}]
%    Let $\hat{\vx} = \vx - \frac{1}{L} \nabla f(\vx)$. Then using the descent lemma of smoothness,
%    \begin{align*}
%        f(\hat{\vx}) 
%        &\le f(\vx) + \la \nabla f(\vx), \hat{\vx} - \vx \ra + \frac{L}{2}\|\hat{\vx} - \vx\|^2\le f(\vx) - \frac{1}{2L}\|\nabla f(\vx)\|^2.
%    \end{align*}
%    Re-arranging the order, and noting that $f(\hat{\vx}) \ge  f^*$,
%    \begin{align*}
%        \|\nabla f(\vx)\|^2 \le 2L (f(\vx) - f(\hat{\vx})) \le 2L(f(\vx) - f^*). 
%    \end{align*}
%\end{proof}

\begin{proof}[Proof of Remark \ref{rem:highpro}]
    Here, we prove the fourth point in Remark \ref{rem:highpro}. Let us fix horizon $T$ and denote $\gamma_{s} =  \frac{\|\vg_s-\bar{\vg}_s\|^2 }{ A\delx_s+B\|\bar{\vg}_s\|^2 + C }, \forall s \in [T]$. Then from the new assumption, we have
    \begin{align*}
       \mE_{\vz_s}[\exp\left( \gamma_s \right)] \le \mathrm{e},\quad \text{thus}, \quad \mE \left[\exp(\gamma_s) \right] \le \mathrm{e}.
    \end{align*}
    By Markov's inequality, for any $E \in \mR$,
    \begin{align*}
        \mathbb{P} \left( 
        \max_{s\in [T]} \gamma_s \geq E    
        \right)
        &= \mathbb{P} \left( 
        \exp\left(\max_{s\in [T]} \gamma_s \right) \geq \mathrm{e}^E   
        \right)  \leq \mathrm{e}^{-E} \mE\left[ \exp\left(\max_{s\in [T]} \gamma_s \right) \right] \le  \mathrm{e}^{-E} \mE\left[ \sum_{s=1}^T\exp\left( \gamma_s \right) \right] \leq \mathrm{e}^{-E} T \mathrm{e},
    \end{align*}
    which leads to that with probability at least $1-\delta$, 
    \begin{align}\label{assump:sub-Gaussian}
        \|\vxi_{s}\|^2 = \|\vg_s-\bar{\vg}_s\|^2 \leq\log{\left(\mathrm{e}T \over \delta \right)} \left(  A\delx_s+B\|\bar{\vg}_s\|^2 + C   \right), \quad \forall s \in [T].  
    \end{align}
    Compared \eqref{assump:sub-Gaussian} with Assumption (A3), an additional $\log T$ factor emerges. Hence, $\mG_s$ and $\mG$ defined in \eqref{eq:define_G_t} should be revised as 
    \begin{align}\label{eq:define_new_G_s}
        \mG_s = \sqrt{\left(X\delx_s + 2C \right)\log{\left(\mathrm{e}T \over \delta \right)}}, \quad \mG = \sqrt{\left(X\del + 2C \right)\log{\left(\mathrm{e}T \over \delta \right)}}.
    \end{align}
    Consequently, using the similar analysis in Lemma \ref{lem:1_bounded}, Lemma \ref{lem:A.1.2} and Lemma \ref{lem:A.2},
    we could reach \eqref{eq:final_1} with a new $\mG_s$ and $\mG$ in \eqref{eq:define_new_G_s}. Then, using a similar induction argument, we could deduce the final convergence rate. The additional logarithm factor will make no essential difference to the order of $\del,\del_1$ and the convergence rate in Theorem \ref{thm:1} up to logarithm factors.
\end{proof}

\begin{proof}[Proof of Lemma \ref{lem:estimation_rough}]
    Let us denote ${\bm \eta}_s = \eta/(\sqrt{\vv_s}+\vep)$.
    Recalling $\vm_0=0$ and $\vm_s$ in Algorithm \ref{alg:AdaGrad}, we have
    \begin{align}
        \vm_s = \beta \vm_{s-1} - {\bm \eta}_s \odot \vg_s= \beta^2 \vm_{s-2}-\beta {\bm \eta}_{s-1} \odot \vg_{s-1}-{\bm \eta}_s \odot\vg_s=\cdots = -\sum_{j=1}^s \beta^{s-j}{\bm \eta}_j \odot \vg_j. \label{eq:ms_iteration}
    \end{align}
    Note that $|g_{s,i}| \le \sqrt{\vsi},\forall i \in [d]$. We therefore verify that $\|\vg_s/\sqrt{\vv_s}\|\le \sqrt{d}\max_{i \in [d]}|\gsi/\vsi| \le \sqrt{d}$. Moreover,
    \begin{align}\label{eq:gs_gs-1}
        \|\vm_{s}\| \le \sum_{j=1}^s \beta^{s-j} \|{\bm \eta}_j \odot \vg_j\| \le \eta \sqrt{d}\sum_{j=1}^s \beta^{s-j} \left\|\frac{\vg_j}{\sqrt{\vv_j}}\right\|_{\infty} \le \frac{\eta\sqrt{d}}{1-\beta}.
    \end{align} 
    Using the smoothness of $f$,
    \begin{align*}
        \|\bar{\vg}_s \| \le \|\bar{\vg}_{s-1}\| + \|\bar{\vg}_s - \bar{\vg}_{s-1}\| \le \|\bar{\vg}_{s-1}\| + L \|\vx_s - \vx_{s-1}\| = \|\bar{\vg}_{s-1}\| + L\|\vm_{s-1}\|.
    \end{align*}
    Further using \eqref{eq:gs_gs-1},
    \begin{align*}
        \|\bar{\vg}_s \| \le \|\bar{\vg}_{s-1}\| + \frac{L\eta\sqrt{d}}{1-\beta} \le \|\bar{\vg}_1\| + \frac{L\eta s\sqrt{d}}{1-\beta}.    
    \end{align*}
\end{proof}

\begin{proof}[Proof of Lemma \ref{lem:delta_rough}]
    Recalling the iteration in Algorithm \ref{alg:AdaGrad} and then using the descent lemma,
    \begin{align*}
        f(\vx_{s+1})
        &\le f(\vx_s)+ \la \bar{\vg}_s, \vx_{s+1}-\vx_s \ra + \frac{L}{2} \|\vx_{s+1} - \vx_s\|^2 = f(\vx_s) + \la \bar{\vg}_s, \vm_s \ra + \frac{L}{2} \|\vm_s\|^2.
    \end{align*} 
    Using Cauchy-Schwarz inequality and Lemma \ref{lem:estimation_rough}, 
    \begin{align*}
        &\la \bar{\vg}_s, \vm_s \ra \le \|\bar{\vg}_s\| \cdot \|\vm_s\| \le \frac{\eta \sqrt{d}}{1-\beta}\left(\|\bar{\vg}_1\| + \frac{L\eta \sqrt{d}s}{1-\beta} \right),\quad \frac{L}{2}\|\vm_s\|^2 \le \frac{L\eta^2 d}{2(1-\beta)^2}.
    \end{align*}
    Combining with the above, we obtain that 
    \begin{align*}
        f(\vx_{s+1}) \le f(\vx_{s}) + \frac{\eta \sqrt{d}}{1-\beta}\left(\|\bar{\vg}_1\| + \frac{L\eta\sqrt{d}s}{1-\beta} \right) +  \frac{L\eta^2 d}{2(1-\beta)^2}.
    \end{align*}
    With both sides subtracting $f^*$ and summing up over $s \in [t]$, we obtain that
    \begin{align}\label{eq:delx_{t+1}_smooth}
        \delx_{t+1} 
        &\le \delx_1 +\frac{\eta \sqrt{d}}{1-\beta} \sum_{s=1}^t \left(\|\bar{\vg}_1\| + \frac{L\eta \sqrt{d}s}{1-\beta} \right) + \frac{L\eta^2 d t}{2(1-\beta)^2}.
    \end{align}
    We define $\sum_{a}^b = 0$ when $a < b$. 
    Then, we sum up both sides of \eqref{eq:delx_{t+1}_smooth} over $t \in [0,1,\cdots ,T-1]$ to obtain that
    \begin{align*}
        \sum_{t=1}^T \delx_{t}  
        &\le \sum_{t=0}^{T-1}\delx_1 + \frac{\eta \sqrt{d}}{1-\beta} \sum_{t=0}^{T-1}\sum_{s=1}^t \left(\|\bar{\vg}_1\| + \frac{L\eta \sqrt{d}s}{1-\beta} \right) + \sum_{t=0}^{T-1}\frac{L\eta^2 d t}{2(1-\beta)^2} \\
        &\le  \delx_1 T + \frac{\eta \sqrt{d}}{1-\beta} \sum_{t=1}^{T}\sum_{s=1}^t \left(\|\bar{\vg}_1\| + \frac{L\eta \sqrt{d}s}{1-\beta} \right) + \sum_{t=1}^{T}\frac{L\eta^2 d t}{2(1-\beta)^2} \\
        &\le \delx_1 T + \left(\frac{\eta \|\bar{\vg}_1\| \sqrt{d}}{1-\beta} + \frac{L\eta^2 d}{2(1-\beta)^2}\right) T^2 + \frac{L\eta^2 dT^3}{(1-\beta)^2} .
    \end{align*}
\end{proof}
\begin{proof}[Proof of Lemma \ref{lem:1_bounded}]
   Let $	X_{s} = - \left\la  \bar{\vg}_s,\frac{\vxi_s}{\va_s} \right \ra$ for any $s \in [T]$. Note that $\bar{\vg}_s$ and  $\va_s$ are random variables dependent on ${\bm z}_1,\cdots,{\bm z}_{s-1}$ and $\vxi_s$ is dependent on ${\bm z}_1,\cdots,{\bm z}_{s-1}, {\bm z}_s$.  It is easy to prove that $X_{s}$ is a martingale difference sequence since
    \begin{align*}
        &\mE\left[ X_{s} \mid \vz_1,\cdots ,\vz_{s-1}\right] = 
        -  \left\la  	 \bar{\vg}_s, \frac{\mE_{{\bm z}_s}  [\vxi_s]}{\va_s} \right\ra    = 0,
    \end{align*}
    where 
    %the second equality comes from that $\bar{\vg}_s,\va_s$ are dependent on $\vz_1,\cdots,\vz_{s-1}$ and 
    the last equality follows from Assumption (A2). Let
    \begin{align*}
        \zeta_{s} =  \left\|{\bar{\vg}_s \over \va_s}\right\| \sqrt{A \delx_s + B \|\bar{\vg}_s\|^2 + C},\quad \forall s \in [T].
    \end{align*}
    Similarly, $\zeta_{s}$ is a random variable dependent on $\vz_1,\cdots,\vz_{s-1}$. Using Cauchy-Schwarz inequality and Assumption (A3), we have
    \begin{align*}
        &\mE \left[\exp\left(\frac{X_{s}^2}{\zeta_{s}^2}\right) \mid \vz_1,\cdots,\vz_{s-1} \right] 
        \le \mE \left[ \exp\left(\frac{\| \vxi_s \|^2}{A \delx_s + B \|\bar{\vg}_s\|^2 + C} \right)\mid \vz_1,\cdots,\vz_{s-1} \right] \le \mathrm{e}.
    \end{align*}
    Therefore, given any fixed $t \in [T]$, applying Lemma \ref{lem:Azuma}, we have that for any $\lambda > 0$, with probability at least $1-\delta$,
    \begin{align}
        \sum_{s=1}^t X_{s}^2 
        &\le \frac{3\lambda}{4}\sum_{s=1}^t \zeta_{s}^2 + \frac{1}{\lambda} \log \left(\frac{1}{\delta} \right)= \frac{3\lambda }{4}\sum_{s=1}^t \sum_{i=1}^d\frac{ \bgsi^2}{\asi^2}\left(A \delx_s + B \|\bar{\vg}_s\|^2 + C \right)+ \frac{1}{\lambda} \log \left(\frac{1}{\delta} \right) \nonumber \\
        &\le \frac{3\lambda }{4}\sum_{s=1}^t \sum_{i=1}^d\frac{ \bgsi^2  \mG_s^2}{\asi^2} + \frac{1}{\lambda} \log \left(\frac{1}{\delta} \right) \le \frac{3\lambda }{4}\sum_{s=1}^t \sum_{i=1}^d \frac{ \bgsi^2  \mG_s }{\asi} + \frac{1}{\lambda} \log \left(\frac{1}{\delta} \right), \label{eq:martin_1}
    \end{align}
    where the second inequality follows from Lemma \ref{lem:gradient_delta_s} and \eqref{eq:define_G_t}. The last inequality follows from using $1/\asi \le 1/ \mG_s$, implied by \eqref{eq:proxy_stepsize}.
    We then obtain that for any $t \in [T]$, \eqref{eq:martin_1} holds with probability at least $1-\delta$. Therefore, for any fixed $\lambda > 0$, we can re-scale over $\delta$ and have that for all $t \in [T]$, with probability at least $1-\delta$,
    \begin{align}\label{eq:set_lambda}
        \sum_{s=1}^t X_{s}^2 \le \frac{3\lambda }{4}\sum_{s=1}^t \left\|\frac{ \bar{\vg}_s  }{\sqrt{\va_s}}\right\|^2 \mG_s + \frac{1}{\lambda} \log \left(\frac{T}{\delta} \right).
    \end{align}
%which is exactly 
%%    Then, summing over $i \in [d]$, we have
%    \begin{align*}
%        \sum_{s=1}^t- \left\langle \bar{\vg}_s, \frac{\vxi_s}{\va_s}\right\rangle \le \frac{3\lambda }{4}\sum_{s=1}^t \mG_s\left\|\frac{ \bar{\vg}_s  }{\sqrt{\va_s}}\right\|^2 + \frac{1}{\lambda} \log \left(\frac{dT}{\delta} \right).
%    \end{align*}
    Finally setting $\lambda = 1/(3  \mG)$, we obtain the desired result.
\end{proof} 

\begin{proof}[Proof of Lemma \ref{lem:gap_as_bs}]
Using the basic inequality, Assumption (A3) and Lemma \ref{lem:gradient_delta_s}, we have
     \begin{align*}
	         \|\vg_s\|^2 \le 2 \|\bar{\vg}_s\|^2 + 2 \|\vxi_s\|^2 \le 2A\delx_s + 2(B+1)\|\bar{\vg}_s\|^2 +2C 
	         \le (2A+4LB+4L)\delx_s +2C .
	     \end{align*}
     Thus, $\|\vg_s\| \le \mG_s,\forall s \ge 1$.
	Let  $\va_s = \sqrt{\tvv_s}+\vep$ where $\tvv_s=(\tilde{v}_{s,i})_i$. Then, for any $i \in [d]$,
	\begin{align}
		&\left|\frac{1}{\asi} - \frac{1}{\bsi} \right| 
		= \frac{|\sqrt{\tilde{v}_{s,i}}-\sqrt{\vsi}|}{\asi\bsi} 
		=\frac{|\tilde{v}_{s,i}-\vsi|}{\asi\bsi(\sqrt{\tilde{v}_{s,i}}+\sqrt{\vsi})}
		= \frac{1}{\asi\bsi}\frac{|\mG_s^2 - \gsi^2|}{\sqrt{\vsi}+\sqrt{\tilde{v}_{s,i}}} \nonumber \\
		\le &\frac{1}{\asi\bsi}\frac{\mG_s^2}{\sqrt{\vsi}+\sqrt{\tilde{v}_{s,i}}} \le \frac{\mG_s}{\asi\bsi}.\label{eq:a-b}
	\end{align}
\end{proof}

\begin{proof}[Proof of Lemma \ref{lem:A.1.2}]
	Applying Lemma \ref{lem:gap_as_bs} and Young's inequality,
	\begin{align}
		{\bf  A.1.2} 
		&\le  \sum_{s=1}^t \sum_{i=1}^d   \left| \frac{1}{\asi} - \frac{1}{\bsi}\right| |\bgsi| |\gsi |
		\le  \sum_{s=1}^t \sum_{i=1}^d \frac{ \mG_s}{\asi\bsi}  |\bgsi| |\gsi | \le  \frac{1}{4}\sum_{s=1}^t \sum_{i=1}^d  \frac{\bgsi^2}{\asi} +   \sum_{s=1}^t \sum_{i=1}^d \frac{  \mG_s^2}{\asi} \frac{\gsi^2}{\bsi^2}\nonumber\\
		&\le  \frac{1}{4}\sum_{s=1}^t \sum_{i=1}^d  \frac{\bgsi^2}{\asi} +   \sum_{s=1}^t \sum_{i=1}^d {  \mG_s} \frac{\gsi^2}{\bsi^2}, \label{eq:estA12a}
	\end{align}
where we use $1/\asi \le 1/\mG_s$ for the last inequality. 
The proof is complete.
\end{proof}



%\begin{align}\label{eq:final_1exp}
%\mE	\dely_{t+1}
%	&\le \delx_1 - \frac{\eta}{4(1-\beta)}\mE \sum_{s=1}^t  \left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2  \nonumber \\
%	& + \frac{\eta}{1-\beta} \mE \sum_{s=1}^t \mG_s \left\|\frac{\vg_s}{\vb_s} \right\|^2 \nonumber\\
%	&+ \frac{L }{2(1-\beta)}\sum_{s=1}^t \mE \|\vm_{s-1}\|^2 + \tL\sum_{s=1}^t  \mE\left\|\frac{\vg_s}{\vb_s} \right\|^2,
%\end{align}
%
%
%  \begin{align}
%\mE[	{\bf A.1.2}] \le  \frac{3}{4} \mE\sum_{s=1}^t \left\|\frac{\bar{\vg}_s}{\sqrt{\va_s}} \right\|^2 + \mE \sum_{s=1}^t \mG_s \left\|\frac{\vg_s}{\vb_s} \right\|^2. \label{eq:A.1.2exptt}
%\end{align}

\begin{proof}[Proof of Lemma \ref{lem:delta_y_x}]
    Using the descent lemma of smoothness and \eqref{eq:define_y_s},
    \begin{align*}
        f(\vx_s) 
        &\le f(\vy_s) + \la \nabla f(\vy_s), \vx_s - \vy_s \ra + \frac{L}{2} \|\vx_s - \vy_s \|^2 \\
        &= f(\vy_s) - \frac{\beta}{1-\beta} \la \nabla f(\vy_s), \vx_s - \vx_{s-1} \ra + \frac{L\beta^2}{2(1-\beta)^2} \|\vx_s - \vx_{s-1} \|^2.
    \end{align*}
    Using Young's inequality and subtracting $f^*$ on both sides,
    \begin{align*}
        \delx_s &\le \dely_s + \frac{1}{2L}\|\nabla f(\vy_s)\|^2 + \frac{(L+L)\beta^2}{2(1-\beta)^2}\|\vx_s - \vx_{s-1} \|^2 \le 2\dely_s + \frac{L\|\vm_{s-1} \|^2}{(1-\beta)^2},
    \end{align*}
    where we apply Lemma \ref{lem:gradient_delta_s} and $\beta \in [0,1)$. Finally dividing $2$ on both sides and re-arranging the order, we obtain the desired result.
\end{proof}

\begin{proof}[Proof of Lemma \ref{lem:sum_1}]
	Recalling the definition of $\vb_s$ and $\vv_s$, and then using Lemma \ref{lem:log_sum},
	\begin{align}\label{eq:gs/bs}
		\sum_{s=1}^t \frac{\gsi^2}{\bsi^2} = \sum_{s=1}^t \frac{\gsi^2}{(\sqrt{\vsi}+\ep)^2} \le \sum_{s=1}^t \frac{\gsi^2}{\vsi+\ep^2} \le \log\left(1+\frac{1}{\ep^2}\sum_{s=1}^t \gsi^2  \right).
	\end{align}
	Using the basic inequality, Assumption (A3), and Lemma \ref{lem:gradient_delta_s},
	\begin{align}
		\sum_{j=1}^t g_{j,i}^2 &\le \sum_{j=1}^T g_{j,i}^2 
		\le 2\sum_{j=1}^T (\bar{g}_{j,i}^2 + \xi_{j,i}^2) \le 2\sum_{j=1}^T (\|\bar{\vg}_{j}\|^2 + \|\vxi_{j}\|^2)  \nonumber\\
		&\le 2 A \sum_{j=1}^T\delx_j + 2(B+1) \sum_{j=1}^T\|\bar{\vg}_j\|^2 + 2CT\le X\sum_{j=1}^T \delx_j + 2CT.\label{eq:analysis_poly_F_1}
	\end{align}
	Combining with \eqref{eq:gs/bs} and Lemma \ref{lem:delta_rough}, we obtain that 
	\begin{align}
		\sum_{s=1}^t \frac{\gsi^2}{\bsi^2} \le \log\left(1 + \frac{1}{\ep^2}\sum_{j=1}^T g_{j,i}^2\right) \le \log\left(1 + \frac{1}{\ep^2}\left(X\sum_{j=1}^T \delx_j + 2CT\right)\right) \le \log\mF_T,\label{eq:analysis_poly_F_2}
	\end{align}
	where $\mF_T$ is defined as
	\begin{align}\label{eq:define_poly_F}
		\mF_T:=  1 + \frac{1}{\ep^2}\left[ (\delx_1 X + 2C)   T + \left(\frac{\eta \|\bar{\vg}_1\| \sqrt{d}}{1-\beta} + \frac{L\eta^2 d}{2(1-\beta)^2}\right)X  T^2 + \frac{L\eta^2 dX  T^3}{(1-\beta)^2}\right].
	\end{align}
	Summing up \eqref{eq:analysis_poly_F_2} over $i \in [d]$, we prove the first desired result.
 
	Then we move to estimate terms related to $\vm_s$.
 Let  $M_s = \sum_{j=1}^s \beta^{s-j}$. For any $i \in [d]$, recalling \eqref{eq:ms_iteration} and using the convexity of the square function,
	\begin{align}
		\msi^2 = \eta^2\left(\sum_{j=1}^s \frac{\beta^{s-j} g_{j,i}}{\sqrt{v_{j,i}}+\ep} \right)^2 \le \eta^2 M_s\sum_{j=1}^s  \frac{\beta^{s-j}g_{j,i}^2}{(\sqrt{v_{j,i}}+\ep)^2} \le \eta^2M_s\sum_{j=1}^s  \frac{\beta^{s-j}g_{j,i}^2}{v_{j,i}+\ep^2}. \label{eq:bound_msi_middle}
	\end{align}
	Summing over $i \in [d]$, using $\beta < 1$ with $M_s \leq {1 \over 1-\beta}$, Lemma \ref{lem:log_sum} and $s \le T$,
	\begin{align}
		\|\vm_s\|^2 &\le \eta^2 M_s \sum_{i=1}^d \sum_{j=1}^s \frac{g_{j,i}^2}{\sqrt{\sum_{k=1}^j g_{k,i}^2}+\ep^2}  \le \frac{\eta^2}{1-\beta} \sum_{i=1}^d \log \left(1+\frac{1}{\ep^2}\sum_{j=1}^s g_{j,i}^2 \right).\label{eq:ms}
	\end{align}
	We also sum up \eqref{eq:bound_msi_middle} over $s \in [t]$ and apply Lemma \ref{lem:log_sum} to obtain that
	\begin{align*}
		\sum_{s=1}^t \msi^2 &\le \eta^2 \sum_{s=1}^t M_s \sum_{j=1}^s  \frac{\beta^{s-j}g_{j,i}^2}{v_{j,i}+\ep^2} =\eta^2 \sum_{j=1}^t \frac{g_{j,i}^2}{v_{j,i}+\ep^2} \sum_{s=j}^t M_s \beta^{s-j} \\
		& \le \frac{\eta^2}{(1-\beta)^2}\sum_{j=1}^t \frac{g_{j,i}^2}{v_{j,i}+\ep^2} \le \frac{\eta^2}{(1-\beta)^2}\log\left(1+\frac{1}{\ep^2}\sum_{j=1}^t g_{j,i}^2 \right).
	\end{align*}
	Summing over $i \in [d]$, we obtain that
	\begin{align}
		\sum_{s=1}^t \|\vm_s\|^2 \le \frac{\eta^2}{(1-\beta)^2} \sum_{i=1}^d \log\left(1+\frac{1}{\ep^2}\sum_{j=1}^t g_{j,i}^2 \right). \label{eq:sum_ms}
	\end{align}
	Finally, we could follow the similar analysis for \eqref{eq:analysis_poly_F_1} and \eqref{eq:analysis_poly_F_2} to deduce that the terms inside the logarithm operator in \eqref{eq:sum_ms} could be further bounded by $\mF_T$ and thereby verify the target results.
\end{proof}


% \input{expectation}


\input{general}
