\section{Proof of Theorem~\ref{thm:large-step-size}}\label{sec:proof}

Before the analysis, we slightly modify the notations to simplify the analysis. The large step update takes the $x_{t,K}$ as input and outputs $x_{t+1,1}$. Then, \algadamw uses small step update to obtain $x_{t+1,2},\ldots,x_{t+1,K}$. It is noteworthy that the indexes of small step updates in Algorithm~\ref{alg:agma} range from $0$ to $K-1$, while in the following analysis, they will range from $1$ to $K$.


Before the analysis, we start with some important lemmas. Firstly, we consider the size of small step updates between two large step updates. To start with, we bound the size of every small step update using Lemma~\ref{lem:single-moment-tau}.
\begin{lemma}\label{lem:single-moment-tau}
When $\tau\geq 2$,
\begin{equation}
    \left\|\frac{m_{t,\tau}}{\sqrt{v_{t,\tau}}}\right\|\leq \frac{1-\beta_1}{\sqrt{1-\beta_2}}\cdot \frac{1}{\sqrt{\tau}} \left(\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\| + \tau-1\right).
\end{equation}
\end{lemma}
\begin{proof}
\begin{equation*}
\begin{aligned}
    \left\|\frac{m_{t,\tau}}{\sqrt{v_{t,\tau}}}\right\| & = \left\|\frac{\frac{\tau-1}{\tau}m_{t,\tau-1}+\frac{1-\beta_1}{\tau}g_{t,\tau-1}}{\sqrt{\frac{\tau-1}{\tau}v_{t,\tau-1}+\frac{1-\beta_2}{\tau}g_{t,\tau-1}^2}}\right\|\\
    & \leq \frac{1-\beta_1}{\sqrt{1-\beta_2}}\cdot \frac{1}{\sqrt{\tau}}\cdot \left\|\frac{m_{t,1}+\sum_{\sigma=2}^\tau g_{t,\sigma}}{\sqrt{v_{t,1}+\sum_{\sigma=2}^\tau g_{t,\sigma}^2}}\right\|\\
    & \leq \frac{1-\beta_1}{\sqrt{1-\beta_2}}\cdot \frac{1}{\sqrt{\tau}} \left(\sum_{\sigma=2}^{\tau}\left\|\frac{g_{t,\sigma}}{\sqrt{v_{t,1}+\sum_{\rho=2}^\tau g_{t,\rho}^2}}\right\| + \left\|\frac{m_{t,1}}{\sqrt{v_{t,1}+\sum_{\sigma=2}^\tau g_{t,\sigma}^2}}\right\| \right)\\
    &\leq \frac{1-\beta_1}{\sqrt{1-\beta_2}}\cdot \frac{1}{\sqrt{\tau}} \left(\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\| + \sum_{\sigma=2}^{\tau}\left\|\frac{g_{t,\sigma}}{\sqrt{g_{t,\sigma}^2}}\right\| \right)\\
    &=\frac{1-\beta_1}{\sqrt{1-\beta_2}}\cdot \frac{1}{\sqrt{\tau}} \left(\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\| + \tau-1\right)
\end{aligned}
\end{equation*}
\end{proof}

Then, we bound the squared size of the small step update.
\begin{corollary}\label{col:single-moment-tau-square}
\begin{equation}
    \left\|\frac{m_{t,\tau}}{\sqrt{v_{t,\tau}}}\right\|^2\leq \frac{(1-\beta_1)^2}{1-\beta_2}\cdot \frac{2}{\tau} \left(\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2 +\tau^2 \right).
\end{equation}
\end{corollary}
\begin{proof}
Since $(a+b)^2=a^2+b^2+2ab\leq 2(a^2+b^2)$,
\begin{equation*}
\begin{aligned}
    \left\|\frac{m_{t,\tau}}{\sqrt{v_{t,\tau}}}\right\|&\leq \frac{1-\beta_1}{\sqrt{1-\beta_2}}\cdot \frac{1}{\sqrt{\tau}} \left(\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\| + \tau-1\right)\\
    &\leq\frac{(1-\beta_1)^2}{1-\beta_2}\cdot \frac{2}{\tau} \left(\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2 +\tau^2 \right).
\end{aligned}
\end{equation*}
\end{proof}

Then, we bound the sum of the squared size of small step updates between two large step updates.
\begin{corollary}\label{col:sum-moment-tau-square}
\begin{equation}
    \sum_{\sigma=1}^{\tau-1} \left\|\frac{m_{t,\sigma}}{\sqrt{v_{t,\sigma}}}\right\|^2\leq \frac{(1-\beta_1)^2}{1-\beta_2}\sum_{\sigma=1}^{\tau-2} \frac{2}{\sigma} \left(\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2+\sigma^2\right) + \left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2.
\end{equation}
\end{corollary}
\begin{proof}
\begin{equation*}
\begin{aligned}
    \sum_{\sigma=1}^{\tau-1} \left\|\frac{m_{t,\sigma}}{\sqrt{v_{t,\sigma}}}\right\|^2&= \sum_{\sigma=1}^{\tau-2} \left\|\frac{m_{t,\sigma}}{\sqrt{v_{t,\sigma}}}\right\|^2 + \left\|\frac{m_{t,\tau-1}}{\sqrt{v_{t,\tau-1}}}\right\|^2\\
    &\underset{\leq}{\text{Corollary~\ref{col:single-moment-tau-square}}}\quad \sum_{\sigma=1}^{\tau-2} \left\|\frac{m_{t,\sigma}}{\sqrt{v_{t,\sigma}}}\right\|^2 + \frac{(1-\beta_1)^2}{1-\beta_2}\cdot\frac{2}{\tau-1}\cdot\left(\left\|\frac{m_{t_1}}{\sqrt{v_{t,1}}}\right\|^2+(\tau-1)^2\right)\\
    &\leq \frac{(1-\beta_1)^2}{1-\beta_2}\sum_{\sigma=1}^{\tau-2} \frac{2}{\sigma} \left(\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2+\sigma^2\right) + \left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2.
\end{aligned}
\end{equation*}
\end{proof}

After bounding the small steps between two large update steps, we consider the size of a large update step and $K$ following small steps. To start with, we assume that the update size of the first large step is bounded.
\begin{assumption}\label{asp:bounded-moment}
Let $a=\frac{\beta_1(1-\beta_1)}{\sqrt{\beta_2(1-\beta_2)}}\cdot \frac{1}{\sqrt{K}}$ and $b=\frac{1-\beta_1}{\sqrt{1-\beta_2}}\cdot \frac{1}{\sqrt{K}}\cdot \left(1+\frac{\beta_1K}{\sqrt{\beta_2}}\right)$.
\begin{equation}
    \left\|\frac{m_{1,1}}{\sqrt{v_{1,1}}}\right\| \leq \frac{b}{1-a} + \alpha.
\end{equation}
\end{assumption}

Then, we make some assumptions on the weight of the momentum.
\begin{assumption}\label{asp:beta}
For all $t$,
\begin{equation*}
    \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t}\leq 1.
\end{equation*}
\end{assumption}
If we take $\beta_1=0.9$, $\beta_2=0.99$ as the default configuration of Adam, this assumption holds.

Then, we bound the size of a large update step.
\begin{lemma}\label{lem:single-moment-1}
By tuning the hyper-parameters $\beta_1$ and $\beta_2$, let $a\leq 1/2$. Then
\begin{equation}
    % \left\|\frac{m_{t,1}}{\sqrt{v_{t,\tau}}}\right\| \leq a^{t-1}\left(\left\|\frac{m_{1,1}}{\sqrt{v_{t,1}}}\right\|-\frac{b}{1-a}\right)+\frac{b}{1-a}.
    \left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\| \leq \bar{\alpha}\cdot a^{t-1},
\end{equation}
where $\bar{\alpha}>\alpha$ is a constant to make $\bar{\alpha}\cdot a^{t-1}\geq \alpha\cdot a^{t-1}+\frac{b}{1-a}+K$.
\end{lemma}
\begin{proof}
\begin{equation*}
\begin{aligned}
    \left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|&=\left\|\frac{\beta_1 m_{t-1,K}+\frac{1-\beta_1}{K}g_{t,1}}{\sqrt{\beta_2 v_{t-1,K}+\frac{1-\beta_2}{K}g_{t,1}^2}}\right\|\\
    &\leq \left\|\frac{\beta_1 m_{t-1,K}}{\sqrt{\beta_2 v_{t-1,K}}}\right\| + \frac{1-\beta_1}{\sqrt{1-\beta_2}}\cdot\frac{1}{\sqrt{K}}\cdot \left\|\frac{g_{t,1}}{\sqrt{g_{t,1}^2}}\right\|\\
    &= \left\|\frac{\beta_1 m_{t-1,K}}{\sqrt{\beta_2 v_{t-1,K}}}\right\| + \frac{1-\beta_1}{\sqrt{1-\beta_2}}\cdot\frac{1}{\sqrt{K}}\\
    &\underset{\leq}{\text{Lem.~\ref{lem:single-moment-tau}}}\quad \frac{1-\beta_1}{\sqrt{1-\beta_2}}\cdot\frac{1}{\sqrt{K}} + \frac{\beta_1}{\sqrt{\beta_2}}\cdot\frac{1-\beta_1}{\sqrt{1-\beta_2}}\cdot \frac{1}{\sqrt{K}}\cdot \left(\left\|\frac{m_{t-1,1}}{\sqrt{v_{t-1,1}}}\right\|+K-1\right)\\
    &\leq \underbrace{\frac{\beta_1(1-\beta_1)}{\sqrt{\beta_2(1-\beta_2)}}\cdot \frac{1}{\sqrt{K}}}_{:=a}\cdot \left\|\frac{m_{t-1,1}}{\sqrt{v_{t-1,1}}}\right\| + \underbrace{\frac{1-\beta_1}{\sqrt{1-\beta_2}}\cdot \frac{1}{\sqrt{K}}\cdot \left(1+\frac{\beta_1K}{\sqrt{\beta_2}}\right)}_{:=b}
\end{aligned}
\end{equation*}
Let $x_t=\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|$, then
\begin{equation*}
    x_t\leq a^{t-1}\left(x_1-\frac{b}{1-a}\right)+\frac{b}{1-a}
\end{equation*}
Then, by Assumption~\ref{asp:bounded-moment}
\begin{equation*}
\begin{aligned}
    \left\|\frac{m_{t,1}}{\sqrt{v_{t,\tau}}}\right\| \leq \alpha\cdot a^{t-1}+\frac{b}{1-a} \leq \bar{\alpha}\cdot a^{t-1}.
\end{aligned}
\end{equation*}
\end{proof}

Similar to the above approach, we assume the bounded squared first large step and prove the bounded squared large steps.
\begin{assumption}\label{asp:bounded-moment-square}
Let $c=\frac{(1-\beta_1)^2}{1-\beta_2}\cdot \frac{2}{K}\left(1+\frac{2\beta_1^2K^2}{\beta_2^2}\right)$
\begin{equation}
    \left\|\frac{m_{1,1}}{\sqrt{v_{t,1}}}\right\|^2 \leq \frac{c}{1-4a^2} + \zeta
\end{equation}
\end{assumption}

\begin{corollary}\label{col:single-moment-1-square}
% Let $c=\frac{(1-\beta_1)^2}{1-\beta_2}\cdot \frac{2}{K}\left(1+\frac{2\beta_1^2K^2}{\beta_2^2}\right)$
\begin{equation}
    \left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2\leq \bar{\zeta}(2a)^{2t-2},
\end{equation}
where $\bar{\zeta}>\zeta$ is a constant to make $\bar{\zeta}(2a)^{2t-2}\geq \zeta(2a)^{2t-2}+\frac{c}{1-4a^2} + K^2$.
\end{corollary}
\begin{proof}
\begin{equation*}
\begin{aligned}
    \left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2 &\leq 2\left\|\frac{\beta_1 m_{t-1,K}}{\sqrt{\beta_2 v_{t-1,K}}}\right\|^2 + \frac{(1-\beta_1)^2}{1-\beta_2}\cdot \frac{2}{K}\\
    &\leq \frac{(1-\beta_1)^2}{1-\beta_2}\cdot \frac{2}{K} + \frac{4}{K}\cdot \frac{(1-\beta_1)^2\beta_1^2}{(1-\beta_2)\beta_2}\cdot \left(\left\|\frac{m_{t-1,1}}{\sqrt{v_{t-1,1}}}\right\|^2+K^2\right)\\
    &=4a^2 \left\|\frac{m_{t-1,1}}{\sqrt{v_{t-1,1}}}\right\|^2 + c.
\end{aligned}
\end{equation*}
Then, 
\begin{equation*}
\begin{aligned}
    \left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2 &\leq (2a)^{2t-2}\left(\left\|\frac{m_{1,1}}{\sqrt{v_{1,1}}}\right\|^2-\frac{c}{1-4a^2}\right)+\frac{c}{1-4a^2}\\
    &\leq \zeta(2a)^{2t-2}+\frac{c}{1-4a^2}\\
    &\leq \bar{\zeta}(2a)^{2t-2}.
\end{aligned}
\end{equation*}
\end{proof}

Before proving Theorem~\ref{thm:large-step-size}, we need more assumptions on the objective function and the initial point. First, we assume that $f$ has Lipschitz continuous gradient.
\begin{assumption}[$L$-smoothness]
A function $f$: $\mathbb{R}^d\to\mathbb{R}$ is differentiable and for any $x_1,x_2\in\mathbb{R}^d$, 
\begin{equation*}
    \|\nabla f(x_1)-\nabla f(x_2)\|\leq L\|x_1-x_2\|,
\end{equation*}
where $L$ is a constant.
\end{assumption}

% \begin{assumption}\label{asp:bounded-initial}
% The distance between the initial point $x_{1,1}$ and the optimal point is bounded, such that
% \begin{equation}
%     \|x_{1,1}-x^*\|\leq B,
% \end{equation}
% where $B$ is a constant and $B\leq \frac{2}{L} \left(1+\frac{\gamma^2L}{K} \frac{(1-\beta_1)^2}{1-\beta_2}(K+1)\right)\cdot \frac{\bar{\zeta}}{1-4a^2}$.
% \end{assumption}

Now, by putting everything together, we are ready to prove Theorem~\ref{thm:large-step-size}.
\begin{proof}[Proof of Theorem~\ref{thm:large-step-size}]
% \begin{equation*}
% \begin{aligned}
%     f(x_{t,1})-f(x^*)&\leq \langle\nabla f(x^*), x_{t,1}-x^*\rangle + \frac{L}{2}\|x_{t,1}-x^*\|^2\\
%     &\leq \frac{L}{2}\|x_{1,1}-x^*\|^2+\sum_{i=1}^{t-1}\frac{L}{2} \|x_{i+1,1}-x_{i}\|^2\\
%     &\underset{\leq}{\text{Assumption~\ref{asp:bounded-initial}}}\quad \frac{LB}{2}+\sum_{i=1}^{t-1}\frac{L}{2} \|x_{i+1,1}-x_{i}\|^2.
% \end{aligned}
% \end{equation*}
Let $\mathcal{T}=\frac{L}{2} \|x_{t+1,1}-x_{t}\|^2$. At the beginning, we assume that the \algadamw~ does not employ the bias correction shown in Line 12-13 in Algorithm~\ref{alg:agma}.
\begin{equation*}
\begin{aligned}
    \mathcal{T}&=\frac{L}{2}\|x_{t+1,1}-x_{t.1}\|^2\\
    &\leq \frac{L}{2} \sum_{\tau=1}^{K-1} \|x_{t,\tau+1}-x_{t,\tau}\|^2 + \frac{L}{2}\|x_{t+1,1}-x_{t,K}\|^2\\
    &=\frac{L}{2} \sum_{\tau=1}^{K-1} \left\|\frac{\gamma}{\sqrt{K}}\frac{\hat{m}_{t,\tau}}{\sqrt{\hat{v}_{t,\tau}}} \right\|^2 + \frac{L}{2} \left\|\gamma\cdot \frac{\hat{m}_{t,K}}{\sqrt{\hat{v}_{t,K}}}\right\|^2\\
    &=\frac{\gamma^2L}{2\sqrt{K}} \sum_{\tau=1}^{K-1} \left\|\frac{m_{t,\tau}}{\sqrt{v_{t,\tau}}} \right\|^2 +\frac{\gamma^2L\sqrt{K}}{2} \left\|\frac{m_{t,K}}{\sqrt{v_{t,K}}}\right\|^2.
\end{aligned}
\end{equation*}
\begin{equation*}
\begin{aligned}
    &\mathcal{T} \leq \frac{\gamma^2L}{2\sqrt{K}} \sum_{\tau=1}^{K-1} \left\|\frac{m_{t,\tau}}{\sqrt{v_{t,\tau}}} \right\|^2 +\frac{\gamma^2L\sqrt{K}}{2} \left\|\frac{m_{t,K}}{\sqrt{v_{t,K}}}\right\|^2\\
    &\underset{\leq}{\text{Corollary~\ref{col:single-moment-tau-square}}}\quad \frac{\gamma^2L}{2\sqrt{K}} \sum_{\tau=1}^{K-1} \left\|\frac{m_{t,\tau}}{\sqrt{v_{t,\tau}}} \right\|^2 +\frac{\gamma^2L}{2} \frac{(1-\beta_1)^2}{1-\beta_2}\cdot \frac{2}{\sqrt{K}} \left(\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2 +K^2 \right)\\
    &=\frac{\gamma^2L}{2\sqrt{K}} \sum_{\tau=1}^{K-1} \left\|\frac{m_{t,\tau}}{\sqrt{v_{t,\tau}}} \right\|^2 + \frac{\gamma^2L}{\sqrt{K}}\cdot \frac{(1-\beta_1)^2}{1-\beta_2}\cdot \left(\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2 +K^2 \right)\\
    &\underset{\leq}{\text{Corollary~\ref{col:sum-moment-tau-square}}}\quad \frac{\gamma^2L}{2\sqrt{K}} \frac{(1-\beta_1)^2}{1-\beta_2}\sum_{\sigma=1}^{K-2} \frac{2}{\sigma} \left(\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2+\sigma^2\right) + \left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2 + \frac{\gamma^2L}{\sqrt{K}}\cdot \frac{(1-\beta_1)^2}{1-\beta_2}\cdot \left(\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2 +K^2 \right)\\
    &\leq \frac{\gamma^2L}{\sqrt{K}} \frac{(1-\beta_1)^2}{1-\beta_2} \left(K\cdot \left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2+K^2\right) + \left(1+\frac{\gamma^2L}{\sqrt{K}} \frac{(1-\beta_1)^2}{1-\beta_2}\right)\left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2 + \gamma^2LK^{\frac{3}{2}}\frac{(1-\beta_1)^2}{1-\beta_2}\\
    &=\left(1+\frac{\gamma^2L}{\sqrt{K}} \frac{(1-\beta_1)^2}{1-\beta_2}(K+1)\right)\cdot \left\|\frac{m_{t,1}}{\sqrt{v_{t,1}}}\right\|^2 + 2\gamma^2LK^{\frac{3}{2}}\frac{(1-\beta_1)^2}{1-\beta_2}\\
    &\underset{\leq}{\text{Corollary~\ref{col:single-moment-1-square}}}\quad \left(1+\frac{\gamma^2L}{\sqrt{K}} \frac{(1-\beta_1)^2}{1-\beta_2}(K+1)\right)\cdot \bar{\zeta}(2a)^{2t-2} + 2\gamma^2LK^{\frac{3}{2}}\frac{(1-\beta_1)^2}{1-\beta_2}\\
    &\underset{\leq}{\text{Larger }\bar{\zeta}}\quad \left(1+\frac{\gamma^2L}{\sqrt{K}} \frac{(1-\beta_1)^2}{1-\beta_2}(K+1)\right)\cdot \bar{\zeta}(2a)^{2t-2}.
\end{aligned}
\end{equation*}
Thus, 
\begin{equation*}
\begin{aligned}
    \|x_{t+1,1}-x_{t,1}\|^2\leq \frac{2}{L} \left(1+\frac{\gamma^2L}{\sqrt{K}} \frac{(1-\beta_1)^2}{1-\beta_2}(K+1)\right)\cdot \bar{\zeta}(2a)^{2t-2}.
\end{aligned}
\end{equation*}

Then, we consider the bias correction shown in Linw 12-13 in Algorithm~\ref{alg:agma}. By the bias correction, the learning rate at time step $t$ can be viewed as $\gamma_t=\frac{\sqrt{1-\beta_2^t}}{1-\beta_1}\gamma\leq \gamma$ by Assumption~\ref{asp:beta}. Then, with the bias correction operation, this bound still holds.

% \begin{equation*}
% \begin{aligned}
%     f(x_{t,1})-f(x^*)&\leq \frac{LB}{2} +\sum_{i=1}^{t-1} \left(1+\frac{\gamma^2L}{K} \frac{(1-\beta_1)^2}{1-\beta_2}(K+1)\right)\cdot \bar{\zeta}(2a)^{2i-2}\\
%     &=\frac{LB}{2} + \left(1+\frac{\gamma^2L}{K} \frac{(1-\beta_1)^2}{1-\beta_2}(K+1)\right)\cdot \bar{\zeta}\frac{1-(2a)^{2t-2}}{1-4a^2}.
% \end{aligned}
% \end{equation*}

% Since
% \begin{equation*}
% \begin{aligned}
%     \|x_{t+1,1}-x_{t.1}\|^2 = \|(x_{t,1}-x^*)-(x_{t+,1}-x^*)\|^2\geq \|x_{t,1}-x^*\|^2-\|x_{t+1,1}-x^*\|^2,
% \end{aligned}
% \end{equation*}
% \begin{equation*}
% \begin{aligned}
%     \|x_{t,1}-x^*\|^2-\|x_{t+1,1}-x^*\|^2\leq \frac{2}{L} \left(1+\frac{\gamma^2L}{K} \frac{(1-\beta_1)^2}{1-\beta_2}(K+1)\right)\cdot \bar{\zeta}(2a)^{2t-2}
% \end{aligned}
% \end{equation*}

% Since
% \begin{equation*}
% \begin{aligned}
%     \|x_{t+1,1}-x_{t.1}\|^2 \geq \frac{1}{L}\|\nabla f(x_{t+1,1})-\nabla f(x_{t,1})\|^2
% \end{aligned}
% \end{equation*}

\end{proof}

\section{Proof of Theorem~\ref{thm:regret}}
Before the analysis, we assume that the variable is bounded, as assumed in~\cite{kingma2014adam}.
\begin{assumption}\label{asp:bounded-variable}
We assume that the distance between the variable and the optimal point is bounded during the optimization process, such that $\|x_{t,\tau}-x^*\|_2\leq D$, $\|x_{i,j}-x_{k,l}\|_{\infty}\leq D_{\infty}$.
\end{assumption}

\begin{proof}[Proof of Theorem~\ref{thm:regret}]
Since the objective function $f$ is convex,
\begin{equation*}
    f(x_{t,K})-f(x^*)\leq \langle\nabla f(x_{t,K}), x_{t,K}-\theta^*\rangle = \sum_{i=1}^d g_{t,K,i}(x_{t,K,i}-x^*_i).
\end{equation*}
Using the update method defined in Algorithm~\ref{alg:agma}, we can get
\begin{equation*}
\begin{aligned}
    x_{t+1,1,i} &= x_{t,K,i}-\gamma\frac{\hat{m}_{t,K,i}}{\sqrt{\hat{v}_{t,K,i}}}\\
    &= x_{t,K,i} - \frac{\gamma}{1-\beta_1^t}\left(\frac{K-1}{\sqrt{K}\sqrt{v_{t,K,i}}} m_{t,K-1,i} + \frac{1-\beta_1}{\sqrt{K}\sqrt{v_{t,K,i}}}g_{t,K,i}\right).
\end{aligned}
\end{equation*}
\begin{equation*}
\begin{aligned}
    (x_{t+1,1,i}-x^*)^2 &= (x_{t,K,i}-x^*_i)^2 -\frac{2\gamma}{1-\beta_1^t}\left( x_{t,K,i}-x^*_i\right) \left(\frac{K-1}{\sqrt{K}\sqrt{v_{t,K,i}}} m_{t,K-1,i} + \frac{1-\beta_1}{\sqrt{K}\sqrt{v_{t,K,i}}}g_{t,K,i}\right)\\
    &+ \gamma^2K\left(\frac{m_{t,K,i}}{\sqrt{v_{t,K,i}}}\right)^2.
\end{aligned}
\end{equation*}
Rearrange the equation above,
\begin{equation*}
\begin{aligned}
    &g_{t,K,i} (x_{t,K,i}-x_i^*) = \frac{(1-\beta_1^t)\sqrt{K}\sqrt{v_{t,K,i}}}{2\gamma (1-\beta_1)}\left((x_{t+1,K,i}-x^*_i)^2-(x_{t,K,i}-x^*_i)^2\right)\\
    &+ \frac{K-1}{1-\beta_1} m_{t,K-1,i} (x_{t,K,i}-x^*_i) + \frac{(1-\beta_1^t)\gamma K^{\frac{3}{2}}\sqrt{v_{t,K,i}}}{2(1-\beta_1)}\left(\frac{m_{t,K,i}}{\sqrt{v_{t,K,i}}}\right)^2\\
    &\leq \frac{\sqrt{K}\sqrt{v_{t,K,i}}}{2\gamma (1-\beta_1)}\left((x_{t+1,K,i}-x_i^*)^2-(x_{t,K,i}-x_i^*)^2\right)\\
    &+ \frac{K-1}{1-\beta_1} (x_{t,K,i}-x_i^*) \sqrt{v_{t,K-1}} \left(\frac{m_{t,K-1,i}}{\sqrt{v_{t,K-1,i}}}\right) + \frac{\gamma K^{\frac{3}{2}}}{2(1-\beta_1)} \frac{m_{t,K,i}^2}{\sqrt{v_{t,K,i}}}\\
    &\leq \frac{\sqrt{K}\sqrt{v_{t,K,i}}}{2\gamma (1-\beta_1)}\left((x_{t+1,K,i}-x_i^*)^2-(x_{t,K,i}-x_i^*)^2\right)\\
    &+ \frac{K-1}{2(1-\beta_1)} (x_{t,K,i}-x^*)^2\cdot \sqrt{v_{t,K,i}} + \frac{K-1}{2(1-\beta_1)} \frac{m_{t,K-1,i}^2}{\sqrt{v_{t,K-1,i}}} + \frac{\gamma K^{\frac{3}{2}}}{2(1-\beta_1)} \frac{m_{t,K,i}^2}{\sqrt{v_{t,K,i}}}.
\end{aligned}
\end{equation*}

\begin{equation*}
\begin{aligned}
    R_K(T)&\leq \sum_{t=1}^T \sum_{i=1}^d g_{t,K,i}(x_{t,K,i}-x^*_i)\\
    &\leq \sum_{i=1}^d\sum_{t=1}^T \frac{\sqrt{K}\sqrt{v_{t,K,i}}}{2\gamma (1-\beta_1)} (x_{t+1,K,i}-x_i^*)^2- \frac{\sqrt{K}\sqrt{v_{t,K,i}}}{2\gamma (1-\beta_1)}(x_{t,K,i}-x_i^*)^2\\
    &+ \frac{K-1}{2(1-\beta_1)} (x_{t,K,i}-x^*)^2\cdot \sqrt{v_{t,K,i}} + \frac{K-1}{2(1-\beta_1)} \frac{m_{t,K-1,i}^2}{\sqrt{v_{t,K-1,i}}} + \frac{\gamma K^{\frac{3}{2}}}{2(1-\beta_1)} \frac{m_{t,K,i}^2}{\sqrt{v_{t,K,i}}}\\
    &\underset{\leq}{\text{Lemma~\ref{lem:single-moment-1}}}\quad  \frac{\sqrt{K}D^2}{2\gamma (1-\beta_1)}  \sum_{i=1}^d\sqrt{T\hat{v}_{T,K,i}}+ \frac{D^2_{\infty}(K-1)}{2(1-\beta_1)} \cdot \sum_{i=1}^d\sum_{t=1}^T \sqrt{v_{T,K,i}}\\
    &+ \frac{(1+\gamma)K^{\frac{3}{2}}G_{\infty}}{2(1-\beta_1)}\sum_{i=1}^d \|g_{1:KT,i}\|_2\\
    % &+ \sum_{i=1}^d\left(\frac{(K-1)\sqrt{v_{t,K-1}}}{2(1-\beta_1)}\cdot \frac{(1-\beta_1)^2}{1-\beta_2}\cdot \frac{2}{K-1} + \frac{\gamma K \sqrt{v_{t,K-1}}}{2(1-\beta_1)} \frac{(1-\beta_1)^2}{1-\beta_2}\cdot \frac{2}{K} \right)\sum_{t=1}^T \bar{\alpha} a^{t-1}\\
    % &\leq  \frac{KD^2 + \gamma D_{\infty}^2(K-1)}{2\gamma (1-\beta_1)} \| \sqrt{\hat{v}_{T,K}}\|\sqrt{T} + \frac{1-\beta_1}{2(1-\beta_2)} \frac{\bar{\alpha}(\gamma+1)}{1-a}.
    &\leq \frac{\sqrt{K}D^2}{2\gamma (1-\beta_1)}  \sum_{i=1}^d\sqrt{T\hat{v}_{T,K,i}}+ \frac{(1+\gamma)K^{\frac{3}{2}}G_{\infty}}{2(1-\beta_1)}\sum_{i=1}^d \|g_{1:KT,i}\|_2 + \frac{D^2_{\infty}G_{\infty}(K-1)}{2(1-\beta_1)}.
\end{aligned}
\end{equation*}

\end{proof}

