\clearpage
\section{Hot DoG Convergence}\label{sec:convergence_proof}
\subsection{Update Rule}\label{sec:update_rule}
The Hot DoG update for coordinate $i \in [M]$ at iteration $n\in\nats_+$ can be written as
\[
    m_{t,i} &= \beta_1 m_{t-1, i} + g_i(w_{t-1}, \theta_{t-1}, \mathcal{S}_{t-1}) \label{eq:m_update}\\
    v_{t,i} &= \beta_2 v_{t-1, i} + \left(g_i(w_{t-1}, \theta_{t-1}, \mathcal{S}_{t-1})\right)^2 \label{eq:v_update}\\
    w_{t,i} &= w_{t-1,i} - \alpha_{t,i} \frac{m_{t,i}}{\sqrt{t \cdot (\epsilon + v_{t,i})}},
\]
where
\[
    \alpha_{t,i} &= \frac{1-\beta_1}{1-\beta_1^t} \frac{\sqrt{1-\beta_2^t}}{\sqrt{1-\beta_2}} \tilde{r}_{t,i}\\
    \tilde{r}_{t,i} &= (1-\beta_1^t)^{-1}\left((1-\beta_1)\left(\sum_{k=0}^{t-1}\beta_1^k\bar{r}_{t-k,i}\right) + \beta_1^t\bar{r}_{0,i}\right)\\
    \bar{r}_{t,i} &= \left(\max_{k\leq t}\{ |w_{t,i} - w_{0,i}| \}\right) \vee r_\delta.
\]
We initialize the algorithm such that $m_0=0$ and $v_0=0$.
Define $s_t \in \reals^N$ such that $\forall j \in \mathcal{S}_t, s_{tj} = \frac{N}{S}$ and $0$ otherwise.
The $M$-dimensional subsampled gradient estimate as defined in \cref{eq:gradest} then takes the form
\[
    g(w_{t-1}, \theta_{t-1}, \mathcal{S}_{t-1}) = G_{t-1}(w_{t-1} - w^\star) + H_{t-1}(1-s_{t-1}), \label{eq:gradient}
\]
where  
\[
    G_{t\!-\!1} \!=\! \frac{1}{K\!-\!1} \!\sum_{k=1}^K\! \begin{bmatrix} \bar{\ell}_1 (\theta_{(t\!-\!1)k}) \\ \vdots \\ \ell_M (\theta_{(t-1)k}) \end{bmatrix}
                                        \begin{bmatrix} \bar{\ell}_1 (\theta_{(t\!-\!1)k}) \\ \vdots \\ \ell_M (\theta_{(t-1)k}) \end{bmatrix}^\top \!\in\! \reals^{M\!\times\! M},
    H_{t\!-\!1} \!=\! \frac{1}{K\!-\!1}\!\sum_{k=1}^K\!
                \bbmat \bar\ell_1(\theta_{(t\!-\!1)k})\\ \vdots \\ \bar\ell_M(\theta_{(t-1)k})\ebmat
                \bbmat \bar\ell_1(\theta_{(t\!-\!1)k})\\ \vdots \\ \bar\ell_N(\theta_{(t-1)k})\ebmat^\top
                \!\in\! \reals^{M\!\times\! N}.
\]
Note that in \cref{eq:gradient}, both matrix-vector products on the right hand side give us vectors of dimension $M$, 
which aligns with the desired dimension of the gradient estimate.
We also define here two quantities that improves the readability of proofs presented in following subsections:
\[
    R_{t-1} = \begin{bmatrix}\frac{\alpha_{t,1}}{\sqrt{t\cdot (\epsilon + v_{t,1})}} & \cdots & \frac{\alpha_{t,M}}{\sqrt{t\cdot (\epsilon + v_{t,M})}}\end{bmatrix}^\top, \quad\quad
    \Delta_{t-j} = w_{t-j-1} - w_{t-j} = \alpha_{t-j} \odot \frac{m_{t-j}}{\sqrt{(t-j)\cdot \left(\epsilon + v_{t-j}\right)}}. \label{eq:simple_notation}
\]
\subsection{Assumptions}\label{sec:assumptions}
\begin{assumption}[Coreset weight constraint]\label{assump:constraint}
    $\mathcal{W} = \{w \in \reals^M: w_t \geq 0, \sum_{m=1}^M w_{tm} \leq B\}$.
\end{assumption}
\begin{assumption}[Exact coreset]\label{assump:exact}
    There exists a $w^\star \in \reals^M, c^\star \in \reals$ such that $w^\star \in \mathcal{W}$ and 
    \[
        \sum_{n=1}^N \ell_n(\cdot) = \sum_{m=1}^M w_m^\star \ell_m(\cdot) + c^\star \quad \pi_0 - a.e.v.
    \]
\end{assumption}
\begin{assumption}[Bounded gradient]\label{assump:grad_bound}
    There exists $U>0$ such that
    \[
        \forall w_t \in \mathcal{W}, \theta_t \in \Theta^K, \mathcal{S}_{t} \subseteq [N] \,\,\,\, 
        \|g(w_t,\theta_t, \mathcal{S}_t)\|_\infty \leq U.
    \]
\end{assumption}
\begin{assumption}[Markov gradient mixing]\label{assump:mixing}
    There exists $\lambda>0$ such that 
    \[
        \forall w_t \in \mathcal{W}, \theta_{t-1} \in \Theta^K \,\,\,\, 
        \E\left[ G_t \middle | w_t, \theta_{t-1} \right] \succeq \lambda I.
    \]
\end{assumption}
\begin{assumption}[Markov gradient noise boundedness]\label{assump:noise}
    There exists $0<\bar{\lambda}<\infty$ such that 
    \[
        \forall w_t, w_{t-j} \in \mathcal{W}, \theta_{t-1}, \theta_{t-j-i} \in \Theta^K \,\,\,\, 
        \E\left[ G_{t-j}^\top G_{t} \middle | w_t, \theta_{t-1}, w_{t-j}, \theta_{t-j-1} \right] \preceq \bar{\lambda} I.
    \]
\end{assumption}

\subsection{Convergence Proof}\label{sec:proof}
\begin{proof}[Proof of \cref{thm:convergence}]
We begin by applying the projected gradient update to get
\[
    \|w_t - w^\star\|^2 
    &= \left\| \proj_\mathcal{W} \left( w_{t-1} - \alpha_t \odot \frac{m_t}{\sqrt{t \cdot (\epsilon + v_t)}} \right) - w^\star \right\|^2\\
    &= \left\| \proj_\mathcal{W} \left( w_{t-1} - \alpha_t \odot \frac{m_t}{\sqrt{t \cdot (\epsilon + v_t)}} \right) - \proj_\mathcal{W} w^\star \right\|^2\\
    &\leq \left\| w_{t-1} - \alpha_t \odot \frac{m_t}{\sqrt{t \cdot (\epsilon + v_t)}} - w^\star \right\|^2. \label{eq:one_step_update}
\]
Here $\odot$ denotes element-wise multiplication, and the fraction $\frac{m_t}{\sqrt{t \cdot (\epsilon + v_t)}}$ is also applied element-wise. The second equality follows because
$w^\star \in \mathcal{W}$ by assumption. The inequality follows because $\mathcal{W}$ defined in \cref{assump:constraint} 
is convex and closed, and hence $\proj_{\mathcal{W}}$ is a contraction.
We unroll $m_t$ by \cref{eq:m_update} and use $R_{t-1}$ as defined in \cref{eq:simple_notation} to get
\[
    \alpha_t \odot \frac{m_t}{\sqrt{t \cdot (\epsilon + v_t)}} = 
    \diag(R_{t-1}) \sum_{k=0}^{t-1}\beta_1^k g(w_{t-k-1}, \theta_{t-k-1}, \mathcal{S}_{t-k-1}).\label{eq:element_update}
\]
By substituting \cref{eq:gradient,eq:element_update} into \cref{eq:one_step_update} and taking expectations on both sides, we get
\[
    &\E\|w_t - w^\star\|^2\\
    &\leq \E\left[\left\| \left((w_{t-1} \!-\! w^\star) \!-\! \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}(w_{t-k-1} \!-\! w^\star)\right) \!-\! \left(\diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k H_{t-k-1}(1\!-\!s_{t-k-1})\right) \right\|^2\right]\\
    &= \E\left[\left\| (w_{t-1} - w^\star) - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}(w_{t-k-1} - w^\star) \right\|^2\right] \\
    &\quad - 2\E\left[\left((w_{t-1} \!-\! w^\star) \!-\! \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}(w_{t-k-1} \!-\! w^\star)\right)^\top \left(\diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k H_{t-k-1}(1\!-\!s_{t-k-1})\right)\right]\\
    &\quad + \E\left[\left\| \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k H_{t-k-1}(1-s_{t-k-1}) \right\|^2\right]\\
    &= \E\left[\left\| (w_{t-1} \!-\! w^\star) \!-\! \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}(w_{t-k-1} \!-\! w^\star) \right\|^2\right]
        + \E\left[\left\| \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k H_{t-k-1}(1\!-\!s_{t-k-1}) \right\|^2\right].
    \label{eq:explicit_update_1}
\]
In the above, the last equality follows due to unbiased subsampling, i.e., for all $t$, $\E[1-s_t] = 0$.
We now rewrite the first term in \cref{eq:explicit_update_1} as follows:
\[
    &\E\left[\left\| (w_{t-1} \!-\! w^\star) \!-\! \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}(w_{t-k-1} \!-\! w^\star) \right\|^2\right]\\
    &=\E\left[ \left\| (w_{t-1} - w^\star) - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}(w_{t-k-1} - w^\star + w_{t-1} - w_{t-1}) \right\|^2 \right]\\
    &=\E\left[ \left\| (w_{t-1} - w^\star) - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}(w_{t-1} - w^\star) - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}(w_{t-k-1} - w_{t-1}) \right\|^2 \right]\\
    &=\E\left[ \left\| \left(I - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right)(w_{t-1} - w^\star) - \diag(R_{t-1})\sum_{k=1}^{t-1}\beta_1^k G_{t-k-1}\left( \sum_{j=1}^k \Delta_{t-j} \right) \right\|^2 \right],\label{eq:explicit_update}
\]
where $\Delta_{t-j}$ is as defined in \cref{eq:simple_notation}. The last equality above follows by rewriting $w_{t-k-1} - w_{t-1}$ as a telescoping sum.
Now let $A_t = \left(I - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right)$, $b_t= \diag(R_{t-1})\sum_{k=1}^{t-1}\beta_1^k G_{t-k-1}\left( \sum_{j=1}^k \Delta_{t-j} \right)$, and
    $c_t = \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k H_{t-k-1}(1-s_{t-k-1})$.
\cref{eq:explicit_update_1} then becomes
\[
    &\E \|w_t - w^\star\|^2\\
    &\leq \E \left[ \|A_t (w_{t-1} - w^\star) - b_t \|^2 \right] + \E\left[\| c_t \|^2 \right]\\
    &=\E \left[ \E \left[ (w_{t-1} - w^\star)^\top A_t^\top A_t (w_{t-1} - w^\star) - 2b_t^\top A_t (w_{t-1} - w^\star) + b_t^\top b_t \middle | w_{t-1}\right] \right] + \E\left[\| c_t \|^2 \right]\\
    &\leq\E \left[ (w_{t-1} - w^\star)^\top \E\left[ A_t^\top A_t \middle | w_{t-1} \right] (w_{t-1} - w^\star) + 2\left | \E\left[ b_t^\top A_t (w_{t-1} - w^\star) \middle | w_{t-1} \right] \right | + \E\left[ \|b_t\|^2 \middle | w_{t-1} \right] \right] + \E\left[\| c_t \|^2 \right]\\
    &\leq\E \left[ (w_{t-1} - w^\star)^\top \E\left[ A_t^\top A_t \middle | w_{t-1} \right] (w_{t-1} - w^\star)
        + 2 \sqrt{\E\left[ \|b_t\|^2 \middle | w_{t-1} \right]}
                    \sqrt{\E\left[ \|A_t (w_{t-1} - w^\star)\|^2 \middle | w_{t-1} \right]} \right]\\
        &\quad+ \E\left[\E\left[ \|b_t\|^2 \middle | w_{t-1} \right] \right] + \E\left[\| c_t \|^2 \right]\\
    &= \E \left[ (w_{t-1} \!-\! w^\star)^\top \E\left[ A_t^\top A_t \middle | w_{t-1} \right] (w_{t-1} \!-\! w^\star)
    \!+\! 2 \sqrt{\E\left[ \|b_t\|^2 \middle | w_{t-1} \right]}
                \sqrt{ (w_{t-1} \!-\! w^\star)^\top \E\left[A_t^\top A_t \middle | w_{t-1}\right] (w_{t-1} \!-\! w^\star)} \right]\\
    &\quad+ \E\left[\E\left[ \|b_t\|^2 \middle | w_{t-1} \right] \right] + \E\left[\| c_t \|^2 \right],
\]
where the last inequality is by Cauchy-Schwartz.
By \cref{lem:At,lem:bt,lem:ct}, we know that there exists $T^\star < \infty$ and $C_1, C_2>0$ such that for all $t > T^\star$, 
\[
    \E\left[ A_t^\top A_t \middle | w_{t-1} \right] \preceq \exp\left( -\frac{D}{\sqrt{t}} \right)I, \quad
    \E\left[ \|b_t\|^2 \middle | w_{t-1} \right] \leq \frac{C_1}{t^2},\quad
    \E\left[ \|c_t\|^2 \right] \leq \frac{C_2}{t}.
\]
Here $D=\frac{\lambda(1-\beta_1)r_\delta}{2\sqrt{\epsilon+(1-\beta_2)^{-1}U^2}}$ is as defined in \cref{lem:At}. 
We know $e^{-D/\sqrt{t}}\leq 1$. By \cref{assump:constraint}, we also have that for all $t \geq 1$, $\|w_{t-1}-w^\star\|^2 \leq \sum_{m=1}^M B^2 = MB^2$.
Therefore, 
\[
    \E \|w_t - w^\star\|^2 &\leq e^{-D/\sqrt{t}}\E\|w_{t-1} - w^\star\|^2 + 2\E\left[\sqrt{\frac{C_1}{t^2}}\sqrt{\exp\left( -\frac{D}{\sqrt{t}} \right) \|w_{t-1} - w^\star\|^2}\right] + \frac{C_1}{t^2} + \frac{C_2}{t}\\
    &\leq e^{-D/\sqrt{t}}\E\|w_{t-1} - w^\star\|^2 + 2\frac{B\sqrt{MC_1}}{t} + \frac{C_1}{t^2} + \frac{C_2}{t}\\
    &\leq e^{-D/\sqrt{t}}\E\|w_{t-1} - w^\star\|^2 + \frac{2B\sqrt{MC_1}+C_1+C_2}{t}.
\]
We unroll this recursion backward from $t$ to $T^\star$ to get
\[
    \E \|w_t - w^\star\|^2 &\leq e^{-D\sum_{\tau=T^\star+1}^{t}\frac{1}{\sqrt{\tau}}}\E\left[\left\| w_{T^\star}-w^\star \right\|^2\right] + \left( 2B\sqrt{MC_1}+C_1+C_2 \right) \sum_{\tau=T^\star+1}^{t} \frac{1}{\tau} e^{-D\sum_{u=\tau+1}^{t}\frac{1}{\sqrt{u}}}\\
    &\leq MB^2 e^{-D\sum_{\tau=T^\star+1}^{t}\frac{1}{\sqrt{\tau}}} + \left( 2B\sqrt{MC_1}+C_1+C_2 \right) \sum_{\tau=T^\star+1}^{t} \frac{1}{\tau} e^{-D\sum_{u=\tau+1}^{t}\frac{1}{\sqrt{u}}},
\]
where the last inequality again uses $\|w_{T^\star}-w^\star\|^2 \leq = MB^2$.
Since $\frac{1}{\sqrt{\tau}}$ monotonically decreases in $\tau$, we have that
$\sum_{\tau=T^\star+1}^{t} \frac{1}{\tau} \geq \int_{T^\star+1}^{t} \frac{1}{\sqrt{\tau}} d\tau = 2\left(\sqrt{t} - \sqrt{T^\star+1}\right)$.
Therefore, as $t\to\infty$,
\[
    \E \|w_t - w^\star\|^2 &\leq MB^2 e^{-2D(\sqrt{t} - \sqrt{T^\star+1})} + (2B\sqrt{MC_1}+C_1+C_2)\sum_{\tau=T^\star+1}^{t}\frac{1}{\tau}e^{-2D(\sqrt{t} - \sqrt{\tau+1})}\\
    &\leq MB^2e^{2D\sqrt{T^\star+1}} e^{-2D\sqrt{t}} + \left(2B\sqrt{MC_1}+C_1+C_2\right)e^{-2D\sqrt{t}}\sum_{\tau=1}^{t}\frac{1}{\tau}e^{2D\sqrt{\tau+1}}\\
    &= O\left( e^{-2D\sqrt{t}} + e^{-2D\sqrt{t}}\sum_{\tau=1}^{t}\frac{1}{\tau}e^{2D\sqrt{\tau+1}} \right). \label{eq:first_big_o}
\]
It is obvious that $e^{-2D\sqrt{t}} = O\left( \frac{1}{\sqrt{t}} \right)$ as $t \to\infty$. It remains to show that $e^{-2D\sqrt{t}}\sum_{\tau=1}^{t}\frac{1}{\tau}e^{2D\sqrt{\tau+1}} = O\left( \frac{1}{\sqrt{t}} \right)$ as $t\to\infty$.
We begin by noting that, since $\forall \tau \geq 1$, $\frac{\tau+1}{\tau} \leq 2$,
\[
    \sum_{\tau=1}^{t}\frac{1}{\tau}e^{2D\sqrt{\tau+1}} &= \sum_{\tau=1}^{t}\frac{1}{\tau}e^{2D\sqrt{\tau+1}}\frac{\tau}{\tau+1}\frac{\tau+1}{\tau}
    \leq 2\sum_{\tau=1}^{t}\frac{1}{\tau+1}e^{2D\sqrt{\tau+1}}
    = 2\sum_{\tau=2}^{t+1}\frac{1}{\tau}e^{2D\sqrt{\tau}}.
\]
We can then equivalently show $2e^{-2D\sqrt{t}}\sum_{\tau=2}^{t+1}\frac{1}{\tau}e^{2D\sqrt{\tau}} = O\left( \frac{1}{\sqrt{t}} \right)$ as $t\to\infty$.
We know that there exists $T' < \infty$ such that for all $\tau\geq T'$, $\frac{1}{\tau}e^{2D\sqrt{\tau}}$ monotonically increases with $\tau$. We therefore split the sum at $g(t)$, with $T' \leq g(t) \leq t$, to get 
\[
    2e^{-2D\sqrt{t}}\sum_{\tau=2}^{t+1}\frac{1}{\tau}e^{2D\sqrt{\tau}} &= 2e^{-2D\sqrt{t}}\sum_{\tau=2}^{g(t)-1}\frac{1}{\tau}e^{2D\sqrt{\tau}} + 2e^{-2D\sqrt{t}}\sum_{\tau=g(t)}^{t+1}\frac{1}{\tau}e^{2D\sqrt{\tau}}. \label{eq:split}
\]
We can bound the first term in \cref{eq:split} as follows
\[
    2e^{-2D\sqrt{t}}\sum_{\tau=2}^{g(t)-1}\frac{1}{\tau}e^{2D\sqrt{\tau}} \leq 2e^{2D\left( \sqrt{g(t)} - \sqrt{t} \right)}\sum_{\tau=2}^{g(t)-1}\frac{1}{\tau}
    \leq 2e^{2D\left( \sqrt{g(t)} - \sqrt{t} \right)}\left( \ln\left(g(t)\right) + 1 \right). \label{eq:condition_1}
\]

Looking at the second term in \cref{eq:split}, since $\tau \geq g(t)$ is large enough that the summand monotonically increases with $\tau$,
\[
    2e^{-2D\sqrt{t}}\sum_{\tau=g(t)}^{t+1}\frac{1}{\tau}e^{2D\sqrt{\tau}} \leq 2e^{-2D\sqrt{t}}\int_{g(t)}^{t+1}\frac{e^{2D\sqrt{\tau}}}{\tau}d\tau = 4e^{-2D\sqrt{t}}\int_{\sqrt{g(t)}}^{\sqrt{t+1}} \frac{e^{2Ds}}{s} ds, \label{eq:second_final_bound}
\]
Where the last equality follows by setting $s = \sqrt{\tau}$, $\tau = s^2$, $d\tau = 2s ds$.
Now for the integral in \cref{eq:second_final_bound}, we integrate by parts by defining $y = \frac{1}{s}$ and $dv = e^{2Ds}ds$:
\[
    \int_{\sqrt{g(t)}}^{\sqrt{t+1}} \frac{e^{2Ds}}{s} ds &= \frac{1}{2D\sqrt{t+1}}e^{2D\sqrt{t+1}} - \frac{1}{2D\sqrt{g(t)}}e^{2D\sqrt{g(t)}} + \frac{1}{2D}\int_{\sqrt{g(t)}}^{\sqrt{t+1}}\frac{e^{2Ds}}{s^2}ds\\
    &\leq \frac{1}{2D\sqrt{t+1}}e^{2D\sqrt{t+1}} + \frac{e^{2D\sqrt{t+1}}}{2D}\int_{\sqrt{g(t)}}^{\sqrt{t+1}}\frac{1}{s^2}ds\\
    &= \frac{1}{2D\sqrt{t+1}}e^{2D\sqrt{t+1}} + \frac{e^{2D\sqrt{t+1}}}{2D}\left( \frac{1}{\sqrt{g(t)}} - \frac{1}{\sqrt{t+1}} \right).
\]
Substituting the above back into \cref{eq:second_final_bound} to get
\[
    2e^{-2D\sqrt{t}}\sum_{\tau=g(t)}^{t+1}\frac{1}{\tau}e^{2D\sqrt{\tau}} &\leq 4e^{-2D\sqrt{t}}\left( \frac{1}{2D\sqrt{t+1}}e^{2D\sqrt{t+1}} + \frac{e^{2D\sqrt{t+1}}}{2D}\left( \frac{1}{\sqrt{g(t)}} - \frac{1}{\sqrt{t+1}} \right) \right)\\
    &= \frac{2}{D\sqrt{t+1}}e^{2D(\sqrt{t+1} - \sqrt{t})} + \frac{2}{D}e^{2D(\sqrt{t+1} - \sqrt{t})}\left( \frac{1}{\sqrt{g(t)}} - \frac{1}{\sqrt{t+1}} \right)\\
    &\leq \frac{2e^{2D}}{D\sqrt{t+1}} + \frac{2e^{2D}}{D}\left( \frac{1}{\sqrt{g(t)}} - \frac{1}{\sqrt{t+1}} \right). \label{eq:condition_2}
\]
Let $g(t) = \frac{t}{2}$. Then $T' \leq g(t) \leq t$ is satisfied for all $t \geq 2T'$. We can then combine \cref{eq:condition_1,eq:condition_2}, and have that for all $t \geq 2T'$,
\[
    2e^{-2D\sqrt{t}}\sum_{\tau=2}^{t+1}\frac{1}{\tau}e^{2D\sqrt{\tau}} &\leq 2e^{2D\left( \sqrt{t/2} - \sqrt{t} \right)}\left( \ln\left(\frac{t}{2}\right) + 1 \right) + \frac{2e^{2D}}{D\sqrt{t+1}} + \frac{2e^{2D}}{D}\left( \frac{1}{\sqrt{t/2}} - \frac{1}{\sqrt{t+1}} \right)\\
    &= 2e^{-2D\left( 1-\frac{1}{\sqrt{2}} \right)\sqrt{t}} \left( \ln t - \ln 2 + 1 \right) + \frac{2e^{2D}}{D\sqrt{t+1}} + \frac{4e^{2D}}{D\sqrt{t}},
\]
which is $O\left( \frac{1}{\sqrt{t}} \right)$ as $t\to\infty$.
Therefore, we arrive at the desired result that as $t\to\infty$,
\[
    \E \|w_t - w^\star\|^2 \leq O\left( e^{-2D\sqrt{t}} + 2e^{-2D\left( 1-\frac{1}{\sqrt{2}} \right)\sqrt{t}} \left( \ln t - \ln 2 + 1 \right) + \frac{2e^{2D}}{D\sqrt{t+1}} + \frac{4e^{2D}}{D\sqrt{t}} \right) = O\left( \frac{1}{\sqrt{t}} \right).
\]

\end{proof}

\subsection{Useful Lemmas}
We used several lemmas in the above proof of \cref{thm:convergence}. In this subsection, we present the proof of these lemmas.

\begin{lemma}\label{lem:At}
    Suppose \cref{assump:constraint,assump:exact,assump:grad_bound,assump:mixing,assump:noise} hold. 
    Define $D = \frac{\lambda(1-\beta_1)r_\delta}{2\sqrt{\epsilon+(1-\beta_2)^{-1}U^2}}$. 
    There exists $T<\infty$ such that $\forall t\geq T$,
    \[
        \E\left[ \left( I - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right)^\top\left( I - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right) \middle | w_{t-1} \right] \preceq \left(\exp\left( -\frac{D}{\sqrt{t}} \right)\right)I.
    \]
\end{lemma}

\begin{proof}[Proof of \cref{lem:At}]
    We begin by expanding the matrix product
    \[
        &\E\left[ \left( I - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right)^\top\left( I - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right) \middle | w_{t-1} \right]\\
        &= I - 2\E \left[ \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1} \middle | w_{t-1} \right] + 
            \E\left[ \left(\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right)^\top\diag(R^2_{t-1})\left(\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right) \middle | w_{t-1} \right].\label{eq:first_lemma}
    \]
    We bound the first expectation from below and the second expectation from above.
    
    We being by bounding $\E \left[ \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1} \middle | w_{t-1} \right]$ from below. 
    Following the update rule as specified in \cref{sec:update_rule}, we expand the $i^{\text{th}}$ entry of $R_{t-1}$ as defined by \cref{eq:simple_notation} and get
    \[
        R_{t-1,i} = \frac{\alpha_{t,i}}{\sqrt{t(\epsilon + v_{t,i})}} 
        = \left(\frac{1-\beta_1}{1-\beta_1^t}\right)\left(\frac{\sqrt{1-\beta_2^t}}{\sqrt{1-\beta_2}}\right)\frac{1}{1-\beta_1^t}\left( (1-\beta_1)\left( \sum_{k=0}^{t-1}\beta_1^k \bar{r}_{t-k,i} \right) + 
            \beta_1^t\bar{r}_{0,i} \right)\frac{1}{\sqrt{t(\epsilon + v_{t,i})}}. \label{eq:expanded_R}
    \]
    By \cref{assump:grad_bound} and that $|\beta_2| < 1$, we can bound $v_{t,i}$ as defined in \cref{eq:v_update} by
    \[
        v_{t,i} = \sum_{k=0}^{t-1}\beta_2^k g_i^2(w_{t-k-1}, \theta_{t-k-1}) \leq U^2\sum_{k=0}^{t-1}\beta_2^k 
        \leq U^2(1-\beta_2)^{-1}.
    \]
    Together with $|\beta_1|<1$ and that $\forall t, i$, $\bar{r}_{t,i} \geq r_\delta$, 
    \[
        R_{t-1,i} \geq 
        \left(\frac{1-\beta_1}{1-\beta_1^t}\right)\left(\frac{\sqrt{1-\beta_2^t}}{\sqrt{1-\beta_2}}\right)\frac{r_\delta}{1-\beta_1^t}\frac{1}{\sqrt{t}\sqrt{\epsilon + (1-\beta_2)^{-1}U^2}}
        \geq \frac{(1-\beta_1)r_\delta}{\sqrt{t}\sqrt{\epsilon + (1-\beta_2)^{-1}U^2}}.
    \]
    As a result, for all $t$, we have that $\diag(R_{t-1}) \succeq \frac{(1-\beta_1)r_\delta}{\sqrt{t}\sqrt{\epsilon+(1-\beta_2)^{-1}U^2}}I$.

    Now let $A=\diag(R_{t-1}) - \frac{1}{2}\left(\min_{1\leq i\leq M} R_{t-1,i}\right)I$. We know A is diagonal and $A \succeq \frac{(1-\beta_1)r_\delta}{2\sqrt{t}\sqrt{\epsilon+(1-\beta_2)^{-1}U^2}}I$.
    We also know $Q \coloneqq \sum_{k=0}^{t-1}\beta_1^k G_{t-k-1} \succeq 0$ since $G_t$ are sample covariance matrices. 
    Together, using $\Lambda_{\min}$ to denote the minimum eigenvalue, we have
        $\Lambda_{\min}\left( AQ \right) = \Lambda_{\min}\left( A^\frac{1}{2}QA^\frac{1}{2} \right) \geq 0$, and so $AQ \succeq 0$.
    Therefore,
    \[
        \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1} \succeq \frac{1}{2}\left(\min_{1\leq i\leq M} R_{t-1,i}\right)\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1} = \frac{(1-\beta_1)r_\delta}{2\sqrt{t}\sqrt{\epsilon+(1-\beta_2)^{-1}U^2}}\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}.
    \]
    Using the above, we have that
    \[
        \E \left[ \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1} \middle | w_{t-1} \right] 
        &\succeq \frac{(1-\beta_1)r_\delta}{2\sqrt{t}\sqrt{\epsilon+(1-\beta_2)^{-1}U^2}}\sum_{k=0}^{t-1}\beta_1^k \E\left[G_{t-k-1} \middle | w_{t-1}\right]\\
        &= \frac{(1-\beta_1)r_\delta}{2\sqrt{t}\sqrt{\epsilon+(1-\beta_2)^{-1}U^2}}\sum_{k=0}^{t-1}\beta_1^k 
            \E\left[\E\left[ G_{t-k-1} \middle| w_{t-k-1}, \theta_{t-k-2} \right] \middle | w_{t-1}\right]\\
        &\succeq \frac{\lambda(1-\beta_1)r_\delta}{2\sqrt{t}\sqrt{\epsilon+(1-\beta_2)^{-1}U^2}}\left(\sum_{k=0}^{t-1}\beta_1^k\right)I\\
        &\succeq \frac{\lambda(1-\beta_1)r_\delta}{2\sqrt{t}\sqrt{\epsilon+(1-\beta_2)^{-1}U^2}}I, \label{eq:first_term}
    \]
    where the inequalities are due to \cref{assump:mixing} and $|\beta_1|<1$.

    We now bound $\E\left[ \left(\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right)^\top\diag(R^2_{t-1})\left(\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right) \middle | w_{t-1} \right]$. We similarly begin 
    by bounding $R_{t-1,i}$ from the other direction. By \cref{assump:constraint}, $\bar{r}_{t,i} \leq B$. Together with $v_{t,i}\geq 0$, and that $|\beta_1|<1, |\beta_2|<1$, 
    we can bound \cref{eq:expanded_R} from above by
    \[
        R_{t-1,i} 
        \leq \left(\frac{1-\beta_1}{1-\beta_1^t}\right)\left(\frac{\sqrt{1-\beta_2^t}}{\sqrt{1-\beta_2}}\right)\frac{B}{1-\beta_1^t} \frac{1}{\sqrt{t}\sqrt{\epsilon}}
        \leq \frac{B}{t\epsilon(1-\beta_1)\sqrt{1-\beta_2}}.\label{eq:R_ub}
    \]
    Again using $|\beta_1|<1, |\beta_2|<1$, and squaring $R_{t-1,i}$, we have that
    \[
        \diag(R_{t-1}^2) \preceq \frac{B^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)}I. \label{eq:simple_R_ub}
    \]
    Therefore,
    \[
        &\E\left[ \left(\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right)^\top\diag(R^2_{t-1})\left(\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right) \middle | w_{t-1} \right]\\
        &\preceq \frac{B^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)} \E\left[ \left(\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right)^\top\left(\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right) \middle | w_{t-1} \right]\\
        &= \frac{B^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)} \sum_{k=0}^{t-1}\sum_{k'=0}^{t-1}\beta_1^k \beta_1^{k'} \E\left[ G_{t-k-1}^\top G_{t-k'-1} \middle | w_{t-1}\right]\\
        &= \frac{B^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)} \sum_{k=0}^{t-1}\sum_{k'=0}^{t-1}\beta_1^k \beta_1^{k'} 
        \E\left[ \E\left[ G_{t-k'-1}^\top G_{t-k-1} \middle | w_{t-k'-1}, \theta_{t-k'-2}, w_{t-k-1}, \theta_{t-k-2} \right] \middle | w_{t-1}\right]\\
        &\preceq \frac{\bar{\lambda}B^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)}\left(\sum_{k=0}^{t-1}\beta_1^k\right)^2I\\
        &\preceq \frac{\bar{\lambda}B^2}{t\epsilon(1-\beta_1)^4(1-\beta_2)}I, \label{eq:second_term}
    \]  
    where the second last inequality is due to \cref{assump:noise}, and the last inequality is by $|\beta_1|<1$.
    Let $D' = \frac{\bar{\lambda}B^2}{\epsilon(1-\beta_1)^4(1-\beta_2)}$ and recall that $D = \frac{\lambda(1-\beta_1)r_\delta}{2\sqrt{\epsilon+(1-\beta_2)^{-1}U^2}}$.
    Together by \cref{eq:first_term,eq:second_term}, we can bound \cref{eq:first_lemma} by 
    \[
        \E\left[ \left( I - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right)^\top\left( I - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right) \middle | w_{t-1} \right]
        \preceq \left(1 - \frac{2D}{\sqrt{t}} + \frac{D'}{t}\right)I.
    \]
    Since $D, D' > 0$, we have for all $t\geq \frac{D'^2}{D^2}$, 
    $1 - \frac{2D}{\sqrt{t}} + \frac{D'}{t} \leq 1 - \frac{D}{\sqrt{t}} \leq \exp\left(-{\frac{D}{\sqrt{t}}}\right)$.
    Therefore, for all $t\geq \frac{D_2^2}{D_1^2}$, we have that
    \[
        \E\left[ \left( I - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right)^\top\left( I - \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k G_{t-k-1}\right) \middle | w_{t-1} \right]
        \preceq \left(\exp\left( -\frac{D}{\sqrt{t}} \right)\right)I.
    \]
\end{proof}

\begin{lemma}\label{lem:bt}
    Suppose \cref{assump:constraint,assump:exact,assump:grad_bound,assump:mixing,assump:noise} hold. We have that as $t \to \infty$,
    \[
        \E\left[\left\| \diag(R_{t-1})\sum_{k=1}^{t-1}\beta_1^k G_{t-k-1}\left( \sum_{j=1}^k \Delta_{t-j} \right)\right\|^2 \middle | w_{t-1}\right] = O\left(\frac{1}{t^2}\right).
    \]
\end{lemma}

\begin{proof}[Proof of \cref{lem:bt}]
    We begin by expanding the norm
    \[
        &\E\left[\left\| \diag(R_{t-1})\sum_{k=1}^{t-1}\beta_1^k G_{t-k-1}\left( \sum_{j=1}^k \Delta_{t-j} \right)\right\|^2 \middle | w_{t-1}\right]\\
        &\leq \E\left[ \left(\max_{1\leq i\leq M}R_{t-1,i}\right)^2 \left\| \sum_{k=1}^{t-1}\beta_1^k G_{t-k-1}\left( \sum_{j=1}^k \Delta_{t-j} \right)\right\|^2 \middle | w_{t-1}\right]\\
        &\leq \frac{B^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)} \E\left[\sum_{k,k'=1}^{t-1} \beta_{1}^{k+k'} \left(\sum_{j=1}^k \Delta_{t-j} \right)^\top G_{t-k-1}^\top G_{t-k'-1}  \left(\sum_{j=1}^{k'} \Delta_{t-j} \right) \middle | w_{t-1}\right] \\
        &= \frac{B^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)} \E\left[\sum_{k,k'=1}^{t-1} \beta_{1}^{k+k'} \left(w_{t-k-1}-w_{t-1} \right)^\top G_{t-k-1}^\top G_{t-k'-1}  \left(w_{t-k'-1}-w_{t-1} \right) \middle | w_{t-1}\right],\label{eq:intermediate}
    \]
    where the second inequality is by \cref{eq:simple_R_ub}, and the last equality follows after writing $w_{t-k-1}-w_{t-1}$ as a telescoping sum. 
    Using \cref{assump:noise}, we can bound the expectation in \cref{eq:intermediate} as follows:
    \[
        &\E\left[\sum_{k,k'=1}^{t-1} \beta_{1}^{k+k'} \left(w_{t-k-1}-w_{t-1} \right)^\top G_{t-k-1}^\top G_{t-k'-1}  \left(w_{t-k'-1}-w_{t-1} \right) \middle | w_{t-1}\right]\\
        &= \E\left[\sum_{k,k'=1}^{t-1} \beta_{1}^{k\!+\!k'} \left(w_{t-k-1}\!-\!w_{t-1} \right)^\top \E\left[G_{t-k-1}^\top G_{t-k'-1} \middle | w_{t-k-1}, \theta_{t-k-2}, w_{t-k'-1}, \theta_{t-k'-2} \right]
            \left(w_{t-k'-1}\!-\!w_{t-1} \right) \middle | w_{t-1}\right]\\
        &\leq \bar{\lambda}\E\left[\sum_{k,k'=1}^{t-1} \beta_{1}^{k\!+\!k'} \left(w_{t-k-1}\!-\!w_{t-1} \right)^\top \left(w_{t-k'-1}\!-\!w_{t-1} \right) \middle | w_{t-1}\right]\\
        &= \bar{\lambda}\E\left[\sum_{k,k'=1}^{t-1} \beta_{1}^{k\!+\!k'} \left( \sum_{j=1}^k \Delta_{t-j} \right)^\top \left( \sum_{j=1}^{k'} \Delta_{t-j} \right) \middle | w_{t-1}\right].
    \]
    Therefore,
    \[
        &\E\left[\left\| \diag(R_{t-1})\sum_{k=1}^{t-1}\beta_1^k G_{t-k-1}\left( \sum_{j=1}^k \Delta_{t-j} \right)\right\|^2 \middle | w_{t-1} \right]\\
        &\leq \frac{\bar{\lambda}B^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)}\E\left[\sum_{k,k'=1}^{t-1} \beta_{1}^{k\!+\!k'} \left( \sum_{j=1}^k \Delta_{t-j} \right)^\top \left( \sum_{j=1}^{k'} \Delta_{t-j} \right) \middle | w_{t-1}\right]\\
        &\leq \frac{\bar{\lambda}B^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)}\E\left[\sum_{k,k'=1}^{t-1} \beta_{1}^{k+k'} \left\| \sum_{j=1}^k \Delta_{t-j} \right\| \left\| \sum_{j=1}^{k'} \Delta_{t-j} \right\| \middle | w_{t-1} \right].
    \]
    We now bound $\left\| \sum_{j=1}^k \Delta_{t-j} \right\|^2$.
    \[
        \left\| \sum_{j=1}^k \Delta_{t-j} \right\|^2 
        = \sum_{j,j'=1}^k \Delta_{t-j}^\top\Delta_{t-j'}
        \leq \sum_{j,j'=1}^k \|\Delta_{t-j}\|\|\Delta_{t-j'}\|.
    \]  
    By \cref{eq:simple_notation,eq:m_update}, we can write
    \[
        \|\Delta_{t-j}\|^2 &= \sum_{i=1}^M R_{n-j,i}^2 m^2_{n-j,i}\\
        &\leq \frac{B^2}{(t-j)\epsilon(1-\beta_1)^2(1-\beta_2)} \sum_{i=1}^M m_{n-j,i}^2\\
        &= \frac{B^2}{(t-j)\epsilon(1-\beta_1)^2(1-\beta_2)} \sum_{i=1}^M \left( \sum_{k=0}^{t-j-1}\beta_1^k g_i(w_{t-j-k-1}, \theta_{t-j-k-1}) \right)^2\\
        &\leq \frac{U^2B^2}{(t-j)\epsilon(1-\beta_1)^2(1-\beta_2)} \sum_{i=1}^M \left( \sum_{k=0}^{t-j-1}\beta_1^k \right)^2\\
        &\leq \frac{U^2B^2M}{(t-j)\epsilon(1-\beta_1)^4(1-\beta_2)},
    \]  
    where the first inequality is by \cref{eq:R_ub}, and the second inequality by \cref{assump:grad_bound}, and the last inequality by $|\beta_1|<1$.
    Let $D_1 = \frac{\bar{\lambda}B^2}{\epsilon(1-\beta_1)^2(1-\beta_2)}$, $D_2 = \frac{U^2B^2M}{\epsilon(1-\beta_1)^4(1-\beta_2)}$, we have
    \[
        &\E\left[\left\| \diag(R_{t-1})\sum_{k=1}^{t-1}\beta_1^k G_{t-k-1}\left( \sum_{j=1}^k \Delta_{t-j} \right)\right\|^2 \middle | w_{t-1}\right]\\
        &\leq \frac{D_1}{t}\sum_{k,k'=1}^{t-1}\beta_{1}^{k+k'}\sum_{j,j'=1}^{k}\frac{\sqrt{D_2}}{\sqrt{t-j}}\frac{\sqrt{D_2}}{\sqrt{t-j'}}
        = \frac{D_1D_2}{t}\left( \sum_{k=1}^{t-1}\beta_1^k \sum_{j=1}^{k}\frac{1}{\sqrt{t-j}} \right)^2.
    \]
    If we can show that, as $t\to\infty$, $S(t)\coloneqq \sum_{k=1}^{t-1}\beta_1^k\sum_{j=1}^k \frac{1}{\sqrt{t-j}} = O\left( \frac{1}{\sqrt{t}} \right)$, 
    then we have, as $t\to\infty$, $\frac{D_1D_2}{t}\left( \sum_{k=1}^{t-1}\beta_1^k \sum_{j=1}^{k}\frac{1}{\sqrt{t-j}} \right)^2 = O\left(\frac{1}{t^2}\right)$,
    thus concluding the proof.
    We now show that $S(t) = O\left( \frac{1}{\sqrt{t}} \right)$ as $t\to\infty$.
    \[
        S(t) = \sum_{k=1}^{t-1}\beta_1^k \sum_{j=1}^{k}\frac{1}{\sqrt{t-j}}
        = \sum_{j=1}^{t-1}\sum_{k=j}^{t-1}\beta_1^k \frac{1}{\sqrt{t-j}}
        = \sum_{j=1}^{t-1}\frac{1}{\sqrt{t-j}} \sum_{k=j}^{t-1}\beta_1^k
        = \sum_{j=1}^{t-1}\frac{1}{\sqrt{t-j}} \frac{\beta_1^j(1-\beta_1^{t-j})}{1-\beta_1}.
    \]
    We decompose the above into two sums to get
    \[
        S(t)
        = \frac{1}{1\!-\!\beta_1}\sum_{j=1}^{t-1}\frac{\beta^j_1}{\sqrt{t-j}} - \frac{1}{1-\beta_1}\sum_{j=1}^{t-1}\frac{\beta_1^j\beta_1^{t-j}}{\sqrt{t-j}}
        = \frac{1}{1\!-\!\beta_1}\sum_{j=1}^{t-1}\frac{\beta^j_1}{\sqrt{t-j}} - \frac{\beta_1^t}{1-\beta_1}\sum_{j=1}^{t-1}\frac{1}{\sqrt{t-j}}
        \leq \frac{1}{1-\beta_1}\sum_{j=1}^{t-1}\frac{\beta^j_1}{\sqrt{t-j}}.
    \]
    Splitting the sum above at $\lfloor t/2 \rfloor$, we get that
    \[
        S(t) = \frac{1}{1-\beta_1}\sum_{j=1}^{\lfloor t/2 \rfloor} \frac{\beta_1^j}{\sqrt{t-j}} + 
                \frac{1}{1-\beta_1}\sum_{j=\lfloor t/2 \rfloor +1}^{t-1} \frac{\beta_1^j}{\sqrt{t-j}}.
    \]
    In the first sum, since $j\leq \lfloor t/2 \rfloor$, we know $t-j\geq t - \frac{t}{2} = \frac{t}{2}$. Then 
    \[
        \frac{1}{1-\beta_1}\sum_{j=1}^{\lfloor t/2 \rfloor} \frac{\beta_1^j}{\sqrt{t-j}} \leq \frac{1}{1-\beta_1}\frac{\sqrt{2}}{\sqrt{t}}\sum_{j=1}^{\lfloor t/2 \rfloor}\beta_1^j \leq \frac{\beta_1\sqrt{2}}{(1-\beta_1)^2\sqrt{t}}.
    \]
    In the second sum, since $\lfloor t/2 \rfloor + 1 \leq j \leq t-1$, we know $t-j\geq 1$. Then
    \[
        \frac{1}{1-\beta_1}\sum_{j=\lfloor t/2 \rfloor +1}^{t-1} \frac{\beta_1^j}{\sqrt{t-j}} \leq \frac{1}{1-\beta_1} \sum_{j=\lfloor t/2 \rfloor +1}^{t-1}\beta_1^j \leq 
        \frac{1}{1-\beta_1} \sum_{j=\lfloor t/2 \rfloor +1}^{\infty}\beta_1^j \leq \frac{\beta_1^{\lfloor t/2 \rfloor +1}}{(1-\beta_1)^2}.
    \]
    Since $|\beta_1|<1$, $\beta_1^{\lfloor t/2 \rfloor +1}$ decays faster than $\frac{1}{\sqrt{t}}$ as $t\to\infty$. 
    Therefore, we have that, as $t\to\infty$, $S(t) = O\left( \frac{1}{\sqrt{t}} \right)$.
\end{proof}

\begin{lemma}\label{lem:ct}
    Suppose \cref{assump:constraint,assump:exact,assump:grad_bound,assump:mixing,assump:noise} hold. We have that as $t \to \infty$,
    \[
        \E\left[\left\| \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k H_{t-k-1}(1-s_{t-k-1}) \right\|^2\right] = O\left( \frac{1}{t} \right).
    \]
\end{lemma}

\begin{proof}[Proof of \cref{lem:ct}]
    We begin by expanding the norm
    \[
        &\E\left[\left\| \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k H_{t-k-1}(1-s_{t-k-1}) \right\|^2\right]\\
        &= \sum_{k,k'=0}^{t-1}\beta_{1}^{k+k'} \E\left[ \left(H_{t-k-1}(1-s_{t-k-1})\right)^\top
                                                    \diag(R_{t-1}^2)
                                                    \left(H_{t-k'-1}(1-s_{t-k'-1})\right)
        \right]\\
        &\leq \frac{B^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)} \sum_{k,k'=0}^{t-1}\beta_1^{k+k'}\E\left[ (1-s_{t-k-1})^\top H_{t-k-1}^\top H_{t-k'-1} (1-s_{t-k'-1}) \right]\\
        &= \frac{B^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)} \sum_{k=0}^{t-1}\beta_1^{2k}\E\left[ (1-s_{t-k-1})^\top H_{t-k-1}^\top H_{t-k-1} (1-s_{t-k-1}) \right]\\
        &= \frac{B^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)} \sum_{k=0}^{t-1}\beta_1^{2k} \E\left[\left\|H_{t-k-1} (1-s_{t-k-1})\right\|^2\right].
    \]
    In the above, the inequality is by \cref{eq:simple_R_ub}; the second last equality is due to unbiased subsampling and that when $k\neq k'$, $s_{t-k-1} \indep s_{t-k'-1}$.
    If we can show $\forall t$, $\E\left[\left\|H_{t} (1-s_{t})\right\|^2\right]$ is uniformly bounded above by some constant $C$, then we have
    \[
        &\E\left[\left\| \diag(R_{t-1})\sum_{k=0}^{t-1}\beta_1^k H_{t-k-1}(1-s_{t-k-1}) \right\|^2\right]\\
        &\leq \frac{CB^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)} \sum_{k=0}^{t-1} \beta_1^{2k}\\
        &\leq \frac{CB^2}{t\epsilon(1-\beta_1)^2(1-\beta_2)}\frac{1}{1-\beta_1^2},
    \]
    where the last line is by $|\beta_1|<1$. We can therefore conclude as $t\to\infty$,
    $\E\!\left[\left\| \diag(\!R_{t-1}\!)\sum_{k=0}^{t-1}\beta_1^k H_{t-k-1}(\!1\!-\!s_{t\!-\!k\!-\!1}\!) \right\|^2\right] = O\left(\frac{1}{t}\right)$.
    It now remains to show that $\forall t$, $\E\left[\left\|H_{t} (1-s_{t})\right\|^2\right]$ is uniformly bounded above by a constant.

    By \cref{eq:gradient}, we have that
    \[
        \E\left[\left\|g(w_t, \theta_t, \mathcal{S}_t)\right\|^2\right]
        &= \E\left[ \left\| G_t (w_t - w^\star) \right\|^2 + \left\| H_t(1-s_t) \right\|^2 + 2(w_{t}-w^\star)^\top G_{t}^\top H_t(1-s_t) \right]\\
        &= \E\left[ \left\| G_t (w_t - w^\star) \right\|^2 + \left\| H_t(1-s_t) \right\|^2 + 2(w_{t}-w^\star)^\top G_{t}^\top H_t\E\left[(1-s_t) \middle | w_t, \theta_t \right] \right]\\
        &= \E\left[ \left\| G_t (w_t - w^\star) \right\|^2 + \left\| H_t(1-s_t) \right\|^2\right],
    \]
    where the last equality is due to unbiased subsampling. Together with \cref{assump:grad_bound}, we have
    \[
        \E\left[ \left\| H_t(1-s_t) \right\|^2 \right] &\leq \E\left[ \left\| G_t (w_t - w^\star) \right\|^2 + \left\| H_t(1-s_t) \right\|^2\right]
        \leq \E\left[\left\|g(w_t, \theta_t, \mathcal{S}_t)\right\|^2\right]
        \leq MU^2,
    \]
    thus concluding the proof.
\end{proof}