\subsection{Proof of Theorem \ref{thm:4}}\label{sec:thm4-proof}
We adapted the following Lemma \ref{lm:cr} from \citep{koloskova2020unified}:

\begin{lemma}\label{lm:cr}
    (Simplify the Recursive Equations) For a bound of the cluster distance to the optimal $d_t = \mathbb{E}\| \overline{\mathbf{c}}^{(t)} - \mathbf{c}^{\star}\|^2$ in the following form:
    \begin{equation}
        d_{t+1} \leq\left(1-a \eta_t\right) d_t-b \eta_t e_t+c \eta_t^2+\eta_t B \mathbf{E}_t,
    \end{equation}
    
    and for any non-negative sequences $\{\mathbf{E}_t\}_{t \geq 0}, \{e_t\}_{t \geq 0}, \{\eta_t\}_{t \geq 0}$ that satisfy the following form:
    \begin{equation}
        \mathbf{E}_t \leq\left(1-\frac{p}{2}\right) \mathbf{E}_{m \beta}+\frac{p}{16 \beta} \sum_{j=m \beta}^{t-1} \mathbf{E}_j+D \sum_{j=m \beta}^{t-1} \eta_j^2 e_j+A \sum_{j=m \beta}^{t-1} \eta_j^2,
    \end{equation}

    then if the learning rate $\{ \eta_t^2 \}_{t \geq 0}$ and $\{ r_t \}_{t \geq 0}$ \new{are respectively} a $\frac{8\beta}{p}$-slow decreasing sequence and $\frac{16\beta}{p}$-slow increasing non-negative sequence, then for some constant $E > 0$ with learning rate $\eta_t \leq \frac{1}{16} \sqrt{\frac{p b}{D B \beta}}$ the following holds:
    \begin{equation}
    E \sum_{t=0}^T r_t \mathbf{E}_t \leq \frac{b}{2} \sum_{t=0}^T r_t e_t+64 B A \frac{\beta}{p} \sum_{t=0}^T r_t \eta_t^2
    \end{equation}

    By combining the above equations we have:

    \begin{equation}
        \frac{1}{2 R_T} \sum_{t=0}^T b r_t e_t \leq \frac{1}{R_T} \sum_{t=0}^T\left(\frac{\left(1-a \eta_t\right) r_t}{\eta_t} d_t-\frac{r_t}{\eta_t} d_{t+1}\right)+\frac{c}{R_T} \sum_{t=0}^T r_t \eta_t+\frac{64 B A}{R_T} \sum_{t=0}^T r_t \eta_t^2
    \end{equation}

    Where $R_T = \sum_{t=0}^T r_t$
\end{lemma}

Following the previous Lemma, we adapt Lemma 13 from \citep{koloskova2020unified} as the following Lemma \ref{lm:mr}

\begin{lemma}\label{lm:mr}
    (Main Recursion) The main recursion can be bounded as the following with a constant step-size $\eta_t = \eta < \frac{1}{h}$:
    \begin{equation}
    \frac{1}{2 R_T} \sum_{t=0}^T b e_t r_t+a d_{T+1} \leq \tilde{\mathcal{O}}\left(d_0 h \exp \left[-\frac{a(T+1)}{h}\right]+\frac{c}{a T}+\frac{B A}{a^2 T^2}\right)
    \end{equation}

    For the following two cases, tuning $\eta$ we have:
    If $\frac{1}{h} \geq \frac{\ln \left(\max \left\{2, a^2 d_0 T^2 / c\right\}\right)}{a T}$ $\eta$ is chosen to be equal to this value and that:

    \begin{equation}
        \tilde{\mathcal{O}}\left(a d_0 T \exp \left[-\ln \left(\max \left\{2, a^2 d_0 T^2 / c\right\}\right)\right]\right)+\tilde{\mathcal{O}}\left(\frac{c}{a T}\right)+\tilde{\mathcal{O}}\left(\frac{B A}{a^2 T^2}\right)=\tilde{\mathcal{O}}\left(\frac{c}{a T}\right)+\tilde{\mathcal{O}}\left(\frac{B A}{a^2 T^2}\right)
    \end{equation}

    If else choose $\eta = \frac{1}{h}$ and that:

    \begin{equation}
        \tilde{\mathcal{O}}\left(d_0 h \exp \left[-\frac{a(T+1)}{h}\right]+\frac{c}{h}+\frac{B A}{h^2}\right) \leq \tilde{\mathcal{O}}\left(d_0 h \exp \left[-\frac{a(T+1)}{h}\right]+\frac{c}{a T}+\frac{B A}{a^2 T^2}\right)
    \end{equation}

\end{lemma}

Using the above Lemma \ref{lm:cr}, Lemma \ref{lm:mr} and Theorem \ref{thm:1} and Theorem \ref{thm:3}, we can get the final bound.