% !TEX root = main.tex

 
 
\section{Proof of convergence of \algm} \label{appdx:VR_MOOM_nonconvex}



 

 

\begin{lem} \label{lem:update}
For general $L$-smooth functions $\{ f_s, s \in [S] \}$, choose the learning rate $\eta$ s.t. $\eta \leq  \frac{1}{2}$, the update $d_t$ of the VR-MOO-M algorithm satisfies:
    \begin{align}
   f_s(\x_{t+1})   \leq &  f_s(\x_t) + \frac{\eta}{2}  \sum_{i=(n_t-1)q}^t \alpha^{(t-i)}\| \nabla f_s(\x_i) - \bu_i^s \|^2 - \frac{1}{2} \eta \sum_{i=(n_t-1)q}^t \alpha^{(t-i)} \| \bd_i \|^2\notag\\&+ \frac{1}{2}L \| \x_{t+1}-\x_{t} \|^2.
\end{align}
\end{lem}


\textbf{Proof of Lemma. \ref{lem:update}.}
\begin{proof}




\begin{align}\label{eqs40}
  &  f_s(\x_{t+1}) \leq f_s(\x_t) + \left< \nabla f_s(\x_t), \x_{t+1}-\x_{t} \right> + \frac{1}{2}L \| \x_{t+1}-\x_{t} \|^2 \notag\\
  & \stackrel{(a)}{\le} f_s(\x_t) + \left< \nabla f_s(\x_t) , \alpha(\x_{t+1}-\x_{t} )\right> +\left< \nabla f_s(\x_t) , -\eta \bd_t \right>+ \frac{1}{2}L \| \eta \bd_t \|^2 \notag\\
    & \stackrel{(b)}{=} f_s(\x_t) +\sum_{i=0}^t \alpha^{(t-i)} \left< \nabla f_s(\x_i), -\eta \bd_i \right> + \frac{1}{2}L \| \eta \bd_t \|^2 \notag\\
    &= f_s(\x_t) - \eta \sum_{i=0}^t\alpha^{(t-i)} \left< \nabla f_s(\x_i) - \bu_i^s, \bd_i \right> - \eta \sum_{i=0}^t \alpha^{(t-i)} \left< \bu_i^s, \bd_i \right> + \frac{1}{2}L \| \x_{t+1}-\x_{t} \|^2 \notag\\
    & \stackrel{(c)}{\le} f_s(\x_t) - \eta \sum_{i=0}^t \alpha^{(t-i)} \left< \nabla f_s(\x_i) - \bu_i^s,  \bd_i \right> - \eta\sum_{i=0}^t\alpha^{(t-i)}  \| \bd_i \|^2 + \frac{1}{2}L \| \x_{t+1}-\x_{t} \|^2 \notag\\
    & \stackrel{(d)}{\le} f_s(\x_t) + \frac{\eta}{2} \sum_{i=0}^t \alpha^{(t-i)} \| \nabla f_s(\x_i) - \bu_i^s \|^2 + \frac{1}{2} \eta \sum_{i=0}^t \alpha^{(t-i)} \| \bd_i \|^2 \notag\\&- \eta  \sum_{i=0}^t\alpha^{(t-i)}\| \bd_i\|^2 + \frac{1}{2}L \| \x_{t+1}-\x_{t} \|^2 \notag\\
    &= f_s(\x_t) + \frac{\eta}{2}  \sum_{i=0}^t \alpha^{(t-i)}\| \nabla f_s(\x_i) - \bu_i^s \|^2 - \frac{1}{2} \eta \sum_{i=0}^t \alpha^{(t-i)} \| \bd_i \|^2+ \frac{1}{2}L \| \x_{t+1}-\x_{t} \|^2.
\end{align}

(a) follows from  the objective function $f_s$ is $L$-smooth.  $(b)$ follows from the update rule of $\x_t$ shown in Line 19 in Algorithm. \ref{alg}.  (c) follows from  $\left< \bu_t^s, \bd_t \right> \geq \| \bd_t \|^2$ since $\bd_t$ is a general solution in the convex hull of the family of vectors $\{\bu_t^s, s \in [S] \}$ (see Lemma 2.1~\cite{desideri2012multiple}). (d) follows from the triangle inequality.




\end{proof}


\textbf{Proof of Theorem.~\ref{STIMULUS_M_NCRate}}

\begin{proof}
Taking expectation on both sides of the inequality in Lemma. \ref{lem:update}, we have

\begin{align} \label{eqss35}
    & \mathbb{E} [f_s(\x_{t+1})]  \notag\\ \stackrel{(a)}{\le} &   \mathbb{E}[f_s(\x_t)] +    \frac{\eta}{2}  \sum_{i=0}^t \alpha^{(t-i)}  \mathbb{E}\| \nabla f_s(\x_i) - \bu_i^s \|^2 - \frac{1}{2} \eta \sum_{i=0}^t \alpha^{(t-i)}  \mathbb{E}\| \bd_i \|^2+ \frac{1}{2}L  \mathbb{E}\| \x_{t+1}-\x_{t} \|^2\notag\\
  \stackrel{(b)}{\le} &   \mathbb{E}[f_s(\x_t)]  - \frac{1}{2} \eta \sum_{i=0}^t \alpha^{(t-i)}  \mathbb{E}\| \bd_i \|^2+ \frac{1}{2}L  \mathbb{E}\| \x_{t+1}-\x_{t} \|^2\notag\\
   &+ \frac{\eta}{2}  \sum_{j=0}^t  \alpha^{(t-j)} [\frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^j \mathbb{E} \|\x_{i+1}-\x_{i}\|^2 + \mathbb{E}\|\nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2] \notag\\
    = &  \mathbb{E}[f_s(\x_t)] - \frac{1}{2} \eta \sum_{i=0}^t \alpha^{(t-i)}  \mathbb{E}\| \bd_i \|^2+ \frac{1}{2}L  \mathbb{E}\| \x_{t+1}-\x_{t} \|^2\notag\\&+ \frac{\eta}{2}  \sum_{j=0}^t  \alpha^{(t-j)}  [\frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^j  \mathbb{E}\|\x_{i+1}-\x_{i}\|^2],
\end{align}

where $(a)$ follows from Eqs. \ref{eqs40}. $(b)$ follows from the Lemma. \ref{lem:bounded1}. $(c)$ follows from $\mathbb{E}\|\nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2=0$ as shown in Line 5 in our Algorithm. \ref{alg}.


Next, telescoping the above inequality over $t$ from $\left(n_t-1\right) q$ to $t$ where $t \leq n_t q-1$ and noting that for $\left(n_t-1\right) q \leq j \leq n_t q-1, n_j=n_t$ and let $\eta\leq \frac{1}{4L}$, we obtain

\begin{align}
  &    \mathbb{E}[f_s(\x_{t+1})]  \notag\\ \stackrel{(a)}{\le} &     \mathbb{E}[f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)}   \mathbb{E}\| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t   \mathbb{E}\| \x_{i+1}-\x_i \|^2 \notag\\& + \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=0}^j  \alpha^{(j-i)}  [\frac{L^2}{|\mathcal{A}|} \sum_{r=\left(n_t-1\right) q}^i   \mathbb{E}\|\x_{r+1}-\x_{r}\|^2]   \notag\\
  \stackrel{(b)}{\le}&     \mathbb{E}[f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)}   \mathbb{E}\| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t  \mathbb{E}\| \x_{i+1}-\x_i \|^2 \notag\\& + \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=0}^j  \alpha^{(j-i)}  [\frac{L^2}{|\mathcal{A}|} \sum_{r=\left(n_t-1\right) q}^{n_t q -1}   \mathbb{E}\|\x_{r+1}-\x_{r}\|^2]   \notag\\
\leq &    \mathbb{E} [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)}    \mathbb{E}\| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t   \mathbb{E}\| \x_{i+1}-\x_i \|^2 \notag\\& + \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=0}^j  \alpha^{(j-i)}  [\frac{L^2}{|\mathcal{A}|} q  \mathbb{E} \|\x_{j+1}-\x_{j}\|^2]   \notag\\
   \stackrel{(c)}{=} &    \mathbb{E} [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)}  \mathbb{E} \| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t   \mathbb{E}\| \x_{i+1}-\x_i \|^2 \notag\\& + \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=0}^j  \alpha^{(j-i)}  [ L^2  \mathbb{E} \|\x_{j+1}-\x_{j}\|^2]   \notag\\
   \stackrel{(d)}{=}  &     \mathbb{E}[f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)}  \mathbb{E} \| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t   \mathbb{E}\| \x_{i+1}-\x_i \|^2 \notag\\& + \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=0}^j  \alpha^{(j-i)}  [ L^2   \mathbb{E}\|\eta \sum_{r=0}^{j} \alpha^{(j-r)} \bd_r\|^2]   \notag\\
       \stackrel{(e)}{\leq} &    \mathbb{E} [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)}   \mathbb{E}\| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t   \mathbb{E}\| \x_{i+1}-\x_i \|^2 \notag\\& + \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=0}^j  \alpha^{2(j-i)}  [ L^2\eta ^2   \mathbb{E}\|\bd_i\|^2]   \notag\\
                 \stackrel{(f)}{\leq}  &     \mathbb{E}[f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{4} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)}   \mathbb{E}\| \bd_i \|^2+ \frac{1}{2}L   \sum_{j=\left(n_t-1\right) q}^t   \mathbb{E}\| \eta \sum_{i=0}^{j} \alpha^{(j-i)} \bd_j \|^2  \notag\\
           \stackrel{(g)}{\leq} &    \mathbb{E} [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{8} \sum_{j=\left(n_t-1\right) q}^t   \mathbb{E}\| \bd_j \|^2,
\end{align}
where $(a)$ holds from Eqs. \eqref{eqss35}. $(b)$ is extend $i$ to $t$ since $i\leq n_t q -1$. $(c)$ is because $q=|\mathcal{A}|=\lceil\sqrt{n}\rceil$. $(d)$ follows from the update rule of $\x_t$ shown in Line 19 in Algorithm. \ref{alg}. $(e)$ follows from the triangle inequality. $(f)$ and $(g)$ hold from $\eta \leq \frac{1}{2L}$ and $0 <\alpha<1 $. We continue the proof by further driving
%
\begin{align}
  &   [f_s(\x_{T})] -   [f_s(\x_{0})]  \notag\\&
  =(   [f_s(\x_{q})] -   [f_s(\x_{0})] ) +  (   [f_s(\x_{2q})] -   [f_s(\x_{q})] ) +\cdot +  (   [f_s(\x_{T})] -   [f_s(\x_{(n_T-1)q})] ) 
  \notag\\&\leq  -[\frac{\eta}{8}] \sum_{t=0}^{T-1}  \|\bd_t\|^2
\end{align}

Note that $ [ f_s\left(\x_{T+1}\right) ]\geq f_s^* \triangleq \inf _{\x \in \mathbb{R}^d} f_s(\x)$. Hence, we have
%
\begin{align}
  &
 [\frac{\eta}{8}] \sum_{t=0}^{T-1}  \|\bd_t\|^2\leq   [  [f_s(\x_{0})] -  [f_s(\x_{T})]  ]\leq   [  [f_s(\x_{0})] - f_s^*  ].
\end{align}


\iffalse
We next bound $ \left\|\nabla f_s\left(\x_{\zeta}\right)\right\|^2$, where $\zeta$ is selected uniformly at random from $\{0, \ldots, T-1\}$. Observe that
\begin{align}
 \left\|\nabla f_s\left(\x_{\zeta}\right)\right\|^2= \left\|\nabla f_s\left(\x_{\zeta}\right)-\bd_{\zeta}+\bd_{\zeta}\right\|^2 \leq 2  \left\|\nabla f_s\left(x_{\zeta}\right)-\bd_{\zeta}\right\|^2+2  \left\|\bd_{\zeta}\right\|^2.
\end{align}
Next, we bound the two terms on the right hand side of the above inequality. First, note that
\begin{align}
 \left\|\bd_{\zeta}\right\|^2=\frac{1}{T} \sum_{i=0}^{T-1}  \left\|\bd_i\right\|^2 \leq \frac{[  [f_s(\x_{0})] -  [f_s^*]  ]}{T [\frac{\eta}{4}- \frac{\eta q}{2}  \frac{L^2}{|\mathcal{A}|}]}.
\end{align}

On the other hand, we have

\begin{align}
 \left\|\nabla f_s\left(x_{\zeta}\right)-\bd_{\zeta}\right\|^2 \leq 
\end{align}

\fi

Based on the parameter setting $q =|\mathcal{A}|=\sqrt{n}$, we have 
\begin{align}
  &
 [\frac{\eta}{8}  ] \sum_{t=0}^{T-1}  \|\bd_t\|^2 \leq   [  [f_s(\x_{0})] -f_s^* ].
\end{align}




Since $\frac{1}{T} \sum_{t=0}^{T-1}\mathbb{E} \|d_t \|^2$ is just common descent directions.
According to Definition. \ref{def:stationary} shown in the paper, the quantity to our interest is 
$\|\sum_{s \in [S]}\lambda_t^s \nabla f(\mathbf{x})\|^2$. 
\begin{align}
  &
 \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)\|^2
\stackrel{(a)}{\le}  ( 2S L^2 \eta ^2\frac{1}{T}  +2)  \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\bd_t\|^2 
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs21}.

Then, we can conclude that 
\begin{align}
  &
 \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)\|^2 
\stackrel{(a)}{\le}
 ( 2S L^2 \eta ^2  +2)
  \frac{[  \mathbb{E}[f_s(\x_{0})] -f_s^* ]}{ \frac{\eta}{8} T},
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs21} and Eqs. \ref{eqs20}.




Thus, we have

\begin{align}
  &
\frac{1}{T}\sum_{t=0}^{T-1}\min_{\boldsymbol{\lambda} \in C} \mathbb{E} \| \boldsymbol{\lambda}^{\top} \nabla \F(\x_t) \|^2 \leq \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)\|^2 =\mathcal{O}(\frac{1}{T}).
\end{align}


The total sample complexity can be calculated as:
	$\lceil \frac{T}{q} \rceil n + T\cdot |\mathcal{A}| \leq  \frac{T+q}{q}n + T\sqrt{n}= T\sqrt{n}+n+T\sqrt{n}=O(n+ \sqrt{n} \epsilon^{-1})$.
	Thus, the overall sample complexity is $\mathcal{O}(n+ \sqrt{n} \epsilon^{-1})$.
	This completes the proof.
 
\end{proof}

\subsection{Proof od Theorem.~\ref{thm:STIMULUSm_SC}}
\begin{proof}

\begin{align} \label{eqs43}
    &f_s(\x_{t+1}) \notag\\\stackrel{(a)}{\le}& f_s(\x_t) + \left< \nabla f_s(\x_t),-\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 \notag\\
 \stackrel{(b)}{\le} & f_s(\x_*) + \left< \nabla f_s(\x_t), \x_t - \x_* \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2  + \left< \nabla f_s(\x_t), -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> \notag\\& + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2\notag\\
    = &f_s(\x_*) + \left< \nabla f_s(\x_t), \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2\notag\\
    =& f_s(\x_*) + \left< \nabla f_s(\x_t)-\bu_t^s, \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right>+ \left< \bu_t^s, \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> \notag\\&- \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2\notag\\
    \stackrel{(c)}{\le} & f_s(\x_*) + \frac{1}{2\delta}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \frac{\delta}{2}\| \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2 \notag\\& + \left< \bu_t^s, \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2\notag\\
    \stackrel{(d)}{\le} & f_s(\x_*) + \frac{1}{2\delta}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2 \notag\\&+ \left< \bu_t^s, \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2,
\end{align}
where $(a)$ is due to $L$-smoothness, $(b)$ follows from $\mu$-strongly convex. $(c)$ and $(d)$ follow from the Young's inequality.

Next, we have
\begin{align} \label{eqs44}
    & \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{t+1}) - f_s(\x_*) \right]  \notag\\
    \stackrel{(a)}{\le}& \frac{1}{2\delta} \sum_{s \in [S]} \lambda_t^{s}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2\notag\\& +\left< \sum_{s \in [S]} \lambda_t^{s}\bu_t^s, \x_t - \x_* \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \left< \sum_{s \in [S]} \lambda_t^{s} \bu_t^s, -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> \notag\\& + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 \notag\\
    %
    =&\frac{1}{2\delta}\sum_{s \in [S]} \lambda_t^{s}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2\notag\\& +\left< \sum_{s \in [S]} \lambda_t^{s}\bu_t^s, \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 \notag\\
    %
    \stackrel{(b)}{=}&\frac{1}{2\delta}\sum_{s \in [S]} \lambda_t^{s}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2\notag\\& +\left< \bd_t , \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 \notag\\
  \stackrel{(c)}{\le} &\frac{1}{2 \eta} \left( \| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - \frac{1}{2} \eta \|  \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 - \frac{\mu}{2} \| \x_t - \x_* \|^2 \notag\\& + \frac{1}{2}L \|\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2 
    + \frac{4}{\mu}\sum_{s \in [S]} \lambda_t^{s}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \frac{\mu}{8}\| \x_t - \x_* \|^2+\frac{\mu}{8}\|\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2\notag\\
    =&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4})\| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 ) \| \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 \notag\\&
    + \frac{4}{\mu}\sum_{s \in [S]} \lambda_t^{s}\| \nabla f_s(\x_t)-\bu_t^s\|^2\notag\\
     \stackrel{(e)}{\leq}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4})\| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 ) \| \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 \notag\\&
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \|\x_{i+1}-\x_{i}\|^2 +\sum_{s \in [S]} \lambda_t^{s}\|\nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2)\notag\\
   \stackrel{(f)}{=}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4})\| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 ) \| \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 \notag\\&
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \|\x_{i+1}-\x_{i}\|^2 ).
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs43}. (b) is because the definition $\bd_t=\sum_{s \in [S]} \lambda_{t}^{s} \mathbf{u}_{t}^s$ as shown in Line 14 in Algorithm. \ref{alg}. $(c)$ is because $\|\x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 = - \eta^2 \| \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 + 2 \left< \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i , \x_t - \x_* \right>$, 
and we choose
$\delta = \frac{\mu}{8}$. $(e)$ and $(f)$ follow from $\sum_{s \in [S]} \lambda_t^{s}=1$ and $\|\nabla f_s(\mathbf{x}_{\left(n_t-1\right) q}) - \mathbf{u}_{\left(n_t-1\right) q}^s \|^2 =0$.

Next, telescoping the above inequality over $t$ from $\left(n_t-1\right) q$ to $t$ where $t \leq n_t q-1$ and noting that for $\left(n_t-1\right) q \leq j \leq n_t q-1, n_j=n_t$, we obtain
\begin{align} \label{eqs45}
    & \sum_{i=\left(n_t-1\right) q}^t \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right]  \notag \\
   \stackrel{(a)}{=} &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=\left(n_t-1\right) q}^t\| \x_i - \x_* \|^2 -  \sum_{i=\left(n_t-1\right) q}^t\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 )  \sum_{i=\left(n_t-1\right) q}^t\| \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=\left(n_j-1\right) q}^j\|\x_{i+1}-\x_{i}\|^2 ) \notag \\
 \stackrel{(b)}{\leq} &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=\left(n_t-1\right) q}^t\| \x_i - \x_* \|^2 -  \sum_{i=\left(n_t-1\right) q}^t\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 )  \sum_{i=\left(n_t-1\right) q}^t\| \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=\left(n_t-1\right) q}^t\|\x_{i+1}-\x_{i}\|^2 )\notag \\
\stackrel{(c)}{=}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=\left(n_t-1\right) q}^t\| \x_i - \x_* \|^2 -  \sum_{i=\left(n_t-1\right) q}^t\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|} ) \sum_{i=\left(n_t-1\right) q }^t\|\sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2 ),
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs44}, $(b)$ extend $j$ to $t$. $(c)$ follows from the update rule of $\x_{t+1}$ shown in Eqs. \eqref{vrm_update}.


We continue the proof by further driving

\begin{align}\label{eqs46}
    & \sum_{t=0}^{T} \sum_{s \in [S]} \lambda_t^{s}\left[ f_s(\x_{i+1}) - f_s(\x_*) \right] \notag\\ = & \sum_{i=0}^q\sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right] +  \sum_{i=q}^{2q}\sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*)\right]+\notag\\& \sum_{i=(n_T-1)q}^{T}\sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right] \notag \\
  \stackrel{(a)}{\leq} &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=0 }^{T} \| \x_i - \x_* \|^2 -  \sum_{t=0}^{T}\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|} ) \sum_{t=0}^{T}\|\sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2 ),
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs45}.
Next, we have
\begin{align}
    & \sum_{t=0}^{T}   \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_i) - f_s(\x_*) \right] \notag\\ = & \sum_{t=0}^{T}   \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) -  f_s(\x_{i+1}) + f_s(\x_i)  \right]  \notag\\
    %
   = &\sum_{t=0}^{T}  \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right]  -\sum_{t=0}^{T} \sum_{s \in [S]} \lambda_t^{s}  | f_s(\x_{i+1}) - f_s(\x_i) |\notag\\
    \stackrel{(a)}{leq} &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=0 }^{T} \| \x_i - \x_* \|^2 -  \sum_{t=0}^{T}\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|}  -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}] )\sum_{t=0}^{T}  \|\sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2,
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs46}.
Let $|\mathcal{A}|=q= \lceil \sqrt{n} \rceil$ and $\eta \leq \min\{\frac{1}{2\mu},\frac{1}{8L},\frac{\mu}{64L^2} \}$, we have $(\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|}  -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}] )> \frac{\eta}{16}>0$
    
    Thus, we have
\begin{align}
    & \sum_{t=0}^{T}   \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_i) - f_s(\x_*) \right]\leq \frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=0 }^{T} \| \x_i - \x_* \|^2 -  \sum_{t=0}^{T}\| \x_{i+1} - \x_* \|^2\right).
\end{align}

Then, we have 

\begin{align}
    & \mathbb{E}  [\sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_t) - f_s(\x_*) \right] ]\leq \frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \mathbb{E} \| \x_t - \x_* \|^2 -  \mathbb{E}\| \x_{t+1} - \x_* \|^2\right).
\end{align}
Based on Asumption. \ref{assump: add} and averaging using weight $w_t = ( 1 - \frac{3\mu \eta }{4})^{1-t}$ and using such weight to pick output $\x$, by using Lemma 1 in \cite{Karimireddy2020SCAFFOLD} with $\eta \geq \frac{1}{uR}$, we have

\begin{align}
    \mathbb{E}\|\x_t-\x^*\|^2  &\leq \| \x_0 - \x_* \|^2 \mu \exp( - \frac{3 \eta \mu T}{4}) \\
    &= \mathcal{O}(\mu \exp( - \mu T)).
\end{align}

Then we have the convergence rate $   \mathbb{E}\|\x_t-\x^*\|^2= \mathcal{O}(\mu \exp( - \mu T))$.
the total sample complexity can be calculated as:
	$\lceil \frac{T}{q} \rceil n + T\cdot |\mathcal{A}| \leq  \frac{T+q}{q}n + T\sqrt{n}= T\sqrt{n}+n+T\sqrt{n}=O(n+ \sqrt{n} \ln ({\mu/\epsilon})$.
	Thus, the overall sample complexity is $\mathcal{O}(n+ \sqrt{n} \ln ({\mu/\epsilon})$.
	This completes the proof.

\end{proof}