% !TEX root = main.tex

 \section{Proof of convergence of \algp} \label{appdx:VRP_nonconvex}




 

\textbf{Proof of Theorem.~\ref{thm:STIMULUSmp_nonC} [Part 1]}

\begin{proof}

Recall that $ \mathcal{N}_s =\min\{ c_{\gamma} \sigma^2(\gamma_t)^{-1}, c_{\epsilon} \sigma^2\epsilon^{-1} ,n\} $. Then we have
	\begin{align}  \label{eqs52}
		\frac{I_{( \mathcal{N}_s <n)}}{ \mathcal{N}_s}  & \leq \frac{1}{ \min\{ c_{\epsilon} \sigma^2(\epsilon)^{-1} ,c_{\gamma} \sigma^2({\gamma_t})^{-1}     \}} \notag\\
		%
		& = \max\{ \frac{{\gamma_t}}{c_{\gamma}\sigma^2}, \frac{\epsilon}{c_{\epsilon} \sigma^2}  \}  \leq \frac{{\gamma_t}}{  c_{\gamma}\sigma^2}+\frac{\epsilon}{ c_{\epsilon}\sigma^2}.
	\end{align}
	


 
From Lemma. \ref{lem:update2}, we have

\begin{align} \label{eqs53}
     [f_s(\x_{t+1})]  &\stackrel{(a)}{\leq}  [f_s(\x_t)] + \frac{\eta}{2}   \| \nabla f_s(\x_t) - \bu_t^s \|^2 -\frac{\eta}{4}  \| \bd_t \|^2\notag\\&
   \stackrel{(b)}{\leq}  [f_s(\x_t)] -\frac{\eta}{4}  \| \bd_t \|^2\notag\\&+ \frac{\eta}{2}  [\frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \|\x_{i+1}-\x_{i}\|^2 +\|\nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2] \notag\\&
  \stackrel{(c)}{\leq}  [f_s(\x_t)] -\frac{\eta}{4}  \| \bd_t \|^2+ \frac{\eta}{2}  [\frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \eta^2 \|\bd_i\|^2+ \frac{I_{(\mathcal{N}_s <n)}}{\mathcal{N}_s} \sigma^2  ],
\end{align}
where $(a)$ follows from  Lemma. \ref{lem:update2}. $(b)$ follows from Lemma. \ref{lem:bounded1}. (c) follows from the update rule shown in Eqs. \eqref{STIMULUSP1}.

Next, telescoping the above inequality over $t$ from $\left(n_t-1\right) q$ to $t$ where $t \leq n_t q-1$ and noting that for $\left(n_t-1\right) q \leq j \leq n_t q-1, n_j=n_t$, and aking expectation on both sides of the inequality in Eqs. \eqref{eqs53},we obtain

\begin{align}\label{eqs54}
  &   \mathbb{E}[f_s(\x_{t+1})]  \notag\\ \stackrel{(a)}{\leq}   &\mathbb{E}[f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{4} \sum_{j=\left(n_t-1\right) q}^t  \mathbb{E}\| \bd_j \|^2\notag\\& + \frac{\eta}{2}  [\frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=\left(n_t-1\right) q}^j\eta^2 \mathbb{E}\|\bd_i\|^2 + \sum_{i=\left(n_t-1\right) q}^t\frac{I_{(\mathcal{N}_s <n)}}{\mathcal{N}_s} \sigma^2 ]  \notag\\ \stackrel{(b)}{\leq}  & \mathbb{E}[f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{4} \sum_{j=\left(n_t-1\right) q}^t \mathbb{E} \| \bd_j \|^2\notag\\&+ \frac{\eta}{2}  \sum_{i=\left(n_t-1\right) q}^t[\frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=\left(n_t-1\right) q}^t\eta^2 \mathbb{E}\|\bd_i\|^2 ] + \frac{\eta}{2} \sum_{i=\left(n_t-1\right) q}^t\frac{I_{(\mathcal{N}_s <n)}}{\mathcal{N}_s} \sigma^2 \notag\\ =&   \mathbb{E}[f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{4} \sum_{j=\left(n_t-1\right) q}^t \mathbb{E} \| \bd_j \|^2\notag\\&+ \frac{\eta^3 q}{2}  [\frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t  \mathbb{E}\|\bd_j\|^2 ]+ \frac{\eta}{2}\sum_{i=\left(n_t-1\right) q}^t \frac{I_{(\mathcal{N}_s <n)}}{\mathcal{N}_s} \sigma^2  \notag\\ \stackrel{(c)}{=}& \mathbb{E}[f_s(\x_{\left(n_t-1\right) q})] -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}] \sum_{j=\left(n_t-1\right) q}^t  \mathbb{E}\|\bd_j\|^2 + \frac{\eta}{2} \sum_{i=\left(n_t-1\right) q}^t(\frac{\gamma_{i}}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} ),
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs53}, $(b)$ extends $j$ to $t$. $(c)$ follows from Eqs. \eqref{eqs52}

Recall that $ \gamma_t= \frac{1}{q}  \sum_{i=(n_t-1)q}^{t} \|	\bd_t \|^2 $. Then, we have
We continue the proof by further driving
%
\begin{align}
  & \mathbb{E}[f_s(\x_{T}) -   f_s(\x_{0})] \notag\\&
  = \mathbb{E}[(   [f_s(\x_{q})] -   [f_s(\x_{0})] ) +  (   [f_s(\x_{2q})] -   [f_s(\x_{q})] ) +\cdot +  (   [f_s(\x_{T})] -   [f_s(\x_{(n_T-1)q})] ) ]
  \notag\\&\stackrel{(a)}{\leq}   -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|} ] \sum_{t=0}^{T-1}  \mathbb{E}\|\bd_t\|^2+ \frac{\eta}{2} \sum_{t= 0 }^{T-1}(\frac{\mathbb{E}[\gamma_{i}]}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} )
    \notag\\&\stackrel{(b)}{\leq}   -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|} -\frac{\eta}{2 c_{\gamma}}  ] \sum_{t=0}^{T-1} \mathbb{E}\|\bd_t\|^2+ \frac{\eta}{2} T\frac{\epsilon}{c_{\epsilon}} ,
\end{align}

where $(a)$ is from Eqs. \eqref{eqs54}. $(b)$ follows from $\gamma_t= \frac{1}{q}  \sum_{i=(n_t-1)q}^{t} \|	\bd_t \|^2$.

Note that $ [ f_s\left(\x_{T+1}\right) ]\geq f_s^* \triangleq \inf _{\x \in \mathbb{R}^d} f_s(\x)$. Let $c_{\gamma}>4$. Hence, we have
%
\begin{align}
  &
 [\frac{\eta}{8}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}-\frac{\eta}{2 c_{\gamma}} ] \sum_{t=0}^{T-1}  \mathbb{E}\|\bd_t\|^2\leq   \mathbb{E}[  [f_s(\x_{0})] -  [f_s(\x_{T})]  ]\leq   \mathbb{E}[  [f_s(\x_{0})] - f_s^*  ]+ \frac{\eta}{2} T\frac{\epsilon}{c_{\epsilon}}.
\end{align}


\iffalse
We next bound $ \left\|\nabla f_s\left(\x_{\zeta}\right)\right\|^2$, where $\zeta$ is selected uniformly at random from $\{0, \ldots, T-1\}$. Observe that
\begin{align}
 \left\|\nabla f_s\left(\x_{\zeta}\right)\right\|^2= \left\|\nabla f_s\left(\x_{\zeta}\right)-\bd_{\zeta}+\bd_{\zeta}\right\|^2 \leq 2  \left\|\nabla f_s\left(x_{\zeta}\right)-\bd_{\zeta}\right\|^2+2  \left\|\bd_{\zeta}\right\|^2.
\end{align}
Next, we bound the two terms on the right hand side of the above inequality. First, note that
\begin{align}
 \left\|\bd_{\zeta}\right\|^2=\frac{1}{T} \sum_{i=0}^{T-1}  \left\|\bd_i\right\|^2 \leq \frac{[  [f_s(\x_{0})] -  [f_s^*]  ]}{T [\frac{\eta}{4}- \frac{\eta q}{2}  \frac{L^2}{|\mathcal{A}|}]}.
\end{align}

On the other hand, we have

\begin{align}
 \left\|\nabla f_s\left(x_{\zeta}\right)-\bd_{\zeta}\right\|^2 \leq 
\end{align}

\fi

Based on the parameter setting $q=|\mathcal{A}|=\lceil\sqrt{n}\rceil$, we have 
\begin{align}
  &
 [\frac{\eta}{8}- \frac{\eta^3 L^2 }{2}  -\frac{\eta}{2 c_{\gamma}} ] \sum_{t=0}^{T-1} \mathbb{E} \|\bd_t\|^2 \leq   \mathbb{E}[  [f_s(\x_{0})] -f_s^* ]+ \frac{\eta}{2} T\frac{\epsilon}{c_{\epsilon}}.
\end{align}
Thus, we have
\begin{align}
  &
 \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\bd_t\|^2 \leq   \frac{\mathbb{E}[  [f_s(\x_{0})] -f_s^* ]}{ [\frac{\eta}{8}- \frac{\eta^3 L^2 }{2}  -\frac{\eta}{2 c_{\gamma}} ] T}+ \frac{\eta}{2} \frac{\epsilon}{c_{\epsilon}}.
\end{align}

Let $\eta\leq \frac{1}{4L},c_{\gamma}\geq 8, c_{\epsilon}\geq \eta $, we have






Since $\frac{1}{T} \sum_{t=0}^{T-1}\mathbb{E} \|d_t \|^2$ is just common descent directions.
According to Definition. \ref{def:stationary} shown in the paper, the quantity to our interest is 
$\|\sum_{s \in [S]}\lambda_t^s \nabla f(\mathbf{x})\|^2$. 
\begin{align}
  &
 \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)\|^2
\stackrel{(a)}{\le}  ( 2S L^2 \eta ^2 +2)  \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\bd_t\|^2 
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs21}.

Then, we can conclude that 
\begin{align}
  &
 \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)\|^2 
\stackrel{(a)}{\le}
 ( 2S L^2 \eta ^2 +2)
(\frac{\mathbb{E}[  [f_s(\x_{0})] -f_s^* ]}{ [\frac{\eta}{8}- \frac{\eta^3 L^2 }{2}  -\frac{\eta}{2 c_{\gamma}} ] T}+ \frac{\eta}{2} \frac{\epsilon}{c_{\epsilon}}),
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs21} and Eqs. \ref{eqs20}.




Thus, we have

\begin{align}
  &
\frac{1}{T}\sum_{t=0}^{T-1}\min_{\boldsymbol{\lambda} \in C} \mathbb{E} \| \boldsymbol{\lambda}^{\top} \nabla \F(\x_t) \|^2 \leq \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)\|^2 =\mathcal{O}(\frac{1}{T}).
\end{align}




The total sample complexity can be calculated as:
	$\lceil \frac{T}{q} \rceil n + T\cdot |\mathcal{A}| \leq  \frac{T+q}{q}n + T\sqrt{n}= T\sqrt{n}+n+T\sqrt{n}=O(n+ \sqrt{n} \epsilon^{-1})$.
	Thus, the overall sample complexity is $\mathcal{O}(n+ \sqrt{n} \epsilon^{-1})$.
	This completes the proof.
 
\end{proof}

\subsection{Proof of Theorem.~\ref{thm:STIMULUSP_SC} [Part 1]}
\begin{proof}

\begin{align} \label{eqs62}
    &f_s(\x_{t+1}) \notag\\\stackrel{(a)}{\le}& f_s(\x_t) + \left< \nabla f_s(\x_t), -\eta \bd_t \right> + \frac{1}{2}L \| \eta \bd_t \|^2 \notag\\
    \stackrel{(b)}{\le}& f_s(\x_*) + \left< \nabla f_s(\x_t), \x_t - \x_* \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2  + \left< \nabla f_s(\x_t), -\eta \bd_t \right> + \frac{1}{2}L \| \eta \bd_t \|^2\notag\\
   = &f_s(\x_*) + \left< \nabla f_s(\x_t), \x_t - \x_* -\eta \bd_t \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2\notag\\
    =& f_s(\x_*) + \left< \nabla f_s(\x_t)-\bu_t^s, \x_t - \x_* -\eta \bd_t \right>+ \left< \bu_t^s, \x_t - \x_* -\eta \bd_t \right> \notag\\&- \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2\notag\\
   \stackrel{(c)}{\le}& f_s(\x_*) + \frac{1}{2\delta}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \frac{\delta}{2}\| \x_t - \x_* -\eta \bd_t\|^2 + \left< \bu_t^s, \x_t - \x_* -\eta \bd_t \right> \notag\\&- \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2\notag\\
    \stackrel{(d)}{\le}& f_s(\x_*) + \frac{1}{2\delta}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \bd_t\|^2 \notag\\&+ \left< \bu_t^s, \x_t - \x_* -\eta \bd_t \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2,
\end{align}
where $(a)$ follows from $L$-smoothness, $(b)$ follows from $\mu$-strongly convexity. $(c)$ follows from Young's inequality, and $(d)$ follows from triangle inequality.

Then, we have
\begin{align} \label{eqs70}
    & \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{t+1}) - f_s(\x_*) \right]  \\
    \stackrel{(a)}{\le}& \frac{1}{2\delta}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \bd_t\|^2\notag\\& +\left< \sum_{s \in [S]} \lambda_t^{s}\bu_t^s, \x_t - \x_* \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \left< \sum_{s \in [S]} \lambda_t^{s} \bu_t^s, -\eta \bd_t \right> + \frac{1}{2}L \| \eta \bd_t \|^2 \\
    %
    =&\frac{1}{2\delta}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \bd_t\|^2\notag\\& +\left< \sum_{s \in [S]} \lambda_t^{s}\bu_t^s, \x_t - \x_* -\eta \bd_t \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2 \\
    %
     \stackrel{(b)}{\le}&\frac{1}{2\delta}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \bd_t\|^2\notag\\& +\left< \bd_t , \x_t - \x_* -\eta \bd_t \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2 \\
    %
     =& \left< \bd_t , \x_t - \x_* \right> - \eta \| \bd_t \|^2 - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \eta^2 \| \bd_t \|^2 \notag\\&+ \frac{1}{2\delta}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \bd_t\|^2\notag\\
     \stackrel{(c)}{=}&\frac{1}{2 \eta} \left( \| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - \frac{1}{2} \eta \| \bd_t \|^2 - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \eta^2 \|\bd_t \|^2 \notag\\
    &+ \frac{4}{\mu}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \frac{\mu}{8}\| \x_t - \x_* \|^2+\frac{\mu}{8}\|\eta \bd_t\|^2\\
    =&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4})\| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 ) \| \bd_t \|^2 \notag\\&
    + \frac{4}{\mu}\| \nabla f_s(\x_t)-\bu_t^s\|^2\\
     \stackrel{(d)}{\le}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4})\| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 ) \| \bd_t \|^2 \notag\\&
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \|\x_{i+1}-\x_{i}\|^2 +\|\nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2)\\
     \stackrel{(f)}{\le}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4})\| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 ) \| \bd_t \|^2 \notag\\&
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \|\x_{i+1}-\x_{i}\|^2 ) +\frac{\mu}{4}\frac{I_{(\mathcal{N}_s <n)}}{\mathcal{N}_s} \sigma^2  .
\end{align}
where $(a)$ follows from Eqs.\eqref{eqs62}. (b) follows from the definition $\bd_t=\sum_{s \in [S]} \lambda_{t}^{s} \mathbf{u}_{t}^s$ as shown in Line 14 in Algorithm. \ref{alg}. $(c)$ is because $\|\x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 = - \eta^2 \| \bd_t \|^2 + 2 \left< \eta \bd_t , \x_t - \x_* \right>$. $(d)$ is from Lemma. \ref{lem:bounded1} and we choose
$\delta = \frac{\mu}{8}$. $(e)$ is from Eqs. \eqref{eqs52}.

Next, telescoping the above inequality over $t$ from $\left(n_t-1\right) q$ to $t$ where $t \leq n_t q-1$ and noting that for $\left(n_t-1\right) q \leq j \leq n_t q-1, n_j=n_t$, we obtain

\begin{align} \label{eqs71}
    & \sum_{i=\left(n_t-1\right) q}^t \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right]  \notag \\
   \stackrel{(a)}{\le}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=\left(n_t-1\right) q}^t\| \x_i - \x_* \|^2 -  \sum_{i=\left(n_t-1\right) q}^t\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 )  \sum_{i=\left(n_t-1\right) q}^t\| \bd_i \|^2 
    + \frac{4}{\mu  }( \frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=\left(n_j-1\right) q}^j\|\x_{i+1}-\x_{i}\|^2 )\notag\\
    &+ \frac{\mu S}{4}\sum_{i=\left(n_t-1\right) q}^t\frac{I_{(\mathcal{N}_s <n)}}{\mathcal{N}_s} \sigma^2 \notag \\
    \stackrel{(b)}{\le}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=\left(n_t-1\right) q}^t\| \x_i - \x_* \|^2 -  \sum_{i=\left(n_t-1\right) q}^t\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 )  \sum_{i=\left(n_t-1\right) q}^t\| \bd_i \|^2 
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=\left(n_t-1\right) q}^t\|\x_{i+1}-\x_{i}\|^2 )\notag\\
    &+\frac{\mu}{4} \sum_{i=\left(n_t-1\right) q}^t\frac{I_{(\mathcal{N}_s <n)}}{\mathcal{N}_s} \sigma^2 \notag \\
   \stackrel{(c)}{\le}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=\left(n_t-1\right) q}^t\| \x_i - \x_* \|^2 -  \sum_{i=\left(n_t-1\right) q}^t\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|} ) \sum_{i=\left(n_t-1\right) q }^t\|\bd_{i}\|^2 )\notag\\
    &+ \frac{\mu}{4} \sum_{i=\left(n_t-1\right) q}^t(\frac{[\gamma_i]}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} ),
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs70} and the fact that $\lambda_t^s \leq 1 \forall s\in [S]$. $(b)$ extends $j$ to $t$. $(c)$ is because $t-(n_t -1)q\geq q$.
We continue the proof by further driving

\begin{align}\label{eqs72}
    & \sum_{t=0}^{T} \sum_{s \in [S]} \lambda_t^{s}\left[ f_s(\x_{i+1}) - f_s(\x_*) \right] \notag\\ = & \sum_{i=0}^q\sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right] +  \sum_{i=q}^{2q}\sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*)\right]+\notag\\&\cdot+ \sum_{i=(n_T-1)q}^{T}\sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right] \notag \\
\stackrel{(a)}{\le} &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=0 }^{T} \| \x_i - \x_* \|^2 -  \sum_{t=0}^{T}\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|} +\frac{\mu}{4 c_{\gamma}} )\sum_{t=0}^{T}\|\bd_{i}\|^2 + \frac{\mu}{4} T\frac{\epsilon}{c_{\epsilon}},
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs71} and $\gamma_t= \frac{1}{q}  \sum_{i=(n_t-1)q}^{t} \|	\bd_t \|^2$.


Next, we have
\begin{align}
    & \sum_{t=0}^{T}   \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_i) - f_s(\x_*) \right] \notag\\ = & \sum_{t=0}^{T}   \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) -  f_s(\x_{i+1}) + f_s(\x_i)  \right]  \notag\\
    %
   = &\sum_{t=0}^{T}  \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right]  +\sum_{t=0}^{T} \sum_{s \in [S]} \lambda_t^{s}  | f_s(\x_{i+1}) - f_s(\x_i) |\notag\\
 \stackrel{(a)}{\le} &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=0 }^{T} \| \x_i - \x_* \|^2 -  \sum_{t=0}^{T}\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|}  -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}] -\frac{\mu}{4 c_{\gamma}} )\sum_{t=0}^{T}  \|\bd_i\|^2+\frac{\mu}{4} T\frac{\epsilon}{c_{\epsilon}},
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs72}.

Let $|\mathcal{A}|=q= \lceil\sqrt{n}\rceil $ and $\eta \leq \min\{\frac{1}{2\mu},\frac{1}{8L},\frac{\mu}{64L^2} \},c_{\gamma}\geq \frac{8\mu}{\eta}$, we have $(\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|}  -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}]-\frac{\mu}{4 c_{\gamma}} )> \frac{\eta}{32}>0$
    
    Thus, we have
\begin{align}
    & \sum_{t=0}^{T}   \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_i) - f_s(\x_*) \right]\leq \frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=0 }^{T} \| \x_i - \x_* \|^2 -  \sum_{t=0}^{T}\| \x_{i+1} - \x_* \|^2\right).
\end{align}

Then, we have 

\begin{align}
    & \mathbb{E}  [\sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_t) - f_s(\x_*) \right] ]\leq \frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \mathbb{E} \| \x_t - \x_* \|^2 -  \mathbb{E}\| \x_{t+1} - \x_* \|^2\right)+\frac{\mu}{4} T\frac{\epsilon}{c_{\epsilon}}.
\end{align}
Averaging using weight $w_t = ( 1 - \frac{3\mu \eta }{4})^{1-t}$ and using such weight to pick output $\x$.
By using Lemma 1 in \cite{Karimireddy2020SCAFFOLD} with $\eta \geq \frac{1}{uR}, c_{\epsilon}>\frac{\mu}{2}$ and Assumption. \ref{assump: add}, we have

\begin{align}
   \mathbb{E}\|\x_t-\x^*\|^2 &\leq \| \x_0 - \x_* \|^2 \mu \exp( - \frac{3 \eta \mu T}{4}) +\frac{\mu}{4} T\frac{\epsilon}{c_{\epsilon}}\\
    &= \mathcal{O}(\mu \exp( - \mu T)).
\end{align}

Then we have the convergence rate $   \mathbb{E}\|\x_t-\x^*\|^2 = \mathcal{O}(\mu \exp( - \mu T))$.


The total sample complexity can be calculated as:
	$\lceil \frac{T}{q} \rceil n + T\cdot |\mathcal{A}| \leq  \frac{T+q}{q}n + T\sqrt{n}= T\sqrt{n}+n+T\sqrt{n}=O(n+ \sqrt{n} \ln ({\mu/\epsilon})$.
	Thus, the overall sample complexity is $\mathcal{O}(n+ \sqrt{n} \ln ({\mu/\epsilon})$.
	This completes the proof.
\end{proof}

 
\section{Proof of convergence of \algmp} \label{appdx:VRMP_nonconvex}



 

 
\textbf{Proof of Theorem.~\ref{thm:STIMULUSmp_nonC} [Part 2]}

\begin{proof}
From Lemma. \ref{lem:update}, we have

\begin{align}\label{eqs78}
    & [f_s(\x_{t+1})]  \notag\\ \stackrel{(a)}{\le}&  [f_s(\x_t)] +    \frac{\eta}{2}  \sum_{i=0}^t \alpha^{(t-i)} \| \nabla f_s(\x_i) - \bu_i^s \|^2 - \frac{1}{2} \eta \sum_{i=0}^t \alpha^{(t-i)} \| \bd_i \|^2+ \frac{1}{2}L \| \x_{t+1}-\x_{t} \|^2\notag\\
\stackrel{(b)}{\le} &  [f_s(\x_t)]  - \frac{1}{2} \eta \sum_{i=0}^t \alpha^{(t-i)} \| \bd_i \|^2+ \frac{1}{2}L \| \x_{t+1}-\x_{t} \|^2\notag\\
   &+ \frac{\eta}{2}  \sum_{j=0}^t  \alpha^{(t-j)} [\frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^j \|\x_{i+1}-\x_{i}\|^2 +\|\nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2] \notag\\
    \stackrel{(c)}{\le} & [f_s(\x_t)] - \frac{1}{2} \eta \sum_{i=0}^t \alpha^{(t-i)} \| \bd_i \|^2+ \frac{1}{2}L \| \x_{t+1}-\x_{t} \|^2+ \frac{\eta}{2}  \sum_{j=0}^t  \alpha^{(t-j)}  [\frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^j \|\x_{i+1}-\x_{i}\|^2] \notag\\&+ \frac{\eta}{2} \sum_{i=0}^t\alpha^{(t-i)} (\frac{\gamma_{i}}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} ),
\end{align}
where $(a)$ follows from Lemma \ref{lem:update}. $(b)$ follows from Lemma. \ref{lem:bounded1}. $(c)$ follows from Eqs. \eqref{eqs52}.

Next, telescoping the above inequality over $t$ from $\left(n_t-1\right) q$ to $t$ where $t \leq n_t q-1$ and noting that for $\left(n_t-1\right) q \leq j \leq n_t q-1, n_j=n_t$ and let $\eta\leq \frac{1}{4L}$, we obtain

\begin{align}
  &   [f_s(\x_{t+1})]  \notag\\   \stackrel{(a)}{\le} &   [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)} \| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t \| \x_{i+1}-\x_i \|^2  \notag\\& + \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=0}^j  \alpha^{(j-i)}  [\frac{L^2}{|\mathcal{A}|} \sum_{r=\left(n_t-1\right) q}^i \|\x_{r+1}-\x_{r}\|^2]  \notag\\&+ \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t\sum_{i=0}^j \alpha^{(j-i)}(\frac{[\gamma_{i}]}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} ) \notag\\
\stackrel{(b)}{\le} &   [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)}  \| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t \| \x_{i+1}-\x_i \|^2 \notag\\& + \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=0}^j  \alpha^{(j-i)}  [\frac{L^2}{|\mathcal{A}|} q \|\x_{j+1}-\x_{j}\|^2]  \notag\\&+ \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t\sum_{i=0}^j \alpha^{(j-i)}(\frac{[\gamma_{i}]}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} ) \notag\\
\stackrel{(c)}{=}&   [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)} \| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t \| \x_{i+1}-\x_i \|^2 \notag\\& + \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=0}^j  \alpha^{(j-i)}  [ L^2 \|\x_{j+1}-\x_{j}\|^2] \notag\\&+ \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t\sum_{i=0}^j \alpha^{(j-i)}(\frac{[\gamma_{i}]}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} ) \notag\\
 \stackrel{(d)}{\le} &   [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)} \| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t \| \x_{i+1}-\x_i \|^2 \notag\\& + \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=0}^j  \alpha^{(j-i)}  [ L^2 \|\eta \sum_{r=0}^{j} \alpha^{(j-r)} \bd_r\|^2]   \notag\\&+ \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t\sum_{i=0}^j \alpha^{(j-i)}(\frac{[\gamma_{i}]}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} ) \notag\\
       = &   [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)} \| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t \| \x_{i+1}-\x_i \|^2 \notag\\& + \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=0}^j  \alpha^{3(j-i)}  [ L^2\eta ^2 \|\bd_i\|^2]  \notag\\&+  \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t\sum_{i=0}^j \alpha^{(j-i)}(\frac{[\gamma_{i}]}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} ) \notag\\
            \stackrel{(e)}{\le}&   [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)} \| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t \| \x_{i+1}-\x_i \|^2 \notag\\& + \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=0}^j  \alpha^{(j-i)}  [ L^2\eta ^2 \|\bd_i\|^2]  \notag\\&+  \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t\sum_{i=0}^j \alpha^{(j-i)}(\frac{[\gamma_{i}]}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} ) \notag\\
            \stackrel{(f)}{\le}&   [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{4} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j\alpha^{(j-i)}\| \bd_i \|^2+ \frac{1}{2}L   \sum_{i=\left(n_t-1\right) q}^t \| \x_{i+1}-\x_i \|^2  \notag\\&+  \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t\sum_{i=0}^j \alpha^{(j-i)}(\frac{[\gamma_{i}]}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} ) \notag\\
                 \stackrel{(g)}{\le}&   [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{4} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j \alpha^{(j-i)} \| \bd_i \|^2+ \frac{1}{2}L   \sum_{j=\left(n_t-1\right) q}^t \| \eta \sum_{i=0}^{j} \alpha^{(j-i)} \bd_j \|^2 \notag\\&+  \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t\sum_{i=0}^j \alpha^{(j-i)}(\frac{[\gamma_{i}]}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} )  \notag\\
               \stackrel{(h)}{\le} &   [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{8} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=0}^j\alpha^{(j-i)} \| \bd_i \|^2 +  \frac{\eta}{2} \sum_{j=\left(n_t-1\right) q}^t\sum_{i=0}^j \alpha^{(j-i)}(\frac{[\gamma_{i}]}{c_{\gamma}}+\frac{\epsilon}{c_{\epsilon}} ) ,
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs78}. $(b)$ follows from $i\leq n_t q$. $(c)$ follows from $q=|\mathcal{A}|=\lceil\sqrt{n}\rceil$. $(d)$ and $(g)$ follow from the update rule of $\x_t$ shown in Line 19 in Algorithm. \ref{alg}.
$(e)$ follows from $0<\alpha<1$, then we have $\alpha^2{(j-i)}<\alpha^{(j-i)} $. $(f)$ and $(h)$ follow from $\eta\leq \frac{1}{4L}$
Recall that $ \gamma_t= \frac{1}{q}  \sum_{i=(n_t-1)q}^{t} \|	\bd_t \|^2 $. Then, we have
%
\begin{align}
  &   \mathbb{E}[f_s(\x_{T})] -   [f_s(\x_{0})]  \notag\\&
  =\mathbb{E}(   [f_s(\x_{q})] -   [f_s(\x_{0})] ) +  (   [f_s(\x_{2q})] -   [f_s(\x_{q})] ) +\cdot +  (   [f_s(\x_{T})] -   [f_s(\x_{(n_T-1)q})] ) 
  \notag\\& \stackrel{(a)}{\le} -[\frac{\eta}{8}] \sum_{t=0}^{T-1}  \sum_{i=0}^j\alpha^{(j-i)} \mathbb{E} \|\bd_t\|^2 +\frac{\eta}{2 c_{\gamma}}   \sum_{t=0}^{T-1} \sum_{i=0}^j\alpha^{(j-i)} \mathbb{E} \|\bd_t\|^2+ \frac{\eta}{2} T q \frac{\epsilon}{c_{\epsilon}} 
    \notag\\& 
 \stackrel{(b)}{\le} -[\frac{\eta}{16}] \sum_{t=0}^{T-1}  \sum_{i=0}^j\alpha^{(j-i)} \mathbb{E} \|\bd_t\|^2 + \frac{\eta}{2} T q \frac{\epsilon}{c_{\epsilon}}  
    \notag\\& \stackrel{(c)}{\le}-[\frac{\eta}{16}] \sum_{t=0}^{T-1}   \mathbb{E} \|\bd_t\|^2 + \frac{\eta}{2} T q \frac{\epsilon}{c_{\epsilon}} ,
\end{align}
where $(a)$ follows from $c_{\gamma}\geq 8$, $(c)$ follows from $0<\alpha<1$.

Note that $ [ f_s\left(\x_{T+1}\right) ]\geq f_s^* \triangleq \inf _{\x \in \mathbb{R}^d} f_s(\x)$. Hence, we have
%
\begin{align}
  &
 [\frac{\eta}{16}] \sum_{t=0}^{T-1}  \|\bd_t\|^2\leq   [  [f_s(\x_{0})] -  [f_s(\x_{T})]  ]\leq   [  [f_s(\x_{0})] - f_s^*  ].
\end{align}


\iffalse
We next bound $ \left\|\nabla f_s\left(\x_{\zeta}\right)\right\|^2$, where $\zeta$ is selected uniformly at random from $\{0, \ldots, T-1\}$. Observe that
\begin{align}
 \left\|\nabla f_s\left(\x_{\zeta}\right)\right\|^2= \left\|\nabla f_s\left(\x_{\zeta}\right)-\bd_{\zeta}+\bd_{\zeta}\right\|^2 \leq 2  \left\|\nabla f_s\left(x_{\zeta}\right)-\bd_{\zeta}\right\|^2+2  \left\|\bd_{\zeta}\right\|^2.
\end{align}
Next, we bound the two terms on the right hand side of the above inequality. First, note that
\begin{align}
 \left\|\bd_{\zeta}\right\|^2=\frac{1}{T} \sum_{i=0}^{T-1}  \left\|\bd_i\right\|^2 \leq \frac{[  [f_s(\x_{0})] -  [f_s^*]  ]}{T [\frac{\eta}{4}- \frac{\eta q}{2}  \frac{L^2}{|\mathcal{A}|}]}.
\end{align}

On the other hand, we have

\begin{align}
 \left\|\nabla f_s\left(x_{\zeta}\right)-\bd_{\zeta}\right\|^2 \leq 
\end{align}

\fi

Based on the parameter setting $q^2 =|\mathcal{A}|=\sqrt{n}$, we have 
\begin{align}
  &
 [\frac{\eta}{16}  ] \sum_{t=0}^{T-1}  \|\bd_t\|^2 \leq   [  [f_s(\x_{0})] -f_s^* ].
\end{align}
Thus, we have
\begin{align}
  &
 \frac{1}{T} \sum_{t=0}^{T-1}  \|\bd_t\|^2 \leq   \frac{[  [f_s(\x_{0})] -f_s^* ]}{ [\frac{\eta}{16} ] T}.
\end{align}




Since $\frac{1}{T} \sum_{t=0}^{T-1}\mathbb{E} \|d_t \|^2$ is just common descent directions.
According to Definition. \ref{def:stationary} shown in the paper, the quantity to our interest is 
$\|\sum_{s \in [S]}\lambda_t^s \nabla f(\mathbf{x})\|^2$. 
\begin{align}
  &
 \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)\|^2
\stackrel{(a)}{\le}  ( 2S L^2 \eta ^2 +2)  \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\bd_t\|^2 
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs21}.

Then, we can conclude that 

\begin{align}
  &
\frac{1}{T}\sum_{t=0}^{T-1}\min_{\boldsymbol{\lambda} \in C} \mathbb{E} \| \boldsymbol{\lambda}^{\top} \nabla \F(\x_t) \|^2 \leq \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)\|^2 =\mathcal{O}(\frac{1}{T}).
\end{align}




The total sample complexity can be calculated as:
	$\lceil \frac{T}{q} \rceil n + T\cdot |\mathcal{A}| \leq  \frac{T+q}{q}n + T\sqrt{n}= T\sqrt{n}+n+T\sqrt{n}=O(n+ \sqrt{n} \epsilon^{-1})$.
	Thus, the overall sample complexity is $\mathcal{O}(n+ \sqrt{n} \epsilon^{-1})$.
	This completes the proof.
 
\end{proof}

\subsection{Proof of Theorem.~\ref{thm:STIMULUSP_SC} [Part 2]}
\begin{proof}

\begin{align} \label{eqs86}
    &f_s(\x_{t+1}) \notag\\  \stackrel{(a)}{\le}& f_s(\x_t) + \left< \nabla f_s(\x_t),-\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 \notag\\
      \stackrel{(b)}{\le}& f_s(\x_*) + \left< \nabla f_s(\x_t), \x_t - \x_* \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2  + \left< \nabla f_s(\x_t), -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right>\notag\\& + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2\notag\\
    = &f_s(\x_*) + \left< \nabla f_s(\x_t), \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2\notag\\
    =& f_s(\x_*) + \left< \nabla f_s(\x_t)-\bu_t^s, \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right>+ \left< \bu_t^s, \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> \notag\\&- \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2\notag\\
    \stackrel{(c)}{\le}& f_s(\x_*) + \frac{1}{2\delta}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \frac{\delta}{2}\| \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2 \notag\\&+ \left< \bu_t^s, \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> \notag\\&- \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2\notag\\
     \stackrel{(d)}{\le} & f_s(\x_*) + \frac{1}{2\delta}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2 \notag\\&+ \left< \bu_t^s, \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2,
\end{align}
where $(a)$ follows from $L$-smoothness assumption, $(b)$ follows from $\mu$-strongly convex. $(c)$ and $(d)$ follow from the triangle inequality.


\begin{align} \label{eqs88}
    & \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{t+1}) - f_s(\x_*) \right]  \\
       \stackrel{(a)}{\le} & \frac{1}{2\delta} \sum_{s \in [S]} \lambda_t^{s} \| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2\notag\\& +\left< \sum_{s \in [S]} \lambda_t^{s}\bu_t^s, \x_t - \x_* \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \left< \sum_{s \in [S]} \lambda_t^{s} \bu_t^s, -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> \notag\\&+ \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2  \notag\\
    %
    =&\frac{1}{2\delta} \sum_{s \in [S]} \lambda_t^{s} \| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2\notag\\& +\left< \sum_{s \in [S]} \lambda_t^{s}\bu_t^s, \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 \notag\\&+ \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2  \notag\\
    %
    =&\frac{1}{2\delta} \sum_{s \in [S]} \lambda_t^{s} \| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2\notag\\& +\left< \bd_t , \x_t - \x_* -\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2  \notag\\
  \stackrel{(b)}{\le} &\frac{1}{2 \eta} \left( \| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - \frac{1}{2} \eta \|  \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 - \frac{\mu}{2} \| \x_t - \x_* \|^2 \notag\\&+ \frac{1}{2}L \|\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2 \notag\\
    &+ \frac{4}{\mu}\sum_{s \in [S]} \lambda_t^{s}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \frac{\mu}{8}\| \x_t - \x_* \|^2+\frac{\mu}{8}\|\eta \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2 \notag\\
   = &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4})\| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 ) \| \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 \notag\\&
    + \frac{4}{\mu}\sum_{s \in [S]} \lambda_t^{s}\| \nabla f_s(\x_t)-\bu_t^s\|^2 \notag\\
    \stackrel{(c)}{\leq}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4})\| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 ) \| \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 \notag\\&
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \|\x_{i+1}-\x_{i}\|^2 +\sum_{s \in [S]} \lambda_t^{s}\|\nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2) \notag\\
      \stackrel{(d)}{\leq} &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4})\| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 ) \| \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 \notag\\&
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \|\x_{i+1}-\x_{i}\|^2 )+\frac{\mu S }{4}\frac{I_{(\mathcal{N}_s <n)}}{\mathcal{N}_s} \sigma^2 .
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs86}, (b) follows from $\|\x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 = - \eta^2 \| \bd_t \|^2 + 2 \left< \eta \bd_t , \x_t - \x_* \right>$ and we choose
$\delta = \frac{\mu}{8}$. $(c)$ is from Lemma. \ref{lem:bounded1}. $(d)$ is from Eqs. \eqref{eqs52}. $(d)$ follows from $0<\lambda_t^s<1, \forall s\in[S]$



Next, telescoping the above inequality over $t$ from $\left(n_t-1\right) q$ to $t$ where $t \leq n_t q-1$ and noting that for $\left(n_t-1\right) q \leq j \leq n_t q-1, n_j=n_t$, we obtain

\begin{align}
    & \sum_{i=\left(n_t-1\right) q}^t \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right]  \notag \\
 \stackrel{(a)}{\leq}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=\left(n_t-1\right) q}^t\| \x_i - \x_* \|^2 -  \sum_{i=\left(n_t-1\right) q}^t\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 )  \sum_{i=\left(n_t-1\right) q}^t\| \sum_{i=0 }^t \alpha^{(t-i)} \bd_i \|^2 
   \notag\\& + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=\left(n_j-1\right) q}^j\|\x_{i+1}-\x_{i}\|^2 ) +\frac{\mu}{4 c_{\gamma}} \sum_{i=\left(n_t-1\right) q }^t\|\alpha^{(t-i)}\bd_{i}\|^2 \notag \\&+\frac{\mu}{4 c_{\gamma}} \sum_{t=\left(n_t-1\right) q }^t\|\alpha^{(t-i)}\bd_{i}\|^2 + +\frac{\mu}{4} \sum_{i=\left(n_t-1\right) q}^t\frac{\epsilon}{c_{\epsilon}}\notag \\
 \stackrel{(b)}{\leq} &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=\left(n_t-1\right) q}^t\| \x_i - \x_* \|^2 -  \sum_{i=\left(n_t-1\right) q}^t\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 )  \sum_{i=\left(n_t-1\right) q}^t\| \sum_{t=0}^{T} \alpha^{(t-i)} \bd_i \|^2 
   \notag\\& + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=\left(n_t-1\right) q}^t\|\x_{i+1}-\x_{i}\|^2 )\notag\\&+\frac{\mu}{4 c_{\gamma}} \sum_{t=\left(n_t-1\right) q }^t\|\alpha^{(t-i)}\bd_{i}\|^2 + +\frac{\mu}{4} \sum_{i=\left(n_t-1\right) q}^t\frac{\epsilon}{c_{\epsilon}}\notag \\
 \stackrel{(c)}{\leq}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=\left(n_t-1\right) q}^t\| \x_i - \x_* \|^2 -  \sum_{i=\left(n_t-1\right) q}^t\| \x_{i+1} - \x_* \|^2 \right) + \frac{\mu}{4} \sum_{i=\left(n_t-1\right) q}^t\frac{\epsilon}{c_{\epsilon}}\notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|} ) \sum_{i=\left(n_t-1\right) q }^t\|\sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2 )\notag\\&+\frac{\mu}{4 c_{\gamma}} \sum_{t=\left(n_t-1\right) q }^t\|\alpha^{(t-i)}\bd_{i}\|^2+ \frac{\mu}{4} \sum_{i=\left(n_t-1\right) q}^t\frac{\epsilon}{c_{\epsilon}},
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs88}, $(b)$ extends $j$ to $t$. $(c)$ follows from $t\leq n_t q -1$.

We continue the proof by further driving
\begin{align} \label{eqs90}
    & \sum_{t=0}^{T} \sum_{s \in [S]} \lambda_t^{s}\left[ f_s(\x_{i+1}) - f_s(\x_*) \right] \notag\\ = & \sum_{i=0}^q\sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right] +  \sum_{i=q}^{2q}\sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*)\right]+\cdot+ \sum_{i=(n_T-1)q}^{T}\sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right] \notag \\
  \leq &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=0 }^{T} \| \x_i - \x_* \|^2 -  \sum_{t=0}^{T}\| \x_{i+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|} ) \sum_{t=0}^{T}\|\sum_{t=0}^{T} \alpha^{(t-i)} \bd_i\|^2 )\notag\\&+\frac{\mu}{4 c_{\gamma}} \sum_{t=0}^{T}\|\alpha^{(t-i)} \bd_{i}\|^2 + \frac{\mu}{4} T\frac{\epsilon}{c_{\epsilon}}.
\end{align}

Next, we have
\begin{align}
    & \sum_{t=0}^{T}   \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_i) - f_s(\x_*) \right] \notag\\ = & \sum_{t=0}^{T}   \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) -  f_s(\x_{i+1}) + f_s(\x_i)  \right]  \notag\\
    %
    \leq &\sum_{t=0}^{T}  \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right]  +\sum_{t=0}^{T} \sum_{s \in [S]} \lambda_t^{s}  | f_s(\x_{i+1}) - f_s(\x_i) |\notag\\
     \stackrel{(a)}{\leq} &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=0 }^{T} \| \x_i - \x_* \|^2 -  \sum_{t=0}^{T}\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|}  -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}] -\frac{\mu}{4 c_{\gamma}})\sum_{t=0}^{T}  \|\alpha^{(t-i)} \bd_i\|^2+ \frac{\mu}{4} T\frac{\epsilon}{c_{\epsilon}},
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs90}.
Let $|\mathcal{A}|=q= \lceil\sqrt{n}\rceil $ and $\eta \leq \min\{\frac{1}{2\mu},\frac{1}{8L},\frac{\mu}{64L^2} \},c_{\gamma}\geq \frac{8\mu}{\eta}, c_{\epsilon}\geq \frac{\mu}{2}$, we have $(\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|}  -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}] -\frac{\mu}{4 c_{\gamma}})> \frac{\eta}{32}>0$
    
    Thus, we have
\begin{align}
    & \sum_{t=0}^{T}   \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_i) - f_s(\x_*) \right]\notag\\&\leq \frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=0 }^{T} \| \x_i - \x_* \|^2 -  \sum_{t=0}^{T}\| \x_{i+1} - \x_* \|^2\right) +\frac{\epsilon}{2}.
\end{align}

Then, we have 

\begin{align}
    & \mathbb{E}  [\sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_t) - f_s(\x_*) \right] ]\notag\\&\leq \frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \mathbb{E} \| \x_t - \x_* \|^2 -  \mathbb{E}\| \x_{t+1} - \x_* \|^2\right)+\frac{\epsilon}{2}.
\end{align}
Based on Assumption. \ref{assump: add} and averaging using weight $w_t = ( 1 - \frac{3\mu \eta }{4})^{1-t}$ and using such weight to pick output $\x$.
By using Lemma 1 in \cite{Karimireddy2020SCAFFOLD} with $\eta \geq \frac{1}{uR}$, we have

\begin{align}
    \mathbb{E}\|\x_t-\x^*\|^2 &\leq \| \x_0 - \x_* \|^2 \mu \exp( - \frac{3 \eta \mu T}{4}) \\
    &= \mathcal{O}(\mu \exp( - \mu T)).
\end{align}

Then we have the convergence rate $   \mathbb{E}\|\x_t-\x^*\|^2  = \mathcal{O}(\mu \exp( - \mu T))$.

The total sample complexity can be calculated as:
	$\lceil \frac{T}{q} \rceil n + T\cdot |\mathcal{A}| \leq  \frac{T+q}{q}n + T\sqrt{n}= T\sqrt{n}+n+T\sqrt{n}=O(n+ \sqrt{n} \ln ({\mu/\epsilon})$.
	Thus, the overall sample complexity is $\mathcal{O}(n+ \sqrt{n} \ln ({\mu/\epsilon})$.
	This completes the proof. 

\end{proof}