% !TEX root = main.tex


\newpage
 \onecolumn
\section{Proof of convergence of \algns} \label{appdx:VR_MOO_nonconvex}


 \begin{table}[htbp]
     \centering
    \caption{List of key notation.}
    \label{tab:list of notations}
    {
    \begin{tabular}{l|l}
        \hline
        Notation      & Definition                               \\ \hline
        $ n $         & Total number of samples per task         \\ \hline
        $ s $         & Objective/task index          \\ \hline
        $ S $         & Total number of objectives/tasks       \\ \hline   
        $ t $         & Iteration number index    \\ \hline
        $ T $         & Total number of iterations     \\ \hline
        $ \x \in \mathbb{R}^d $         & Model parameters in Problem~\eqref{eq: moo}       \\ \hline
        $ \x_* \in \mathbb{R}^d $         & A pareto optimal solution of Problem~\eqref{eq: moo}       \\ \hline
        $ \eta $         & The learning rate     \\ \hline
        $ \alpha$         & The momentum constant   \\ \hline   
$\epsilon$ & 
      Stationarity error in  Def. \ref{def:stationary}\\ \hline   
      $\mu$ & 
      Strongly-convex constant in  Assumption \ref{assump: SC}\\ \hline  
    \end{tabular}
    }
\end{table}



For clarity of notation, we drop $*$ for $\lambda$, that is, we use $\lambda_t^s$ to represent the solution of quadratic problem for task $s$ in the $t$-th round.


\iffalse
 {\color{blue} 
\begin{lem} \label{non_nagative_metric} 
In the strongly convex case, there always exists a set $\{ \lambda_t^s>0, \forall s \in [S]: \sum _ {s=1}^{S} \lambda _ t^s =1\}$ such that $\mathbb{E}[\sum _ {s \in [S]} \lambda _ t^s [ f _ s(\mathbf{x} _ t) - f _ s(\mathbf{x} _ *) ] ] \geq 0$.
\end{lem}

\begin{proof} 


From strong convexity, we have $f _ s(\mathbf{x} _ t) \geq f _ s(\mathbf{x} _ *) + \nabla f _ s^{\top}(\mathbf{x} _ *)(\mathbf{x} _ t-\mathbf{x} _ *) +\frac{\mu}{2}||\mathbf{x} _ t-\mathbf{x} _ *||^2$ for some $\mu>0$. 

Thus, for any $\lambda _ t^s>0$, $\forall s \in [S]$ with $\sum _ {s=1}^{S} \lambda _ t^s =1$, we have

\begin{align}
   & \sum _ {s\in[S]} \lambda _ t^s[f _ s(\mathbf{x} _ t) - f _ s(\mathbf{x} _ *)]\notag \\
\geq &\sum _ {s \in [S]} \lambda _ t^s \nabla f _ s^{\top}(\mathbf{x} _ *)(\mathbf{x} _ t-\mathbf{x} _ *) + \frac{\mu}{2}||\mathbf{x} _ t-\mathbf{x} _ *||^2\notag\\=& \sum _ {s \in [S]} \lambda _ t^s \nabla f _ s^{\top}(\mathbf{x} _ *) \mathbf{d} + \frac{\mu}{2}||\mathbf{x} _ t-\mathbf{x} _ *||^2,
\end{align}

where we define $\mathbf{d} \triangleq \mathbf{x} _ t-\mathbf{x} _ *$ in the last equality for convenience. 

Since $\mathbf{x} _ *$ is Pareto-stationary and all objective functions are strongly convex, it follows that $\mathbf{x} _ *$ is also Pareto-optimal. Thus, there must exist at least one $\tilde{s} \in [S]$ such that $\nabla f _ {\tilde{s}}^{\top}(\mathbf{x} _ *) \mathbf{d} > 0$.

Now, consider the following strategy for choosing $\lambda _ t^s$, $\forall s\in [S]$: For $\tilde{s}$, we choose a $\lambda _ t^{\tilde{s}}$-value that is close to 1. For all other $s \ne \tilde{s}$, we choose a small $\lambda _ t^s$-value that is close to 0. Then, by pushing the $\lambda _ t^{\tilde{s}}$-value toward 1 and other $\lambda _ t^s$-values towards 0 (whiling maintaining $\sum _ {s=1}^{S} \lambda _ t^s =1$), we can always make $\sum _ {s\in[S]} \lambda _ t^s[f _ s(\mathbf{x} _ t) - f _ s(\mathbf{x} _ *)]$ non-negative.

This completes the proof. 
\end{proof}
}
\fi
\begin{lem} \label{lem:bounded1}
Let Assumption 1 hold. The gradient estimator $\bu_t^s$ satisfies for all $(n_t -1)q + 1 \leq t \leq  n_t q - 1$:
    \begin{align}
 \mathbb{E}_t\| \nabla f_s(\x_t) - \bu_t^s \|^2 \leq \frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \mathbb{E}\|\x_{i+1}-\x_{i}\|^2 +\mathbb{E}_t\|\nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2 .
    \end{align}
\end{lem}


\textbf{Proof of Lemma. \ref{lem:bounded1}.}
\begin{proof}
From Lemma 1 in \cite{fang2018spider}, we have
%
 \begin{align}
 \mathbb{E}_t\| \nabla f_s(\x_t)  &- \bu_t^s \|^2 \stackrel{(a)}{=}   \mathbb{E}_t\| \nabla f_s(\mathbf{x}_{t-1}) - \mathbf{u}_{t-1}^s \|^2 \notag\\+& \mathbb{E}_t \| \frac{1}{|\mathcal{A}|} \sum_{j\in \mathcal{A}}\left( \nabla f_{sj} (\mathbf{x}_{t};\xi_{sj}  ) - \nabla f_{sj} (\mathbf{x}_{t-1};\xi_{sj} )  +\nabla f_s(\mathbf{x}_{t-1})-\nabla f_s(\mathbf{x}_{t}) \right) \|^2\notag\\
\stackrel{(b)}{\le} & \mathbb{E}_t\| \nabla f_s(\mathbf{x}_{\left(n_t-1\right) q}) - \mathbf{u}_{\left(n_t-1\right) q}^s \|^2 +  L^2 \sum_{i=\left(n_t-1\right) q}^t\frac{1}{|\mathcal{A}|} \mathbb{E}\|\mathbf{x}_{i+1}-\mathbf{x}_{i}\|^2.
\end{align}

$(a)$ stems from Proposition 1 in \cite{fang2018spider}, where the expectation of the gradient difference is broken down.
$(b)$ leverages Eq. (2.3) from \cite{fang2018spider}, applying a bound based on the Lipschitz continuity of the gradient.

Telescoping over from $\left(n_t-1\right) q+1 \text { to } t \text {, where } t \leq n_t q-1$, we obtain that 
 \begin{align}
 &\mathbb{E}_t\| \nabla f_s(\x_t) - \bu_t^s \|^2  \leq \mathbb{E}_t\| \nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2 +  L^2 \sum_{i=\left(n_t-1\right) q}^t\frac{1}{|\mathcal{A}|} \mathbb{E}\|\x_{i+1}-\x_{i}\|^2
     \end{align}

Then, we have
 \begin{align}
\mathbb{E}_t \| \nabla f_s(\x_t) - \bu_t^s \|^2 \leq \frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \mathbb{E}\|\x_{i+1}-\x_{i}\|^2 +\mathbb{E}_t\|\nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2 .
    \end{align}
    
\end{proof}
 

 

\begin{lem} \label{lem:update2}
For general $L$-smooth functions $\{ f_s, s \in [S] \}$, choose the learning rate $\eta$ s.t. $\eta \leq  \frac{1}{2L}$, the update $\bd_t$ of the algorithm satisfies:
    \begin{align}
   f_s(\x_{t+1})   &\leq  f_s(\x_t) + \frac{\eta}{2}  \| \nabla f_s(\x_t) - \bu_t^s \|^2 -\frac{\eta}{4} \| \bd_t \|^2.
\end{align}
\end{lem}


\textbf{Proof of Lemma. \ref{lem:update2}.}
\begin{proof}


\begin{align}
    f_s(\x_{t+1}) &\stackrel{(a)}{\le}  f_s(\x_t) + \left< \nabla f_s(\x_t), -\eta \bd_t \right> + \frac{1}{2}L \| \eta \bd_t \|^2 \notag\\
    &=  f_s(\x_t) - \eta\left< \nabla f_s(\x_t) - \bu_t^s, \bd_t \right> - \eta \left< \bu_t^s, \bd_t \right> + \frac{1}{2}L \| \eta \bd_t \|^2 \notag\\
    &\stackrel{(b)}{\le} f_s(\x_t) -\eta\left< \nabla f_s(\x_t) - \bu_t^s,  \bd_t \right> - \eta \| \bd_t \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2 \notag\\
    &\stackrel{(c)}{\le}  f_s(\x_t) + \frac{\eta}{2} \notag\| \nabla f_s(\x_t) - \bu_t^s \|^2 + \frac{1}{2} \eta \| \bd_t \|^2 - \eta \| \bd_t \|^2 + \frac{1}{2}L \eta^2 \| \bd_t \|^2 \notag\\
    &= f_s(\x_t) + \frac{\eta}{2}  \| \nabla f_s(\x_t) - \bu_t^s \|^2 - \eta \left( \frac{1}{2}- \frac{1}{2}L \eta \right) \| \bd_t \|^2.
\end{align}
(a) follows from  the objective function $f_s$ is $L$-smooth.  $(b)$ follows from $\left< \bu_t^s, \bd_t \right> \geq \| \bd_t \|^2$ since $\bd_t$ is a general solution in the convex hull of the family of vectors $\{\bu_t^s, s \in [S] \}$ (see Lemma 2.1~\cite{desideri2012multiple}). (c) follows from the triangle inequality.

By setting $\left( \frac{1}{2} - \frac{L}{2} \eta \right) \geq \frac{1}{4}$, that is, $\eta \leq  \frac{1}{2L}$,  we have 
\begin{align}
   f_s(\x_{t+1})   &\leq  f_s(\x_t) + \frac{\eta}{2}  \| \nabla f_s(\x_t) - \bu_t^s \|^2 -\frac{\eta}{4} \| \bd_t \|^2.
\end{align}
\end{proof}


\textbf{Proof of Theorem.~\ref{thm:STIMULUS_nonC}}

\begin{proof}
Taking expectation on both sides of the inequality in Lemma. \ref{lem:update2}, we have

\begin{align}
  &  \mathbb{E}[f_s(\x_{t+1})] \stackrel{(a)}{\le}  \mathbb{E}[f_s(\x_t)] + \frac{\eta}{2}   \mathbb{E}\| \nabla f_s(\x_t) - \bu_t^s \|^2 -\frac{\eta}{4}  \mathbb{E}\| \bd_t \|^2\notag\\&
 \stackrel{(b)}{\le}  \mathbb{E}[f_s(\x_t)] -\frac{\eta}{4}  \mathbb{E}\| \bd_t \|^2+ \mathbb{E}\frac{\eta}{2}  [\frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \mathbb{E}\|\x_{i+1}-\x_{i}\|^2 +\mathbb{E}\|\nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2] \notag\\&
\stackrel{(c)}{=}  \mathbb{E}[f_s(\x_t)] -\frac{\eta}{4} \mathbb{E} \| \bd_t \|^2+ \frac{\eta}{2}  [\frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \eta^2 \mathbb{E}\|\bd_i\|^2 ].
\end{align}
$(a)$ follows from Lemma. \ref{lem:update2}. $(b)$ follows from the Lemma. \ref{lem:bounded1}. $(c)$ follows from the update rule of $\x$ as shown in Eq. \eqref{STIMULUSP1} and $\mathbb{E}\|\nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2=0$ as shown in Line 5 in our Algorithm. \ref{alg}.

Next, telescoping the above inequality over $t$ from $\left(n_t-1\right) q$ to $t$ where $t \leq n_t q-1$ and noting that for $\left(n_t-1\right) q \leq j \leq n_t q-1, n_j=n_t$, we obtain

\begin{align} \label{eqs16}
  &   \mathbb{E}[f_s(\x_{t+1})]  \notag\\&\leq   \mathbb{E}[f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{4} \sum_{j=\left(n_t-1\right) q}^t  \mathbb{E}\| \bd_j \|^2+ \frac{\eta}{2}  [\frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=\left(n_t-1\right) q}^j\eta^2 \mathbb{E}\|\bd_i\|^2 ]  \notag\\& \stackrel{(a)}{\le}   \mathbb{E}[f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{4} \sum_{j=\left(n_t-1\right) q}^t  \mathbb{E}\| \bd_j \|^2+ \frac{\eta}{2}  [\frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t \sum_{i=\left(n_t-1\right) q}^t\eta^2 \mathbb{E}\|\bd_i\|^2 ] \notag\\& \stackrel{(b)}{\le}   \mathbb{E} [f_s(\x_{\left(n_t-1\right) q})] -\frac{\eta}{4} \sum_{j=\left(n_t-1\right) q}^t \mathbb{E} \| \bd_j \|^2+ \frac{\eta^3 q}{2}  [\frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t  \mathbb{E}\|\bd_j\|^2 ] \notag\\&= \mathbb{E}[f_s(\x_{\left(n_t-1\right) q})] -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}] \sum_{j=\left(n_t-1\right) q}^t  \mathbb{E}\|\bd_j\|^2.
\end{align}

where $(a)$ extends the summation of the third term from $j$ to $t$, $(b)$ follows from the fact that
$t \leq n_t q - 1$. 


We continue the proof by further driving
%
\begin{align}
  &   \mathbb{E}[f_s(\x_{T})] -   \mathbb{E}[f_s(\x_{0})]  \notag\\&
  =(   \mathbb{E}[f_s(\x_{q})] -   \mathbb{E}[f_s(\x_{0})] ) +  (   \mathbb{E}[f_s(\x_{2q})] -   \mathbb{E}[f_s(\x_{q})] ) +\cdot +  (   \mathbb{E}[f_s(\x_{T})] -   \mathbb{E}[f_s(\x_{(n_T-1)q})] ) 
  \notag\\&\leq  -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}] \sum_{t=0}^{T-1}  \mathbb{E}\|\bd_t\|^2
\end{align}

Note that $\mathbb{E} [ f_s\left(\x_{T+1}\right) ]\geq f_s^* \triangleq \inf _{\x \in \mathbb{R}^d} f_s(\x)$. Hence, we have
%
\begin{align}
  &
 [\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}] \sum_{t=0}^{T-1}  \mathbb{E}\|\bd_t\|^2\leq  [  [f_s(\x_{0})] -  [f_s(\x_{T})]  ]\leq  [  [f_s(\x_{0})] - f_s^*  ].
\end{align}


\iffalse
We next bound $ \left\|\nabla f_s\left(\x_{\zeta}\right)\right\|^2$, where $\zeta$ is selected uniformly at random from $\{0, \ldots, T-1\}$. Observe that
\begin{align}
 \left\|\nabla f_s\left(\x_{\zeta}\right)\right\|^2= \left\|\nabla f_s\left(\x_{\zeta}\right)-\bd_{\zeta}+\bd_{\zeta}\right\|^2 \leq 2  \left\|\nabla f_s\left(x_{\zeta}\right)-\bd_{\zeta}\right\|^2+2  \left\|\bd_{\zeta}\right\|^2.
\end{align}
Next, we bound the two terms on the right hand side of the above inequality. First, note that
\begin{align}
 \left\|\bd_{\zeta}\right\|^2=\frac{1}{T} \sum_{i=0}^{T-1}  \left\|\bd_i\right\|^2 \leq \frac{[  [f_s(\x_{0})] -  [f_s^*]  ]}{T [\frac{\eta}{4}- \frac{\eta q}{2}  \frac{L^2}{|\mathcal{A}|}]}.
\end{align}

On the other hand, we have

\begin{align}
 \left\|\nabla f_s\left(x_{\zeta}\right)-\bd_{\zeta}\right\|^2 \leq 
\end{align}

\fi

Based on the parameter setting $q=|\mathcal{A}|=\lceil\sqrt{n}\rceil$, we have 
\begin{align}
  &
 [\frac{\eta}{4}- \frac{\eta^3 L^2 }{2}  ] \sum_{t=0}^{T-1}  \|\bd_t\|^2 \leq   [  [f_s(\x_{0})] -f_s^* ].
\end{align}
Thus, we have
\begin{align}\label{eqs20}
  &
 \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\bd_t\|^2 \leq   \frac{[  [f_s(\x_{0})] -f_s^* ]}{ [\frac{\eta}{4}- \frac{\eta^3 L^2 }{2}  ] T}.
\end{align}



Since $\frac{1}{T} \sum_{t=0}^{T-1}\mathbb{E} \|d_t \|^2$ is just common descent directions.
According to Definition. \ref{def:stationary} shown in the paper, the quantity to our interest is 
$\|\sum_{s \in [S]}\lambda_t^s \nabla f(\mathbf{x})\|^2$. 
\begin{align}\label{eqs21}
  &
 \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)\|^2 \notag\\
\stackrel{(a)}{\le}   & \frac{1}{T} \sum_{t=0}^{T-1}  2\mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)-\sum_{s\in [S]}\lambda_t^s\bu_t^s\|^2+  \frac{1}{T} \sum_{t=0}^{T-1}  2\mathbb{E}\|\sum_{s\in [S]}\lambda_t^s \bu_t^s\|^2 \notag\\
\stackrel{(b)}{=} & \frac{1}{T} \sum_{t=0}^{T-1}  2\mathbb{E}\|\sum_{s\in [S]}\lambda_t^s(\nabla f_s(\x_t)-\bu_t^s)\|^2+  \frac{1}{T} \sum_{t=0}^{T-1}  2\mathbb{E}\|\bd_t\|^2 \notag\\
\stackrel{(c)}{\le}  & \frac{1}{T} \sum_{t=0}^{T-1} 2S \sum_{s\in [S]} (\lambda_t^s)^2\mathbb{E}\|(\nabla f_s(\x_t)-\bu_t^s)\|^2+  \frac{1}{T} \sum_{t=0}^{T-1}  2\mathbb{E}\|\bd_t\|^2 \notag\\
\stackrel{(d)}{\le}  & \frac{1}{T} \sum_{t=0}^{T-1} 2S \sum_{s\in [S]} (\lambda_t^s)^2[ \mathbb{E}_t\| \nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2 +  L^2 \sum_{i=\left(n_t-1\right) q}^t\frac{1}{|\mathcal{A}|} \mathbb{E}\|\x_{i+1}-\x_{i}\|^2]+  \frac{1}{T} \sum_{t=0}^{T-1}  2\mathbb{E}\|\bd_t\|^2 \notag\\
=& \frac{1}{T} \sum_{t=0}^{T-1} 2S \sum_{s\in [S]} (\lambda_t^s)^2[ \mathbb{E}_t\| \nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2]\notag\\& +   2S L^2 \frac{1}{T} \sum_{t=0}^{T-1} \sum_{i=\left(n_t-1\right) q}^t\frac{1}{|\mathcal{A}|} \mathbb{E}\|\x_{i+1}-\x_{i}\|^2 +  \frac{1}{T} \sum_{t=0}^{T-1}  2\mathbb{E}\|\bd_t\|^2  \notag\\
\stackrel{(e)}{\le}& \frac{1}{T} \sum_{t=0}^{T-1} 2S \sum_{s\in [S]} (\lambda_t^s)^2[ \mathbb{E}_t\| \nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2]\notag\\& +   2S L^2 \frac{1}{T} \sum_{t=0}^{T-1} \sum_{i=\left(n_t-1\right) q}^{n_t q -1}\frac{1}{|\mathcal{A}|} \mathbb{E}\|\x_{t+1}-\x_{t}\|^2 +  \frac{1}{T} \sum_{t=0}^{T-1}  2\mathbb{E}\|\bd_t\|^2 \notag\\
= &   2S L^2 \frac{1}{T} \sum_{t=0}^{T-1} \frac{q}{|\mathcal{A}|} \mathbb{E}\|\x_{t+1}-\x_{t}\|^2 +  \frac{1}{T} \sum_{t=0}^{T-1}  2\mathbb{E}\|\bd_t\|^2  \notag\\
\stackrel{(f)}{=} &   2S L^2 \eta ^2\frac{1}{T} \sum_{t=0}^{T-1} \mathbb{E}\|\bd_t\|^2 +  \frac{1}{T} \sum_{t=0}^{T-1}  2\mathbb{E}\|\bd_t\|^2 \notag\\
= &  ( 2S L^2 \eta ^2 +2)  \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\bd_t\|^2 
\end{align}
where $(a)$  and $(c)$ hold from the triangle inequality. (b) is because the definition $\bd_t=\sum_{s \in [S]} \lambda_{t}^{s} \mathbf{u}_{t}^s$ as shown in Line 14 in Algorithm. \ref{alg}. $(d)$ follows from the Lemma. \ref{lem:bounded1}. (e) is because $t\leq n_t q -1$. $(f)$ is because we have $q=|\mathcal{A}|=\lceil\sqrt{n}\rceil$.

Then, we can conclude that 
\begin{align}
  &
 \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)\|^2 
\stackrel{(a)}{\le}
 ( 2S L^2 \eta ^2  +2)
  \frac{[  [f_s(\x_{0})] -f_s^* ]}{ [\frac{\eta}{4}- \frac{\eta^3 L^2 }{2}  ] T},
\end{align}
where $(a)$ follows from Eqs. \eqref{eqs21} and Eqs. \eqref{eqs20}.


Let $\eta\leq \frac{1}{2L}$, we have

\begin{align}
  &
\frac{1}{T}\sum_{t=0}^{T-1}\min_{\boldsymbol{\lambda} \in C} \mathbb{E} \| \boldsymbol{\lambda}^{\top} \nabla \F(\x_t) \|^2 \leq \frac{1}{T} \sum_{t=0}^{T-1}  \mathbb{E}\|\sum_{s\in [S]}\lambda_t^s\nabla f_s(\x_t)\|^2 \notag\\ \leq&   \frac{ ( 2S L^2 \eta ^2\frac{1}{T}  +2)[  [f_s(\x_{0})] -f_s^*  ]}{ [\frac{\eta}{8} ] T}=\frac{ ( 2S L^2 \eta ^2 +2) \frac{8}{\eta}  [  [f_s(\x_{0})] -f_s^*  ]}{  T} =\mathcal{O}(\frac{1}{T}).
\end{align}




Lastly, to show the sample complexity, the number of samples with $mod(t,q)=0$ can be calculated as: $\lceil \frac{T}{q} \rceil \cdot M$.
	Also, the number of samples with $mod(t,q)\neq0$ can be calculated as $T\cdot|\mathcal{A}|$.
	Hence, the total sample complexity can be calculated as:
	$\lceil \frac{T}{q} \rceil n + T\cdot |\mathcal{A}| \leq  \frac{T+q}{q}n + T\sqrt{n}= T\sqrt{n}+n+T\sqrt{n}=O(n+ \sqrt{n} \epsilon^{-1})$.
	Thus, the overall sample complexity is $\mathcal{O}(n+ \sqrt{n} \epsilon^{-1})$.
	This completes the proof.

 
\end{proof}




\subsection{Proof of Theorem.~\ref{thm:STIMULUS_SC}}
\begin{proof}

\begin{align}\label{eq24}
    &f_s(\x_{t+1}) \notag\\\leq& f_s(\x_t) + \left< \nabla f_s(\x_t), -\eta \bd_t \right> + \frac{1}{2}L \| \eta \bd_t \|^2 \notag\\
  \stackrel{(a)}{\le} & f_s(\x_*) + \left< \nabla f_s(\x_t), \x_t - \x_* \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2  + \left< \nabla f_s(\x_t), -\eta \bd_t \right> + \frac{1}{2}L \| \eta \bd_t \|^2\notag\\
    = &f_s(\x_*) + \left< \nabla f_s(\x_t), \x_t - \x_* -\eta \bd_t \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2\notag\\
    \stackrel{(b)}{\le} & f_s(\x_*) + \left< \nabla f_s(\x_t)-\bu_t^s, \x_t - \x_* -\eta \bd_t \right>+ \left< \bu_t^s, \x_t - \x_* -\eta \bd_t \right> \notag\\&- \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2\notag\\
    \stackrel{(c)}{\le}  & f_s(\x_*) + \frac{1}{2\delta}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \frac{\delta}{2}\| \x_t - \x_* -\eta \bd_t\|^2 + \left< \bu_t^s, \x_t - \x_* -\eta \bd_t \right> \notag\\&- \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2\notag\\
   \stackrel{(d)}{\le}  & f_s(\x_*) + \frac{1}{2\delta}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \bd_t\|^2 \notag\\&+ \left< \bu_t^s, \x_t - \x_* -\eta \bd_t \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2,
\end{align}
the first inequality is due to $L$-smoothness, the second inequality follows from $\mu$-strongly convex. The last two inequality follows from the triangle inequality. 

According to Definition. \ref{def:stationary} shown in the paper, the quantity to our interest is 
$ \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{t+1}) - f_s(\x_*) \right]  $, then we have


\begin{align} \label{eqs23}
    & \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{t+1}) - f_s(\x_*) \right]  \notag\\
    \stackrel{(a)}{\le}& \frac{1}{2\delta}  \sum_{s \in [S]} \lambda_t^{s} \| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \bd_t\|^2\notag\\& +\left< \sum_{s \in [S]} \lambda_t^{s}\bu_t^s, \x_t - \x_* \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \left< \sum_{s \in [S]} \lambda_t^{s} \bu_t^s, -\eta \bd_t \right> + \frac{1}{2}L \| \eta \bd_t \|^2 \notag\\
    %
    =&\frac{1}{2\delta}  \sum_{s \in [S]} \lambda_t^{s} \| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \bd_t\|^2\notag\\& +\left< \sum_{s \in [S]} \lambda_t^{s}\bu_t^s, \x_t - \x_* -\eta \bd_t \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2 \notag\\
    %
     \stackrel{(b)}{\le}&\frac{1}{2\delta}  \sum_{s \in [S]} \lambda_t^{s}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \bd_t\|^2\notag\\& +\left< \bd_t , \x_t - \x_* -\eta \bd_t \right> - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \| \eta \bd_t \|^2 \notag\\
    %
   = & \left< \bd_t , \x_t - \x_* \right> - \eta \| \bd_t \|^2 - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \eta^2 \| \bd_t \|^2 \notag\\&+ \frac{1}{2\delta}  \sum_{s \in [S]} \lambda_t^{s}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \delta\| \x_t - \x_* \|^2+\delta\|\eta \bd_t\|^2\notag\\
    \stackrel{(c)}{\le}&\frac{1}{2 \eta} \left( \| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - \frac{1}{2} \eta \| \bd_t \|^2 - \frac{\mu}{2} \| \x_t - \x_* \|^2 + \frac{1}{2}L \eta^2 \|\bd_t \|^2 \notag\\
    &+ \frac{4}{\mu}  \sum_{s \in [S]} \lambda_t^{s}\| \nabla f_s(\x_t)-\bu_t^s\|^2+ \frac{\mu}{8}\| \x_t - \x_* \|^2+\frac{\mu}{8}\|\eta \bd_t\|^2\notag\\
     \stackrel{(d)}{\le}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4})\| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 ) \| \bd_t \|^2 \notag\\&
    + \frac{4}{\mu} \sum_{s \in [S]} \lambda_t^{s}\| \nabla f_s(\x_t)-\bu_t^s\|^2\notag\\
    \stackrel{(e)}{\le}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4})\| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 ) \| \bd_t \|^2 \notag\\&
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \|\x_{i+1}-\x_{i}\|^2 + \sum_{s \in [S]} \lambda_t^{s}\|\nabla f_s(\x_{\left(n_t-1\right) q}) - \bu_{\left(n_t-1\right) q}^s \|^2)\notag\\
    =&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4})\| \x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 \right) - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 ) \| \bd_t \|^2 \notag\\&
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{i=\left(n_t-1\right) q}^t \|\x_{i+1}-\x_{i}\|^2 ).
\end{align}
where $(a)$ follows from Eqs. \eqref{eq24}. (b) is because the definition $\bd_t=\sum_{s \in [S]} \lambda_{t}^{s} \mathbf{u}_{t}^s$ as shown in Line 14 in Algorithm. \ref{alg}. $(c)$ is because 
$\|\x_t - \x_* \|^2 - \| \x_{t+1} - \x_* \|^2 = - \eta^2 \| \bd_t \|^2 + 2 \left< \eta \bd_t , \x_t - \x_* \right>$, 
and we choose
$\delta = \frac{\mu}{8}$ in $(d)$. $(e)$ follows from Lemma. \ref{lem:bounded1}.

Next, telescoping the above inequality over $t$ from $\left(n_t-1\right) q$ to $t$ where $t \leq n_t q-1$ and noting that for $\left(n_t-1\right) q \leq j \leq n_t q-1, n_j=n_t$, we obtain

\begin{align} \label{eqs24}
    & \sum_{i=\left(n_t-1\right) q}^t \sum_{s \in [S]} \lambda_i^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right]  \notag \\
    \stackrel{(a)}{\le} &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=\left(n_t-1\right) q}^t\| \x_i - \x_* \|^2 -  \sum_{i=\left(n_t-1\right) q}^t\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 )  \sum_{i=\left(n_t-1\right) q}^t\| \bd_i \|^2 
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=\left(n_j-1\right) q}^j\|\x_{i+1}-\x_{i}\|^2 ) \notag \\
   \stackrel{(b)}{\le}&\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=\left(n_t-1\right) q}^t\| \x_i - \x_* \|^2 -  \sum_{i=\left(n_t-1\right) q}^t\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2 )  \sum_{i=\left(n_t-1\right) q}^t\| \bd_i \|^2 
    + \frac{4}{\mu}( \frac{L^2}{|\mathcal{A}|} \sum_{j=\left(n_t-1\right) q}^t  \sum_{i=\left(n_t-1\right) q}^t\|\x_{i+1}-\x_{i}\|^2 )\notag \\
= &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=\left(n_t-1\right) q}^t\| \x_i - \x_* \|^2 -  \sum_{i=\left(n_t-1\right) q}^t\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|} ) \sum_{i=\left(n_t-1\right) q }^t\|\bd_{i}\|^2 ),
\end{align}
where $(a)$ is from Eqs. \eqref{eqs23}. $(b)$ relaxes $j$ to $t$, since $j\leq t$.
We continue the proof by further driving

\begin{align}
    & \sum_{i=0}^{T} \sum_{s \in [S]} \lambda_i^{s}\left[ f_s(\x_{i+1}) - f_s(\x_*) \right] \notag\\ = & \sum_{i=0}^q\sum_{s \in [S]} \lambda_i^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right] +  \sum_{i=q}^{2q}\sum_{s \in [S]} \lambda_i^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*)\right]+\notag\\&\cdot \cdot \cdot+ \sum_{i=(n_T-1)q}^{T}\sum_{s \in [S]} \lambda_i^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right] \notag \\
  \leq &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=0 }^{T} \| \x_i - \x_* \|^2 -  \sum_{i=0}^{T}\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|} ) \sum_{i=0}^{T}\|\bd_{i}\|^2 ), 
\end{align}
where the last inequality is from Eq. \eqref{eqs16} and Eq. \eqref{eqs24}.
Next, we have
\begin{align}
    & \sum_{i=0}^{T}   \sum_{s \in [S]} \lambda_i^{s} \left[ f_s(\x_i) - f_s(\x_*) \right] \notag\\ = & \sum_{i=0}^{T}   \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) -  f_s(\x_{i+1}) + f_s(\x_i)  \right]  \notag\\
    %
    \leq &\sum_{i=0}^{T}  \sum_{s \in [S]} \lambda_t^{s} \left[ f_s(\x_{i+1}) - f_s(\x_*) \right]  -\sum_{i=0}^{T} \sum_{s \in [S]} \lambda_t^{s}  | f_s(\x_{i+1}) - f_s(\x_i) |\notag\\
    \leq &\frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=0 }^{T} \| \x_i - \x_* \|^2 -  \sum_{i=0}^{T}\| \x_{i+1} - \x_* \|^2 \right) \notag\\& - (\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|}  -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}] )\sum_{i=0}^{T}  \|\bd_i\|^2
\end{align}

Let $|\mathcal{A}|=q= \lceil\sqrt{n}\rceil $ and $\eta \leq \min\{\frac{1}{2\mu},\frac{1}{8L},\frac{\mu}{64L^2} \}$, we have $(\frac{1}{2} \eta-\frac{\mu}{8}\eta^2 - \frac{1}{2}L \eta^2-
    \frac{4}{\mu} \frac{L^2 q  \eta^2}{|\mathcal{A}|}  -[\frac{\eta}{4}- \frac{\eta^3 q}{2}  \frac{L^2}{|\mathcal{A}|}] )> \frac{\eta}{16}>0$
    
    Thus, we have
\begin{align}
    & \sum_{i=0}^{T}   \sum_{s \in [S]} \lambda_i^{s} \left[ f_s(\x_i) - f_s(\x_*) \right]\leq \frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \sum_{i=0 }^{T} \| \x_i - \x_* \|^2 -  \sum_{i=0}^{T}\| \x_{i+1} - \x_* \|^2\right).
\end{align}

Then, we have 

\begin{align}
    & \mathbb{E}_t  [\sum_{s \in [S]} \lambda_i^{s} \left[ f_s(\x_t) - f_s(\x_*) \right] ]\leq \frac{1}{2 \eta} \left( (1-\frac{3\mu\eta}{4}) \mathbb{E}_t \| \x_t - \x_* \|^2 -  \mathbb{E}_t\| \x_{t+1} - \x_* \|^2\right).
\end{align}
Based on Assumption.\ref{assump: add} and averaging using weight $w_t = ( 1 - \frac{3\mu \eta }{4})^{1-t}$ and using such weight to pick output $\x$, by using Lemma 1 in \cite{Karimireddy2020SCAFFOLD} with $\eta \geq \frac{1}{uR}$, we have

\begin{align}
    \mathbb{E}\|\x_t-\x^*\|^2\left[ f_s(\x_t) - f_s(\x_*) \right] ]  &\leq \| \x_0 - \x_* \|^2 \mu \exp( - \frac{3 \eta \mu T}{4}) \\
    &= \mathcal{O}(\mu \exp( - \mu T)).
\end{align}

Then we have the convergence rate $   \mathbb{E}\|\x_t-\x^*\|^2 = \mathcal{O}(\mu \exp( - \mu T))$.

Lastly, the total sample complexity can be calculated as:
	$\lceil \frac{T}{q} \rceil n + T\cdot |\mathcal{A}| \leq  \frac{T+q}{q}n + T\sqrt{n}= T\sqrt{n}+n+T\sqrt{n}=O(n+ \sqrt{n} \ln ({\mu/\epsilon})$.
	Thus, the overall sample complexity is $\mathcal{O}(n+ \sqrt{n} \ln ({\mu/\epsilon})$.
	This completes the proof.
 
\end{proof}