% \section{Our techniques}


% We summarize our challenges and techniques here, and discuss details in later sections



% \begin{itemize}
%     \item synchronous coupling -> $W_2$
%     \item To obtain a sharper bound, instead of the standard analysis in tracking the stochastic differential equation, we adopt \cite{dk19} and decompose the error following 
%     \begin{align*}
%          \bar\theta_{(k+1)\eta}-\theta_{k+1} =\bar\theta_{k\eta}-\theta_{k}-\eta \cdot X_k- Y_k +\eta\zeta_k.
%     \end{align*}  
%     where
%     \begin{align*}
%         X_k := & ~  \nabla f(\theta_k+\bar\theta_{k\eta}-\theta_{k})-\nabla f(\theta_k) \\
%         Y_k := & ~ \int_{k\eta}^{(k+1)\eta}\big(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\big)\d s
%     \end{align*}
%     \Wei{will this analysis leads to a sharper rate?}
%     \item in particular for federated learning, we prove the $\ell_2$ upper bound for local clients so that assumption 4 \cite{lhy+19} is not required .
% \end{itemize}




% \subsection{Technique 1...}


% \subsection{Technique 2...}

% \paragraph{Follow \cite{lhy+19} to add discussion of the optimal local updates of $K$}
% \textbf{Optimal choice of $K$.} To ensure the algorithm to achieve the $\epsilon$ precision based on the total number of steps $k_{\epsilon}$, we can set
% \begin{align*}
%     &e^{-\frac{\eta m}{4} k_{\epsilon}} \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)\leq \frac{\epsilon}{2}\notag\\
%     &8\sqrt{{\eta} d \kappa} \cdot \bigg(\sqrt{\eta ((K-1)^2+\kappa) H_{\tau}^2}+\sqrt{K\tau}+{\sqrt{\sigma^2/L}} \bigg)\leq \frac{\epsilon}{2},
% \end{align*}

% This readily leads to
% \begin{align*}
%     \eta\leq \min\bigg\{\frac{1}{m+L}, O\bigg(\frac{\epsilon^2}{d\kappa}(\eta K^2 H_{\tau}^2+\eta\kappa H_{\tau}^2 +K\tau+{\sigma^2}/{L})^{-1}\bigg)\bigg\},\quad k_{\epsilon}\geq \Omega\bigg(\frac{\log\big(\frac{\epsilon}{d(\mathcal{D}+\frac{\tau}{m})}\big)}{m\eta}\bigg).
% \end{align*}

% This suggests that it suffices to set $k_{\epsilon}=\Omega\big({\eta}K^2 H_{\tau}^2+{\eta}\kappa H_{\tau}^2 +K\tau+\frac{\sigma^2}{L}\big)$ to reach the precision level $\epsilon$. We observe that the number of communication rounds is around the order
% \begin{align*}
%     \frac{k_{\epsilon}}{K}=\Omega\bigg(\eta K H_{\tau}^2 +\frac{ \eta\kappa H_{\tau}^2 + \sigma^2/L}{K}\bigg),
% \end{align*}
% where the value of $\frac{k_{\epsilon}}{K}$ first decreases and then increases with respect to $K$, indicating that setting $K$ either too large or too small may lead to high communication cost and hurt the performance. Ideally, $K$ should not be larger than $\Omega(\sqrt{k_{\epsilon}})$, where a similar result has been achieved by \cite{Stich19, lhy+19}.

