\section{Simulations}


\paragraph{Experimental setups} For each client $c\in[N]$, where $N=50$, we first sample a two-dimensional point $\theta_c$ from a Gaussian distribution $N(0,\alpha I_2)$. Then, we sample $n_c$ points from  $N(\theta_c,\Sigma)$, where 
$
\Sigma=\left[\begin{matrix}
5 & -2\\
-2 & 1
\end{matrix}\right]
$. Denote each data point by $x_{c,i}$ for $i=1,\dots,n_c$. Thus, $l(\theta;x_{c,i})=\frac{1}{2}(\theta-x_{c,i})^{\top}\Sigma^{-1}(\theta-x_{c,i})+\log(2\pi |\Sigma|^{\frac{1}{2}})$,  $\ell^c(\theta)=\sum_{i=1}^{n_c}l(\theta;x_{c,i})$, $f(\theta)=\sum_{c=1}^N\ell^c(\theta)$, and $f^c(\theta)=\frac{1}{p_c}\ell^c(\theta)$. Fixing the temperature $\tau=1$, the target distribution $\pi$ is $N(u,\frac{1}{n}\Sigma)$ with $u=\frac{1}{n}\sum_{c=1}^N\sum_{i=1}^{n_c}x_{c,i}$. Given proper initializations, the simulated parameters are normally distributed. The $W_2$ distance between two Gaussian variables $\mu_1$ and $\mu_2$ is
\begin{equation*}
W_2\left(\mu_1, \mu_2\right)=
\sqrt{\left\|u_{1}-u_{2}\right\|_{2}^{2}+\operatorname{Tr}\left(\Sigma_{1}+\Sigma_{2}-2\left(\Sigma_{1}^{1 / 2} \Sigma_{2} \Sigma_{1}^{1 / 2}\right)^{1 / 2}\right)},
\end{equation*}
where $u_1$, $u_2$, $\Sigma_{1}$, and $\Sigma_{2}$ denote the means and covariance matrices of Gaussian variables, respectively.

We run Algorithm \ref{alg:alg_main_paper_text_independent_noise}, \ref{alg:alg_main_paper_text_different_seeds} and \ref{alg:alg_main_text_partial_main} and repeat each experiment $R=300$ times. %independently with each run consisting of $T$ steps. 
At the $k$-th communication round, %each step $k=EK$ \Wei{check} with $E\in \mathbb{N}$ representing the communication round, 
we obtain a set of $R$ simulated parameters $\{\theta_{k,j}\}_{j=1}^R$, where $\theta_{k,j}$ denotes the parameter at the $k$-th round in the $j$-th independent run. The underlying distribution $\mu_k$ at round $k$ is approximated by a Gaussian variable with the empirical mean ${u}_{k}=\frac{1}{R}\sum_{j=1}^R\theta_{k,j}$ and covariance matrix ${\Sigma}_{k}=\frac{1}{R-1}\sum_{j=1}^R (\theta_{k,j}-{u}_k)(\theta_{k,j}-{u}_k)^{\top}$. %Then we can estimate the $W_2$ distance between $\mu_k$ and the target distribution $\pi$ using $\hat{W}_2^{(k)}=W_2\left(u, \hat{u}_{k}\right)$.

% \begin{equation*}
% \hat{W}_2^{(k)}=
% \sqrt{\left\|u-\hat{u}_{k}\right\|_{2}^{2}+\operatorname{Tr}\left(\Sigma+\hat{\Sigma}_{k}-2\left(\Sigma^{1 / 2} \hat{\Sigma}_{k} \Sigma^{1 / 2}\right)^{1 / 2}\right)}.
% \end{equation*}

\paragraph{Optimal local steps} We study the optimal number of local step $K$ for Algorithm \ref{alg:alg_main_paper_text_independent_noise} based on different $\alpha$ values defined in the beginning of this section, which corresponds to different levels of data heterogeneity modelled by $\gamma$. Here, we choose $\alpha=0, 1, 10, 100, 1000$ and the corresponding $\gamma$ is around $1\times 10^{8},4\times 10^{11}, 4\times 10^{12}, 4\times 10^{13}$, and $4\times 10^{14}$, respectively. 
%(or qualities of non i.i.d. data) controlled by the values of $\alpha$. Specifically, with the prescribed model, we can calculate the degree of the non-i.i.d. data $\gamma=\max_{c\in[N]}\|\nabla f^c(\theta_*)\|_2$ for those datasets simulated under different $\alpha$. Here, we choose $\alpha=0, 1, 10, 100, 1000$ and the calculated $\gamma$ is around $1\times 10^{8},4\times 10^{11}, 4\times 10^{12}, 4\times 10^{13}, 4\times 10^{14}$ respectively. 
We fix the learning rate $\eta=10^{-7}$. We evaluate the (log) number of communication rounds to achieve accuracy $\epsilon$ and denote it by $T_{\epsilon}$, where $T_{\epsilon}:=\min\{k\in\mathbb{N}: W_{2}(\mu_k, \pi)\le \epsilon\}$ and $\epsilon=10^{-3}$. % $\log(T_{\epsilon}/K)$ $\textup{Required rounds}$ versus the number of local steps ($K$), 
As shown in Figure \ref{fig:optimalK}, on the one hand, setting $K$ too small easily leads to excessive amount of communication costs; on the other hand, choosing a large $K$ results in large biases, which in turn requires high communications. We observe that there is an optimal choice of $K$ that achieves the minimal required communication rounds under different $\gamma$ and \emph{the communication savings can be as large as 30 times compared to the trivial scenarios of $K=1$}. As $\gamma$ increases, the value of the optimal $K$ also increases naturally.%, ranging from 3000 to 3800 approximately.

\paragraph{Data heterogeneity and correlated noise} We study the impact of $\gamma$ on the convergence of Algorithm \ref{alg:alg_main_paper_text_independent_noise} based on different $\gamma$ from $\{1\times 10^{8},4\times 10^{11}, 4\times 10^{12}, 4\times 10^{13}$, and $4\times 10^{14}\}$. We set $K=10$. As shown in Figure \ref{fig:alpha}, the $W_2$ distances under different $\gamma$ all converge to some levels around $10^{-3}$ after sufficient computations. Nevertheless, a larger $\gamma$ does slow down the convergence, which suggests adopting more balanced data to facilitate the computations. In Figure \ref{fig:rho}, we study the impact of $\rho$ on the convergence of Algorithm \ref{alg:alg_main_paper_text_different_seeds}. We choose $K=100$ and $\gamma=10^8$ and observe that a larger correlation slightly accelerates the computation, although it risks in privacy concerns.

\paragraph{Approximate samples} In Figure \ref{fig:empirical_density}, we plot the empirical density according to the %$R=6600$ 
samples generated %$k=1230$ 
by Algorithm \ref{alg:alg_main_paper_text_independent_noise} with $K=10$ and $\gamma=10^{8}$, $\eta=10^{-7}$. %The estimated $W_2$ distance is around $9.6\times 10^{-3}$. 
For comparison, we show the true density plot of the target distribution in Figure \ref{fig:true_density}. The empirical density approximates the true density very well, which indicates that the simulation potential of FA-LD in federated settings.

%We study the influence of $S$ and scheme of device-sampling for Algorithm \ref{alg:alg_main_text_partial_main} empirically. In all experiments, we fix $\tau=1$ and $N=50$. For each $c\in[N]$, we set $n_c=1000$. In each iteration $k$, we sample $m_c$ data points uniformly at random among all subsets with size $100$ of the set of those 1000 data points to estimate $\tilde{f}^c(\theta_k^c)$.

%\subsubsection{Optimal choice of $K$}
%We set $R=500$, $T=5000$, $\eta=10^{-7}$, and $N=50$. We run Algorithm \ref{alg:alg_main_paper_text_independent_noise} on the simulated datasets generated with $\alpha=0$, $10$, and $100$. In each setting, we report the curves of the estimated $W_2$ distance $\hat{W}_2\left(u_k,\pi\right)$ against the iteration $k$ for different choices of $K$. 

%When $\alpha=0$, we have that $\theta_c=(0,0)$ for any $c\in [N]$. The curves of the estimated $W_2$ distance are shown in Figure \ref{fig:Kalpha0_1} which indicates that the for $K$ ranging from 1 to 1000, the distribution of the outputs of Algorithm \ref{alg:alg_main_paper_text_independent_noise} converges to the true distribution. To further observe the rates of convergence under different values of $K$, we plot Figure \ref{fig:Kalpha0_2}. According to Figure \ref{fig:Kalpha0_2}, the optimal rate of convergence is obtained when $K\le 200$ or $K=500,900,1000$.
% \begin{figure}
%     \centering
%     \includegraphics[width=\textwidth]{figures/simulation/optimal_K/W2alpha0_10.pdf}
%     \caption{The curves of the estimated $W_2$ distance against iterations for different choices of $K$ when $\alpha=0$.}
%     \label{fig:Kalpha0_1}
% \end{figure}

% \begin{figure}
%     \centering
%     \includegraphics[width=\textwidth]{figures/simulation/optimal_K/W2alpha0_14.pdf}
%     \caption{The curves of the estimated $W_2$ distance against iterations for different choices of $K$ when $\alpha=0$.}
%     \label{fig:Kalpha0_2}
% \end{figure}

%When $\alpha=10$, the curves of the estimated $W_2$ distance are shown in Figure \ref{fig:Kalpha10_1} and \ref{fig:Kalpha10_2}. According to them, when $K$ ranging from 1 to 1000, the distributions of the outputs of Algorithm \ref{alg:alg_main_paper_text_independent_noise} all converge to the true distribution. The optimal rate of convergence is obtained when $K=400$ or 500. 
% \begin{figure}
%     \centering
%     \includegraphics[width=\textwidth]{figures/simulation/optimal_K/W2alpha10_10.pdf}
%     \caption{The curves of the estimated $W_2$ distance against iterations for different choices of $K$ when $\alpha=10$.}
%     \label{fig:Kalpha10_1}
% \end{figure}

% \begin{figure}
%     \centering
%     \includegraphics[width=\textwidth]{figures/simulation/optimal_K/W2alpha10_14.pdf}
%     \caption{The curves of the estimated $W_2$ distance against iterations for different choices of $K$ when $\alpha=10$.}
%     \label{fig:Kalpha10_2}
% \end{figure}

%When $\alpha=100$, the curves of the estimated $W_2$ distance are shown in Figure \ref{fig:Kalpha100_1} and \ref{fig:Kalpha100_2}. According to them, when $K$ ranging from 1 to 1000, the distributions of the outputs of Algorithm \ref{alg:alg_main_paper_text_independent_noise} all converge to the true distribution. The optimal rate of convergence is obtained when $K=400$ or 500. 
% \begin{figure}
%     \centering
%     \includegraphics[width=\textwidth]{figures/simulation/optimal_K/W2alpha10_10.pdf}
%     \caption{The curves of the estimated $W_2$ distance against iterations for different choices of $K$ when $\alpha=100$.}
%     \label{fig:Kalpha100_1}
% \end{figure}

% \begin{figure}
%     \centering
%     \includegraphics[width=\textwidth]{figures/simulation/optimal_K/W2alpha10_14.pdf}
%     \caption{The curves of the estimated $W_2$ distance against iterations for different choices of $K$ when $\alpha=100$.}
%     \label{fig:Kalpha100_2}
% \end{figure}


% \Wei{the above results don't make sense.}

%Figure \ref{fig:K-round} shows the curves of the logarithm of required rounds versus the number local steps $K$ for $\alpha=0,10,100,1000$.

% \begin{figure}
%     \centering
%     \includegraphics[width=\textwidth]{figures/simulation/optimal_K/K-round.pdf}
%     \caption{The curves of the logarithm of required rounds versus the number local steps $K$ for different $\alpha$.}
%     \label{fig:K-round}
% \end{figure}

% \Wei{Figure 1: show optimal local step based on different $\gamma$ (heterogeneity), our most important result, convex-like structure is the most important goal.}

% \Wei{Figure 2: pick one setup and compare the empirical samples with the ground truth}


% \Wei{Figure 3: $\rho$ effects: compare convergence rates based on $\rho=0, 0.5, 1$. three curves are good enough}

% \Wei{Figure 4: show convergence of partial device based on two schemes and compare them with the full device (there are three curves, full device \& two schemes in total)}


\input{draw}