 \begin{figure*}[htbp]
%   \vspace{-0.2in}
    \centering
    \subfigure[Study of $K$]{
    \begin{minipage}[t]{0.19\linewidth}
    \centering
    \label{fig:optimalK}
    \includegraphics[width=1.25in]{figures/simulation/partial_devices/optimalK2.pdf}
    \end{minipage}%
    }%
    \subfigure[Study of $\gamma$]{
    \begin{minipage}[t]{0.19\linewidth}
    \centering
    \label{fig:alpha}
    \includegraphics[width=1.25in]{figures/simulation/partial_devices/alpha_trace2.pdf}
    \end{minipage}%
    }%
    \subfigure[Study of $\rho$]{
    \begin{minipage}[t]{0.19\linewidth}
    \centering
    \label{fig:rho}
    \includegraphics[width=1.25in]{figures/simulation/partial_devices/rho2.pdf}
    \end{minipage}%
    }%
    \subfigure[True density]{
    \begin{minipage}[t]{0.19\linewidth}
    \centering
    \label{fig:true_density}
    \includegraphics[width=1.25in]{figures/simulation/partial_devices/alpha0_true_density4.pdf}
    \end{minipage}%
    }%
    \subfigure[Empirical density]{
    \begin{minipage}[t]{0.19\linewidth}
    \centering
    \label{fig:empirical_density}
    \includegraphics[width=1.25in]{figures/simulation/partial_devices/alpha0_empirical_density4.pdf}
    \end{minipage}%
    }%
%   \vskip -0.05in
  \caption{Convergence of FA-LD based on full devices. In Figure \ref{fig:optimalK}, several points for different $\gamma$ coincide with each other; e.g., the points of $\gamma=1\times 10^{8}$ and $\gamma=4\times 10^{11}$ coincide at $K=3000$.}
  \label{figure:full_device}
%   \vspace{-0.1in}
\end{figure*}



\begin{figure*}[htbp]
%   \vspace{-0.2in}
    \centering
    \subfigure[Full devices: $S=50$]{
    \begin{minipage}[t]{0.19\linewidth}
    \centering
    \label{full_device_baseline}
    \includegraphics[width=1.25in]{figures/simulation/partial_devices/FA-LD_50_50.pdf}
    \end{minipage}%
    }%
    \subfigure[Scheme I: $S=40$]{
    \begin{minipage}[t]{0.19\linewidth}
    \centering
    \label{partial_device_s1_40}
    \includegraphics[width=1.25in]{figures/simulation/partial_devices/FA-LD_40_50_S1.pdf}
    \end{minipage}%
    }%
    \subfigure[Scheme II: $S=40$]{
    \begin{minipage}[t]{0.19\linewidth}
    \centering
    \label{partial_device_s2_40}
    \includegraphics[width=1.25in]{figures/simulation/partial_devices/FA-LD_40_50_S2.pdf}
    \end{minipage}%
    }%
    \subfigure[Scheme I: $S=30$]{
    \begin{minipage}[t]{0.19\linewidth}
    \centering
    \label{partial_device_s1_30}
    \includegraphics[width=1.25in]{figures/simulation/partial_devices/FA-LD_30_50_S1.pdf}
    \end{minipage}%
    }%
    \subfigure[Scheme II: $S=30$]{
    \begin{minipage}[t]{0.19\linewidth}
    \centering
    \label{partial_device_s2_30}
    \includegraphics[width=1.25in]{figures/simulation/partial_devices/FA-LD_30_50_S2.pdf}
    \end{minipage}%
    }%
%   \vskip -0.05in
  \caption {Convergence of FA-LD based on different device-sampling schemes. The full device updates adopt $S=50$ devices; the partial device settings choose $S=40$ and 30 devices, respectively.}
  \label{figure:partial_device}
%   \vspace{-0.1in}
\end{figure*}

\paragraph{Partial device participation} We study the convergence of two popular device-sampling schemes I and II. We fix the number of local steps $K=100$ and the total devices $N=50$. We try to sample $S$ devices based on different fixed learning rates $\eta$. The full device updates are also presented for a fair evaluation. As shown in Figure \ref{full_device_baseline}, larger learning rates converge faster but lead to larger biases; small learning rates, by contrast, yield diminishing biases consistently, where is in accordance with Theorem \ref{main_paper_theorem}. However, in partial device scenarios, the bias becomes much less dependent on the learning rate in the long run. We observe in Figure \ref{figure:partial_device}(b-d) that the bias caused by partial devices becomes dominant as we decrease the number of partial devices $S$ for both schemes. Unfortunately, such a phenomenon still exists even when the algorithms converge, which suggests that the proposed partial device updates may be only appropriate for the early period of the training or simulation tasks with low accuracy demand.