%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} 
% after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

%%own package

\usepackage{smile}
%%my own definition
\newcommand{\shortsection}[1]{\vspace{1ex}\noindent{\bf #1.}}

\title{Efficient Privacy-Preserving Stochastic Nonconvex Optimization (Supplementary material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
%  Lingxiao Wang$^*$, Bargav Jayaraman$^\dag$, David Evans$^\dag$, Quanquan Gu$^\S$ \\[1.5ex]
% $*$: \emph{Toyota Technological Institute at Chicago} \\
%  {\small \sf lingxw@ttic.edu} \\[1.2ex]
%  $\dag$: \emph{Department of Computer Science, University of Virginia} \\
%  {\small \sf [bj4nq, evans]@virginia.edu} \\[1.2ex]
%   $\S$: \emph{Department of Computer Science, University of California, Los Angeles} \\
%  {\small \sf qgu@cs.ucla.edu} 
\author[1]{{Lingxiao Wang}{}}
\author[2]{Bargav Jayaraman}
\author[2]{David Evans}
\author[3]{Quanquan Gu}
% Add affiliations after the authors
\affil[1]{%
    Toyota Technological Institute at Chicago
}
\affil[2]{%
Department of Computer Science, University of Virginia
}
\affil[3]{%
Department of Computer Science, University of California, Los Angeles
  }
  
  \begin{document}
  \onecolumn
\maketitle
\section{Additional Experiments}\label{sec:add_experiments}
In this section, we present additional experiment results on nonconvex logistic regression and convolutional neural
networks.

\subsection{Results on \textit{ijcnn1} dataset}
In this subsection, we present the additional experiment of our method on \textit{ijcnn1} dataset. In this dataset, we follow the same settings as before: we set the clipping thresholds  $C_1=1,C_2=0.01$, and set the momentum parameter $\gamma=C_2$. Figures~\ref{figure:ijcnn1} illustrates the objective function value and the gradient norm of different algorithms under various privacy budgets $\epsilon\in\{0.2,0.5\}$. We can see that our proposed algorithm (DP-SRM) outperforms the other three baseline algorithms (RRPSGD, DP-GD, and DP-AGD) in terms of the objective loss, gradient norm, and convergence rate by a large margin. Table~\ref{table:ijcnn1} shows the test error of different algorithms as well as the CPU time (in seconds) of the training process on \textit{ijcnn1} dataset. It demonstrates that our algorithm convergences faster and can achieve a better test error on the test set than other baselines.
\begin{table*}[!th]
 \small
	\caption{Comparison of different algorithms on \textit{ijcnn1} dataset under different privacy budgets $\epsilon\in\{0.2,0.5\}$ and $\delta=10^{-5}$. Note that the non-private baseline denotes the test error of the non-private STORM algorithm [Cutkosky and Orabona,
2019]. }
	\label{table:ijcnn1}
	\centering
	\begin{tabular}{l|c|c|c|c|c|c}
		\toprule 
		Privacy&  Non-private& \multirow{2}{*}{Method}&\multirow{2}{*}{Test Error} &Data& \multirow{2}{*}{CPU time}  & \multirow{2}{*}{Gradient Norm}\\Budget& Baseline && & Passes &&\\
		%\cline{2-11}
		\midrule
		\multirow{4}{*}{$\epsilon=0.2$}        & \multirow{4}{*}{0.2096} & DP-GD &    0.3160 (0.0120) &20  &  0.5180   &    0.0184 (0.0024)  \\&&DP-AGD&0.2645 (0.0044) &346&90.05&0.0133 (0.0018)\\&&RRPSGD&0.3110 (0.0106) &8&47.64&0.0175 (0.0023)\\&(0.002)&\textbf{DP-SRM}&\textbf{0.2503 (0.0090)}&\textbf{4}&\textbf{0.4748 }&\textbf{0.0117 (0.0008)}\\
		\midrule
		\multirow{4}{*}{$\epsilon=0.5$ }     & \multirow{4}{*}{0.2096}  &DP-GD & 0.2717 (0.0081)  &20 &  0.4990  &     0.0171 (0.0024) \\&&DP-AGD&0.2416 (0.0029) &365&94.28 &0.0397 (0.0025)\\&&RRPSGD&0.3033 (0.0110)&10&59.06 &0.0160 (0.0018)\\&(0.002)&\textbf{DP-SRM}&\textbf{0.2341 (0.0042)}&\textbf{5}&\textbf{0.4368 }&\textbf{0.0082 (0.0005)}\\
% 		\midrule
		\hline
	\end{tabular}
\end{table*}
\begin{figure*}[!th]%
	\centering
	%\vspace{.3in}
	\subfigure[$\epsilon=0.2$]{
		\label{fig3:subfig:1.a} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/ijcnn1_02_err_epoch_CI.pdf}}
	\subfigure[$\epsilon=0.5$]{
		\label{fig3:subfig:1.b} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/ijcnn1_05_err_epoch_CI.pdf}}
			\subfigure[$\epsilon=0.2$]{
		\label{fig3:subfig:1.c} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/ijcnn1_02_norm_epoch_CI.pdf}}
	\subfigure[$\epsilon=0.5$]{
		\label{fig3:subfig:1.d} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/ijcnn1_05_norm_epoch_CI.pdf}}

	%\vspace{.3in}
	\caption{Results for nonconvex logistic regression on \textit{ijcnn1} dataset. (a), (b) show the objective loss versus the number of epochs. (c), (d) illustrate the gradient norm versus the number of epochs. } \label{figure:ijcnn1}%% label for entire figure
\end{figure*}


 
\subsection{Additional Experiments on Convolutional Neural Networks}

In this subsection, we present additional experiment results on training convolutional neural networks. Figures \ref{fig:mnist_add} shows the average test error (over 30 trials) and the corresponding 95\% confidence interval of different methods versus the number of iterations as well as the training time under different privacy budgets on MNIST and CIFAR-10 datasets. 
% \subsection{Results on \textit{ijcnn1} dataset}
% In this subsection, we present the additional experiment of our method on \textit{ijcnn1} dataset. In this dataset, we follow the same settings as before: we set the clipping thresholds  $C_1=1,C_2=0.01$, and set the momentum parameter $\gamma=C_2$. Figures~\ref{figure:ijcnn1} illustrates the objective function value and the gradient norm of different algorithms under various privacy budgets $\epsilon\in\{0.2,0.5\}$. We can see that our proposed algorithm (DP-SRM) outperforms the other three baseline algorithms (RRPSGD, DP-GD, and DP-AGD) in terms of the objective loss, gradient norm, and convergence rate by a large margin. Table~\ref{table:ijcnn1} shows the test error of different algorithms as well as the CPU time (in seconds) of the training process on \textit{ijcnn1} dataset. It demonstrates that our algorithm convergences faster and can achieve a better test error on the test set than other baselines.
% \begin{table*}[!th]
%  \small
% 	\caption{Comparison of different algorithms on \textit{ijcnn1} dataset under different privacy budgets $\epsilon\in\{0.2,0.5\}$ and $\delta=10^{-5}$. Note that the non-private baseline denotes the test error of the non-private STORM algorithm \citep{cutkosky2019momentum}. }
% 	\label{table:ijcnn1}
% 	\centering
% 	\begin{tabular}{l|c|c|c|c|c|c}
% 		\toprule 
% 		Privacy&  Non-private& \multirow{2}{*}{Method}&\multirow{2}{*}{Test Error} &Data& \multirow{2}{*}{CPU time}  & \multirow{2}{*}{Gradient Norm}\\Budget& Baseline && & Passes &&\\
% 		%\cline{2-11}
% 		\midrule
% 		\multirow{4}{*}{$\epsilon=0.2$}        & \multirow{4}{*}{0.2096} & DP-GD &    0.3160 (0.0120) &20  &  0.5180   &    0.0184 (0.0024)  \\&&DP-AGD&0.2645 (0.0044) &346&90.05&0.0133 (0.0018)\\&&RRPSGD&0.3110 (0.0106) &8&47.64&0.0175 (0.0023)\\&(0.002)&\textbf{DP-SRM}&\textbf{0.2503 (0.0090)}&\textbf{4}&\textbf{0.4748 }&\textbf{0.0117 (0.0008)}\\
% 		\midrule
% 		\multirow{4}{*}{$\epsilon=0.5$ }     & \multirow{4}{*}{0.2096}  &DP-GD & 0.2717 (0.0081)  &20 &  0.4990  &     0.0171 (0.0024) \\&&DP-AGD&0.2416 (0.0029) &365&94.28 &0.0397 (0.0025)\\&&RRPSGD&0.3033 (0.0110)&10&59.06 &0.0160 (0.0018)\\&(0.002)&\textbf{DP-SRM}&\textbf{0.2341 (0.0042)}&\textbf{5}&\textbf{0.4368 }&\textbf{0.0082 (0.0005)}\\
% % 		\midrule
% 		\hline
% 	\end{tabular}
% \end{table*}
% \begin{figure*}[!th]%
% 	\centering
% 	%\vspace{.3in}
% 	\subfigure[$\epsilon=0.2$]{
% 		\label{fig3:subfig:1.a} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{icml_2021/figure_CI/ijcnn1_02_err_epoch_CI.pdf}}
% 	\subfigure[$\epsilon=0.5$]{
% 		\label{fig3:subfig:1.b} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{icml_2021/figure_CI/ijcnn1_05_err_epoch_CI.pdf}}
% 			\subfigure[$\epsilon=0.2$]{
% 		\label{fig3:subfig:1.c} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{icml_2021/figure_CI/ijcnn1_02_norm_epoch_CI.pdf}}
% 	\subfigure[$\epsilon=0.5$]{
% 		\label{fig3:subfig:1.d} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{icml_2021/figure_CI/ijcnn1_05_norm_epoch_CI.pdf}}

% 	%\vspace{.3in}
% 	\caption{Results for nonconvex logistic regression on \textit{ijcnn1} dataset. (a), (b) show the objective loss versus the number of epochs. (c), (d) illustrate the gradient norm versus the number of epochs. } \label{figure:ijcnn1}%% label for entire figure
% \end{figure*}

% \subsection{Additional Results on MNIST and CIFAR-10 datasets}
% In this subsection, we present additional experiment results on convolutional neural networks. 

\begin{figure*}[!thb]%
	\centering
	%\vspace{.3in}
	\subfigure[$\epsilon=7.0$]{
		\label{fig4:subfig:1.a} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/MNIST_err_7_steps_CI.pdf}}
	\subfigure[$\epsilon=7.0$]{
		\label{fig4:subfig:1.b} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/MNIST_err_7_time_CI.pdf}}
	\subfigure[$\epsilon=8.0$]{
		\label{fig4:subfig:1.c} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/CIFAR_res_err_8_steps_CI.pdf}}
	\subfigure[$\epsilon=8.0$]{
		\label{fig4:subfig:1.d} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/CIFAR_res_err_8_time_CI.pdf}}
	\subfigure[$\epsilon=6.0$]{
		\label{fig4:subfig:1.e} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/CIFAR_err_6_steps_CI.pdf}}
	\subfigure[$\epsilon=6.0$]{
		\label{fig4:subfig:1.f} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/CIFAR_err_6_time_CI.pdf}}
			\subfigure[$\epsilon=8.0$]{
		\label{fig6:subfig:1.h} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/CIFAR_err_8_steps_CI.pdf}}
	\subfigure[$\epsilon=8.0$]{
		\label{fig6:subfig:1.h} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/CIFAR_err_8_time_CI.pdf}}
	\subfigure[$\epsilon=10.0$]{
		\label{fig6:subfig:1.i} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/CIFAR_err_10_steps_CI.pdf}}
			\subfigure[$\epsilon=10.0$]{
		\label{fig6:subfig:1.j} %% label for first subfigure
		\includegraphics[width=0.23\textwidth]{figure/CIFAR_err_10_time_CI.pdf}}
	%\vspace{.3in}
	\caption{Results for CNN on MNIST and CIFAR-10 datasets. (a), (b) illustrate the results on MNIST dataset. (c), (d) demonstrate the results for CNN6 on CIFAR-10 dataset. (e)-(j) show the results for CNN5 on CIFAR-10 dataset.} \label{fig:mnist_add}
\end{figure*}

\shortsection{Results on MINST dataset}
We can see from Figure \ref{fig4:subfig:1.a} and Figure \ref{fig4:subfig:1.b} that our proposed method can achieve $2.91\%$ test error when $\epsilon=7.0$, which is comparable to the $2.93\%$ test errors achieved by DP-SGD. Furthermore, the results show that our method is more efficient than DP-SGD in terms of iteration numbers and the training time. More specifically, our method is more than 2$\times$ faster than DP-SGD to achieve the desired test error.

% \noindent\textbf{Results on CIFAR-10 dataset}
% In this subsection, we present experiments of our method on CIFAR-10 dataset. More specifically, we consider two convolutional neural networks. The first one is a five layer CNN \footnote{\url{https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html}.} with two convolutional layers and three fully connected layers, and we call it CNN5. For CNN5, we train it from the scratch using our DP-SRM method and the DP-SGD method \citep{abadi2016deep} and compare their performances in terms of the model accuracy, iteration numbers and the training time. For the second one, we consider a similar architecture as in \citet{abadi2016deep}, which has three convolutional layers with 32, 64, 128 filters in each convolution layer and three fully connected layers, and we denote it by CNN6. For CNN6, we follow the same experiment setting as in \citet{abadi2016deep}: we use CIFAR-100 dataset as a public dataset, and first train a network with the same architecture on this dataset as the pretrained model. Then, we initialize the convolutional layers of CNN6 using the cnvolutional layers of the pretrained model, and only train the fully connected layers of CNN6 on CIFAR-10 dataset.




\shortsection{Parameters for CNN5} We choose three different privacy budgets $\epsilon\in\{6.0,8.0,10.0\}$, and set $\delta=10^{-5}$. We set the clipping parameter $C_1=2$ for the term $\|\nabla f_i(\theta^t)\|_2$. For the term $\|\nabla f_i(\theta^t)-\nabla f_i(\theta^{t-1})\|_2$, we choose the clipping parameter $C_2$ by searching the grid $\{0.01,0.1,0.3,0.5,0.7,0.9,0.99\}$. 
 For DP-SGD, we tune the batch size by searching the grid $\{32, 64, 128\}$ and the step size by $\{0.01,0.02,0.05,0.1,0.2\}$. For DP-SRM, we tune the batch size $b$ by searching the grid $\{32, 64, 128\}$, step size by $\{0.01,0.02,0.05,0.1,0.2\}$, and $b_0$ by $\{b,2b,4b\}$. In addition, we set the momentum parameter $\gamma=C_2$.

\shortsection{Results for CNN5 on CIFAR-10 dataset} Figures \ref{fig4:subfig:1.e}-\ref{fig6:subfig:1.j} present the average test error of different methods versus the number of iterations as well as the training time under different privacy budgets for CNN5 on CIFAR-10 dataset. The CNN5 trained by the non-private SGD will have $39.5\%$ test error after 100  epochs. The results show that that our proposed method has $50.3\%$, $48.2\%$ and $47.1\%$ test errors when $\epsilon=6.0$, $\epsilon=8.0$ and  $\epsilon=10.0$. Nevertheless, DP-SGD has $51.0\%$, $50.2\%$ and $49.3\%$ test errors under the privacy budgets $\epsilon=6.0$, $\epsilon=8.0$ and  $\epsilon=10.0$, which are worse than our method. Furthermore, we can see from the plots that compared with DP-SGD, our method can reduce both the iteration numbers and the training time.
%  \begin{figure*}[!thb]%
% 	\centering
% 	%\vspace{.3in}
% 	\subfigure[$\epsilon=6.0$]{
% 		\label{fig5:subfig:1.a} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_err_6_steps.pdf}}
% 	\subfigure[$\epsilon=6.0$]{
% 		\label{fig5:subfig:1.b} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_err_6_time.pdf}}
% 	\subfigure[$\epsilon=8.0$]{
% 		\label{fig5:subfig:1.c} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_err_8_steps.pdf}}
% 			\subfigure[$\epsilon=8.0$]{
% 		\label{fig5:subfig:1.d} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_err_8_time.pdf}}
% 			\subfigure[$\epsilon=10.0$]{
% 		\label{fig5:subfig:1.e} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_err_10_steps.pdf}}
% 	\subfigure[$\epsilon=10.0$]{
% 		\label{fig5:subfig:1.f} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_err_10_time.pdf}}
% 	\subfigure[$\epsilon=6.0$]{
% 		\label{fig5:subfig:1.g} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_err_6_time_multi.pdf}}
% 			\subfigure[$\epsilon=8.0$]{
% 		\label{fig5:subfig:1.h} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_err_8_time_multi.pdf}}
% 	%\vspace{.3in}
% 	\caption{Results for CNN5 on CIFAR 10 dataset. (a), (c), (e) depict the test error versus the number of iterations under different privacy budgets. (b), (d), (f) illustrate the test error versus the training time under different privacy budgets. (g), (h) present the test error versus training time with different number of parties under different privacy budgets for distributed learning} \label{fig:cifar}
% \end{figure*}

% \noindent\textbf{Parameters for CNN6.} We choose three different privacy budgets $\epsilon\in\{2.0,4.0,8.0\}$, and set $\delta=10^{-5}$. We set the clipping parameter $C_1=2$ for the term $\|\nabla f_i(\theta^t)\|_2$. For the term $\|\nabla f_i(\theta^t)-\nabla f_i(\theta^{t-1})\|_2$, we choose the clipping parameter $C_2$ by searching the grid $\{0.01,0.05,0.1,0.3,0.5,0.7,0.9,0.95,0.99\}$. 
%  For DP-SGD, we tune the batch size by searching the grid $\{64, 128, 256\}$ and the step size by $\{0.01,0.02,0.05,0.1,0.15,0.2\}$. For DP-SRM, we tune the batch size $b$ by searching the grid $\{64, 128, 256\}$, step size by $\{0.01,0.02,0.05,0.1,0.15,0.2\}$, and $b_0$ by $\{b,2b,4b\}$. In addition, we set the momentum parameter $\gamma=C_2$.
 
\shortsection{Results for CNN6 on CIFAR-10 dataset} Figure \ref{fig4:subfig:1.c} and Figure \ref{fig4:subfig:1.d} illustrate the average test error of different methods versus the number of iterations and the training time for CNN6 on CIFAR-10 dataset. We can see from the results that that our proposed method can achieve $29.3\%$ test errors given the privacy budget $\epsilon=8.0$, which are comparable to the results of DP-SGD with  $29.4\%$ under the same privacy budget. However, we can see from the plots that our method can significantly reduce the iteration numbers and the training time. When $\epsilon=8$, DP-SGD takes $5.8\times 10^{4}$ iterations and 5176 seconds to achiever $29.4\%$ test error. In sharp contrast, our method only takes $2.6\times 10^{4}$ iterations and 2589 seconds to achieve $29.3\%$ test error. 
%  \begin{figure*}[!thb]%
% 	\centering
% 	%\vspace{.3in}
% 	\subfigure[$\epsilon=2.0$]{
% 		\label{fig6:subfig:1.a} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_res_err_2_steps.pdf}}
% 	\subfigure[$\epsilon=2.0$]{
% 		\label{fig6:subfig:1.b} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_res_err_2_time.pdf}}
% 	\subfigure[$\epsilon=4.0$]{
% 		\label{fig6:subfig:1.c} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_err_res_4_steps.pdf}}
% 			\subfigure[$\epsilon=4.0$]{
% 		\label{fig6:subfig:1.d} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_err_res_4_time.pdf}}
% 			\subfigure[$\epsilon=8.0$]{
% 		\label{fig6:subfig:1.e} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_err_res_8_steps.pdf}}
% 	\subfigure[$\epsilon=8.0$]{
% 		\label{fig6:subfig:1.f} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_err_res_8_time.pdf}}
% 	\subfigure[$\epsilon=2.0$]{
% 		\label{fig6:subfig:1.g} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_res_err_2_time_multi.pdf}}
% 			\subfigure[$\epsilon=4.0$]{
% 		\label{fig6:subfig:1.h} %% label for first subfigure
% 		\includegraphics[width=0.23\textwidth]{neurips2020_version/new_figure/CIFAR_res_err_4_time_multi.pdf}}
% 	%\vspace{.3in}
% 	\caption{Results for CNN6 on CIFAR 10 dataset. (a), (c), (e) illustrate the test error versus the number of iterations under different privacy budgets. (b), (d), (f) show the test error versus the training time under different privacy budgets. (g), (h) demonstrate the test error versus training time with different number of parties under different privacy budgets for distributed learning} \label{fig:cifar1}
% \end{figure*}



\section{Proof of Main Results}\label{proof}
In this section, we present the proofs of our main results.
\subsection{Proof of Theorem 5.1}
We will provide the privacy guarantee of Algorithm 1 in this subsection. To this end, we need the following composition rule for RDP.
\begin{lemma}[Mironov [2017]]\label{lemma:com_post}
	If $k$ randomized mechanisms $\cM_i:\cS^n\rightarrow\cR$ for $i\in[k]$, satisfy $(\alpha,\rho_i)$-RDP, then their composition $\big(\cM_1(S),\ldots,\cM_k(S)\big)$ satisfies $(\alpha,\sum_{i=1}^k\rho_i)$-RDP. Moreover, the input of the $i$-th mechanism can base on the outputs of previous $(i-1)$ mechanisms.
\end{lemma}

We will first show that our proposed algorithm satisfies RDP using Lemma 3.7 and Lemma \ref{lemma:com_post}. Then we will transform it into $(\epsilon,\delta)$-DP based on Lemma 3.9. For the given dataset $S$, we use $S^\prime$ to denote its neighboring dataset with one different example indexed by $i^\prime$ in the following discussion. According to Algorithm 1, we use the following $\cM_t$ to denote the mechanism at $t$-th iteration
% \begin{align}\label{eq:mechanismM}
%     \cM_{t}=\nabla F_{\cB_t}(\btheta^t)+(1-\gamma)\big(\vb^{t-1}_p-\nabla F_{\cB_t}(\btheta^{t-1})\big)+\ub^t.
% \end{align}
\begin{align}\label{eq:mechanismM}
    \cM_{t}=
    \left\{
	\begin{array} {ll}
		\nabla F_{\cB_t}(\btheta^t)+(1-\gamma)\big(\vb^{t-1}_p-\nabla F_{\cB_t}(\btheta^{t-1})\big)+\ub^t, & t>0,\\
		\vb^{0}+\ub^0, &  t=0.
	\end{array}
	\right.
\end{align}

 Therefore, our goal is to show the privacy guarantees of $\cM_t$ for $t=0,1,\ldots,T$.

\noindent\textbf{Case 1:} If $t=0$, we have $\vb^{0}=\nabla F_{\cB_0}(\btheta^0)$ and $\cM_0$ is equivalent to the following Gaussian mechanism 
\begin{align*}
    \cG_{0}=\nabla F_{\cB_0}(\btheta^0)+\ub^{0},
\end{align*}
where $\ub^{0}\sim N(0,\sigma^2_{0}\Ib_d)$.  Note that the mechanism $\cG_0$ is based on the subsampling, thus we will use the results of privacy-amplification by subsampling, i.e., Lemma 3.7, to show that $\cG_0$ satisfies RDP given appropriate $\ub^0$. To this end, we first consider the following Gaussian mechanism without subsampling
\begin{align*}
    \tilde \cG_{0}=\frac{1}{b_0}\sum_{i=1}^n\nabla f_i(\btheta^0)+\ub^{0}.
\end{align*}

\noindent\textbf{Sensitivity.} Consider the query on the dataset $S$ as follows $\tilde \qb_{0}(S)=\sum_{i=1}^n\nabla f_i(\btheta^0)/b_0$, where $\tilde \qb_{0}(S)$ denotes that the query is based on the dataset $S$. Thus, we have 
\begin{align*}
    \tilde \qb_{0}(S)-\tilde \qb_{0}(S^\prime)=\frac{1}{b_0}\big(\nabla f_i(\btheta^{0})-\nabla f_{i^\prime}(\btheta^{0})\big).
\end{align*}
Since each component function is $G$-Lipschitz, we can obtain the $\ell_2$-sensitivity of this query as follows
\begin{align}\label{eq:senstivity_gamma_t_1}
    \tilde \Delta_0=\frac{1}{b_0}\|\nabla f_i(\btheta^{0})-\nabla f_{i^\prime}(\btheta^{0})\|_2\leq \frac{2G}{b_0}.
\end{align}

\noindent\textbf{Privacy guarantee of $\cG_{0}$.}
By Lemma 3.7, if the Gaussian noise $\ub^0$ in $\tilde \cG_0$ has the following variance
\begin{align}\label{eq:sigma_1_RDP}
    \sigma^2_{0}=\frac{14T\alpha G^2}{\beta n^{2}\epsilon},
\end{align}
the mechanism $\tilde \cG_{0}$ satisfies $\big(\alpha,\beta\epsilon n^2/\big(7b_0^2T\big)\big)$-RDP. Therefore, according to the privacy-amplification by subsampling result in Lemma 3.7, we have that the mechanism $\cG_0$ satisfies $(\alpha,\rho_0)$-RDP, where $\rho_0=\beta \epsilon/T$. Furthermore, the variance $\sigma_0^2$ should satisfy the following condition
\begin{align*}
    \frac{\sigma_0^2}{\tilde \Delta_0^2}=\frac{\sigma_0^2b_0^2}{4G^2}=\frac{7b_0^2T\alpha}{\beta n^2\epsilon}\geq 0.7.
\end{align*}
And the parameter $\alpha$ should satisfy $\alpha\leq1+ 2(\sigma_0/\tilde \Delta_0)^2\log\big(1/\tau\alpha (1+(\sigma_0/\tilde \Delta_0)^2)\big)/3$.

\noindent\textbf{Case 2:} If $t>0$, according to the definition of $\cM_t$ in \eqref{eq:mechanismM}, we consider the following Gaussian mechanism
\begin{align*}
    \cG_t=\nabla F_{\cB_t}(\btheta^t)-(1-\gamma)\nabla F_{\cB_t}(\btheta^{t-1})+\ub^t.
\end{align*}
Now, we are going to show that $\cG_t$ satisfies RDP given appropriate $\ub^t$. Since the mechanism $\cG_t$ is based on the subsampling, we will use the similar proof procedure as in \textbf{Case 1} to show that $\cG_t$ satisfies RDP. Thus we consider the following Gaussian mechanism without subsampling
\begin{align*}
    \tilde \cG_t=\frac{1}{b}\sum_{i=1}^n\nabla f_i(\btheta^t)-(1-\gamma)\frac{1}{b}\sum_{i=1}^n\nabla f_i(\btheta^{t-1})+\ub^t.
\end{align*}

\noindent\textbf{Sensitivity.} We consider the following query without subsampling
\begin{align*}
    \tilde \qb_t(S)=\frac{1}{b}\sum_{i=1}^n\nabla f_i(\btheta^t)-(1-\gamma)\frac{1}{b}\sum_{i=1}^n\nabla f_i(\btheta^{t-1}).
\end{align*}
Thus we have
 \begin{align*}
     \tilde \qb_t(S)-\tilde \qb_t(S^\prime)&=\frac{1}{b}\big(\nabla f_i (\btheta^t)-(1-\gamma)\nabla f_i (\btheta^{t-1})-\nabla f_{i^\prime} (\btheta^t)+(1-\gamma)\nabla f_{i^\prime}(\btheta^{t-1})\big).
 \end{align*}
 As a result, we can obtain the $\ell_2$-sensitivity of the query $\tilde \qb_t$ as follows
 \begin{align*}
     \tilde \Delta_t&=\frac{1}{b}\big\|(1-\gamma)\big(\nabla f_i (\btheta^t)-\nabla f_i (\btheta^{t-1})-\nabla f_{i^\prime} (\btheta^t)+\nabla f_{i^\prime}(\btheta^{t-1})\big)\\
     &~~\qquad+\gamma\big(\nabla f_i (\btheta^{t})-\nabla f_{i^\prime} (\btheta^{t})\big) \big\|_2\\
     &\leq\frac{2L(1-\gamma)}{b}\|\btheta^t-\btheta^{t-1}\|_2+\frac{2\gamma G}{b},
 \end{align*}
where the inequality is due to $L$-Lipschitz continuous gradient and $G$-Lipschitz of each component function. Furthermore, according to the update rule of Algorithm 1 and the definition of $\eta_{t-1}$, we have
\begin{align*}
    \|\btheta^t-\btheta^{t-1}\|_2\leq \eta_{t-1}\|\vb^{t-1}_p\|_2\leq \min\bigg\{\frac{\zeta}{n_0L\|\vb^{t-1}_p\|_2},\frac{1}{2n_0L}\bigg\}\cdot\|\vb^{t-1}_p\|_2\leq \frac{\zeta}{n_0L},
\end{align*}
which implies that
\begin{align}\label{eq:senstivity_gamma_t}
   \tilde \Delta_t\leq\frac{2L(1-\gamma)}{b}\|\btheta^t-\btheta^{t-1}\|_2+\frac{2\gamma G}{b}\leq \frac{2\big((1-\gamma)\zeta/n_0+\gamma G\big)}{b}.
\end{align}

\noindent\textbf{Privacy guarantee of $\cG_t$.}
By Lemma 3.7, if we the Gaussian noise $\ub^t$ in $\tilde \cG_t$ has the variance as follows
\begin{align}\label{eq:sigma_2_RDP}
    \sigma^2_t&=\frac{14T\alpha\big((1-\gamma)\zeta/n_0+\gamma G\big)^2}{\beta n^2\epsilon},
\end{align}
the mechanism $\tilde \cG_t$ satisfies $\big(\alpha,\beta\epsilon n^2/\big(7b^2T\big)\big)$-RDP. Thus based on the privacy-amplification by subsampling result (Lemma 3.7), we can get that the mechanism $\cG_t$ satisfies $(\alpha,\rho)$-RDP, where $\rho=\beta \epsilon/T$. In addition, the variance $\sigma_t^2$ should satisfy the following condition 
\begin{align*}
    \frac{\sigma_t^2}{\tilde \Delta_t^2}=\frac{\sigma_t^2b^2}{4\big((1-\gamma)\zeta/n_0+\gamma G\big)^2}=\frac{7b^2T\alpha}{\beta n^2\epsilon}\geq 0.7.
\end{align*}
And the parameter $\alpha$ should satisfy $\alpha\leq1+ 2(\sigma_t/\tilde \Delta_t)^2\log\big(1/\tau\alpha (1+(\sigma_t/\tilde \Delta_t)^2)\big)/3$. As a result, we show that $\cG_t$ satisfies $(\alpha,\rho)$-RDP.

\noindent\textbf{Privacy guarantee of $\cM_{t}$.}
By the definition of the mechanism $\cM_{t}$ in \eqref{eq:mechanismM}, $\cM_t$ is a composition of $\cG_{0},\ldots,\cG_{t}$, i.e., $\cM_{t}=(\cG_{0},\ldots,\cG_{t})$. According to the composition property of RDP, i.e., Lemma \ref{lemma:com_post}, we have 
$\cM_{t}$ satisfies $(\alpha,\rho_0+(t-1)\rho)$-RDP. Since $\rho_0=\rho=\beta\epsilon/T$, we have that after $T^\prime$ iterations of Algorithm 1, it satisfies $(\alpha,\beta T^\prime\epsilon/T)$-RDP.  According to Lemma 3.9 and $\alpha=\log(1/\delta)/\big((1-\beta)\epsilon\big)+1$, we have that after $T^\prime$ iterations, Algorithm 1 satisfies $(T^\prime\epsilon/T,\delta)$-DP. As a result, we have that for each $\btheta^t$, where $t=1,\ldots,T$, it satisfies $(\epsilon,\delta)$-DP. Finally, by the definition of $\tilde \btheta$, we have $\tilde \btheta$ satisfies $(\epsilon,\delta)$-DP.

\subsection{Proof of Corollary 5.3}
In this subsection, we show that by choosing a larger mini-batch size, we can get rid of the constraints in Theorem 5.1. More specifically, let $b_0^2=b^2= n^2\epsilon/T$ and $\beta=1/2$, we have $\sigma^{\prime 2}=7T\alpha b^2/(\beta n^2\epsilon)=14\alpha$. Furthermore, we have
\begin{align*}
   \tau\alpha\big(1+\sigma^{\prime2}) \stackrel{\mathrm{(i)}}\leq 15\tau\alpha^2 \stackrel{\mathrm{(ii)}}=15\big(2\log(1/\delta)/\epsilon+1\big)^2\sqrt{\epsilon/T},
\end{align*}
where $\mathrm{(i)}$ uses $\sigma^{\prime 2}=14\alpha$, $\mathrm{(ii)}$ uses $\tau=b/n=\sqrt{\epsilon/T}$ and $\epsilon=2\log(1/\delta)/(\alpha-1)$.
% $\tau\alpha\big(1+\sigma^{\prime2})\leq 3\tau\alpha\sigma^{\prime2}=42\alpha^2\sqrt{\epsilon/T}=42\big(2\log(1/\delta)/\epsilon+1\big)^2\sqrt{\epsilon/T}$. 
If $\epsilon\leq 2\log(1/\delta)$, we can obtain $\tau\alpha\big(1+\sigma^{\prime2})\leq 1/3$ if $T$ is larger than $O\big(\log^4(1/\delta)/\epsilon^3\big)$. If $\epsilon> 2\log(1/\delta)$, we can obtain $\tau\alpha\big(1+\sigma^{\prime2})\leq 1/3$ if $T$ is larger than $O(\epsilon)$. Therefore, we can get $\log(1/\tau\alpha\big(1+\sigma^{\prime2})\big)\geq 1$. As a result, we have $2\big(\sigma^{\prime2}\log(1/\tau\alpha\big(1+\sigma^{\prime2})\big)\big)/3\geq 28\alpha/3 >\alpha-1$.

\subsection{Proof of Theorem 5.4}
In this subsection. we provide the utility guarantee of our method. According to the assumption that each component function has $L$-Lipschitz continuous gradient, we can obtain that
\begin{align*}
    \|\nabla F(\xb)-\nabla F(\yb)\|_2=\frac{1}{n}\sum_{i=1}^n\|\nabla f_i(\xb)-\nabla f_i(\yb)\|_2\leq L\|\xb-\yb\|_2,
\end{align*}
which implies that $F(\xb)$ has $L$-Lipschitz continuous gradient. Thus we have
\begin{align*}
    F(\btheta^{t+1})&\leq F(\btheta^t)+\la\nabla F(\btheta^t),\btheta^{t+1}-\btheta^{t}\ra+\frac{L}{2}\|\btheta^{t+1}-\btheta^{t}\|_2^2\\
    &=F(\btheta^t)-\eta_t\la\nabla F(\btheta^t),\vb^t_p\ra+\frac{\eta_t^2L}{2}\big\|\vb^t_p\big\|_2^2\\
    &=F(\btheta^t)+\frac{\eta_t}{2}\big\|\nabla F(\btheta^t)-\vb^t_p\big\|_2^2-\frac{\eta_t}{2}\big\|\nabla F(\btheta^t)\big\|_2^2-\eta_t\bigg(\frac{1}{2}-\frac{\eta_tL}{2}\bigg)\big\|\vb^t_p\big\|_2^2,
\end{align*}
where the last equality is due to the fact that $2\la\nabla F(\btheta^t),\vb^t_p\ra=\big\|\nabla F(\btheta^t)\big\|_2^2+\big\|\vb^t_p\big\|_2^2-\big\|\nabla F(\btheta^t)-\vb^t_p\big\|_2^2$. Since $\eta_t\leq 1/(2n_0L)$, we can obtain that 
\begin{align*}
    F(\btheta^{t+1})&\leq F(\btheta^t)+\frac{1}{4n_0L}\big\|\nabla F(\btheta^t)-\vb^t_p\big\|_2^2-\frac{\eta_t}{4}\big\|\vb^t_p\big\|_2^2.
\end{align*}
 In addition, we have
\begin{align*}
    \frac{\eta_t}{4}\big\|\vb^t_p\big\|_2^2=\frac{\zeta^2}{8n_0L}\min\big\{2\big\|\vb^t_p/\zeta\big\|_2,\big\|\vb^t_p/\zeta\big\|_2^2\big\}\geq \frac{\zeta\big\|\vb^t_p\big\|_2-2\zeta^2}{4n_0L}.
\end{align*}
Thus we have
\begin{align}\label{eq:eq0}
    F(\btheta^{t+1})&\leq F(\btheta^t)+\frac{1}{4n_0L}\big\|\nabla F(\btheta^t)-\vb^t_p\big\|_2^2-\frac{\zeta\big\|\vb^t_p\big\|_2}{4n_0L}+\frac{\zeta^2}{2n_0L}.
\end{align}
Summing over $t=0,\ldots,T-1$ and taking expectation in \eqref{eq:eq0}, we can get
\begin{align}\label{eq:contraction}
    \frac{\zeta}{4n_0L}\sum_{t=0}^{T-1}\EE\big\|\vb^t_p\big\|_2&\leq F(\btheta^0)-\EE F(\btheta^T)+\frac{1}{4n_0L}\sum_{t=0}^{T-1}\EE\big\|\nabla F(\btheta^t)-\vb^t_p\big\|_2^2+\frac{T\zeta^2}{2n_0L}\nonumber\\
    &\leq F(\btheta^0)- F(\btheta^*)+\frac{1}{4n_0L}\sum_{t=0}^{T-1}\EE\big\|\nabla F(\btheta^t)-\vb^t_p\big\|_2^2+\frac{T\zeta^2}{2n_0L}.
\end{align}
For the term $\EE\big\|\nabla F(\btheta^t)-\vb^t_p\big\|_2^2$, we can bound it as follows: we first consider the conditional expectation
\begin{align}\label{eq:eq1}
    \EE_{t}\big\|\vb^t_p-\nabla F(\btheta^t)\big\|_2^2&=\EE_{t}\big\|(1-\gamma)\big(\vb^{t-1}_p-\nabla F_{\cB_t}(\btheta^{t-1})\big)+\nabla F_{\cB_t}(\btheta^t)-\nabla F(\btheta^t)+\ub^t\big\|_2^2\nonumber\\
    &=\EE_{t}\big\|(1-\gamma)\big(\vb^{t-1}_p-\nabla F(\btheta^{t-1})\big)+(1-\gamma)\nabla F(\btheta^{t-1})\nonumber\\
    &\quad\qquad-(1-\gamma)\nabla F_{\cB_t}(\btheta^{t-1})+\nabla F_{\cB_t}(\btheta^t)-\nabla F(\btheta^t)\big\|_2^2+\EE_{t}\|\ub^t\|_2^2\nonumber\\
    &=\EE_{t}\big\|(1-\gamma)\big(\vb^{t-1}_p-\nabla F(\btheta^{t-1})\big)+(1-\gamma)\big(\nabla F_{\cB_t}(\btheta^t)-\nabla F_{\cB_t}(\btheta^{t-1})\nonumber\\
    &~~\qquad+\nabla F(\btheta^{t-1})-\nabla F(\btheta^t)\big)
    +\gamma\big(\nabla F_{\cB_t}(\btheta^t)-\nabla F(\btheta^t)\big)\big\|_2^2+\EE_{t}\|\ub^t\|_2^2,
\end{align}
where $\EE_t$ is taken over the randomness at the $t$-th iteration given the observations after $(t-1)$-th iteration, the first equation comes from the definition of $\vb^t_p$, the second one is due to the independence of the random variables. Therefore, we can obtain that 
\begin{align}\label{eq:eq2}
    \EE_{t}\big\|\vb^t_p-\nabla F(\btheta^t)\big\|_2^2&=(1-\gamma)^2\EE_{t}\big\|\vb^{t-1}_p-\nabla F(\btheta^{t-1})\big\|_2^2\nonumber\\
    &\qquad+2\gamma^2\EE_{t}\big\|\nabla F_{\cB_t}(\btheta^t)-\nabla F(\btheta^t)\big\|_2^2+\EE_{t}\|\ub^t\|_2^2\nonumber\\
    &\qquad+2(1-\gamma)^2\EE_{t}\big\|\nabla F_{\cB_t}(\btheta^t)-\nabla F_{\cB_t}(\btheta^{t-1})+\nabla F(\btheta^{t-1})-\nabla F(\btheta^t)\big\|_2^2,
\end{align}
where the equality is due to the expansion of \eqref{eq:eq1} and Cauchy-Schwartz inequality.
In addition, we have
\begin{align*}
    &\EE_{t}\big\|\nabla F(\btheta^t)-\nabla F(\btheta^{t-1})-\nabla F_{\cB_t}(\btheta^t)+\nabla F_{\cB_t}(\btheta^{t-1})\big\|_2^2\nonumber\\
    &\leq \frac{1}{b}\cdot\frac{1}{n}\sum_{i=1}^n\big\|\nabla F(\btheta^t)-\nabla F(\btheta^{t-1})-\nabla f_i(\btheta^t)+\nabla f_i(\btheta^{t-1})\big\|_2^2\nonumber\\
    &\leq\frac{1}{b}\cdot\frac{1}{n}\sum_{i=1}^n\big\|\nabla f_i(\btheta^t)-\nabla f_i(\btheta^{t-1})\big\|_2^2\nonumber\\
    &\leq \frac{L^2}{b}\|\btheta^t-\btheta^{t-1}\|_2^2,
\end{align*}
where the first inequality is due to Lemma \ref{lemma:scsg}, the second one comes from the fact that $\EE\|\bX-\EE\bX\|_2^2\leq \EE\|\bX\|_2^2$ for any random variable $\bX$, and the last one is due to the gradient Lipschitz property of each component function. According to the update rule, we have
\begin{align*}
    \|\btheta^t-\btheta^{t-1}\|_2\leq \eta_{t-1}\big\|\vb^{t-1}_p\big\|_2\leq \min\bigg\{\frac{\zeta}{n_0L\big\|\vb^{t-1}_p\big\|_2},\frac{1}{2n_0L}\bigg\}\cdot\big\|\vb^{t-1}_p\big\|_2\leq \frac{\zeta}{n_0L},
\end{align*}
which implies 
\begin{align}\label{eq:eq3}
    \EE_{t}\big\|\nabla F(\btheta^t)-\nabla F(\btheta^{t-1})-\nabla F_{\cB_t}(\btheta^t)+\nabla F_{\cB_t}(\btheta^{t-1})\big\|_2^2\leq \frac{\zeta^2}{n_0^2b}.
\end{align}

Thus plugging \eqref{eq:eq3} into \eqref{eq:eq2}, we can obtain that
\begin{align}\label{eq:eq4}
    \EE_{t}\big\|\vb^t_p-\nabla F(\btheta^t)\big\|_2^2&\leq (1-\gamma)^2\big\|\vb^{t-1}_p-\nabla F(\btheta^{t-1})\big\|_2^2+\frac{2(1-\gamma)^2L^2}{b}\|\btheta^t-\btheta^{t-1}\|_2^2\nonumber\\
    &\qquad+2\gamma^2\EE_{t}\big\|\nabla F_{\cB_t}(\btheta^t)-\nabla F(\btheta^t)\big\|_2^2+\EE_{t}\|\ub^t\|_2^2\nonumber\\
    &\leq (1-\gamma)^2\big\|\vb^{t-1}_p-\nabla F(\btheta^{t-1})\big\|_2^2+\frac{2(1-\gamma)^2\zeta^2}{n_0^2b}+\frac{2\gamma^2G^2}{b}+\EE_{t}\|\ub^t\|_2^2,
\end{align}
where the second inequality follows the following inequality (using Lemma \ref{lemma:scsg}, $\EE\|\bX-\EE\bX\|_2^2\leq \EE\|\bX\|_2^2$, and the $G$-Lipschitz of each component function)
\begin{align}\label{eq:varaince1}
    \EE_{t}\big\|\nabla F_{\cB_t}(\btheta^t)-\nabla F(\btheta^t)\big\|_2^2\leq \frac{1}{b}\cdot\frac{1}{n}\sum_{i=1}^n\big\|\nabla f_i(\btheta^t)\big\|_2^2\leq \frac{G^2}{b}.
\end{align}
Therefore, taking expectations over all iterations in \eqref{eq:eq4}, we can get
\begin{align}\label{eq:eq5}
     \EE\big\|\vb^t_p-\nabla F(\btheta^t)\big\|_2^2&\leq (1-\gamma)^2\EE\big\|\vb^{t-1}_p-\nabla F(\btheta^{t-1})\big\|_2^2+\frac{2(1-\gamma)^2\zeta^2}{n_0^2b}+\frac{2\gamma^2G^2}{b}+d\sigma^2.
\end{align}
Following the proof of Lemma 9 in  Yuan et al. [2020], we have
\begin{align*}
    \gamma\sum_{t=0}^{T-1}\EE\big\|\vb^t_p-\nabla F(\btheta^t)\big\|_2^2&\leq \frac{2T(1-\gamma)^2\zeta^2}{n_0^2b}+\frac{2T\gamma^2G^2}{b}+Td\sigma^2+\EE\big\|\vb^0_p-\nabla F(\btheta^0)\big\|_2^2\\
    &\leq \frac{2T(1-\gamma)^2\zeta^2}{n_0^2b}+\frac{2T\gamma^2G^2}{b}+Td\sigma^2+\frac{G^2}{b_0}+d\sigma_0^2,
\end{align*}
where the last line comes from the definition of $\vb_p^0=\nabla F_{\cB_0}(\btheta^0)+\ub^0$ and the inequality $ \EE\big\|\nabla F_{\cB_0}(\btheta^0)-\nabla F(\btheta^0)\big\|_2^2\leq G^2/b_0$ (see equation \eqref{eq:varaince1}).
Therefore, we can obtain that 
\begin{align}\label{eq:eq6}
    \sum_{t=0}^{T-1}\EE\big\|\vb^t_p-\nabla F(\btheta^t)\big\|_2^2\leq \frac{2T(1-\gamma)^2\zeta^2}{n_0^2\gamma b}+\frac{2T\gamma G^2}{b}+\frac{Td\sigma^2+d\sigma_0^2}{\gamma}+\frac{G^2}{\gamma b_0}.
\end{align}


Combining \eqref{eq:contraction} and \eqref{eq:eq6}, we can get
\begin{align*}
    \frac{\zeta}{4n_0L}\sum_{t=0}^{T-1}\EE\big\|\vb^t_p\big\|_2
    &\leq F(\btheta^0)- F(\btheta^*)+\frac{1}{4n_0L}\sum_{t=0}^{T-1}\EE\big\|\nabla F(\btheta^t)-\vb^t_p\big\|_2^2+\frac{T\zeta^2}{2n_0L}\\
    &\leq  F(\btheta^0)- F(\btheta^*)+ \frac{T(1-\gamma)^2\zeta^2}{2n_0^3L\gamma b}+\frac{T\gamma G^2}{ 4Ln_0b}\nonumber\\
    &\qquad+\frac{Td\sigma^2+d\sigma_0^2}{4n_0L\gamma}+\frac{G^2}{4L\gamma n_0b_0}+\frac{T\zeta^2}{2n_0L}.
\end{align*}
Hence we have 
\begin{align}\label{eq:eq7}
   \frac{1}{T}\sum_{t=0}^{T-1}\EE\big\|\vb^t_p\big\|_2
    &\leq \frac{4n_0L}{T\zeta}\big(F(\btheta^0)-F(\btheta^*)\big)+ \frac{2\zeta}{n_0^2\gamma b}+\frac{\gamma G^2}{ \zeta b}+\frac{d\sigma^2+d\sigma_0^2/T}{\zeta\gamma}+\frac{G^2}{T\zeta\gamma b_0}+2\zeta\nonumber\\
    &\leq 6\zeta+\frac{2\zeta}{n_0^2\gamma b}+\frac{\gamma G^2}{ \zeta b}+\frac{d\sigma^2+d\sigma_0^2/T}{\zeta\gamma}+\frac{G^2}{T\zeta\gamma b_0},
    % &\leq 6\zeta+4\zeta+\frac{\zeta G^2}{4LD}+\frac{2d}{\zeta\alpha}\frac{T\Delta^2\log(1/\delta)}{n^2\epsilon^2}
\end{align}
where the first inequality is due to $T= \lfloor 4n_0L\big(F(\btheta^0)-F(\btheta^*)\big)/\zeta^2\rfloor +1$. In addition, according to \eqref{eq:eq6} and Jensen's inequality, we have
\begin{align}\label{eq:eq8}
    \frac{1}{T}\sum_{t=0}^{T-1}\EE\big\|\nabla F(\btheta^t)-\vb^t_p\big\|_2\leq \frac{\sqrt{2}\zeta}{n_0\sqrt{\gamma b}}+\frac{\sqrt{2\gamma} G}{\sqrt{b}}+\frac{\sqrt{d}\sigma+\sqrt{d}\sigma_0/\sqrt{T}}{\sqrt{\gamma}}+\frac{G}{\sqrt{T\gamma b_0}}.
\end{align}
Thus by the definition of $\tilde \btheta$, we have 
\begin{align}\label{eq:eq9}
    \EE\|\nabla F(\tilde \btheta)\|_2&=\frac{1}{T}\sum_{t=0}^{T-1}\EE\|\nabla F(\btheta^t)\|_2\nonumber\\
    &\leq \frac{1}{T}\sum_{t=0}^{T-1}\EE\big\|\vb^t_p\big\|_2+\frac{1}{T}\sum_{t=0}^{T-1}\EE\big\|\nabla F(\btheta^t)-\vb^t_p\big\|_2\nonumber\\
    &\leq 6\zeta+\frac{2\zeta}{n_0^2\gamma b}+\frac{\gamma G^2}{ \zeta b}+\frac{d\sigma^2}{\zeta\gamma}+\frac{d\sigma_0^2}{T\zeta\gamma}+\frac{G^2}{T\zeta\gamma b_0}+\frac{\sqrt{2}\zeta}{n_0\sqrt{\gamma b}}+\frac{\sqrt{2\gamma} G}{\sqrt{b}}\nonumber\\
    &\qquad+\frac{\sqrt{d}\sigma}{\sqrt{\gamma}}+\frac{\sqrt{d}\sigma_0}{\sqrt{T\gamma}}+\frac{G}{\sqrt{T\gamma b_0}},
    % \nonumber\\
    % &\leq 14\zeta+\frac{\zeta}{4}+\frac{\zeta }{4}+\frac{d\sigma^2}{\zeta\alpha}+\frac{\sqrt{2d}\sigma}{\sqrt{\alpha}}+\frac{d\sigma_0^2}{T\zeta\alpha}+\frac{\sqrt{d}\sigma_0}{\sqrt{T\alpha}},
\end{align}
where the second inequality comes from \eqref{eq:eq7} and \eqref{eq:eq8}. Let $\gamma^2=2\zeta^2/(n_0^2G^2)$, $b=G/(n_0\zeta)$, $b_0=G^3/(\zeta LD_F)$, where $D_F=F(\btheta^0)-F(\btheta^*)$ and $F(\btheta^*$) is a global minimum of $F$, by the definition of $T$, we can get
\begin{align}\label{eq:utility_single}
    \EE\|\nabla F(\tilde \btheta)\|_2\leq 15\zeta+\frac{d\sigma^2}{\zeta\gamma}+\frac{\sqrt{d}\sigma}{\sqrt{\gamma}}+\frac{d\sigma_0^2}{T\zeta\gamma}+\frac{\sqrt{d}\sigma_0}{\sqrt{T\gamma}}.
\end{align}
 Furthermore, we have 
\begin{align}\label{eq:eq9}
    \sigma^2=\frac{14T\big((1-\gamma)\zeta/n_0+\gamma G\big)^2\log(1/\delta)}{n^{2}\epsilon^2},\quad \sigma_0^2=\frac{14TG^2\log(1/\delta)}{n^2\epsilon^2}.
\end{align}
Plugging \eqref{eq:eq9} into \eqref{eq:utility_single}, we can obtain 
\begin{align}\label{eq:final_bound}
    \EE\|\nabla F(\tilde \btheta)\|_2&\leq 15\zeta+\frac{C_1Td G\log(1/\delta)}{n_0n^2\epsilon^2}+\frac{\sqrt{C_1T\zeta dG\log(1/\delta)}}{n\epsilon\sqrt{n_0}}+\frac{C_2dn_0G^3\log(1/\delta)}{n^2\epsilon^2\zeta^2}\nonumber\\   &\qquad+\frac{\sqrt{C_2n_0dG^3\log(1/\delta)}}{n\epsilon\sqrt{\zeta}}\nonumber\\
    &\leq 15\zeta+\frac{C_3LD_FGd\log(1/\delta)}{n^2\epsilon^2\zeta^2}+\frac{\sqrt{C_4GLD_Fd\log(1/\delta)}}{n\epsilon\sqrt{\zeta}}\nonumber\\
    &\qquad+\frac{C_5n_0dG^3\log(1/\delta)}{n^2\epsilon^2\zeta^2}+\frac{\sqrt{C_6n_0dG^3\log(1/\delta)}}{n\epsilon\sqrt{\zeta}},
\end{align}
where the second inequality is due to the fact that $T= \lfloor 4n_0LD_F/\zeta^2\rfloor +1$. Without loss of generality, we can assume $G\geq 1$ and $\zeta\leq 1$.
Therefore, let $n_0=LD_F/G^2\cdot(G/\zeta)^\kappa$ with $\kappa\in[0,1]$, and plugging $n_0$ into \eqref{eq:final_bound}, we can obtain
\begin{align}\label{eq:final_bound_f1}
  \EE\|\nabla F(\tilde \btheta)\|_2&\leq 15\zeta+ \frac{C_7LD_FGd\log(1/\delta)G^\kappa}{n^2\epsilon^2\zeta^{2+\kappa}}+\frac{C_8\sqrt{GLD_Fd\log(1/\delta)}G^{\frac{\kappa}{2}}}{n\epsilon\zeta^{\frac{1+\kappa}{2}}}.
\end{align}
Thus, choosing 
\begin{align}\label{eq:kappa}
\zeta=C_9\bigg(\frac{G^{\frac{\kappa}{2}}\sqrt{GLD_Fd\log(1/\delta)}}{n\epsilon}\bigg)^{\frac{2}{3+\kappa}},
\end{align}
we can get
\begin{align}\label{eq:final_bound_f2}
  \EE\|\nabla F(\tilde \btheta)\|_2&\leq C_{10}\bigg(\frac{G^{\frac{\kappa}{2}}\sqrt{GLD_Fd\log(1/\delta)}}{n\epsilon}\bigg)^{\frac{2}{3+\kappa}}.
\end{align}
Note that we require $\gamma\leq 1$, which gives us $n\epsilon\geq O\big(G^2(d\log(1/\delta))^{1/2}/(LD_F)\big)$.

Furthermore, according to Theorem 5.1 to achieve the desired privacy guarantee, we require $\sigma^{\prime2}=\min\{b^2\sigma^2/\big(4((1-\gamma)\zeta/n_0+\gamma G)^2\big), b_0^2\sigma_0^2/(4G^2)\}\geq 0.7$. Note that $b=G/(n_0\zeta)$, $b_0=G^3/(\zeta LD_F)$, $n_0=LD_F/G^2\cdot(G/\zeta)^\kappa$, we have $b=b_0\cdot(\zeta/G)^\kappa$. Thus, the aforementioned requirement reduces to 
\begin{align*}
    \frac{14b_0^2T\log(1/\delta)}{4n^2\epsilon^2}\cdot\frac{\zeta^{2\kappa}}{G^{2\kappa}}&=\frac{14b_0^2n_0LD_{F}\log(1/\delta)}{\zeta^4n^2\epsilon^2}\cdot\frac{\zeta^{2\kappa}}{G^{2\kappa}}\\
    &\geq \frac{14b_0n_0LD_{F}\log(1/\delta)}{\zeta^4n^2\epsilon^2}\cdot\frac{\zeta^{2\kappa}}{G^{2\kappa}}\\
    &=\frac{14GLD_{F}\log(1/\delta)}{\zeta^3n^2\epsilon^2}\cdot\frac{\zeta^{\kappa}}{G^{\kappa}}\\
    &\geq 0.7,
\end{align*}
where the first equality comes from the definition of $T$ and the first inequality is due to $b_0\geq 1$.  Therefore, we need
\begin{align}\label{eq:kappa_r}
    \zeta\leq \bigg(4\frac{G^{-\frac{\kappa}{2}}\sqrt{GLD_Fd\log(1/\delta)}}{n\epsilon}\bigg)^{\frac{2}{3-\kappa}}.
\end{align}
Combining \eqref{eq:kappa} and \eqref{eq:kappa_r}, we need to choose $\kappa=0$ in $n_0$, which gives us 
\begin{align}\label{eq:final_bound_f3}
  \EE\|\nabla F(\tilde \btheta)\|_2&\leq C_{10}\bigg(\frac{\sqrt{GLD_Fd\log(1/\delta)}}{n\epsilon}\bigg)^{\frac{2}{3}},
\end{align}
where $\{C_i\}_{i=1}^{10}$ are absolute constants. Furthermore, the requirement 
$\alpha-1=\log(1/\delta)/\big((1-\beta)\epsilon\big)\leq 2\sigma^{\prime 2}\log\big(1/\big(\tau\alpha (1+\sigma^{\prime 2})\big)\big)/3$ in Theorem 5.1 can be satisfied under our choice of parameters given large enough $n$. Since we have $\sigma^{\prime 2}\geq 0.7$, we have $2\sigma^{\prime 2}\log\big(1/\big(\tau\alpha (1+\sigma^{\prime 2})\big)\big)/3\geq 0.4\log\big(1/\big(\tau\alpha (1+\sigma^{\prime 2})\big)\big)\geq0.4\log\big(1/\big(3\tau\alpha \sigma^{\prime 2}\big)\big)$. Furthermore, we have
\begin{align*}
   \tau\alpha \sigma^{\prime 2}&=\frac{G^3}{n\zeta LD_F}\cdot\frac{\log(1/\delta)+(1-\beta)\epsilon}{(1-\beta)\epsilon}\cdot\frac{14GLD_{F}\log(1/\delta)}{\zeta^3n^2\epsilon^2}\\
   &\leq \frac{28G^4\log^2(1/\delta)}{(1-\beta)n^3\epsilon^3\zeta^4}\\
   &\leq C_{11} \frac{G^4\log^2(1/\delta)}{(n\epsilon)^3}\cdot\frac{(n\epsilon)^{8/3}}{(GLD_Fd\log(1/\delta))^{4/3}}\\
   &=C_{11}\frac{G^{8/3}\log^{2/3}(1/\delta)}{(n\epsilon)^{1/3}(LD_Fd)^{4/3}},
\end{align*}
where the first inequality comes from assumining $\epsilon\leq \log(1/\delta)$ without loss of generality, and the second inequality is due to the definition of $\zeta$. Thus we have
\begin{align*}
    \log\big(1/\big(3\tau\alpha \sigma^{\prime 2}\big)\big)\geq\log\bigg(3C_{11}\frac{(n\epsilon)^{1/3}(LD_Fd)^{4/3}}{G^{8/3}\log^{2/3}(1/\delta)}\bigg).
\end{align*}
As a result, the requirement reduces to 
\begin{align*}
    0.4\log\bigg(3C_{11}\frac{(n\epsilon)^{1/3}(LD_Fd)^{4/3}}{G^{8/3}\log^{2/3}(1/\delta)}\bigg)\geq \frac{\log(1/\delta)}{(1-\beta)\epsilon},
\end{align*}
which can be satisfied if we have 
\begin{align*}
    n\geq C_{12}\frac{G^{8}\log^{2}(1/\delta)}{(LD_Fd)^{4}\epsilon},
\end{align*}
where $C_{11},C_{12}$ are some large constants.

\textbf{Gradient Complexity.}
Since we have  $b=b_0=G^3/(\zeta LD_F)$, the total gradient complexity is 
\begin{align*}
    2(T-1)b+b_0\leq \frac{8LD_Fn_0}{\zeta^2}\cdot \frac{G^3}{LD_F\zeta}+\frac{G^3}{LD_F\zeta}.
\end{align*}
According to the definition of $\zeta$ and $n_0$, we have the total gradient complexity is $O\big(n^2\epsilon^2/(d\log(1/\delta))\big)$.



% According to the proof in the centralized setting, i.e., equations \eqref{eq:senstivity_gamma_t_1} and \eqref{eq:senstivity_gamma_t}, we have 
% \begin{align*}
%     \frac{1}{m\tilde n}\|\nabla f_i^j(\btheta^{0})-f_{i^\prime}^j(\btheta^{0})\|_2\leq\frac{G}{m\tilde n} \quad\text{and}\quad\frac{1}{m \bar b}\|\nabla f_i(\btheta^t)-\nabla f_i(\btheta^{t-1})\|_2\leq\frac{\zeta}{m \bar b}.
% \end{align*}
% Therefore, following the same proof as in the centralized setting, we only need to replace the parameter $n$ with $m\tilde n$ is the distributed setting to get our privacy guarantee.


\section{Proof of Lemma 3.7}

Without loss of generality, we assume $\Delta(q)=1$. According to Theorem 9 in Wang et al. [2019b], we have 
\begin{align}\label{eq:rho1}
    \rho^\prime(\alpha)\leq \frac{1}{\alpha-1}\log\bigg(1+\tau^2{\alpha \choose 2}\min\Big\{4(e^{\rho(2)}-1),2e^{\rho(2)}\Big\}+\sum_{j=3}^\alpha \tau^j{\alpha \choose j}2e^{(j-1)\rho(j)}\bigg),
\end{align}
where $\tau$ is the subsample rate, $\rho(j)=j/(2\sigma^2)$. Next, we will show that the summation term in the right hand side of the above inequality is dominated by the second term under certain conditions. First of all, when $\sigma^2$ is large, i.e., $\sigma^2\geq 0.7$, we have 
\begin{align*}
    \min\Big\{4(e^{\rho(2)}-1),2e^{\rho(2)}\Big\}\leq 6/\sigma^2,
\end{align*}
which implies that
\begin{align*}
    \tau^2{\alpha \choose 2}\min\Big\{4(e^{\rho(2)}-1),2e^{\rho(2)}\Big\}\leq \tau^2{\alpha \choose 2} 6/\sigma^2.
\end{align*}
Next, we consider the summation term in \eqref{eq:rho1}, and we have 
\begin{align*}
    \sum_{j=3}^\alpha \tau^j{\alpha \choose j}2e^{(j-1)\rho(j)}&\leq\tau^2{\alpha \choose 2}\bigg(\sum_{j=3}^\alpha\tau^{j-2}\alpha^{j-2}e^{\frac{(\alpha-1) j}{2\sigma^2}}\bigg)\\
    &\leq\tau^2{\alpha \choose 2}\frac{\tau\alpha e^{\frac{3(\alpha-1)}{2\sigma^2}}}{1-\tau\alpha e^{\frac{\alpha-1}{2\sigma^2}}},
\end{align*}
where the first inequality is due to the fact that
\begin{align*}
    e^{(j-1)\rho(j)}=e^{\frac{(j-1)j}{2\sigma^2}}\leq e^{\frac{(\alpha-1)j}{2\sigma^2}}\quad\text{and}\quad
% \end{align*}
%  and 
%  \begin{align*}
     {\alpha \choose j }=\frac{\alpha!}{j!(\alpha-j)!}\leq \frac{\alpha^2\alpha^{j-2}}{3!}.
 \end{align*}
 In addition, the last inequality comes from the condition that $\tau\alpha\exp\big((\alpha-1)/(2\sigma^2)\big)<1$ and the sum of the geometric sequence.
Therefore, as long as 
\begin{align}\label{eq:alpha_ineq}
    \alpha-1\leq\frac{2}{3}\sigma^2\log\frac{1}{\tau\alpha(1+\sigma^2)},
\end{align}
we have 
\begin{align*}
    \sum_{j=3}^\alpha \tau^j{\alpha \choose j}2e^{(j-1)\rho(j)}\leq \tau^2{\alpha \choose 2}\frac{1}{\sigma^2}.
\end{align*}
% A sufficient conditions to ensure the \eqref{eq:alpha_ineq} holds is that 
% \begin{align*}
%     \alpha \leq \log\frac{1}{\tau(1+\sigma^2)}.
% \end{align*}
In addition, we require that $\tau\alpha\exp\big((\alpha-1)/(2\sigma^2)\big)<1$. By plugging the condition of $\alpha$ into the above requirement, we can obtain that this condition can hold if $\tau<1$.

As a result, under the conditions that $\sigma^2\geq 0.7$, $\alpha\leq \log(1/\tau\big(1+\sigma^2)\big)$, we can obtain that
\begin{align*}
    \rho^\prime(\alpha)\leq \frac{1}{\alpha-1}\log\bigg(1+\tau^2{\alpha \choose 2}\frac{10}{\sigma^2}\bigg)\leq \frac{1}{\alpha-1}\tau^2{\alpha \choose 2}\frac{7}{\sigma^2}\leq 3.5\alpha\tau^2/\sigma^2.
\end{align*}

\section{Auxiliary Lemmas}
\begin{lemma}[Lei et al., 2017]\label{lemma:scsg}
Consider vectors $\ab_i$ satisfying $\sum_{i=1}^n\ab_i=0$. Let $\cB$ be a uniform random subset of $\{1,2,\ldots,n\}$ with size $m$, we have
\begin{align*}
    \EE\bigg\|\frac{1}{m}\sum_{i\in\cB}\ab_i\bigg\|_2^2\leq\frac{\ind\{|\cB|< n \}}{mn}\sum_{i=1}^n\|\ab_i\|_2^2.
\end{align*}
\end{lemma}
\end{document}
