\documentclass{article} %for full version

\usepackage[margin=1in]{geometry}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
%\usepackage{microtype}      % microtypography
\usepackage{bm}
% \usepackage{fullpage}
\usepackage{amsthm,amsmath,amsfonts,amssymb}
\usepackage{mathtools}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{xcolor}
\usepackage{graphicx}
\usepackage{subcaption}
% \usepackage{subfigure}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{hyperref}
\usepackage{mathrsfs, fancyhdr}

% \usepackage[shortlabels]{enumitem}
\usepackage{makecell}
\usepackage{rotating}
%\ \usepackage[switch]{lineno}


\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    % \renewcommand{\bibsection}{\subsubsection*{References}}
% \renewcommand{\bibname}{References}
% \renewcommand{\bibsection}{\subsubsection*{\bibname}}

% set preamble
\setlength{\parindent}{0pt}

\usepackage{xr}

% In your preamble

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

% In your preamble

\myexternaldocument{yang_122}

 
%\theoremstyle{theorem}
% \theoremstyle{definition}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}
\newtheorem{exercise}{Exercise}
\newtheorem{claim}{Claim}
\newtheorem{assumption}{Assumption}
\newtheorem{example}{Example}
\newtheorem{property}{Property}
\newtheorem{remark}{Remark}
\def\S{S}


\usepackage{yang-macros}
\newcommand{\yiming}[1]{{\color{blue}{\noindent{Yiming: \bfseries [}{ \sffamily #1}{\rm\bfseries ]~}}}}
\newcommand{\yunwen}[1]{{\color{blue}{\noindent{Yunwen: \bfseries [}{ \sffamily #1}{\rm\bfseries ]~}}}}
\newcommand{\shu}[1]{{\color{red}{\noindent{shu: \bfseries [}{ \sffamily #1}{\rm\bfseries ]~}}}}
\newcommand{\zhenhuan}{\textcolor{orange}}
\newcommand{\kush}[1]{{\color{orange}{\noindent{Kush: \bfseries [}{ \sffamily #1}{\rm\bfseries ]~}}}}

% % \usepackage{aistats2022}
% \numberwithin{equation}{section}
% \numberwithin{theorem}{section}
% \numberwithin{figure}{section}
% \numberwithin{table}{section}
% \renewcommand{\thesection}{{\Alph{section}}}
% \renewcommand{\thesubsection}{\Alph{section}.\arabic{subsection}}
% \renewcommand{\thesubsubsection}{\Roman{section}.\arabic{subsection}.\arabic{subsubsection}}

% \renewcommand{\zhenhuan}{\textcolor{orange}}



% \input{example}

% \input{proof-miscellany}

% \input{proof-dp-sgda-convex}

% \input{proof-dp-sgda-nonconvex}

% \input{additional-experiments}

% \end{document}

\begin{document}


\onecolumn
\appendix


\begin{center}
\textbf{\Large Appendix for "Differentially Private SGDA for Minimax Problems"}
\end{center}


\section{Motivating Examples}\label{sec:motivating-example}
We provide several examples that can be formulated as a stochastic minimax problem. All these examples have corresponding empirical minimax formulations. 

\textbf{AUC Maximization.} Area Under the ROC Curve (AUC) is a widely used measure for binary classification. Optimizing AUC with square loss can be formulated as
\begin{align*}
\min_{\theta \in \Theta} \Ebb_{\zbf,\zbf'}[(1 - h(\theta;\xbf) + h(\theta;\xbf'))^2|y=1, y'=-1]
\end{align*}
where $h: \Theta\times \Rbb^d\rightarrow\Rbb$ is the scoring function for the classifier. It has been shown this problem is equivalent to a minimax problem once auxiliary variables $a, b, \vbf \in \Rbb$ are introduced \citep{ying2016stochastic-supp}.
\begin{align*}
\min_{\theta, a, b}\max_{\vbf}	F(\theta,a,b,c) = \Ebb_\zbf[f(\theta, a, b, \vbf;\zbf)]
\end{align*}
where $f = (1-p)(h(\theta;\xbf) - a)^2\Ibb[y=1] + p(h(\theta;\xbf) - b)^2\Ibb[y=-1] + 2(1+\vbf)(ph(\theta;\xbf)\Ibb[y=-1] - (1 - p)h(\theta;\xbf)\Ibb[y=1])] - p(1-p)\vbf^2$ and $p = \Pbb[y=1]$. Such problem is (non)convex-concave. In particular, \citet{liu2019stochastic-supp} showed that when $h$ is a one hidden layer neural network the objective $f$ satisfies the \PL condition. Differential privacy has been applied to learn private classifier by optimizing AUC \citep{wang2021differentially-supp}. The proposed privacy mechanisms there are objective perturbation and output perturbation. 
 
\textbf{Generative Adversarial Networks (GANs).} GAN is introduced in \citet{goodfellow2014generative-supp} which can be regarded as a game between a generator network $G_\vbf$ and a discriminator network $D_\wbf$. The generator network produces synthetic data from random noise $\xi$, while the
discriminator network discriminates between the true data and the synthetic data.
In particular, a popular variant of GAN named as WGAN \citep{arjovsky2017wasserstein-supp} can be written as a minimax problem
\begin{align*}
\min_{\wbf}\max_{\vbf} \mathbb{E}[f(\wbf,\vbf;\zbf,\xi)] := \mathbb{E}_{\zbf} [D_\wbf(\zbf)] - \mathbb{E}_{\xi} [D_\wbf(G_\vbf(\xi))].
\end{align*}
Recently \citet{sahiner2021hidden-supp}  showed that WGAN with a two-layer discriminator and generator can be expressed as a convex-concave problem. An heuristic differentially private version of RMSProp were employed to train GANs by \citet{xie2018differentially-supp}. Recently differential privacy has successfully applied to private synthetic data generation by GAN framework \citep{jordon2018pate-supp, beaulieu2019privacy-supp}.


\textbf{Markov Decision Process (MDP).}  Let $\Acal$ be a finite action space. For any $a \in \Acal$, $P(a) \in [0, 1]^{n\times n}$ is the state-transition probability matrix and $\rbf(a) \in [0,1]^n$ is the vector of
expected state-transition rewards. In the infinite-horizon average-reward Markov decision problem, one aims to find a stationary policy $\pi$ to make an infinite sequence of actions and optimize the average-per-time-step reward $\bar{v}$. By classical theory of dynamics programming \citep{puterman2014markov-supp}, finding an optimal policy is equivalent as solving the fixed-point Bellman equation
\begin{align*}
\bar{v}^* + h^*_i = \max_{a\in \Acal} \big\{ \sum_{j=1}^n (p_{ij}(a)h^*_i + p_{ij}(a)r_{ij}(a))\big\}, \quad \forall i
\end{align*}
where $\hbf \in \Rbb^n$ is the difference-of-value vector. \citet{wang2017primal-supp} showed that this problem is equivalent to the minimax problem as follow
\begin{align*}
\min_{\hbf \in \Hcal}\max_{\mu \in \Ucal} \mu^\top((P(a) - I)\hbf + \rbf(a))
\end{align*}
where $\Hcal$ and $\Ucal$ are the feasible regions chosen according to the mixing time and stationary distribution. We refer to \citet{zhang2021generalization-supp} for a discussion on the measure of population risk.

%\textbf{Adversarial Machine Learning.}  Minimax problem \citep{eq:SPP}.} In order to train models that are robust to adversarial attacks, \citep{madry2017towards} proposed to incorporate the adversarial perturbation in the training. Let $\Dcal$ be the set of allowed perturbations that formalizes the manipulative power of the adversary. For a given loss $L$, the minimax problem is formulated as\begin{align*} \min_{\wbf\in \Wcal} \Ebb_\zbf[\max_{\delta \in \Dcal} L(\wbf; \xbf+\delta, y)]\end{align*}

\textbf{Robust Optimization and Fairness.} Let $\Dcal_1, \cdots, \Dcal_m$ be $m$ different distributions on some support. The aim is to minimize the worst population risks $L$ parameterized by some  $\wbf$ among multiple scenarios: 
\begin{align*}
\min_{\wbf \in \Wcal} L(\wbf) = \max_{1 \leq i \leq m} \big\{\Ebb_{\zbf_1 \sim \Dcal_1}[\ell(\wbf; \zbf_1)], \cdots, \Ebb_{\zbf_m \sim \Dcal_m}[\ell(\wbf; \zbf_m)]\big\}    
\end{align*}
This problem can be reformulated as a zero-sum game between two players $\wbf$ and $\vbf$ as follow
\begin{align*}
\min_{\wbf \in \Wcal}\max_{\vbf\in \Delta_m} \sum_{i=1}^m v_i \Ebb_{\zbf_i \sim \Dcal_i}[\ell(\wbf; \zbf_i)] = \Ebb\Big[\sum_{i=1}^m v_i \ell(\wbf; \zbf_i)\Big]    
\end{align*}
where $\Delta_m = \bigl\{\vbf \in\Rbb^m:  v_i\geq 0, \sum_{i=1}^m v_i=1\bigr\}$ denotes the $m$-dimensional simplex. Such robust optimization formulation has been recently proposed to address fairness among subgroups \citep{mohri2019agnostic-supp} and federated learning on heterogeneous populations \citep{li2019fair-supp}. 


\section{Proofs of Theorem \ref{thm:moments-accountant-privacy} and Remark \ref{rem:choice-of-param}}\label{sec:proof-privacy}

% \begin{lemma}[\citep{liang2020exploring}]\label{lem:subsampling-numeric}
% Consider a mechanism $\Mcal: \Zcal^n \rightarrow \Wcal$ and a dataset $S$ drawn from $\Zcal^n$. The Gaussian mechanism $\Mcal(S) + \Ncal(0,\sigma^2 I)$ applied to a subset of samples that are drawn uniformly without replacement with subsampling rate $p$ satisfies $(\alpha, 3.5 p^2\alpha \Delta^2/\sigma^2)$-RDP given $\sigma^2 \geq 0.67 \Delta^2$ and $\alpha -1 \leq 2\sigma^2/(3\Delta^2)\log(1/(1+\sigma^2/\Delta^2))$ where $\Delta$ is the $\ell_2$-sensitivity of $\Mcal$.	
% \end{lemma}

% \begin{theorem}\label{thm:sgda-privacy}
% Suppose the function $f$ is $G$-Lipschitz continuous. Then Algorithm \ref{alg:dp-sgda} satisfies $(\epsilon, \delta)$-DP if there exists $\beta \in (0,1)$ such that $\frac{\sigma^2}{2G^2} \geq 0.67$ and $\alpha -1 \leq \frac{\sigma^2}{3G^2}\log(\frac{n}{\alpha(1 + \frac{\sigma^2}{2G^2})})$ hold with $\alpha = \frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1$.
% \end{theorem}

% \begin{proof}[Proof of Theorem \ref{thm:sgda-privacy}]
% Consider the mechanism $\Mcal_t = (\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t}), \nabla_\vbf f(\wbf_t, \vbf_t; \zbf_{i_t}))^\top$. By the Lipschitz continuity we know the $\ell_2$-sensitivity of $\Mcal_t$ is $G$. Now	 let
% \[
% \sigma^2 = \frac{7G^2T}{\beta n^2\epsilon}\Big(\frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1\Big)
% \]
% Lemma \ref{lem:subsampling-numeric} with $p = \frac{1}{n}$ implies that $\Mcal_t$ satisfies $\Big(\alpha, \frac{\alpha \beta \epsilon}{T\big(\frac{\log(1/\delta)}{(1-\beta)\epsilon} +1\big)}\Big)$-RDP if the following conditions hold
% \[
% \frac{\sigma^2}{2G^2} \geq 0.67
% \]
% and
% \[
% \alpha -1 \leq \frac{\sigma^2}{3G^2}\log(\frac{n}{\alpha(1 + \frac{\sigma^2}{2G^2})}).
% \]
% Let $\alpha = \frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1$. We obtain that $\Mcal_t$ satisfies $\big(\frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1, \frac{\beta\epsilon}{T}\big)$-RDP. Then by the post-processing property of RDP, we know $(\wbf_{t+1}, \vbf_{t+1})$ also satisfies $\big(\frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1, \frac{\beta\epsilon}{T}\big)$-RDP for any $t=1, \cdots, T$. Furthermore, according to the adaptive composition theorem of RDP, Algorithm \ref{alg:dp-sgda} satisfies $\big(\frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1, \beta\epsilon\big)$-RDP. Finally, by RDP to DP conversion the output of Algorithm \ref{alg:dp-sgda} satisfies $(\epsilon,\delta)$-DP. The proof is complete.

% \end{proof}

% \begin{theorem}\label{thm:agda-privacy}
% Suppose the function $f$ is $G$-Lipschitz continuous. Then Algorithm \ref{alg:dp-sgda} satisfies $(\epsilon, \delta)$-DP if there exists $\beta \in (0,1)$ such that $\frac{\sigma^2}{2G^2} \geq 0.67$ and $\alpha -1 \leq \frac{\sigma^2}{3G^2}\log(\frac{n}{\alpha(1 + \frac{\sigma^2}{2G^2})})$ hold with $\alpha = \frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1$.
% \end{theorem}

% \begin{proof}[Proof of Theorem \ref{thm:agda-privacy}]
% Consider the mechanism $\Mcal^\wbf_t = \nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t})$. By the Lipschitz continuity we know the $\ell_2$-sensitivity of $\Mcal^\wbf_t$ is $G$. Now	 let
% \[
% \sigma^2 = \frac{7G^2T}{\beta n^2\epsilon}\Big(\frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1\Big)
% \]
% Lemma \ref{lem:subsampling-numeric} with $p = \frac{1}{n}$ implies that $\Mcal^\wbf_t$ satisfies $\Big(\alpha, \frac{\alpha \beta \epsilon}{T\big(\frac{\log(1/\delta)}{(1-\beta)\epsilon} +1\big)}\Big)$-RDP if the following conditions hold
% \[
% \frac{\sigma^2}{2G^2} \geq 0.67
% \]
% and
% \[
% \alpha -1 \leq \frac{\sigma^2}{3G^2}\log(\frac{n}{\alpha(1 + \frac{\sigma^2}{2G^2})}).
% \]
% Let $\alpha = \frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1$. We obtain that $\Mcal^\wbf_t$ satisfies $\big(\frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1, \frac{\beta\epsilon}{T}\big)$-RDP. Then by the post-processing property of RDP, we know $\wbf_{t+1}$ also satisfies $\big(\frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1, \frac{\beta\epsilon}{T}\big)$-RDP. 

% Similarly, by the tracking the privacy loss of $\Mcal^\vbf_t = \nabla_\wbf f(\wbf_{t+1}, \vbf_t; \zbf_{j_t})$, we can show that $\vbf_{t+1}$ also satisfies $\big(\frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1, \frac{\beta\epsilon}{T}\big)$-RDP. Furthermore, according to the adaptive composition theorem of RDP, Algorithm \ref{alg:dp-sgda} satisfies $\big(\frac{\log(1/\delta)}{(1-\beta)\epsilon} + 1, \beta\epsilon\big)$-RDP. Finally, by RDP to DP conversion the output of Algorithm \ref{alg:dp-sgda} satisfies $(\epsilon,\delta)$-DP. The proof is complete.

% \end{proof}


In this section, we prove the privacy guarantee of DP-SGDA based on the privacy-amplification by the subsampling result, which is a direct application of Theorem 1 in \citet{abadi2016deep-supp}. First we introduce some necessary definitions.


\begin{definition}\label{def:sensitivity}
Given a function $g: \Zcal^n \rightarrow \Rbb^d$, we say $g$ has $\Delta(g)$ $\ell_2$-sensitivity if for any neighboring datasets $S, S'$ we have
\begin{align*}
\|g(S) - g(S')\|_2 \leq \Delta(g).    
\end{align*}
\end{definition}

\begin{definition}[\citep{abadi2016deep-supp}]\label{def:moments-accountant}
For an (randomized) algorithm $A$, and neighboring datasets $S, S'$ the $\lambda$-th moment is given as 
\[
\alpha_A(\lambda, S, S') = \log\Ebb_{O\sim A(S)}\Big[\Big(\frac{\Pbb[A(S) = O]}{\Pbb[A(S') = O]}\Big)^\lambda\Big].
\]
The moments accountant is then defined as 
\[
\alpha_A(\lambda) = \sup_{S, S'}\alpha_A(\lambda, S, S').
\]
\end{definition}

\begin{lemma}[\citep{abadi2016deep-supp}]
Consider a sequence of mechanisms $\{A_t\}_{t\in[T]}$ and the composite mechanism $A = (A_1, \cdots, A_T)$. 
\begin{enumerate}
\item[a)] [Composability]\label{lem:moments-accountant-composition} For any $\lambda$,
\[
\alpha_A(\lambda) = \sum_{t=1}^T \alpha_{A_t}(\lambda).
\]
\item[b)][Tail bound]\label{lem:moments-accountant-tail} For any $\epsilon$, the mechanism $A$ is $(\epsilon, \delta)$ differentially private for 
\[
\delta = \min_\lambda \alpha_A(\lambda) - \lambda \epsilon.
\]
\end{enumerate}
\end{lemma}

\begin{lemma}[\citep{abadi2016deep-supp}]\label{lem:moments-accountant-privacy}
Consider a sequence of mechanisms $A_t = g_t(S_t) + \xi_t$ where $\xi \sim \Ncal(0, \sigma^2I)$. Here each function $g_t: \Zcal^m \rightarrow \Rbb^d$ has $\ell_2$-sensitivity of $1$. And each $S_t$ is a subsample of size $m$ obtained by uniform sampling without replacement \footnote{In our case we use uniform sampling on each iteration to construct $I_t$ and therefore $S_t$, as opposed to the Poisson sampling in  \citet{abadi2016deep-supp}. However, one can verify that similar moment estimates lead to our stated result \citep{wang2019subsampled-supp}} from $S$, i.e. $S_t \sim (Unif(S))^m$, Then
\begin{equation*}
\alpha_A(\lambda) \leq  \frac{m^2n\lambda(\lambda + 1)}{n^2(n-m)\sigma^2} + \Ocal(\frac{m^3\lambda^3}{n^3\sigma^3}).
\end{equation*}
\end{lemma}

\begin{theorem}[Theorem \ref{thm:moments-accountant-privacy} restated]
There exist constants $c_1, c_2$ and $c_3$ so that for any $\epsilon < c_1 T/n^2$, Algorithm \ref{alg:dp-sgda} is $(\epsilon, \delta)$-differentially private for any $\delta > 0$ if we choose
\begin{equation*}
\sigma_\wbf \geq \frac{c_2 G_\wbf \sqrt{T\log(1/\delta)}}{n\epsilon} \text{ and } \sigma_\vbf \geq \frac{c_3 G_\vbf \sqrt{T\log(1/\delta)}}{n\epsilon}.
\end{equation*}
\end{theorem}

\begin{proof}
Let $S = \{\zbf_1, \cdots, \zbf_n\}$ and $S' = \{\zbf'_1, \cdots, \zbf'_n\}$ be two neighboring datasets. At iteration $t$, we first focus on $A_t^\wbf =  \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) + \xi_t$. Since $f(\cdot, \vbf; \zbf)$ is $G_\wbf$-Lipschitz continuous, it implies for any neighboring datasets $S, S'$, 
\begin{align*}
\Big\| \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) - \frac{1}{m}\sum_{j=1}^m \nabla_\wbf f(\wbf_t, \vbf_t; \zbf'_{i_t^j})\Big\|_2  \leq \frac{2G_\wbf}{m}.  
\end{align*}
Therefore we can define $g_t(S_t) = \frac{1}{2G_\wbf}\sum_{j=1}^m \nabla_\wbf f(\wbf_t, \vbf_t, \zbf_{i_t^j})$ such that $\Delta(g_t) = 1$. By Lemma \ref{lem:moments-accountant-composition} b) and \ref{lem:moments-accountant-privacy}, the log moment of the composite mechanism $A^\wbf = (A_1^\wbf, \cdots, A_T^\wbf)$ can be bounded as follows
\begin{align*}
\alpha_{A^\wbf}(\lambda) \leq \frac{m^2T\lambda^2}{n^2\tilde{\sigma}_\wbf^2}. 
\end{align*}
where $\tilde{\sigma}_\wbf = \sigma_\wbf / 2G_\wbf$. Similarly, since $A_t^\vbf = \nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t}) + \zeta_t$ has $\ell_2$-sensitivity $2G_\vbf/m$, then the log moment of the final output $A = (A_1^\wbf, A_1^\vbf, \cdots, A_T^\wbf, A_T^\vbf)$ can be bounded as follows
\begin{align*}
\alpha_{A}(\lambda) \leq \alpha_{A^\vbf}(\lambda) + \alpha_{A^\wbf}(\lambda) \leq \frac{m^2T\lambda^2}{n^2\tilde{\sigma}_\wbf^2} + \frac{m^2T\lambda^2}{n^2\tilde{\sigma}_\vbf^2}. 
\end{align*} 
By Lemma \ref{lem:moments-accountant-tail} a), to guarantee $A$ to be $(\epsilon, \delta)$-differentially private, it suffices that
\begin{align*}
\frac{\lambda^2 m^2T}{n^2\tilde{\sigma}_\wbf^2} \leq \frac{\lambda \epsilon}{4}, \frac{\lambda^2 m^2T}{n^2\tilde{\sigma}_\vbf^2} \leq \frac{\lambda \epsilon}{4},
\exp(-\frac{\lambda \epsilon}{4}) \leq \delta, \lambda \leq  \tilde{\sigma}_\wbf^2\log(\frac{n}{m\tilde{\sigma}_\wbf}) \text{ and } \lambda \leq  \tilde{\sigma}_\vbf^2\log(\frac{n}{m\tilde{\sigma}_\vbf})  
\end{align*}
It is now easy to verify that when $\epsilon = c_1m^2T/n^2$, we can satisfy all these conditions by setting
\begin{equation*}
\tilde{\sigma}_\wbf \geq \frac{c_2  \sqrt{T\log(1/\delta)}}{n\epsilon} \text{ and } \tilde{\sigma}_\vbf \geq \frac{c_3  \sqrt{T\log(1/\delta)}}{n\epsilon}
\end{equation*}
for some explicit constants $c_1, c_2$ and $c_3$. The proof is complete.
\end{proof}


% \begin{proof}%\yunwen{$S_t$ is not defined}
% Let $S = \{\zbf_1, \cdots, \zbf_n\}$ and $S' = \{\zbf'_1, \cdots, \zbf'_n\}$ be two neighboring datasets. At iteration $t$, we first focus on $A_t^\wbf =  \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) + \xi_t$. Since $f(\cdot, \vbf; \zbf)$ is $G_\wbf$-Lipschitz continuous, it implies for any neighboring datasets $S, S'$, 
% \begin{align*}
% \Big\| \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) - \frac{1}{m}\sum_{j=1}^m \nabla_\wbf f(\wbf_t, \vbf_t; \zbf'_{i_t^j})\Big\|_2  \leq \frac{2G_\wbf}{m}.  
% \end{align*}
% Therefore we can define $g_t(S_t) = \frac{1}{2G_\wbf}\sum_{j=1}^m \nabla_\wbf f(\wbf_t, \vbf_t, \zbf_{i_t^j})$ such that $\Delta(g_t) = 1$. By Lemma  \ref{lem:moments-accountant-privacy}, there exist constants $c_1$ and $c_2$ so that for any $\epsilon/2 < c_1 m^2 T/n^2$, the composite mechanism $A^\wbf = (A_1^\wbf, \cdots, A_T^\wbf)$ is $(\frac{\epsilon}{2}, \delta)$-DP for any $\delta > 0$ if we choose 
% \begin{equation*}
% \frac{\sigma_\wbf}{2G_\wbf} \geq \frac{c_2  \sqrt{T\log(1/\delta)}}{n\epsilon}.
% \end{equation*}
% Similarly, since $A_t^\vbf = \nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t}) + \zeta_t$ has $\ell_2$-sensitivity $2G_\vbf/m$, then the choice of $\sigma_\vbf$ ensures $A^\vbf = (A_1^\vbf, \cdots, A_T^\vbf)$ is $(\frac{\epsilon}{2}, \delta)$-DP. According to Lemma \ref{lem:moments-accountant-composition} and \ref{lem:moments-accountant-tail} \yunwen{how it is from these two lemmas}, the final output $A = (A_1^\wbf, A_1^\vbf, \cdots, A_T^\wbf, A_T^\vbf)$ satisfies $(\epsilon,\delta)$-DP. The proof is complete.
% \end{proof}

\begin{proof}[Proof of Remark \ref{rem:choice-of-param}]
% We focus on the case of Lemma \ref{lem:moments-accountant-privacy} as it can easily extend to our Theorem. 
Without loss of generality, we consider with only one $\sigma$ in the the proof of Theorem \ref{thm:moments-accountant-privacy}. Then algorithm $A$ is guaranteed to be $(\epsilon, \delta)$-DP if one can find $\lambda > 0$ such that
\begin{align*}
\frac{\lambda^2 m^2T}{n^2\sigma^2} \leq \frac{\lambda \epsilon}{2},\,
\exp(-\frac{\lambda \epsilon}{2}) \leq \delta, \text{ and } \lambda \leq  \sigma^2\log(\frac{n}{m\sigma})
\end{align*}
Given $\delta = \frac{1}{n^2}$, the second inequality can be reformulated as $\lambda \geq \frac{4\log(n)}{\epsilon}$. Therefore by choosing $\sigma^2 = \frac{8m^2 T \log(n)}{n^2\epsilon^2}$, the first inequality becomes $\lambda \leq \frac{4\log(n)}{\epsilon}$, indicating $\lambda = \frac{4\log(n)}{\epsilon}$. It suffices to show such choice of $\lambda$ satisfies the third inequality, which is straightforward by the choice of $m$ and $\epsilon\leq 1$. The proof is complete.
\end{proof}

% The proof of Theorem \ref{thm:moments-accountant-privacy} is based on the concept of \Renyi differential privacy and its property, which are given in the following definition and lemmas.

% \begin{definition}[\citep{mironov2017renyi}]
% For $\alpha > 1, \rho>0$, a randomized algorithm $A$ satisfies $(\alpha, \rho)$-\Renyi differential privacy (RDP), if for all neighboring datasets $S, S'$ we have
% \begin{align*}
% D_\alpha(A(S)||A(S')):= \frac{1}{\alpha-1}\log \Ebb\Big(\frac{A(S)}{A(S')}\Big)^\alpha  \leq \rho.
% \end{align*}
% \end{definition}

% \begin{lemma}[\citep{mironov2017renyi}]
%  If a randomized algorithm $A$ satisfies $(\alpha,\rho)$-RDP, then $A$ satisfies $(\rho+\log(1/\delta)/(\alpha-1), \delta)$-DP for all $\delta\in (0,1)$.
% \end{lemma}

% \begin{lemma}[\citep{mironov2017renyi}]
%  If the randomized algorithm $A_i$ satisfies $(\alpha,\rho_i)$-RDP for each $i \in [k]$, then the composition $A(S) = (A_1(S), \cdots, A_k(S))$ satisfies $(\alpha,\sum_{i=1}^k\rho_i)$-RDP.
% \end{lemma}

% \begin{lemma}[\citep{wang2019efficient}]\label{lem:uniform-subsampling-amplification}
% Suppose the function $g: \Zcal^m \rightarrow \Rbb^d$ has $\Delta(g)$ $\ell_2$-sensitivity. Let $S_m$ be a subsample obtained by uniform sampling without replacement from $S$, i.e. $\sim (Unif(S))^m$, then $A = g(S_m) + \xi$ where $\xi \sim \Ncal(0, \sigma^2I)$ satisfies $(\alpha, 3.5m^2\Delta^2(g)\alpha/ (n^2\sigma^2))$ where $\sigma^2/\Delta^2(g) \geq 0.7$ and $\alpha \leq 2\sigma^2\log(n/m\alpha(1+\sigma^2/\Delta^2(g)))/3+1$.
% \end{lemma}

% \begin{theorem}[Theorem \ref{thm:moments-accountant-privacy} restated]
% Suppose Assumption \ref{ass:lipschitz} holds. There exist constants $\beta_1$ so that $\sigma_\wbf^2/4G_\wbf^2 \geq 0.7$, $\alpha_1-1\leq \sigma_\wbf^2/4G_\wbf^2 \log(\frac{n}{\alpha_1(1+\sigma_\wbf^2/4G_\wbf^2)})$ where $\alpha_1=\frac{\log(1/\delta)}{(1-\beta_1)\epsilon} + 1$  and $\beta_2$ so that $\sigma_\vbf^2/4G_\vbf^2 \geq 0.7$, $\alpha_2-1\leq \sigma_\vbf^2/4G_\vbf^2 \log(\frac{n}{\alpha_2(1+\sigma_\vbf^2/4G_\vbf^2)})$ where $\alpha_2=\frac{\log(1/\delta)}{(1-\beta_2)\epsilon} + 1$, the Algorithm \ref{alg:dp-sgda} is $(\epsilon, \delta)$-differentially private if we choose
% \begin{align*}
% \sigma_\wbf^2 = \frac{14G_\wbf^2T}{\beta_1 n^2\epsilon}(\frac{\log(1/\delta)}{(1-\beta_1)\epsilon}+1) \text{ and } \sigma_\vbf^2 = \frac{14G_\vbf^2T}{\beta_2 n^2\epsilon}(\frac{\log(1/\delta)}{(1-\beta_2)\epsilon}+1),
% \end{align*}
% \end{theorem}

% \begin{proof}[Proof of Theorem \ref{thm:moments-accountant-privacy}]
% At iteration $t$, we first focus on $A^t_\wbf = \nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t}) + \xi_t$. Since $f(\cdot, \vbf; \zbf)$ is $G_\wbf$-Lipschitz continuous, it implies for any neighboring datasets $S, S'$, 
% \begin{align*}
% \|\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t}) -\nabla_\wbf f(\wbf_t, \vbf_t; \zbf'_{i_t})\|_2  \leq 2G_\wbf.  
% \end{align*}
% Now let 
% \begin{align*}
% \sigma_\wbf^2 = \frac{14G_\wbf^2T}{\beta n^2\epsilon}(\frac{\log(1/\delta)}{(1-\beta)\epsilon}+1)    
% \end{align*}
% Lemma \ref{lem:uniform-subsampling-amplification} with $m=1$ implies $A_\wbf^t$ satisfies $(\alpha_1, \frac{\alpha_1\beta_1\epsilon}{T(\frac{\log(1/\delta)}{(1-\beta_1)\epsilon}+1)})$-RDP, if the following condition holds
% \begin{align*}
% \sigma_\wbf^2/4G_\wbf^2 \geq 0.7 \text{ and }\alpha_1-1\leq \sigma_\wbf^2/4G_\wbf^2 \log(\frac{n}{\alpha_1(1+\sigma_\wbf^2/4G_\wbf^2)}).
% \end{align*}
% Let $\alpha_1=\frac{\log(1/\delta)}{(1-\beta_1)\epsilon} + 1$. We know $A_\wbf^t$ satisfies $(\frac{\log(1/\delta)}{(1-\beta_1)\epsilon} + 1, \frac{\beta_1\epsilon }{T})$-RDP

% \end{proof}


\section{Proofs for the convex-concave setting in Section \ref{sec:convex}}\label{sec:proof-convex}

Recall that the error decomposition  \eqref{eq:weak-err-decomp} given in Section \ref{sec:convex} that  the weak PD risk can be decomposed as follows: 
\begin{align*}\label{eq:weak-err-decomp-1}
\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) = \triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) - \triangle^w_S(\bar{\wbf}_T, \bar{\vbf}_T) + \triangle^w_S(\bar{\wbf}_T, \bar{\vbf}_T), \numberthis
\end{align*}
where the term $\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) - \triangle^w_S(\bar{\wbf}_T, \bar{\vbf}_T)$ is the generalization error and the term $\triangle^w_S(\bar{\wbf}_T, \bar{\vbf}_T)$ is the optimization error.

The proof of Theorem \ref{thm:sgda-utility} involves the estimation of the optimization error and generalization error which are performed in the subsequent subsection, respectively. 

\subsection{Estimation of Optimization Error}\label{sec:cc-opt}

We start by studying the optimization error for Algorithm \ref{alg:dp-sgda}. This is obtained as a direct corollary of \citet{nemirovski2009robust-supp}, with the existence of the Gaussian noise's variance and the mini-batch. Recall that $d = \max\{d_1, d_2\}.$
\begin{lemma}\label{lem:sgda-opt-gap}
Suppose \textbf{(A1)} holds, and $F_S$ is convex-concave. Let the stepsizes $\eta_{\wbf, t} = \eta_{\vbf, t} = \eta$, $t \in [T]$ for some $\eta > 0$. Then Algorithm \ref{alg:dp-sgda} satisfies
\[
\sup_{\vbf \in \Vcal} \Ebb_A[F_S(\bar{\wbf}_T,\vbf)]  - \inf_{\wbf\in \Wcal} \Ebb_A[F_S(\wbf,\bar{\vbf}_T)]\leq \frac{\eta (G_\wbf^2+G_\vbf^2)}{2} + \frac{D_\wbf^2 + D_\vbf^2}{\eta T} + \frac{(D_\wbf G_\wbf + D_\vbf G_\vbf)}{\sqrt{mT}} + \eta d(\sigma_\wbf^2 + \sigma_\vbf^2).
\]
\end{lemma}

\begin{proof}
According to the non-expansiveness of projection and update rule of Algorithm \ref{alg:dp-sgda}, for any $\wbf \in \Wcal$, we have
\begin{align*}
& \|\wbf_{t+1} - \wbf\|_2^2 \leq \Big\|\wbf_t - \wbf - \frac{\eta}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) - \eta \xi_t\Big\|_2^2 \\
\leq & \|\wbf_t - \wbf\|_2^2 + 2\eta\Big\langle \wbf - \wbf_t, \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) + \xi_t \Big\rangle + \eta^2 \Big\|\frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j})\Big\|_2^2 + \eta^2\|\xi_t\|_2^2\\
& + 2\eta^2\Big\langle \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}), \xi_t\Big\rangle\\
\leq & \|\wbf_t - \wbf\|_2^2 + 2\eta\langle \wbf - \wbf_t, \nabla_\wbf F_S(\wbf_t, \vbf_t)\rangle  + 2\eta\Big\langle \wbf - \wbf_t, \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) - \nabla_\wbf F_S(\wbf_t, \vbf_t)\Big\rangle\\
& + \eta^2 G_\wbf^2 + \eta^2\|\xi_t\|_2^2 + 2\eta^2\Big\langle \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}), \xi_t\Big\rangle +  2\eta\langle \wbf - \wbf_t, \xi_t\rangle,
\end{align*}
% \yunwen{it seems that the term $2\eta\langle\wbf-\wbf_t,\xi_t\rangle$ is missing. I also can not see why the inequality $\Big\|\frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j})\Big\|_2^2\leq \frac{G_{\wbf}^2}{m}$ holds} 
where in the last inequality we have used $f(\cdot, \vbf_t, \zbf_{i_t^j})$ is $G_\wbf$-Lipschitz continuous. According to the convexity of $F_S(\cdot, \vbf_t)$ we know
\begin{align*}
2\eta(F_S(\wbf_t, \vbf_t) \!-\! F_S(\wbf, \vbf_t)) \leq & \|\wbf_t \!-\! \wbf\|_2^2 \!-\! \|\wbf_{t+1} \!-\! \wbf\|_2^2 \!+\! 2\eta\Big\langle \wbf \!-\! \wbf_t, \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) \!-\! \nabla_\wbf F_S(\wbf_t, \vbf_t)\Big\rangle \\
& + \eta^2 G_\wbf^2 + \eta^2\|\xi_t\|_2^2 + 2\eta^2\Big\langle \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}), \xi_t\Big\rangle +  2\eta\langle \wbf - \wbf_t, \xi_t\rangle.
\end{align*}
Taking a summation of the above inequality from $t=1$ to $T$ we derive
\begin{multline*}\label{eq:opt-before-expectation}
2\eta\sum_{t=1}^T(F_S(\wbf_t, \vbf_t) - F_S(\wbf, \vbf_t)) \leq \|\wbf_1 - \wbf\|_2^2 + 2\eta\sum_{t=1}^T\Big\langle \wbf - \wbf_t, \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) - \nabla_\wbf F_S(\wbf_t, \vbf_t)\Big\rangle \\
+ T\eta^2 G_\wbf^2 + \eta^2\sum_{t=1}^T\|\xi_t\|_2^2 + 2\eta^2\sum_{t=1}^T\Big\langle \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}), \xi_t\Big\rangle +  2\eta\langle \wbf - \wbf_t, \xi_t\rangle.
\end{multline*}
It then follows from the concavity of $F_S(\wbf, \cdot)$ and Schwartz's inequality that
\begin{multline*}
2\sum_{t=1}^T\eta(F_S(\wbf_t, \vbf_t) - F_S(\wbf, \bar{\vbf}_T)) \leq  2D_\wbf^2 - 2\eta\sum_{t=1}^T\Big\langle\wbf_t, \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) - \nabla_\wbf F_S(\wbf_t, \vbf_t)\Big\rangle\\
 + 2D_\wbf \eta\Big\|\sum_{t=1}^T(\frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) - \nabla_\wbf F_S(\wbf_t, \vbf_t)\Big\|_2\\
 + T\eta^2 G_\wbf^2 + \eta^2\sum_{t=1}^T\|\xi_t\|_2^2 + 2\eta^2\sum_{t=1}^T\Big\langle \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}), \xi_t\Big\rangle +  2\eta\langle \wbf - \wbf_t, \xi_t\rangle. \numberthis
\end{multline*}
We can take expectations on the randomness of $A$ over both sides of\eqref{eq:opt-before-expectation} and get
\begin{align*}
2\eta\sum_{t=1}^T\Ebb_A[F_S(\wbf_t, \vbf_t) \!-\!F_S(\wbf, \bar{\vbf}_T)] \leq & 2D_\wbf^2  \!+\! 2D_\wbf\eta \Ebb_A\Big[\Big\|\sum_{t=1}^T \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) \!-\! \nabla_\wbf F_S(\wbf_t, \vbf_t)\Big\|_2\Big]\\
& + T\eta^2 G_\wbf^2 +\eta^2d_1\sigma_\wbf^2, 
\end{align*}
where we used that the variance $\Ebb_A[\|\xi_t\|_2^2] = d_1\sigma_\wbf^2$, the unbiasedness $\Ebb_A[\langle\wbf_t, \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) - \nabla_\wbf F_S(\wbf_t, \vbf_t)\rangle] = 0$, the independence  $\Ebb_A[\langle \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}), \xi_t\rangle] = 0$ and $\Ebb_A[\langle \wbf - \wbf_t, \xi_t\rangle]=0$.
Since the above inequality holds for all $\wbf$, we further get
\begin{align*}\label{eq:opt-before-variance}
2\eta\sum_{t=1}^T\Ebb_A[F_S(\wbf_t, \vbf_t)] \!-\! \inf_{\wbf \in \Wcal}\Ebb_A[F_S(\wbf, \bar{\vbf}_T)] \leq & 2D_\wbf^2  \!+\! 2D_\wbf\eta \Ebb_A\Big[\Big\|\sum_{t=1}^T \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) \!-\! \nabla_\wbf F_S(\wbf_t, \vbf_t)\Big\|_2\Big]\\
& + T\eta^2 G_\wbf^2 +\eta^2d_1\sigma_\wbf^2, \numberthis
\end{align*}
According to Jensen's inequality and $G_\wbf$-Lipschitz continuity we further derive
\begin{align*}
& \Big(\Ebb_A\Big[\Big\|\sum_{t=1}^T (\frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) \!-\! \nabla_\wbf F_S(\wbf_t, \vbf_t)\Big\|_2)\Big]\Big)^2\\
\leq &  \Ebb_A\Big[\Big\|\sum_{t=1}^T (\frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) \!-\! \nabla_\wbf F_S(\wbf_t, \vbf_t))\Big\|_2^2\Big] = \sum_{t=1}^T \Ebb_A\Big[\Big\|\frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) - \nabla_\wbf F_S(\wbf_t, \vbf_t)\Big\|_2^2\Big]\\
\leq & \frac{TG_\wbf^2}{m}.
\end{align*}
Plugging the above estimate into \eqref{eq:opt-before-variance} we arrive
\[
2\eta\sum_{t=1}^T\Ebb_A[F_S(\wbf_t, \vbf_t)] - \inf_{\wbf\in \Wcal}\Ebb_A[F_S(\wbf, \bar{\vbf}_T)] \leq 2D_\wbf^2  + \frac{2D_\wbf \eta G_\wbf\sqrt{T}}{\sqrt{m}} + T\eta^2 G_\wbf^2 + T\eta^2d_1\sigma_\wbf^2.
\]
By dividing $2\eta T$ on both sides we have
\begin{equation}\label{eq:opt-w}
\frac{1}{T}\sum_{t=1}^T\Ebb_A[F_S(\wbf_t, \vbf_t)] - \inf_{\wbf\in \Wcal}\Ebb_A[F_S(\wbf, \bar{\vbf}_T)] \leq \frac{D_\wbf^2}{\eta T}  + \frac{D_\wbf G_{\wbf}}{\sqrt{mT}} + \frac{\eta G^2_{\wbf}}{2} + \frac{\eta d_1\sigma_\wbf^2}{2}.
\end{equation}
In a similar way, we can show that 
\begin{equation}\label{eq:opt-v}
\frac{1}{T}\sum_{t=1}^T\sup_{\vbf \in \Vcal}\Ebb_A[F_S(\bar{\wbf}_T, \vbf)] - \Ebb_A[F_S(\wbf_t, \vbf_t)] \leq \frac{D_\vbf^2}{\eta T}  + \frac{D_\vbf G_\vbf}{\sqrt{mT}} + \frac{\eta G_\vbf^2}{2} + \frac{\eta d_2\sigma_\vbf^2}{2}.
\end{equation}
The stated bound then follows from \eqref{eq:opt-w} and \eqref{eq:opt-v} and the fact that $d = \max\{d_1, d_2\}.$
\end{proof}

\subsection{Estimation of Generalization Error}\label{sec:cc-gen}
Next we move on to the generalization error. Firstly, we introduce a lemma that bridges the generalization and the stability. We say the randomized algorithm $A$ is  {\em $\varepsilon$-weakly-stable} if,  for any neighboring datasets $S, S'$,  there holds 
\begin{align*}
\sup_\zbf\Big(\sup_{\vbf \in \Vcal}\Ebb_{A}[f(A_\wbf(S), \vbf; \zbf) - f(A_\wbf(S'), \vbf; \zbf)] + \sup_{\wbf \in \Wcal}\Ebb_{A}[f(\wbf, A_\vbf(S); \zbf) - f(\wbf, A_\vbf(S'); \zbf)]\Big) \leq \varepsilon.     
\end{align*}

\begin{lemma}{\citep{lei2021stability-supp}}\label{lem:weak-gen-via-weak-stab}
If $A$ is $\varepsilon$-weakly-stable, then there holds $$
    \triangle^w(A_{\wbf}(S),A_{\vbf}(S))-\triangle^w_S(A_{\wbf}(S),A_{\vbf}(S))\leq\varepsilon.$$
\end{lemma}

We also need the following standard lemma before we prove the stability of DP-SGDA.
\begin{lemma}[\citep{rockafellar1976monotone-supp}]\label{lem:monotone}
Let $f$ be a convex-concave function. Then
\begin{equation*}
\left\langle \begin{pmatrix} \wbf - \wbf'\\ \vbf - \vbf' \end{pmatrix},  \begin{pmatrix} \nabla_\wbf f(\wbf, \vbf) - \nabla_\wbf f(\wbf', \vbf')\\ \nabla_\vbf f(\wbf', \vbf') - \nabla_\vbf f(\wbf, \vbf) \end{pmatrix}\right\rangle \geq 0.
\end{equation*}
\end{lemma}

The stability analysis is given in the following lemma. This lemma is an extension of the uniform argument stability results in \citet{lei2021stability-supp} to the case of mini-batch DP-SGDA.

\begin{lemma}\label{lem:sgda-gen-gap}
Suppose the function $F_S$ is convex-concave. Let the stepsizes $\eta_{\wbf, t} = \eta_{\vbf, t} = \eta$ for some $\eta > 0$. 
\begin{enumerate}
\item[a)] Assume \textbf{(A1)} and \textbf{(A3)} hold, then Algorithm \ref{alg:dp-sgda} satisfies
\[
\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) - \triangle^w_S(\bar{\wbf}_T, \bar{\vbf}_T) \leq \frac{4\sqrt{e(T+T^2/n)}(G_\wbf+G_\vbf)^2\eta\exp(L^2T\eta^2/2)}{\sqrt{n}}.
\]
\item[b)] Assume \textbf{(A1)} holds, then Algorithm \ref{alg:dp-sgda} satisfies
\[
\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) - \triangle^w_S(\bar{\wbf}_T, \bar{\vbf}_T) \leq 4\sqrt{2}\eta (G_\wbf + G_\vbf)^2\Big(\sqrt{T} + \frac{T}{n}\Big).
\]
\end{enumerate}
\end{lemma}

\begin{proof}
Without loss of generality, let $S=\{\zbf_1,\cdots,\zbf_n\},S'=\{\zbf_1',\cdots,\zbf_n'\}$ be neighboring datasets differing by the last element, i.e. $\zbf_n \neq \zbf'_n$. Let $\{\wbf_t,\vbf_t\},\{\wbf_t',\vbf_t'\}$ be the sequence produced by Algorithm \ref{alg:dp-sgda} w.r.t. $S$ and $S'$, respectively. We first prove Part a). In the case $n\not\in I_t$, by the non-expansiveness of projection, we have
\begin{align*}
  &\left\|\begin{pmatrix}
           \wbf_{t+1}-\wbf_{t+1}' \\
           \vbf_{t+1}-\vbf_{t+1}'
         \end{pmatrix}\right\|_2^2 \leq \left\|\begin{pmatrix}                 \wbf_t-\frac{\eta}{m}\sum_{j=1}^m\nabla_{\wbf}f(\wbf_t,\vbf_t;z_{i_t^j}) - \eta \xi_t -\wbf_t'+\frac{\eta}{m}\sum_{j=1}^m\nabla_{\wbf}f(\wbf_t',\vbf'_t;z_{i_t^j}) + \eta \xi_t \\  
                   \vbf_t+\frac{\eta}{m}\sum_{j=1}^m\nabla_{\vbf}f(\wbf_t,\vbf_t;z_{i_t^j}) + \eta \zeta_t-\vbf_t'-\frac{\eta}{m}\sum_{j=1}^m\nabla_{\vbf}f(\wbf_t',\vbf'_t;z_{i_t^j}) - \eta\zeta_t
                 \end{pmatrix}\right\|_2^2\\
        & =  \left\|\begin{pmatrix}
           \wbf_t-\wbf_t' \\
           \vbf_t-\vbf_t'
         \end{pmatrix}\right\|_2^2 + \frac{\eta}{m}\sum_{j=1}^m \left\langle \begin{pmatrix} \wbf_t - \wbf'_t\\ \vbf_t - \vbf'_t \end{pmatrix},  \begin{pmatrix} \nabla_{\wbf}f(\wbf_t,\vbf_t;z_{i_t^j}) - \nabla_{\wbf}f(\wbf_t',\vbf'_t;z_{i_t^j})\\ \nabla_{\vbf}f(\wbf'_t,\vbf'_t;z_{i_t^j}) - \nabla_{\vbf}f(\wbf_t,\vbf_t;z_{i_t^j}) \end{pmatrix}\right\rangle\\
         & + \left\|\begin{pmatrix}
                   \frac{\eta}{m}\sum_{j=1}^m(\nabla_{\wbf}f(\wbf_t,\vbf_t;z_n)-\nabla_{\wbf}f(\wbf_t',\vbf'_t;z'_n)) \\
                   \frac{\eta}{m}\sum_{j=1}^m(\nabla_{\vbf}f(\wbf_t,\vbf_t;z_n)-\nabla_{\vbf}f(\wbf_t',\vbf'_t;z'_n))
                 \end{pmatrix}\right\|_2^2\\
         & \leq 
         (1+L^2\eta^2) \left\|\begin{pmatrix}
           \wbf_{t}-\wbf_{t}' \\
           \vbf_{t}-\vbf_{t}'
         \end{pmatrix}\right\|_2^2,
\end{align*}
where the last inequality follows from Lemma \ref{lem:monotone} and the $L$-smoothness assumption. If $n \in I_t$, then it follows that
\begin{align*}\label{stab-gda-2}
  & \left\|\begin{pmatrix}
           \wbf_{t+1}-\wbf_{t+1}' \\
           \vbf_{t+1}-\vbf_{t+1}'
         \end{pmatrix}\right\|_2^2
         \leq\left\|\begin{pmatrix}                 \wbf_t-\frac{\eta}{m}\sum_{j=1}^m\nabla_{\wbf}f(\wbf_t,\vbf_t;z_{i_t^j}) - \eta \xi_t -\wbf_t'+\frac{\eta}{m}\sum_{j=1}^m\nabla_{\wbf}f(\wbf_t',\vbf'_t;z'_{i_t^j}) + \eta \xi_t \\  
                   \vbf_t+\frac{\eta}{m}\sum_{j=1}^m\nabla_{\vbf}f(\wbf_t,\vbf_t;z_{i_t^j}) + \eta \zeta_t-\vbf_t'-\frac{\eta}{m}\sum_{j=1}^m\nabla_{\vbf}f(\wbf_t',\vbf'_t;z'_{i_t^j}) - \eta\zeta_t
                 \end{pmatrix}\right\|_2^2\\
        & \leq\frac{1}{m}\sum_{i_t^j \in I_t, i_t^j \neq n}\left\|\begin{pmatrix}
                   \wbf_t-\eta\nabla_{\wbf}f(\wbf_t,\vbf_t;z_{i_t^j})-\wbf_t'+\eta\nabla_{\wbf}f(\wbf_t',\vbf'_t;z'_{i_t^j}) \\
                   \vbf_t+\eta\nabla_{\vbf}f(\wbf_t,\vbf_t;z_{i_t^j})-\vbf_t'-\eta\nabla_{\vbf}f(\wbf_t',\vbf'_t;z'_{i_t^j})
                 \end{pmatrix}\right\|_2^2\\
        & + \frac{1}{m}\left\|\begin{pmatrix}
                   \wbf_t-\eta\nabla_{\wbf}f(\wbf_t,\vbf_t;z_n)-\wbf_t'+\eta\nabla_{\wbf}f(\wbf_t',\vbf'_t;z'_n) \\
                   \vbf_t+\eta\nabla_{\vbf}f(\wbf_t,\vbf_t;z_n)-\vbf_t'-\eta\nabla_{\vbf}f(\wbf_t',\vbf'_t;z'_n)
                 \end{pmatrix}\right\|_2^2\\
         & \leq\frac{m-1}{m}(1+L^2\eta^2) \left\|\begin{pmatrix}
           \wbf_{t}-\wbf_{t}' \\
           \vbf_{t}-\vbf_{t}'
         \end{pmatrix}\right\|_2^2 + \frac{1+p}{m}\left\|\begin{pmatrix}
           \wbf_{t}-\wbf_{t}' \\
           \vbf_{t}-\vbf_{t}'
         \end{pmatrix}\right\|_2^2
        \\ & +\frac{1+1/p}{m}\eta^2\left\|\begin{pmatrix}
                                 \nabla_{\wbf}f(\wbf_t,\vbf_t;z_n)-\nabla_{\wbf}f(\wbf_t',\vbf_t';z'_n) \\
                                 \nabla_{\vbf}f(\wbf_t,\vbf_t;z_n)-\nabla_{\vbf}f(\wbf_t',\vbf_t';z'_n)
                               \end{pmatrix}\right\|_2^2,\numberthis
\end{align*}
where in the last inequality we used the elementary inequality $(a+b)^2\leq(1+p)a^2+(1+1/p)b^2$ ($p>0$). Since $I_t$ are drawn uniformly at random with replacement, the event $n\not\in I_t$ happens with probability $1-m/n$ and the event $n\in I_t$ happens with probability $m/n$. Therefore,
we know
\begin{align*}
  \Ebb_{i_t}\left[\left\|\begin{pmatrix}
           \wbf_{t+1}-\wbf_{t+1}' \\
           \vbf_{t+1}-\vbf_{t+1}'
         \end{pmatrix}\right\|_2^2\right] & \leq \frac{(n-m)(1+L^2\eta^2)}{n} \left\|\begin{pmatrix}
           \wbf_{t}-\wbf_{t}' \\
           \vbf_{t}-\vbf_{t}'
         \end{pmatrix}\right\|_2^2 + \frac{m(1+L^2\eta^2)}{n}\frac{m-1}{m} \left\|\begin{pmatrix}
           \wbf_{t}-\wbf_{t}' \\
           \vbf_{t}-\vbf_{t}'
         \end{pmatrix}\right\|_2^2\\
         & +  \frac{m}{n}\frac{1+p}{m}\left\|\begin{pmatrix}
           \wbf_{t}-\wbf_{t}' \\
           \vbf_{t}-\vbf_{t}'
         \end{pmatrix}\right\|_2^2+\frac{m}{n}\frac{4(1+1/p)}{m}\eta^2(G_\wbf^2 + G_\vbf^2)\\
         & \leq \Big(1+L^2\eta^2+p/n\Big)\left\|\begin{pmatrix}
           \wbf_{t}-\wbf_{t}' \\
           \vbf_{t}-\vbf_{t}'
         \end{pmatrix}\right\|_2^2+\frac{4(1+1/p)}{n}\eta^2(G_\wbf^2 + G_\vbf^2).
\end{align*}
Applying this inequality recursively, we derive
\[
\Ebb_A\left[\left\|\begin{pmatrix}
           \wbf_{t+1}-\wbf_{t+1}' \\
           \vbf_{t+1}-\vbf_{t+1}'
         \end{pmatrix}\right\|_2^2\right]\leq
         \frac{4(1+1/p)}{n}(G_\wbf^2 + G_\vbf^2)\sum_{k=1}^{t}\eta^2\prod_{j=k+1}^{t}\Big(1+L^2\eta^2+p/n\Big).
\]
By the elementary inequality $1+a\leq\exp(a)$, we further derive
\begin{align*}
\Ebb_A\left[\left\|\begin{pmatrix}
           \wbf_{t+1}-\wbf_{t+1}' \\
           \vbf_{t+1}-\vbf_{t+1}'
         \end{pmatrix}\right\|_2^2\right] & \leq \frac{4(1+1/p)}{n}(G_\wbf^2 + G_\vbf^2)\sum_{k=1}^{t}\eta^2\prod_{j=k+1}^{t}\exp\Big(L^2\eta^2+p/n\Big)\\
         & = \frac{4(1+1/p)}{n}(G_\wbf^2 + G_\vbf^2)\sum_{k=1}^{t}\eta^2\exp\Big(L^2\sum_{j=k+1}^{t}\eta^2+p(t-k)/n\Big)\\
         & \leq \frac{4(1+1/p)}{n}(G_\wbf^2 + G_\vbf^2)\exp\Big(L^2\sum_{j=1}^{t}\eta^2+pt/n\Big)\sum_{k=1}^{t}\eta^2.
\end{align*}
By taking $p=n/t$ we get
\[
\Ebb_A\left[\left\|\begin{pmatrix}
           \wbf_{t+1}-\wbf_{t+1}' \\
           \vbf_{t+1}-\vbf_{t+1}'
         \end{pmatrix}\right\|_2^2\right]
         \leq
         \frac{4e(G_\wbf^2 + G_\vbf^2)(1+t/n)}{n}\exp\Big(L^2\sum_{j=1}^{t}\eta^2\Big)\sum_{k=1}^{t}\eta^2.
\]
Now by the Lipschitz continuity and Jensen's inequality we ave
\begin{align*}
& \sup_\zbf\Big(\sup_{\vbf \in \Vcal}\Ebb_{A}[f(A_\wbf(S), \vbf; \zbf) - f(A_\wbf(S'), \vbf; \zbf)] + \sup_{\wbf \in \Wcal}\Ebb_{A}[f(\wbf, A_\vbf(S); \zbf) - f(\wbf, A_\vbf(S'); \zbf)]\Big)\\
\leq & G_\wbf \Ebb_A[\|\bar{\wbf}_T - \bar{\wbf}'_T\|_2] + G_\vbf \Ebb_A[\|\bar{\vbf}_T - \bar{\vbf}'_T\|_2] \leq \frac{4\sqrt{e(T+T^2/n)}(G_\wbf+G_\vbf)^2\eta\exp(L^2T\eta^2/2)}{\sqrt{n}}.
\end{align*}
According to Lemma \ref{lem:weak-gen-via-weak-stab} we know 
\begin{align*}
\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) - \triangle^w_S(\bar{\wbf}_T, \bar{\vbf}_T) \leq \frac{4\sqrt{e(T+T^2/n)}(G_\wbf+G_\vbf)^2\eta\exp(L^2T\eta^2/2)}{\sqrt{n}}.
\end{align*}

Next we focus on Part b). We consider two cases at the $t$-th iteration. If $n\not\in I_t$, then analogous to the discussions in \citet{lei2021stability-supp} we can show
\begin{align}
  \left\|\begin{pmatrix}
           \wbf_{t+1}-\wbf_{t+1}' \\
           \vbf_{t+1}-\vbf_{t+1}'
         \end{pmatrix}\right\|_2^2
         &\leq\left\|\begin{pmatrix}
                   \wbf_t-\frac{\eta}{m}\sum_{j=1}^m\nabla_{\wbf}f(\wbf_t,\vbf_t;z_{i_t^j}) - \eta \xi_t-\wbf_t'+\frac{\eta}{m}\sum_{j=1}^m\nabla_{\wbf}f(\wbf_t',\vbf'_t;z_{i_t^j}) + \eta \xi_t \\
                   \vbf_t+\frac{\eta}{m}\sum_{j=1}^m\nabla_{\vbf}f(\wbf_t,\vbf_t;z_{i_t^j})+ \eta \zeta_t -\vbf_t'-\frac{\eta}{m}\sum_{j=1}^m\nabla_{\vbf}f(\wbf_t',\vbf'_t;z_{i_t^j}) - \eta \zeta_t
                 \end{pmatrix}\right\|_2^2\notag \\
         & \leq \left\|\begin{pmatrix}
           \wbf_{t}-\wbf_{t}' \\
           \vbf_{t}-\vbf_{t}'
         \end{pmatrix}\right\|_2^2+4(G_\wbf^2 + G_\vbf^2)\eta^2.\label{stab-gda-1}
\end{align}
Combining the preceding inequality with \eqref{stab-gda-2} and using the probability of $n\not\in I_t$, we derive
\begin{align*}
 & \Ebb_{i_t}\left[\left\|\begin{pmatrix}
           \wbf_{t+1}-\wbf_{t+1}' \\
           \vbf_{t+1}-\vbf_{t+1}'
         \end{pmatrix}\right\|_2^2\right]  \leq \frac{n-1}{n}\left(\left\|\begin{pmatrix}
           \wbf_{t}-\wbf_{t}' \\
           \vbf_{t}-\vbf_{t}'
         \end{pmatrix}\right\|_2^2+4(G_\wbf^2 + G_\vbf^2)\eta^2\right)  \\ & + \frac{1+p}{n}\left\|\begin{pmatrix}
           \wbf_{t}-\wbf_{t}' \\
           \vbf_{t}-\vbf_{t}'
         \end{pmatrix}\right\|_2^2+\frac{4(1+1/p)}{n}(G_\wbf^2 + G_\vbf^2)\eta^2 \\
         & = (1+p/n)\left\|\begin{pmatrix}
           \wbf_{t}-\wbf_{t}' \\
           \vbf_{t}-\vbf_{t}'
         \end{pmatrix}\right\|_2^2+4(G_\wbf^2 + G_\vbf^2)\eta^2(1+1/(np)).
\end{align*}
Applying this inequality recursively implies that 
\begin{align*}
 &  \Ebb_A\left[\left\|\begin{pmatrix}
           \wbf_{t+1}-\wbf_{t+1}' \\
           \vbf_{t+1}-\vbf_{t+1}'
         \end{pmatrix}\right\|_2^2\right]  \leq 4(G_\wbf^2 + G_\vbf^2)\eta^2\big(1+1/(np)\big)\sum_{k=1}^{t}\Big(1+\frac{p}{n}\Big)^{t-k}
   \\ & = 4(G_\wbf^2 + G_\vbf^2)\eta^2\Big(1+\frac{1}{np}\Big)\frac{n}{p}\Big(\Big(1+\frac{p}{n}\Big)^t-1\Big) = 4(G_\wbf^2 + G_\vbf^2)\eta^2\Big(\frac{n}{p}+\frac{1}{p^2}\Big)\Big(\Big(1+\frac{p}{n}\Big)^t-1\Big).
\end{align*}
By taking $p=n/t$ in the above inequality and using $(1+1/t)^t\leq e$, we get
\[
\Ebb_A\left[\left\|\begin{pmatrix}
           \wbf_{t+1}-\wbf_{t+1}' \\
           \vbf_{t+1}-\vbf_{t+1}'
         \end{pmatrix}\right\|_2^2\right]\leq 16(G_\wbf^2 + G_\vbf^2)\eta^2\Big(t+\frac{t^2}{n^2}\Big).
\]
Now by the Lipschitz continuity and Jensen's inequality we ave
\begin{align*}
& \sup_\zbf\Big(\sup_{\vbf \in \Vcal}\Ebb_{A}[f(A_\wbf(S), \vbf; \zbf) - f(A_\wbf(S'), \vbf; \zbf)] + \sup_{\wbf \in \Wcal}\Ebb_{A}[f(\wbf, A_\vbf(S); \zbf) - f(\wbf, A_\vbf(S'); \zbf)]\Big)\\
\leq & G_\wbf \Ebb_A[\|\bar{\wbf}_T - \bar{\wbf}'_T\|_2] + G_\vbf \Ebb_A[\|\bar{\vbf}_T - \bar{\vbf}'_T\|_2] \leq 4\sqrt{2}(G_\wbf + G_\vbf)^2\eta^2\Big(\sqrt{T}+\frac{T}{n}\Big).
\end{align*}
According to Lemma \ref{lem:weak-gen-via-weak-stab} we know 
\begin{align*}
\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) - \triangle^w_S(\bar{\wbf}_T, \bar{\vbf}_T) \leq 32(G_\wbf + G_\vbf)^2\eta^2\Big(\sqrt{T}+\frac{T}{n}\Big).
\end{align*} 
\end{proof}


\subsection{Proof of Theorem  \ref{thm:sgda-utility}}
Finally we are ready to present the proof of Theorem \ref{thm:sgda-utility}.

\begin{theorem}[Theorem \ref{thm:sgda-utility} restated]
Suppose the function $F_S$ is convex-concave. Let the stepsizes $\eta_{\wbf, t} = \eta_{\vbf, t} = \eta$, $t = [T]$ for some $\eta > 0$. 
\begin{enumerate}
\item[a)] Assume \textbf{(A1)} and \textbf{(A3)} hold. If we choose $T \asymp n$ and $\eta \asymp 1/\Big(\sqrt{L}\max\{\sqrt{n}, \sqrt{d\log(1/\delta)}/\epsilon\}\Big)$, then Algorithm \ref{alg:dp-sgda} satisfies
\[
\triangle^w(\bar{\wbf}_T,\bar{\vbf}_T) = \Ocal\Big(\max\{G_\wbf^2 + G_\vbf^2, (G_\wbf + G_\vbf)^2, D_\wbf^2 + D_\vbf^2, D_\wbf G_\wbf + D_\vbf G_\vbf\} \max\Big\{\frac{1}{\sqrt{n}}, \frac{\sqrt{d\log(1/\delta)}}{n\epsilon}\Big\}\Big).
\]
\item[b)] Assume \textbf{(A1)} holds. If we choose $T \asymp n^2$ and $\eta \asymp 1/\Big(n\max\{\sqrt{n}, \sqrt{d\log(1/\delta)}/\epsilon\}\Big)$, then Algorithm \ref{alg:dp-sgda} satisfies
\[
\triangle^w(\bar{\wbf}_T,\bar{\vbf}_T) = \Ocal\Big(\max\{G_\wbf^2 + G_\vbf^2, (G_\wbf + G_\vbf)^2, D_\wbf^2 + D_\vbf^2, D_\wbf G_\wbf + D_\vbf G_\vbf\}\max\Big\{\frac{1}{\sqrt{n}}, \frac{\sqrt{d\log(1/\delta)}}{n\epsilon}\Big\}\Big).
\]
\end{enumerate}
\end{theorem}

\begin{proof}[Proof of Theorem \ref{thm:sgda-utility}]
We first focus on Part a). According to Part a) of  Lemma \ref{lem:sgda-gen-gap}  we know
\[
\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) - \triangle^w_S(\bar{\wbf}_T, \bar{\vbf}_T) \leq \frac{4\sqrt{e(T+T^2/n)}(G_\wbf + G_\vbf)^2\eta\exp(L^2T\eta^2/2)}{\sqrt{n}}
\]
and by Lemma \ref{lem:sgda-opt-gap} we know
\[
\triangle^w_S(\bar{\wbf}_T, \bar{\vbf}_T) \leq \frac{\eta (G_\wbf^2 + G_\vbf^2)}{2} + \frac{D_\wbf^2 + D_\vbf^2}{2\eta T} + \frac{D_\wbf G_\wbf + D_\vbf G_\vbf}{\sqrt{mT}} + \eta d(\sigma_\wbf^2 + \sigma_\vbf^2).
\] 	
Combining the above two quantities we have
\begin{align*}\label{eq:risk-before-noise}
\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) \leq & \frac{4\sqrt{e(T+T^2/n)}(G_\wbf + G_\vbf)^2\eta\exp(L^2T\eta^2/2)}{\sqrt{n}} + \frac{\eta (G_\wbf^2 + G_\vbf^2)}{2} + \frac{D_\wbf^2 + D_\vbf^2}{2\eta T} \\
& + \frac{D_\wbf G_\wbf + D_\vbf G_\vbf}{\sqrt{mT}} + \eta d(\sigma_\wbf^2 + \sigma_\vbf^2). \numberthis
\end{align*}
Furthermore, by Theorem \ref{thm:moments-accountant-privacy}, we know
\[
\sigma_\wbf^2 = \Ocal\Big(\frac{G_\wbf^2T\log(1/\delta)}{n^2\epsilon^2}\Big), \quad \sigma_\vbf^2 = \Ocal\Big(\frac{G_\vbf^2T\log(1/\delta)}{n^2\epsilon^2}\Big).
\]
Plugging it back into \eqref{eq:risk-before-noise} we have
\begin{multline*}
\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) = \Ocal\Big(\frac{\sqrt{(T+T^2/n)}(G_\wbf + G_\vbf)^2\eta\exp(L^2T\eta^2)}{\sqrt{n}}\\
+ \frac{\eta (G_\wbf^2 + G_\vbf^2)}{2} + \frac{D_\wbf^2 + D_\vbf^2}{2\eta T} + \frac{D_\wbf G_\wbf + D_\vbf G_\vbf}{\sqrt{mT}} + \frac{\eta (G_\wbf^2 + G_\vbf^2) Td\log(1/\delta)}{n^2\epsilon^2}\Big).
\end{multline*}
By picking $T \asymp n$ and $\eta \asymp 1/\Big(L\max\{\sqrt{n}, \sqrt{d\log(1/\delta)}/\epsilon\}\Big)$ we have $\exp(L^2T\eta^2) = \Ocal\Big(\min\{1, \frac{n\epsilon^2}{d\log(1/\delta)}\}\Big) = \Ocal(1)$
and
\[
\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) = \Ocal\Big(\max\{G_\wbf^2 + G_\vbf^2, (G_\wbf + G_\vbf)^2, D_\wbf^2 + D_\vbf^2, D_\wbf G_\wbf + D_\vbf G_\vbf\} \max\Big\{\frac{1}{\sqrt{n}}, \frac{\sqrt{d\log(1/\delta)}}{n\epsilon}\Big\}\Big).
\]
We now turn to Part b). According to Lemma \ref{lem:sgda-gen-gap} Part b) we know 
\[
\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) - \triangle^w_S(\bar{\wbf}_T, \bar{\vbf}_T) \leq 4\sqrt{2}\eta (G_\wbf + G_\vbf)^2 \Big(\sqrt{T} + \frac{T}{n}\Big).
\]
Similar to Part a) we have
\[
\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) \!=\! \Ocal\Big(\eta (G_\wbf + G_\vbf)^2 \Big(\sqrt{T} + \frac{T}{n}\Big) + \frac{\eta (G_\wbf^2 \!+\! G_\vbf^2)}{2} + \frac{D_\wbf^2 \!+\! D_\vbf^2}{2\eta T} + \frac{D_\wbf G_\wbf \!+\! D_\vbf G_\vbf}{\sqrt{mT}} + \frac{\eta (G_\wbf^2 \!+\! G_\vbf^2) Td\log(1/\delta)}{n^2\epsilon^2}\Big).
\]
By picking $T \asymp n^2$ and $\eta \asymp 1/\Big(n\max\{\sqrt{n}, \sqrt{d\log(1/\delta)}/\epsilon\}\Big)$ we have
\[
\triangle^w(\bar{\wbf}_T, \bar{\vbf}_T) = \Ocal\Big(\max\{G_\wbf^2 + G_\vbf^2, (G_\wbf + G_\vbf)^2, D_\wbf^2 + D_\vbf^2, D_\wbf G_\wbf + D_\vbf G_\vbf\}\max\Big\{\frac{1}{\sqrt{n}}, \frac{\sqrt{d\log(1/\delta)}}{n\epsilon}\Big\}\Big).
\]
The proof is complete.
\end{proof}

% \subsection{Clarifying Remark \ref{rem:compare-extragradient}}\label{sec:proof-bug}

% The same optimal utility bound in terms of VI gap was claimed in Theorem 5.4 and Theorem 7.4 of \citep{boob2021optimal}, i.e.
% \[\Ebb[\sup_{w \in \Wcal} \langle F(w), u - w\rangle] = \Ocal\Big(\max\Big\{\frac{1}{\sqrt{n}}, \frac{\sqrt{d\log(1/\delta)}}{n\epsilon}\Big\}\Big).\] By the monotonicity assumption, their results imply the same bound on the primal-dual risk $\triangle(A_\wbf(\S), A_\vbf(\S)) = \Ebb[\max_\vbf F(A_\wbf(\S), \vbf) - \min_\wbf F(\wbf, A_\vbf(\S))]$ of the minimax problem \eqref{eq:SSP}. Such measure of generalization seems to be stronger than ours since
% $ 
% \triangle^w(A_\wbf(\S), A_\vbf(\S)) \leq \triangle(A_\wbf(\S), A_\vbf(\S)).$
% However, the discussion between stability and generalization there is not rigorous in Theorem 4.1 (i.e. stability implies generalization for SVI). Indeed, borrowing their notations, equation (4.3) from their paper, i.e., 
% \[
% \Ebb_{\beta_i}\langle F(u; \beta_i), \Acal(\S) - u\rangle \leq \Ebb_{\beta_j}\langle F(u; \beta_j), \Acal(\S) - u\rangle\\
% + M \Ebb_{\beta'_j}[\|\Acal(\S_j^i) - \Acal(\S^j)\|],\]
% only holds when $u$ is independent of $\beta_j$. But applying (4.3) with $u = \hat{u}_i = \arg\max_{u \in \Wcal} \Ebb_{\beta_i}\langle F(u; \beta_i), \Acal(\S) - u\rangle$ to get the second inequality in (4.5) of their paper is problematic because now $u=\hat{i}_i$ involves sample $S$ and after taking expectation of $\beta_i$, it depends on $\beta_j$. 

\section{Proofs for the nonconvex-strongly-concave setting in Section \ref{sec:nonconvex-strongly-concave}}\label{sec:proof-nonconvex}

In this section, we will provide the proofs for the theorems in Section \ref{sec:nonconvex-strongly-concave}. Recall that we define $R^*_\S = \min_{\wbf\in\Wcal} R_\S(\wbf), \text{ and } R^* = \min_{\wbf\in\Wcal} R(\wbf).$ Then, for any $\wbf^* \in \arg\min_\wbf R(\wbf)$ we have the error decomposition:
\begin{align*}
\Ebb[R(\wbf_T) - R^*] = & \Ebb[R(\wbf_T) - R_\S(\wbf_T)] + \Ebb[R_\S(\wbf_T) - R_\S^*] +  \Ebb[R_\S^* - R_\S(\wbf^*)] + \Ebb[R_\S(\wbf^*) - R(\wbf^*)]\\
\leq & \Ebb[R(\wbf_T) - R_\S(\wbf_T)] + \Ebb[R_\S(\wbf^*) - R(\wbf^*)] + \Ebb[R_\S(\wbf_T) - R_\S^*].
\end{align*}
The term $\Ebb[R_\S(\wbf_T) - R_\S^*]$ is the {\em optimization error} which characterizes the discrepancy between the primal empirical risk of an output of Algorithm \ref{alg:dp-sgda} and the least possible one. The term $\Ebb[R(\wbf_T) - R_\S(\wbf_T)]  + \Ebb[R_\S(\wbf^*) - R(\wbf^*)]$ is called the {\em generalization error} which measures the discrepancy  between the primal population risk and the empirical one. The estimations for these two errors are described as follows. 

\subsection{Proof of Theorem \ref{thm:sgda-primal-opt}}\label{sec:sgda-primal-opt}

To prove Theorem \ref{thm:sgda-primal-opt}, i.e., optimization error,  we introduce several necessary lemmas. The first lemma is an application of Danskin's Theorem.

\begin{lemma}[\citep{lin2020gradient-supp}]\label{lem:primal-smoothness}
Assume \textbf{(A3)}  holds and $F_S(\wbf, \cdot)$ is $\rho$-strongly concave. Assume $\Vcal$ is a convex and bounded set. Then the function $R_S(\wbf)$ is $L + L^2/\rho$-smooth and $\nabla R_S(\wbf) = \nabla_\wbf F_S(\wbf, \hat{\vbf}_S(\wbf))$, where $\hat{\vbf}_S(\wbf) = \arg\max_{\vbf \in \Vcal} F_S(\wbf, \vbf)$. And $\hat{\vbf}_S(\wbf)$ is $L/\rho$ Lipschitz continuous. 
\end{lemma}

The second lemma shows that $R_S$ also satisfies the PL condition whenever $F_S$ does. 
\begin{lemma}\label{lem:primal-pl}
Assume \textbf{(A3)}  holds. Assume $F_S(\cdot, \vbf)$ satisfies PL condition with constant $\mu$ and $F_S(\wbf, \cdot)$ is $\rho$-strongly concave. Then the function $R_S(\wbf)$ satisfies the PL condition with $\mu$.
\end{lemma}

\begin{proof}
From Lemma \ref{lem:primal-smoothness}, $\|\nabla R_S(\wbf)\|_2^2 = \|\nabla_\wbf F_S(\wbf, \hat{\vbf}_S(\wbf))\|_2^2$. Since $F_S$ satisfies PL condition with constant $\mu$, we get 
\begin{equation}\label{eq:intermediate-pl}
\|\nabla R_S(\wbf)\|_2^2 \geq 2\mu  \big(F_S(\wbf, \hat{\vbf}_S(\wbf)) - \min_{\wbf' \in \Wcal} F_S(\wbf', \hat{\vbf}_S(\wbf))\big).  
\end{equation}
Also, since $F_S(\wbf', \hat{\vbf}_S(\wbf)) \leq \max_{\vbf \in \Vcal} F_S(\wbf', \vbf)$, we have
\begin{equation}\label{eq:intermediate-mm}
\min_{\wbf' \in \Wcal}F_S(\wbf', \hat{\vbf}_S(\wbf)) \leq \min_{\wbf' \in \Wcal}\max_{\vbf \in \Vcal} F_S(\wbf', \vbf) =   \min_{\wbf' \in \Wcal} R_S(\wbf')
\end{equation}
Combining equation \eqref{eq:intermediate-pl} and \eqref{eq:intermediate-mm}, we have
\begin{equation*}
\|\nabla R_S(\wbf)\|_2^2 \geq 2\mu  \big(R_S(\wbf) - \min_{\wbf' \in \Wcal} R_S(\wbf')\big).  
\end{equation*}
The proof is complete.
\end{proof}
% \begin{lemma}\label{lem:dual-pl}
% The function $-f(\wbf, \cdot)$ satisfies the Pl condition with $\mu_\vbf$.
% \end{lemma}

Now we present two key lemmas for the convergence analysis. The next lemma characterizes the descent behavior of $R_S(\wbf_t)$. 

\begin{lemma}\label{lem:primal-gap-coupled}
Assume \textbf{(A2)}  and \textbf{(A3)}  hold. Assume $F_S(\cdot, \vbf)$ satisfies the $\mu$-PL condition and $F_S(\wbf, \cdot)$ is $\rho$-strongly concave. For Algorithm \ref{alg:dp-sgda}, the iterates $\{\wbf_t, \vbf_t\}_{t \in [T]}$ satisfies the following inequality 
\begin{align*}
\Ebb[R_S(\wbf_{t+1}) - R_S^*] \leq & (1 - \mu\eta_{\wbf, t})\Ebb[R_S(\wbf_t) - R_S^*] + \frac{L^2\eta_{\wbf, t}}{2} \Ebb[\|\hat{\vbf}_S(\wbf_t) - \vbf_t\|_2^2]\\
& + \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}(\frac{B_\wbf^2}{m} + d\sigma_\wbf^2).     
\end{align*}
\end{lemma}

\begin{proof}
Because $R_S$ is $L + L^2/\rho$-smooth by Lemma \ref{lem:primal-smoothness}, we have
\begin{align*}
R_S(\wbf_{t+1}) - R_S^* \leq & R_S(\wbf_t) - R_S^* + \langle\nabla R_S(\wbf_t), \wbf_{t+1} - \wbf_t\rangle + \frac{L+L^2/\rho}{2}\|\wbf_{t+1} - \wbf_t\|_2^2 \\
= & R_S(\wbf_t) - R_S^* - \eta_{\wbf, t}  \langle\nabla R_S(\wbf_t), \frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) + \xi_t \rangle\\
& + \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}\|\frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) + \xi_t\|_2^2.
\end{align*}
We denote $\Ebb_t$ as the conditional expectation of given $\wbf_t$ and $\vbf_t$. Taking this conditional expectation of both sides, we get
\begin{align*}
\Ebb_t[R_S(\wbf_{t+1}) - R_S^*] = & R_S(\wbf_{t}) - R_S^* - \eta_{\wbf, t} \langle\nabla R_S(\wbf_t), \nabla_\wbf F_S(\wbf_t, \vbf_t)\rangle\\
& + \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2} \|\frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) - \nabla_\wbf F_S(\wbf_t, \vbf_t) + \nabla_\wbf F_S(\wbf_t, \vbf_t) - \xi_t\|_2^2\\
\leq & R_S(\wbf_{t}) - R_S^* - \eta_{\wbf, t} \langle\nabla R_S(\wbf_t), \nabla_\wbf F_S(\wbf_t, \vbf_t)\rangle\\
& + \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2} \|\nabla_\wbf F_S(\wbf_t, \vbf_t)\|_2^2 + \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2} (\frac{B_\wbf^2}{m} + d\sigma_\wbf^2)\\
\leq & R_S(\wbf_t) - R_S^* - \frac{\eta_{\wbf, t}}{2}  \|\nabla R_S(\wbf_t)\|_2^2  + \frac{\eta_{\wbf, t}}{2}  \|\nabla R_S(\wbf_t) - \nabla_\wbf F_S(\wbf_t, \vbf_t)\|_2^2\\
& + \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}(\frac{B_\wbf^2}{m} + d\sigma_\wbf^2),
\end{align*}
where in first inequality since $\Ebb_t[\|\frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) - \nabla_\wbf F_S(\wbf_t, \vbf_t)\|_2^2] = \frac{1}{m}\sum_{j=1}^m\Ebb_t[\|\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) - \nabla_\wbf F_S(\wbf_t, \vbf_t)\|_2^2] \leq \frac{B_\wbf^2}{m}$ and $\Ebb_t[\|\xi_t\|_2^2] = d_1\sigma_\wbf^2 \leq d\sigma_\wbf^2$, and the last inequality we use $\eta_\wbf \leq 1/(L+L^2/\rho)$. Because $R_S$ satisfies PL condition with $\mu$ by Lemma \ref{lem:primal-pl}, we have
\begin{align*}
\Ebb_t[R_S(\wbf_{t+1}) - R_S^*] \leq & (1 - \mu\eta_{\wbf, t})(R_S(\wbf_t) - R_S^*) + \frac{\eta_{\wbf, t}}{2}  \|\nabla R_S(\wbf_t) - \nabla_\wbf F_S(\wbf_t, \vbf_t)\|_2^2\\
& + \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}(\frac{B_\wbf^2}{m} + d\sigma_\wbf^2)\\
\leq &  (1 - \mu\eta_{\wbf, t})(R_S(\wbf_t) - R_S^*) + \frac{L^2\eta_{\wbf, t}}{2}  \|\hat{\vbf}_S(\wbf_t) - \vbf_t\|_2^2 + \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}(\frac{B_\wbf^2}{m} + d\sigma_\wbf^2),
\end{align*}
where the second we use $F_S$ is $L$-smooth. Now taking expectation of both sides yields the claimed bound. The proof is complete.
\end{proof}

The next lemma characterizes the descent behavior of $\vbf_t$.
\begin{lemma}\label{lem:dual-point-coupled}
Assume \textbf{(A2)}  and \textbf{(A3)}  hold. Assume $F_S(\cdot, \vbf)$ satisfies PL condition with constant $\mu$ and $F_S(\wbf, \cdot)$ is $\rho$-strongly concave.  Let $\hat{\vbf}_S(\wbf) = \arg\max_{\vbf \in \Vcal} F_S(\wbf, \vbf)$. For Algorithm \ref{alg:dp-sgda} and any $\epsilon > 0$, the iterates $\{\wbf_t, \vbf_t\}$ satisfies the following inequality
\begin{align*}
\Ebb[\|\vbf_{t+1} \!-\! \hat{\vbf}_S(\wbf_{t+1})\|_2^2] \leq &((1\!+\!\frac{1}{\epsilon})2 L^4/\rho \eta_{\wbf,t}^2 \!+\! (1\!+\!\epsilon)(1 \!-\! \rho\eta_{\vbf, t})) \Ebb[\|\vbf_t \!-\! \hat{\vbf}_S(\wbf_t)\|_2^2]  \!+\! (1\!+\!\frac{1}{\epsilon})\eta_{\wbf,t}^2L^2/\rho^2 (\frac{B_\wbf^2}{m} \!+\! d\sigma_\wbf^2)\\
& + (1+\frac{1}{\epsilon}) 4 L^2/\rho^2(L+L^2/\rho) \eta_{\wbf,t}^2\Ebb[R_S(\wbf_t) - R_S^*] + (1+\epsilon)\eta_{\vbf, t}^2 (\frac{B_\vbf^2}{m} + d\sigma_\vbf^2).  
\end{align*}
\end{lemma}

\begin{proof}
By Young's inequality, we have
\begin{equation*}
\|\vbf_{t+1} - \hat{\vbf}_S(\wbf_{t+1})\|_2^2 \leq (1 + \epsilon) \|\vbf_{t+1} - \hat{\vbf}_S(\wbf_t)\|_2^2 + (1 + \frac{1}{\epsilon}) \|\hat{\vbf}_S(\wbf_t) - \hat{\vbf}_S(\wbf_{t+1})\|_2^2.
\end{equation*}
For the term $\|\hat{\vbf}_S(\wbf_t) - \hat{\vbf}_S(\wbf_{t+1})\|_2^2$, since $\hat{\vbf}_S(\cdot)$ is $L/\rho$-Lipschitz by Lemma \ref{lem:primal-smoothness}, taking conditional expectation, we have
\begin{multline*}
\Ebb_t[\|\hat{\vbf}_S(\wbf_{t+1}) - \hat{\vbf}_S(\wbf_t)\|_2^2] \leq L^2/\rho^2\Ebb_t[\|\wbf_{t+1} - \wbf_t\|_2^2] = L^2/\rho^2 \eta_{\wbf,t}^2\Ebb_t[\|\frac{1}{m}\sum_{j=1}^m\nabla_\wbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) + \xi_t\|_2^2]\\
\leq L^2/\rho^2  \eta_{\wbf,t}^2\|\nabla_\wbf F_S(\wbf_t, \vbf_t)\|_2^2 + L^2/\rho^2 \eta_{\wbf,t}^2 (\frac{B_\wbf^2}{m} + d\sigma_\wbf^2)\\
\leq 2 L^2/\rho^2  \eta_{\wbf,t}^2\|\nabla R_S(\wbf_t) - \nabla_\wbf F_S(\wbf_t, \vbf_t)\|_2^2 + 2 L^2/\rho^2  \eta_{\wbf,t}^2\|\nabla R_S(\wbf_t) \|_2^2 + L^2/\rho^2 \eta_{\wbf,t}^2 (\frac{B_\wbf^2}{m} + d\sigma_\wbf^2)\\
\leq 2 L^4/\rho^2 \eta_{\wbf,t}^2\|\hat{\vbf}_S(\wbf_t) - \vbf_t\|_2^2 + 2 L^2/\rho^2  \eta_{\wbf,t}^2\|\nabla R_S(\wbf_t) \|_2^2 + L^2/\rho^2 \eta_{\wbf,t}^2(\frac{B_\wbf^2}{m} + d\sigma_\wbf^2),
\end{multline*}
where the last step uses the fact that $F_S$ is $L$-smooth. Because $R_S$ is $L + L^2/\rho$-smooth by Lemma \ref{lem:primal-smoothness} we have
$\frac{1}{2(L+L^2\rho)}\|\nabla R_S(\wbf_t) \|_2^2 \leq R_S(\wbf_t) - R_S^*$.
Therefore
\begin{align*}\label{eq:eq:dual-opt-pt}
\Ebb_t[\|\hat{\vbf}_S(\wbf_{t+1}) - \hat{\vbf}_S(\wbf_t)\|_2^2] \leq &2 L^4/\rho^2 \eta_{\wbf,t}^2\|\hat{\vbf}_S(\wbf_t) - \vbf_t\|_2^2 + 4 L^2/\rho^2 (L+L^2/\rho) \eta_{\wbf,t}^2(R_S(\wbf_t) - R_S(\wbf^*))\\
& + L^2/\rho^2 \eta_{\wbf,t}^2(\frac{B_\wbf^2}{m} + d\sigma_\wbf^2). \numberthis
\end{align*}
For the term $\|\vbf_{t+1} - \hat{\vbf}_S(\wbf_t)\|_2^2$, by the contraction of projection, we have
\begin{multline*}
\Ebb_t[\|\vbf_{t+1} - \hat{\vbf}_S(\wbf_t)\|_2^2] \leq \Ebb_t[\|\vbf_t + \eta_{\vbf,t} (\frac{1}{m}\sum_{j=1}^m\nabla_\vbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) + \zeta_t) - \hat{\vbf}_S(\wbf_t)\|_2^2]  \\
\leq \|\vbf_t - \hat{\vbf}_S(\wbf_t)\|_2^2 + 2\eta_{\vbf,t}\Ebb_t[\langle\vbf_t - \hat{\vbf}_S(\wbf_t),  \frac{1}{m}\sum_{j=1}^m\nabla_\vbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) \rangle] + \eta_{\vbf, t}^2 \Ebb_t[\|\frac{1}{m}\sum_{j=1}^m\nabla_\vbf f(\wbf_t, \vbf_t; \zbf_{i_t^j}) + \zeta_t\|_2^2]\\
\leq \|\vbf_t - \hat{\vbf}_S(\wbf_t)\|_2^2 + 2\eta_{\vbf,t}\langle\vbf_t - \hat{\vbf}_S(\wbf_t),  \nabla_\vbf F_S(\wbf_t, \vbf_t)\rangle + \eta_{\vbf, t}^2 \|\nabla_\vbf F_S(\wbf_t, \vbf_t)\|_2^2 + \eta_{\vbf, t}^2 (\frac{B_\vbf^2}{m} + d\sigma_\vbf^2)\\
\leq (1 - \rho\eta_{\vbf, t})\|\vbf_t - \hat{\vbf}_S(\wbf_t)\|_2^2 + 2\eta_{\vbf,t}(F_S(\wbf_t, \vbf_t) - F_S(\wbf_t, \hat{\vbf}_S(\wbf_t)) + \eta_{\vbf, t}^2 \|\nabla_\vbf F_S(\wbf_t, \vbf_t)\|_2^2 + \eta_{\vbf, t}^2 (\frac{B_\vbf^2}{m} + d\sigma_\vbf^2),
\end{multline*}
where the third inequality we use the $F_S(\wbf, \cdot)$ is $\rho$-strongly concave. Since $F_S$ is $L$-smooth, by choosing $\eta_{\vbf, t} \leq 1/L$, we have
\begin{align*}\label{eq:dual-pt-conv}
\Ebb_t[\|\vbf_{t+1} \!-\! \hat{\vbf}_S(\wbf_t)\|_2^2] \leq & (1 \!-\! \rho\eta_{\vbf, t})\|\vbf_t \!-\! \hat{\vbf}_S(\wbf_t)\|_2^2 \!-\! \frac{\eta_{\vbf,t}}{L}\|\nabla_\vbf F_S(\wbf_t, \vbf_t)\|_2^2 \!+\! \eta_{\vbf, t}^2 \|\nabla_\vbf F_S(\wbf_t, \vbf_t)\|_2^2 \!+\! \eta_{\vbf, t}^2 (\frac{B_\vbf^2}{m} \!+\! d\sigma_\vbf^2)\\
\leq & (1 - \rho\eta_{\vbf, t})\|\vbf_t - \hat{\vbf}_S(\wbf_t)\|_2^2 + \eta_{\vbf, t}^2(\frac{B_\vbf^2}{m} + d\sigma_\vbf^2). \numberthis
\end{align*}
Combining \eqref{eq:dual-pt-conv} and \eqref{eq:eq:dual-opt-pt} we have
\begin{align*}
\Ebb_t[\|\vbf_{t+1} \!-\! \hat{\vbf}_S(\wbf_{t+1})\|_2^2] \leq &((1\!+\!\frac{1}{\epsilon})2 L^4/\rho^2 \eta_{\wbf,t}^2 \!+\! (1\!+\!\epsilon)(1 \!-\! \rho\eta_{\vbf, t})) \|\vbf_t \!-\! \hat{\vbf}_S(\wbf_t)\|_2^2 \!+\! (1\!+\!\frac{1}{\epsilon})\eta_{\wbf,t}^2L^2/\rho^2  (\frac{B_\wbf^2}{m} \!+\! d\sigma_\wbf^2)\\
& + (1+\frac{1}{\epsilon}) 4 L^2/\rho^2(L+L^2/\rho) \eta_{\wbf,t}^2(R_S(\wbf_t) - R_S(\wbf^*)) + (1+\epsilon)\eta_{\vbf, t}^2 (\frac{B_\vbf^2}{m} + d\sigma_\vbf^2).
\end{align*}
% Since $\eta_{\vbf, t} \leq 1/L$, we can pick $\epsilon = \frac{\rho\eta_{\vbf, t}}{2(1 - \rho\eta_{\vbf, t})}$. Then we have $1 + \frac{1}{\epsilon} \leq \frac{2}{\rho \eta_{\vbf,t}}$ and 
% \begin{align*}
% \Ebb[\|\vbf_{t+1} - \hat{\vbf}_S(\wbf_{t+1})\|_2^2] \leq &(1 - \frac{\rho\eta_{\vbf, t}}{2 } + \frac{4 L^4/\rho^3 \eta_{\wbf,t}^2}{ \eta_{\vbf, t}}) \|\vbf_t - \hat{\vbf}_S(\wbf_t)\|_2^2 + \frac{2L^2/\rho^3\eta_{\wbf,t}^2 G_\wbf^2}{\eta_{\vbf, t}}\\
% & + \frac{8 L^2/\rho^3(L+L^2/\rho) \eta_{\wbf,t}^2}{\eta_{\vbf, t}}(R_S(\wbf_t) - R_S(\wbf^*))  + \frac{(2 - \rho\eta_{\vbf, t})\eta_{\vbf, t}^2 G_\vbf^2}{2 - 2\rho\eta_{\vbf,t}}  
% \end{align*}
Taking expectation on both sides yields the desired bound. The proof is complete.
\end{proof}

\begin{lemma}\label{lem:coupled-recursive}
Assume \textbf{(A2)}  and \textbf{(A3)}  hold. Assume $F_S(\cdot, \vbf)$ satisfies PL condition with constant $\mu$ and $F_S(\wbf, \cdot)$ is $\rho$-strongly concave. Define $a_t = \Ebb[R_S(\wbf_t) - R_S(\wbf^*)]$ and $b_t = \Ebb[\|\hat{\vbf}_S(\wbf_t) - \vbf_t\|_2^2]$. For Algorithm \ref{alg:dp-sgda}, if $\eta_{\wbf, t} \leq 1/(L+L^2/\rho)$ and $\eta_{\vbf, t} \leq 1/L$, then for any non-increasing sequence $\{\lambda_t>0\}$ and $\epsilon > 0$,  the iterates $\{\wbf_t, \vbf_t\}_{t \in [T]}$ satisfy the following inequality 
\begin{multline*}
a_{t+1} + \lambda_{t+1} b_{t+1} \leq k_{1, t} a_t + k_{2, t} \lambda_t b_t\\
+  \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}(\frac{B_\wbf^2}{m} + d\sigma_\wbf^2) + 2(1 + \frac{1}{\epsilon})\lambda_t L^2/\rho^2\eta_{\wbf,t}^2 (\frac{B_\wbf^2}{m} + d\sigma_\wbf^2) + \lambda_t(1 + \epsilon)\eta_{\vbf, t}^2 (\frac{B_\vbf^2}{m} + d\sigma_\vbf^2),
\end{multline*}
where 
\begin{align*}
k_{1,t} = & (1-\mu \eta_{\wbf, t}) + \lambda_t (1 + \frac{1}{\epsilon})4L^2/\rho^2(L+L^2/\rho)\eta_{\wbf,t}^2,  \\
k_{2,t} = & \frac{L^2\eta_{\wbf, t}}{2\lambda_t} + (1 + \epsilon)(1 - \rho\eta_{\vbf, t}) + (1 + \frac{1}{\epsilon})2 L^4/\rho^2 \eta_{\wbf,t}^2.
\end{align*}
\end{lemma}

\begin{proof}
Combining Lemma \ref{lem:primal-gap-coupled} and Lemma \ref{lem:dual-point-coupled}, we have for any $\lambda_{t+1} > 0$, we have 
\begin{align*}
a_{t+1} + \lambda_{t+1} b_{t+1} \leq & ((1-\mu \eta_{\wbf, t}) + \lambda_{t+1} (1 + \frac{1}{\epsilon})4L^2/\rho^2(L+L^2/\rho)\eta_{\wbf,t}^2 )a_t\\
& + (\frac{L^2\eta_{\wbf, t}}{2} + \lambda_{t+1} (1 + \epsilon)(1 - \rho\eta_{\vbf, t}) + \lambda_{t+1}(1 + \frac{1}{\epsilon})2 L^4/\rho^2 \eta_{\wbf,t}^2) b_t \\
& \!+\! \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}(\frac{B_\wbf^2}{m} \!+\! d\sigma_\wbf^2) \!+\! 2(1 \!+\! \frac{1}{\epsilon})\lambda_{t+1} L^2/\rho^2\eta_{\wbf,t}^2 (\frac{B_\wbf^2}{m} \!+\! d\sigma_\wbf^2) \!+\! \lambda_{t+1}(1 \!+\! \epsilon)\eta_{\vbf, t}^2 (\frac{B_\vbf^2}{m} \!+\! d\sigma_\vbf^2) \\
\leq & ((1-\mu \eta_{\wbf, t}) + \lambda_t (1 + \frac{1}{\epsilon})4L^2/\rho^2(L+L^2/\rho)\eta_{\wbf,t}^2 )a_t\\
& + (\frac{L^2\eta_{\wbf, t}}{2} + \lambda_t (1 + \epsilon)(1 - \rho\eta_{\vbf, t}) + \lambda_t(1 + \frac{1}{\epsilon})2 L^4/\rho^2 \eta_{\wbf,t}^2) b_t \\
& \!+\! \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}(\frac{B_\wbf^2}{m} \!+\! d\sigma_\wbf^2) \!+\! 2(1 \!+\! \frac{1}{\epsilon})\lambda_t L^2/\rho^2\eta_{\wbf,t}^2 (\frac{B_\wbf^2}{m} \!+\! d\sigma_\wbf^2) \!+\! \lambda_t(1 \!+\! \epsilon)\eta_{\vbf, t}^2 (\frac{B_\vbf^2}{m} \!+\! d\sigma_\vbf^2) \\
= & ((1-\mu \eta_{\wbf, t}) + \lambda_t (1 + \frac{1}{\epsilon})4L^2/\rho^2(L+L^2/\rho)\eta_{\wbf,t}^2 )a_t \\
& + \lambda_t (\frac{L^2\eta_{\wbf, t}}{2\lambda_t} + (1 + \epsilon)(1 - \rho\eta_{\vbf, t}) + (1 + \frac{1}{\epsilon})2 L^4/\rho^2 \eta_{\wbf,t}^2) b_t\\
& +\! \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}(\frac{B_\wbf^2}{m} \!+\! d\sigma_\wbf^2) \!+\! 2(1 \!+\! \frac{1}{\epsilon})\lambda_t L^2/\rho^2\eta_{\wbf,t}^2 (\frac{B_\wbf^2}{m} \!+\! d\sigma_\wbf^2) \!+\! \lambda_t(1 \!+\! \epsilon)\eta_{\vbf, t}^2 (\frac{B_\vbf^2}{m} \!+\! d\sigma_\vbf^2).
\end{align*}
where the first inequality we used $\lambda_{t+1} \leq \lambda_t$. The proof is completed.
\end{proof}

% \begin{theorem}\label{thm:sgda-conv-1/3}
% For SGDA, if $\eta_{\wbf, t} = \Ocal(\frac{1}{t})$ and $\eta_{\vbf, t} = \Ocal(\frac{1}{t^{2/3}})$, then for any $\lambda > 0$, the iterates $\{\wbf_t, \vbf_t\}_{t \in [T]}$ satisfies the following inequality
% \begin{align*}
% \Ebb[R_S(\Wbf_T) - R_S(\wbf^*)] + \lambda \Ebb[\|\hat{\vbf}_S(\wbf_t) - \vbf_T\|_2^2] = \Ocal(\frac{1}{T^{1/3}}). 
% \end{align*}
% \end{theorem}

% \begin{proof}
% Since $\eta_{\vbf, t} \leq 1/L$, we can pick $\epsilon = \frac{\rho\eta_{\vbf, t}}{2(1 - \rho\eta_{\vbf, t})}$. Then we have $(1 + \epsilon)(1 - \rho\eta_{\vbf, t})=1 - \frac{ \rho\eta_{\vbf, t}}{2}$ and $1 + \frac{1}{\epsilon} \leq \frac{2}{\rho \eta_{\vbf,t}}$. Therefore
% \begin{align*}
% k_{1, t} \leq & (1-\mu \eta_{\wbf, t}) + \lambda\frac{8 L^2/\rho^2(L+L^2/\rho)\eta_{\wbf,t}^2}{\rho \eta_{\vbf,t}}  \\
% k_{2, t} \leq & \frac{L^2\eta_{\wbf, t}}{2\lambda} + 1 - \frac{\rho\eta_{\vbf, t}}{2} + \frac{4 L^4/\rho^2 \eta_{\wbf,t}^2}{\rho\eta_{\vbf,t}}    
% \end{align*}
% If we choose $\eta_{\wbf, t} \leq \min\{\frac{\mu\eta_{\vbf,t}}{16\lambda \kappa^2(\kappa+\kappa^2)}, \frac{\lambda(1+\kappa)\rho\eta_{\vbf,t}}{2(1+\kappa)(\lambda\mu + L^2)+\mu L)}\}$, then we have
% \begin{align*}
% \max\{k_{1, t}, k_{2, t}\} \leq 1 - \frac{\mu\eta_{\wbf,t}}{2}
% \end{align*}
% By Lemma \ref{lem:coupled-recursive}, we have
% \begin{align*}
% a_{t+1} + \lambda b_{t+1} \leq (1 - \frac{\mu\eta_{\wbf,t}}{2})(a_t + \lambda b_t) +   \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}G_\wbf^2 + \frac{4\lambda L^2/\rho^2\eta_{\wbf,t}^2}{\rho \eta_{\vbf,t}} G_\wbf^2 + \frac{\lambda(2-\rho\eta_{\vbf,t})\eta_{\vbf, t}^2}{2(1 - \rho\eta_{\vbf,t})} G_\vbf^2   
% \end{align*}
% \end{proof}

We are now ready to state the convergence theorem of Algorithm \ref{alg:dp-sgda}. 

\begin{theorem}[Theorem \ref{thm:sgda-primal-opt} restated]\label{thm:sgda-conv}
Assume \textbf{(A2)}  and \textbf{(A3)}  hold. Assume $F_S(\cdot, \vbf)$ satisfies PL condition with constant $\mu$ and $F_S(\wbf, \cdot)$ is $\rho$-strongly concave. Assume $\mu\leq 2L^2$ and Let $\kappa = \frac{L}{\rho}$. For Algorithm \ref{alg:dp-sgda}, if $\eta_{\wbf, t} = \Ocal(\frac{1}{\mu t})$ and $\eta_{\vbf, t} = \Ocal(\frac{\kappa^2\max\{1, \sqrt{\kappa/\mu}\}}{\mu t^{2/3}})$, then the iterates $\{\wbf_t, \vbf_t\}_{t \in [T]}$ satisfy the following inequality
\begin{align*}\label{eq:sgda-primal-opt}
\Ebb[R_S(\wbf_{T+1}) - R_S^*] = \Ocal(\min\Big\{\frac{1}{L}, \frac{1}{\mu}\Big\}(\frac{B_{\wbf}^2/m + d\sigma_\wbf^2}{T^{2/3}}) + \max\Big\{1, \sqrt{\frac{L\kappa}{\mu}}\Big\}\frac{L\kappa^3}{\mu^2}(\frac{B_{\vbf}^2/m+d\sigma_\vbf^2}{T^{2/3}})). \numberthis
\end{align*}
Furthermore, if $\sigma_\wbf, \sigma_\vbf$ are given by \eqref{eq:sigma-sigma}, we have
\begin{align*}\label{eq:D82}
& \Ebb[R_S(\wbf_{T+1}) - R_S^*]\\
= & \Ocal(\min\Big\{\frac{1}{L}, \frac{1}{\mu}\Big\}(\frac{B_{\wbf}^2}{mT^{2/3}} + \frac{G_{\wbf}^2d T^{1/3}\log(1/\delta)}{n^2\epsilon^2}) + \max\Big\{1, \sqrt{\frac{L\kappa}{\mu}}\Big\}\frac{L\kappa^3}{\mu^2}(\frac{B_{\vbf}^2}{mT^{2/3}} + \frac{G_{\vbf}^2d T^{1/3}\log(1/\delta)}{n^2\epsilon^2})).\numberthis
\end{align*}
% \begin{align*}
% a_T + \frac{4L\kappa\eta_{\wbf, T}}{\eta_{\vbf, T}} b_T \leq & \Big(\prod_{t=1}^T (1 - \frac{\mu\eta_{\wbf,t}}{2})\Big)(a_1 + \frac{4L\kappa\eta_{\wbf, 1}}{\eta_{\vbf, 1}} b_1) + \sum_{t=1}^T \Big(\prod_{j=t+1}^T (1 - \frac{\mu\eta_{\wbf,j}}{2})\Big) \\
% & \times\Big( \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}(G_\wbf^2 + d\sigma_\wbf^2) + \frac{16L^4/\rho^3\eta_{\wbf,t}^3}{\rho \eta_{\vbf,t}^2} (G_\wbf^2 + d\sigma_\wbf^2) + \frac{4L^2(2-\rho\eta_{\vbf,t})\eta_{\wbf,t}\eta_{\vbf, t}}{2\rho(1 - \rho\eta_{\vbf,t})} (G_\vbf^2 + d\sigma_\vbf^2)\Big)
% \end{align*}
\end{theorem}

\begin{proof}
Since $\eta_{\vbf, t} \leq 1/L$, we can pick $\epsilon = \frac{\rho\eta_{\vbf, t}}{2(1 - \rho\eta_{\vbf, t})}$. Then we have $(1 + \epsilon)(1 - \rho\eta_{\vbf, t})=1 - \frac{ \rho\eta_{\vbf, t}}{2}$ and $1 + \frac{1}{\epsilon} \leq \frac{2}{\rho \eta_{\vbf,t}}$. Therefore Lemma \ref{lem:coupled-recursive} can be simplified as 
\begin{align*}
k_{1, t} \leq & (1-\mu \eta_{\wbf, t}) + \lambda_t\frac{8 L^2/\rho^2(L+L^2/\rho)\eta_{\wbf,t}^2}{\rho \eta_{\vbf,t}},  \\
k_{2, t} \leq & \frac{L^2\eta_{\wbf, t}}{2\lambda_t} + 1 - \frac{\rho\eta_{\vbf, t}}{2} + \frac{4 L^4/\rho^2 \eta_{\wbf,t}^2}{\rho\eta_{\vbf,t}}.
\end{align*}
If we choose $\lambda_t = \frac{4L^2\eta_{\wbf,t}}{\rho\eta_{\vbf,t}}$  and $\eta_{\wbf, t} \leq \min\{\frac{\sqrt{\mu}}{8 \kappa^2\sqrt{L+L^2/\rho}}, \frac{1}{4\sqrt{2}\kappa^2}\}\eta_{\vbf, t}$, then further we have $k_{1, t} \leq 1 - \frac{\mu\eta_{\wbf,t}}{2}$ and $k_{2, t} \leq 1 - \frac{\rho\eta_{\vbf, t}}{4}$. By Lemma \ref{lem:coupled-recursive} we have
\begin{align*}
a_{t+1} + \lambda_{t+1} b_{t+1} \leq & (1 - \min\{\frac{\mu}{2}, L^2\}\eta_{\wbf, t})(a_t + \lambda_t b_t) + \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}(\frac{B_\wbf^2}{m} + d\sigma_\wbf^2)\\     
&  + \frac{16L^4/\rho^3\eta_{\wbf,t}^3}{\rho \eta_{\vbf,t}^2} (\frac{B_\wbf^2}{m} + d\sigma_\wbf^2) + \frac{4L^2(2-\rho\eta_{\vbf,t})\eta_{\wbf,t}\eta_{\vbf, t}}{2\rho(1 - \rho\eta_{\vbf,t})} (\frac{B_\vbf^2}{m} + d\sigma_\vbf^2)\\
\leq & (1 - \frac{\mu\eta_{\wbf, t}}{2})(a_t + \lambda_t b_t) + \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}(\frac{B_\wbf^2}{m} + d\sigma_\wbf^2)\\     
&  + \frac{16L^4/\rho^3\eta_{\wbf,t}^3}{\rho \eta_{\vbf,t}^2} (\frac{B_\wbf^2}{m} + d\sigma_\wbf^2) + \frac{4L^2(2-\rho\eta_{\vbf,t})\eta_{\wbf,t}\eta_{\vbf, t}}{2\rho(1 - \rho\eta_{\vbf,t})} (\frac{B_\vbf^2}{m} + d\sigma_\vbf^2), 
\end{align*}
where we used $\mu \leq 2L^2$. Taking $\eta_{\wbf, t} = \frac{2}{\mu t}$ and $\eta_{\vbf, t} = \max\{8 \kappa^2\sqrt{(L+L^2/\rho)/\mu}, 4\sqrt{2}\kappa^2\}\frac{2}{\mu t^{2/3}}$ and multiplying the preceding inequality with $t$ on both sides,  there holds
\begin{multline*}
t(a_{t+1} + \lambda_{t+1}b_{t+1}) \leq (t-1) (a_t + \lambda_t b_t) +   \frac{2(L+L^2/\rho)}{\mu^2 t}(\frac{B_\wbf^2}{m} + d\sigma_\wbf^2) \\
+ \frac{32L^4/\rho^3\min\{\frac{\sqrt{\mu}}{8 \kappa^2\sqrt{L+L^2/\rho}}, \frac{1}{4\sqrt{2}\kappa^2}\}^2}{\mu\rho t^{2/3}} (\frac{B_\wbf^2}{m} + d\sigma_\wbf^2) + \frac{16L^2\max\{8 \kappa^2\sqrt{(L+L^2/\rho)/\mu}, 4\sqrt{2}\kappa^2\}}{2\mu^2\rho t^{2/3}} (\frac{B_\vbf^2}{m} + d\sigma_\vbf^2).       
\end{multline*}
Applying the preceding inequality inductively from $t=1$ to $T$, we have
\begin{align*}
T(a_{T+1} + \lambda_{T+1}b_{T+1}) \leq & \frac{2(L+L^2/\rho)}{\mu^2}(\frac{B_\wbf^2}{m} + d\sigma_\wbf^2)\log(T) + \frac{32L^4/\rho^3\min\{\frac{\sqrt{\mu}}{8 \kappa^2\sqrt{L+L^2/\rho}}, \frac{1}{4\sqrt{2}\kappa^2}\}^2}{\mu\rho} (\frac{B_\wbf^2}{m} + d\sigma_\wbf^2) T^{1/3}\\
& + \frac{16L^2\max\{8 \kappa^2\sqrt{(L+L^2/\rho)/\mu}, 4\sqrt{2}\kappa^2\}}{2\mu^2\rho} (\frac{B_\vbf^2}{m} + d\sigma_\vbf^2)    T^{1/3}.   
\end{align*}
Consequently, 
\begin{align*}\label{eq:sgda-conv-before-sigma}
\Ebb[R_S(\wbf_{T+1}) - R_S^*] \leq & a_{T+1} + \lambda_{T+1}b_{T+1}\\
\leq & \frac{2(L+L^2/\rho)(B_\wbf^2/m + d\sigma_\wbf^2)}{\mu^2}\frac{\log(T)}{T}
\!+\! \frac{32 (B_\wbf^2/m \!+\! d\sigma_\wbf^2)L^4\!/\!\rho^3\!\min\{\frac{\sqrt{\mu}}{8 \kappa^2\!\sqrt{L\!+\!L^2/\rho}}, \frac{1}{4\sqrt{2}\kappa^2}\}^2}{\mu\rho} \frac{1}{T^{2\!/\!3}}\\
& \!+\! \frac{16 (B_\vbf^2/m \!+\! d\sigma_\vbf^2)L^2\!\max\{8 \kappa^2\!\sqrt{(L\!+\!L^2\!/\!\rho)/\mu}, 4\sqrt{2}\kappa^2\}}{2\mu^2\rho} \frac{1}{T^{2\!/\!3}}. \numberthis
\end{align*}
% Applying the preceding inequality recursively from $t=1$ to $T$, we have
% \begin{align*}
% a_{T+1} + \lambda_{T+1} b_{T+1} \leq & \Big(\prod_{t=1}^T (1 - \frac{\mu\eta_{\wbf,t}}{2})\Big)(a_1 + \lambda_1 b_1) + \sum_{t=1}^T \Big(\prod_{j=t+1}^T (1 - \frac{\mu\eta_{\wbf,j}}{2})\Big) \\
% & \times\Big( \frac{(L+L^2/\rho)\eta_{\wbf, t}^2}{2}(G_\wbf^2 + d\sigma_\wbf^2) + \frac{16L^4/\rho^3\eta_{\wbf,t}^3}{\rho \eta_{\vbf,t}^2} (G_\wbf^2 + d\sigma_\wbf^2) + \frac{4L^2(2-\rho\eta_{\vbf,t})\eta_{\wbf,t}\eta_{\vbf, t}}{2\rho(1 - \rho\eta_{\vbf,t})} (G_\vbf^2 + d\sigma_\vbf^2)\Big)  
% \end{align*}
Therefore, the estimation \eqref{eq:sgda-primal-opt} follows from the fact that  $\kappa= L/\rho.$ 


The result in Theorem \ref{thm:sgda-primal-opt} follows by observing $\max\Big\{1, \sqrt{\frac{L\kappa}{\mu}}\Big\}\frac{L\kappa^3}{\mu^2} \geq \min\Big\{\frac{1}{L}, \frac{1}{\mu}\Big\}$. Substituting  the values of $\sigma_\wbf, \sigma_\vbf$, i.e.,  $\sigma_\wbf \!=\! \frac{c_2 G_\wbf \sqrt{T\log(\frac{1}{\delta})}}{n\epsilon}$ and $  \sigma_\vbf \!=\! \frac{c_3 G_\vbf \sqrt{T\log(\frac{1}{\delta})}}{n\epsilon}$,  into \eqref{eq:sgda-primal-opt} yields the desired estimation  \eqref{eq:D82}.  
\end{proof}

\subsection{Proof of Theorem \ref{thm:sgda-primal-gen} (Generalization Error)}\label{sec:sgda-primal-gen}

We first focus on to the generalization error $\Ebb[R(\wbf_T) - R_S(\wbf_T)]$. Firstly, we introduce a lemma that bridges the generalization and the uniform argument stability. We modify the lemma so that it satisfies our needs.

\begin{lemma}[\citep{lei2021stability-supp}]\label{lem:stab-gen}
Let $A$ be a randomized algorithm and $\epsilon>0$. If for all neighboring datasets $S, S'$, there holds
\begin{align*}
\Ebb_A[\|A_\wbf(S) - A_\wbf(S')\|_2] \leq \varepsilon.    
\end{align*}
Furthermore, if the function $F(\wbf,\cdot)$ is $\rho$-strongly-concave and Assumptions \ref{ass:lipschitz}, \textbf{(A3)}  hold, then the primal generalization error satisfies
  \begin{align*}
      \Ebb_{S,A}\Big[R(A_{\wbf}(S))-R_S(A_{\wbf}(S))\Big]\leq \big(1+L/\rho\big)G_\wbf\varepsilon.
  \end{align*}%\begin{equation}\label{stab-gen-b}
%\end{equation}
\end{lemma}

The next proposition states the set of saddle points is unique with respect to the variable $\vbf$ when $F_S(\wbf, \cdot)$ is strongly concave.

\begin{proposition}\label{lem:unique-v}
Assume $F_S(\wbf, \cdot)$ is $\rho$-strongly concave with $\rho > 0$. Let $(\hat{\wbf}_S, \hat{\vbf}_S)$ and $(\hat{\wbf}'_S, \hat{\vbf}'_S)$ be two saddle points of $F_S$. Then we have $\hat{\vbf}_S = \hat{\vbf}'_S$.
\end{proposition}

\begin{proof}
Given $\hat{\wbf}_S$, by the strong concavity, we have
\begin{align*}
F_S(\hat{\wbf}_S, \hat{\vbf}_S) \geq  F_S(\hat{\wbf}_S, \hat{\vbf}'_S) + \langle \nabla_\vbf F_S(\hat{\wbf}_S, \hat{\vbf}_S) , \hat{\vbf}_S - \hat{\vbf}'_S\rangle + \frac{\rho}{2}\|\hat{\vbf}_S - \hat{\vbf}'_S\|_2^2 .
\end{align*}
Since $(\hat{\wbf}_S, \hat{\vbf}_S)$ is a saddle point of $F_S$, it implies $\hat{\vbf}_S$ attains maximum of $F_S(\hat{\wbf}_S, \cdot)$. By the first order optimality we know $\langle \nabla_\vbf F_S(\hat{\wbf}_S, \hat{\vbf}_S) , \hat{\vbf}_S - \hat{\vbf}'_S\rangle \geq 0$ and therefore 
\begin{align*}\label{eq:strong-saddle-pt}
F_S(\hat{\wbf}_S, \hat{\vbf}_S) \geq  F_S(\hat{\wbf}_S, \hat{\vbf}'_S) +  \frac{\rho}{2}\|\hat{\vbf}_S - \hat{\vbf}'_S\|_2^2 \geq F_S(\hat{\wbf}'_S, \hat{\vbf}'_S) +  \frac{\rho}{2}\|\hat{\vbf}_S - \hat{\vbf}'_S\|_2^2, \numberthis
\end{align*}
where in the second inequality we used $(\hat{\wbf}'_S, \hat{\vbf}'_S)$ is also a saddle point of $F_S$. Similarly, given $\hat{\wbf}'_S$ we can show
\begin{align}\label{eq:strong-saddle-pt2}
F_S(\hat{\wbf}'_S, \hat{\vbf}'_S) \geq  F_S(\hat{\wbf}_S, \hat{\vbf}_S) +  \frac{\rho}{2}\|\hat{\vbf}_S - \hat{\vbf}'_S\|_2^2.
\end{align}
Adding   \eqref{eq:strong-saddle-pt} and \eqref{eq:strong-saddle-pt2} together implies that $\rho\|\hat{\vbf}_S - \hat{\vbf}'_S\|_2^2 \le 0.$
This implies $\hat{\vbf}_S = \hat{\vbf}'_S$ which  completes the proof.
\end{proof}


Recall that $\pi_S:\Wcal \rightarrow \Wcal$ is the projection onto the set of saddle points $\Omega_S = \{\hat{\wbf}_S: (\hat{\wbf}_S, \hat{\vbf}_S \in \arg\min\max F_S(\wbf, \vbf)\}$. i.e. $\pi_S(\wbf) = \arg\min_{\hat{\wbf}_S \in \Omega_S} \frac{1}{2}\|\wbf - \hat{\wbf}_S\|_2^2$. Proposition \ref{lem:unique-v} makes sure the projection is well-defined. The next lemma shows that PL condition implies quadratic growth (QG) condition. The proof follows straightforward from \citet{karimi2016linear-supp} and we omit it for brevity. % Below we give a self-contained proof. 
\begin{lemma}\label{lem:pl-to-qg}
Suppose the function $F_S(\cdot, \vbf)$ satisfies $\mu$-PL condition. Then $F_S$ satisfies the QG condition with respect to $\wbf$ with constant $4\mu$, i.e.
\begin{equation*}
F_S(\wbf, \vbf) -  F_S(\pi_S(\wbf), \vbf) \geq 2\mu\|\wbf - \pi_S(\wbf)\|_2^2, \quad \forall \vbf \in \Vcal
\end{equation*}
% where $(\pi_S(\wbf), \pi_S(\vbf)) \in \arg\min_\wbf\max_\vbf F_S(\wbf,\vbf)$ given $\wbf, \vbf$. 
\end{lemma}

% \begin{proof}
% Define the function $G_S(\wbf) = \sqrt{F_S(\wbf, \pi_S(\vbf)) - \min_{\wbf} F_S(\wbf, \pi_S(\vbf))}$. Then $G_S(\wbf) \geq 0$. Furthermore, by the PL-condition, for any $\wbf \not\in \arg\min_\wbf F_S(\wbf, \pi_S(\vbf))$, we have
% \begin{align*}
% \|\nabla G_S(\wbf)\|_2^2 = \|\frac{1}{\sqrt{F_S(\wbf, \pi_S(\vbf)) - \min_\wbf F_S(\wbf, \pi_S(\vbf))}} \nabla_\wbf F_S(\wbf, \pi_S(\vbf))\|_2^2     \geq \frac{\mu}{2}
% \end{align*}
% which implies $\|\nabla G_S(\wbf)\|_2 \geq \sqrt{\frac{\mu}{2}}$. Consider the differentially equation
% \begin{align*}
% \frac{\mathrm{d} \wbf(t)}{\mathrm{d}t} = -\nabla G_S(\wbf(t)), \quad \wbf(t=0) = \wbf_0
% \end{align*}
% for $\wbf(t) \not\in \arg\min F_S(\wbf, \pi_S(\vbf))$. We claim that there exists some $T>0$ such that $\wbf_T = \wbf(t=T) \in \arg\min F_S(\wbf, \pi_S(\vbf))$. Therefore, the length of the orbit $L(\wbf_0)$ is given by 
% \begin{align*}
% L(\wbf_0) = \int_0^T \|\frac{\mathrm{d} \wbf(t)}{\mathrm{d}t}\|_2   \mathrm{d}t =  \int_0^T \|\nabla G_S(\wbf(t))\|_2\mathrm{d} t \geq \|\wbf_0 - \pi_S(\wbf_0)\|_2
% \end{align*}
% where the last inequality is due to the orbit must be at least as long as the projection distance. Now by the gradient theorem for line integrals, we have
% \begin{align*}
% G_S(\wbf_0) - G_S(\wbf_T) = &  \int_{\wbf_T}^{\wbf_0}     \langle\nabla G_S(\wbf),  \mathrm{d}\wbf\rangle\\
% = & - \int_{\wbf_0}^{\wbf_T}     \langle\nabla G_S(\wbf),  \mathrm{d}\wbf\rangle\\
% = & - \int_0^T    \langle\nabla G_S(\wbf(t)),  \frac{\mathrm{d} \wbf(t)}{\mathrm{d}t}\rangle \mathrm{d} t\\
% = &  \int_0^T  \|\nabla G_S((\wbf(t)))\|_2^2 \mathrm{d} t\\
% \geq & \sqrt{\frac{\mu}{2}} \int_0^T  \|\nabla G_S((\wbf(t)))\|_2 \mathrm{d} t\\
% \geq & \sqrt{\frac{\mu}{2}}\|\wbf_0 - \pi_S(\wbf_0)\|_2
% \end{align*}
% Since $G_S(\wbf_T) = 0$, this proves the desired bound. Note that the claim holds because $G_S$ and $\|\nabla G_S((\wbf(t)))\|_2$ are both positive and bounded from below, $\wbf(t)$ will move towards the optimal set. Furthermore, we have
% \begin{align*}
% G_S(\wbf_0) - G_S(\wbf_T)=  \int_0^T  \|\nabla G_S((\wbf(t)))\|_2^2 \mathrm{d}t \geq \frac{\mu}{2}T.
% \end{align*}
% As $ G_S(\wbf_T) \geq 0$, we can find such $T$ when $T\leq 2G_S(\wbf_0)/\mu$.
% \end{proof}

% The next technical assumption is imposed as a realizability condition, which was firstly proposed in \citep{charles2018stability}.
% \begin{assumption}\label{ass:unique-projection}
% For an (randomized) algorithm $A$, let $(\hat{\wbf}_S, \hat{\vbf}_S) = (\pi_S(A_\wbf(S)), \hat{\vbf}_S)$ and $(\hat{\wbf}_{S'}, \hat{\vbf}_{S'}) = (\pi_{S'}(A_\wbf(S')), \hat{\vbf}_S)$. Then these two empirical risk saddle points for $F_S$ and $F_{S'}$ satisfy $\pi_S(\hat{\wbf}_{S'}) = \hat{\wbf}_S$ and $\pi_{S'}(\hat{\wbf}_S) = \hat{\wbf}_{S'}$.
% \end{assumption}  
With the help of Assumption \ref{ass:unique-projection} and the preceding lemmas, we can derive the uniform argument stability.  

\begin{lemma}\label{lem:pl-stability}%[Lemma \ref{lem:pl-stability} 
Assume \textbf{(A1)}, \textbf{(A3)} and \textbf{(A4)}  hold. Assume $F_S(\cdot, \vbf)$ satisfies PL condition with constant $\mu$ and $F_S(\wbf, \cdot)$ is $\rho$-strongly concave. Let $A$ be a randomized algorithm. If for any $S$, $\Ebb[\|A_\wbf(S) - \pi_S(A_\wbf(S))\|_2] = \Ocal(\varepsilon_A)$, then we have
\begin{equation*}
\Ebb[\|A_\wbf(S) - A_\wbf(S')\|_2] \leq \Ocal(\varepsilon_A) + \frac{1}{n}\sqrt{\frac{G_\wbf^2}{4\mu^2} + \frac{G_\vbf^2}{\rho\mu}}.
\end{equation*}
\end{lemma}

\begin{proof} 
Let $(\pi_S(A_\wbf(S)), \hat{\vbf}_S) \in \arg\min_\wbf\max_\vbf F_S(\wbf,\vbf)$ and $(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_{S'})$ defined in the similar way. By triangle inequality we have
\begin{align*}
\Ebb[\|A_\wbf(S) \!-\! A_\wbf(S')\|_2] \leq & \Ebb[\|A_\wbf(S) \!-\! \pi_S(A_\wbf(S))\|_2] \!+\! \|\pi_S(A_\wbf(S)) \!-\! \pi_{S'}(A_\wbf(S'))\|_2 \!+\! \Ebb[\|A_\wbf(S') \!-\! \pi_{S'}(A_\wbf(S'))\|_2]\\
= & \|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2 + \Ocal(\varepsilon_A).
\end{align*}
Since $\pi_S(A_\wbf(S)) \in \arg\min_{\wbf \in \Wcal} F_S(\wbf, \hat{\vbf}_S)$ and by Assumption  \textbf{(A4)} we know that $\pi_S(A_\wbf(S))$ is the closest optimal point of $F_S$ to $\pi_{S'}(A_\wbf(S'))$. And since $\hat{\vbf}_S$ is fixed, by Lemma \ref{lem:pl-to-qg}, we have 
\begin{align*}
2\mu\|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2^2 \leq &  F_S(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_S) - F_S(\pi_S(A_\wbf(S)), \hat{\vbf}_S).
\end{align*}
Similarly, we have 
\begin{align*}
2\mu\|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2^2 \leq & F_{S'}(\pi_S(A_\wbf(S)), \hat{\vbf}_{S'}) - F_{S'}(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_{S'}).
\end{align*}
Summing up the above two inequalities we have
\begin{align*}\label{eq:qg-two-times}
4\mu\|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2^2 \leq & F_S(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_S) - F_S(\pi_S(A_\wbf(S)), \hat{\vbf}_S) \\
& + F_{S'}(\pi_S(A_\wbf(S)), \hat{\vbf}_{S'}) - F_{S'}(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_{S'}). \numberthis
\end{align*}
On the other hand, by the $\rho$-strong concavity of $F_S(\cdot, \vbf)$ and $\hat{\vbf}_S = \arg\max_{\vbf\in \Vcal} F_S(\pi_S(A_\wbf(S)), \vbf)$, we have
\begin{align*}
\frac{\rho}{2}\|\hat{\vbf}_S - \hat{\vbf}_{S'}\|_2^2 \leq & F_S(\pi_S(A_\wbf(S)), \hat{\vbf}_S) - F_S(\pi_S(A_\wbf(S)), \hat{\vbf}_{S'}).
\end{align*}
Similarly, we have
\begin{align*}
\frac{\rho}{2}\|\hat{\vbf}_S - \hat{\vbf}_{S'}\|_2^2 \leq & F_{S'}(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_{S'}) - F_{S'}(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_S).
\end{align*}
Summing up the above two inequalities we have
\begin{align*}\label{eq:sc-two-times}
\rho\|\hat{\vbf}_S - \hat{\vbf}_{S'}\|_2^2 \leq & F_S(\pi_S(A_\wbf(S)), \hat{\vbf}_S) - F_S(\pi_S(A_\wbf(S)), \hat{\vbf}_{S'})\\
& + F_{S'}(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_{S'}) - F_{S'}(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_S). \numberthis
\end{align*}
Summing up \eqref{eq:qg-two-times} and \eqref{eq:sc-two-times} rearranging terms, we have
\begin{align*}
& 4\mu\|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2^2 + \rho\|\hat{\vbf}_S - \hat{\vbf}_{S'}\|_2^2\\
\leq & F_S(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_S) - F_{S'}(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_S) + F_{S'}(\pi_S(A_\wbf(S)), \hat{\vbf}_{S'}) - F_S(\pi_S(A_\wbf(S)), \hat{\vbf}_{S'})\\
= & \frac{1}{n} \big(f(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_S; \zbf) - f(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_S; \zbf') + f(\pi_S(A_\wbf(S)), \hat{\vbf}_{S'}; \zbf') - f(\pi_S(A_\wbf(S)), \hat{\vbf}_{S'}; \zbf)\big) \\
\leq & \frac{2G_\wbf}{n} \|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2 +  \frac{2G_\vbf}{n} \|\hat{\vbf}_S - \hat{\vbf}_{S'}\|_2 \\
\leq & \frac{1}{n}\sqrt{\frac{G_\wbf^2}{\mu} + \frac{4G_\vbf^2}{\rho}} \times \sqrt{4\mu\|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2^2 + \rho\|\hat{\vbf}_S - \hat{\vbf}_{S'}\|_2^2},
\end{align*}
where the second inequality is due to Lipschitz continuity of $f$, the third inequality is due to Cauchy-Schwartz inequality. Therefore
\begin{align*}
2\sqrt{\mu}\|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2 \leq \sqrt{4\mu\|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2 ^2 + \rho\|\hat{\vbf}_S - \hat{\vbf}_{S'}\|_2^2} \leq \frac{1}{n}\sqrt{\frac{G_\wbf^2}{\mu} + \frac{4G_\vbf^2}{\rho}}.
\end{align*}
The proof is complete.
\end{proof}

% \begin{lemma}[Substitute for Lemma \ref{lem:pl-stability}]
% Assume \textbf{(A1)}  and \textbf{(A3)}  hold. Assume $F_S(\cdot, \vbf)$ satisfies PL condition with constant $\mu$ and $F_S(\wbf, \cdot)$ is $\rho$-strongly concave. Assume for any $\wbf, \vbf, \zbf$, there exists $C>0$ such that $|f(\wbf, \vbf; \zbf)|\leq C$. Let $A$ be a randomized algorithm. If for any $S$, $\Ebb[\|A_\wbf(S) - \pi_S(A_\wbf(S))\|_2] = \Ocal(\varepsilon_A)$, then we have
% \begin{equation*}
% \Ebb[\|A_\wbf(S) - A_\wbf(S')\|_2] \leq \Ocal(\varepsilon_A) + \sqrt{\frac{C}{\mu n}}.
% \end{equation*}
% \end{lemma}

% \begin{proof}
% Let $(\pi_S(A_\wbf(S)), \hat{\vbf}_S) \in \arg\min_\wbf\max_\vbf F_S(\wbf,\vbf)$ and $(\pi_{S'}(A_\wbf(S')), \hat{\vbf}_{S'})$ defined in the similar way. By triangle inequality we have
% \begin{align*}
% \|A_\wbf(S) - A_\wbf(S')\|_2 \leq & \|A_\wbf(S) - \pi_S(A_\wbf(S))\|_2 + \|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2 + \|A_\wbf(S') - \pi_{S'}(A_\wbf(S'))\|_2\\
% = & \|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2 + \Ocal(\varepsilon_A)
% \end{align*}
% Note that by Assumption \ref{ass:unique-projection}, we know that $\pi_S(A_\wbf(S))$ is the closest optimal point of $F_S$ to $\pi_{S'}(A_\wbf(S'))$. By Lemma \ref{lem:pl-to-qg}, we have
% \begin{align*}
% 2\mu\|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2^2 \leq &  R_S(\pi_{S'}(A_\wbf(S'))) - R_S(\pi_S(A_\wbf(S))).
% \end{align*}
% Similarly, we have 
% \begin{align*}
% 2\mu\|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2^2 \leq & R_{S'}(\pi_S(A_\wbf(S))) - R_{S'}(\pi_{S'}(A_\wbf(S'))).
% \end{align*}
% Summing up the above two inequalities we have
% \begin{align*}\label{eq:qg-two-times}
% 4\mu\|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2^2 \leq & R_S(\pi_{S'}(A_\wbf(S'))) - R_S(\pi_S(A_\wbf(S))) + R_{S'}(\pi_S(A_\wbf(S))) - R_{S'}(\pi_{S'}(A_\wbf(S')))\\
% = & \max_{\vbf\in \Vcal}\frac{1}{n}\sum_{i=1}^n f(\pi_{S'}(A_\wbf(S')) \vbf; \zbf_i) - \max_{\vbf\in \Vcal}\frac{1}{n}\sum_{i=1}^n f(\pi_S(A_\wbf(S)) \vbf; \zbf_i)\\
% & + \max_{\vbf\in \Vcal}\frac{1}{n}\sum_{i=1}^n f(\pi_S(A_\wbf(S)) \vbf; \zbf'_i) - \max_{\vbf\in \Vcal}\frac{1}{n}\sum_{i=1}^n f(\pi_{S'}(A_\wbf(S')) \vbf; \zbf'_i)\\
% \leq & \frac{1}{n} \max_{\vbf\in \Vcal} \{ f(\pi_{S'}(A_\wbf(S')), \vbf; \zbf_n) - f(\pi_{S'}(A_\wbf(S')), \vbf; \zbf'_n) \}\\
% & + \frac{1}{n} \max_{\vbf\in \Vcal} \{ f(\pi_S(A_\wbf(S)), \vbf; \zbf'_n) - f(\pi_S(A_\wbf(S)), \vbf; \zbf_n) \}\\
% \leq & \frac{4C}{n}\numberthis
% \end{align*}
% where in the second inequality we used the subadditivity of $\max$ and in the last inequality we used the boundedness of $f$. Therefore
% \begin{align*}
% \|\pi_S(A_\wbf(S)) - \pi_{S'}(A_\wbf(S'))\|_2  \leq \sqrt{\frac{C}{\mu n}}
% \end{align*}
% The proof is complete.
% \end{proof}

We are now ready to present the generalization error of Algorithm \ref{alg:dp-sgda} in terms of $\wbf_T$.

\begin{theorem}\label{thm:sgda-gen}% [Theorem \ref{thm:sgda-gen} 
Assume \textbf{(A1)}, \textbf{(A3)} and \textbf{(A4)}  hold. Assume $F_S(\cdot, \vbf)$ satisfies PL condition with constant $\mu$ and $f(\wbf, \cdot; \zbf)$ is $\rho$-strongly concave. For Algorithm \ref{alg:dp-sgda},  the iterates $\{\wbf_t, \vbf_t\}$ satisfies the following inequality
\begin{align*}
\Ebb[R(\wbf_T) - R_S(\wbf_T)] \leq (1 + \frac{L}{\rho})G_\wbf \Big(\sqrt{\frac{\varepsilon_T}{2\mu}} + \frac{1}{n}\sqrt{\frac{G_\wbf^2}{4\mu^2} + \frac{G_\vbf^2}{\rho\mu}}\Big).
\end{align*}
\end{theorem}

\begin{proof}
Since $R_S$ satisfies $\mu$-PL, by Lemma \ref{lem:pl-to-qg} and Theorem \ref{thm:sgda-conv}, we have 
\begin{align*}
\Ebb[\|\wbf_T - \pi(\wbf_T)\|_2] \leq \sqrt{\Ebb[\|\wbf_T - \pi(\wbf_T)\|_2^2]} \leq \sqrt{\Ebb[\frac{1}{2\mu} (R_S(\wbf_T) - R_S^*)]} \leq \sqrt{\frac{\varepsilon_T}{2\mu}}.
\end{align*}
By Lemma \ref{lem:pl-stability}, we have
\begin{align*}
\Ebb[\|\wbf_T - \wbf'_T\|_2] \leq \sqrt{\frac{\varepsilon_T}{2\mu}} + \frac{1}{n}\sqrt{\frac{G_\wbf^2}{4\mu^2} + \frac{G_\vbf^2}{\rho\mu}}.     
\end{align*}
By Part b) of Lemma \ref{lem:stab-gen}, we have
\begin{align*}
\Ebb[R(\wbf_T) - R_S(\wbf_T)] \leq (1 + \frac{L}{\rho})G_\wbf \Big(\sqrt{\frac{\varepsilon_T}{2\mu}} + \frac{1}{n}\sqrt{\frac{G_\wbf^2}{4\mu^2} + \frac{G_\vbf^2}{\rho\mu}}\Big).
\end{align*}
The proof is complete.
\end{proof}

% \begin{proof}[Proof of Theorem \ref{thm:sgda-gen}]
% Let $\Delta_t = \|\wbf_t - \wbf'_t\|_2 + \|\vbf_t - \vbf'_t\|_2$. Note that the projection step is nonexpansive. We consider two cases at the $t$-th iteration. If $i_t \neq n$, then it follows from Assumption \textbf{(A3)}  that
% \begin{align*}
% & \|\wbf_{t+1} - \wbf'_{t+1}\|_2 \\
% \leq &	\|\wbf_t - \eta_{\wbf,t}\nabla_\wbf f(\wbf_t, \vbf_t, z_{i_t}) - \wbf'_t + \eta_{\wbf,t}\nabla_\wbf f(\wbf'_t, \vbf'_t, z_{i_t})\|_2\\
% \leq  & 	\|\wbf_t - \eta_{\wbf,t}\nabla_\wbf f(\wbf_t, \vbf_t, z_{i_t}) - \wbf'_t + \eta_{\wbf,t}\nabla_\wbf f(\wbf'_t, \vbf_t, z_{i_t})\|_2 + \|\eta_{\wbf,t}\nabla_\wbf f(\wbf'_t, \vbf_t, z_{i_t}) - \eta_{\wbf,t}\nabla_\wbf f(\wbf'_t, \vbf'_t, z_{i_t})\|_2\\
% \leq & (1 + L\eta_{\wbf,t}) \|\wbf_t - \wbf'_t\|_2 + L\eta_{\wbf,t}\|\vbf_t - \vbf'_t\|_2.
% \end{align*}
% If $i_t = n$, then it follows from \textbf{(A1)}  that
% \begin{align*}
% \|\wbf_{t+1} - \wbf'_{t+1}\|_2 \leq &	\|\wbf_t - \eta_{\wbf,t}\nabla_\wbf f(\wbf_t, \vbf_t, z_{i_t}) - \wbf'_t + \eta_{\wbf,t}\nabla_\wbf f(\wbf'_t, \vbf'_t, z_{i_t})\|_2\\
% \leq & \|\wbf_t - \wbf'_t\|_2 + 2G\eta_{\wbf,t}.
% \end{align*}
% According to the distribution of $i_t$, we have
% \begin{align}
% \Ebb_A[\|\wbf_{t+1}-\wbf_{t+1}'\|_2] \leq & \frac{n-1}{n}\Ebb_A\Big[(1+\eta_{\wbf,t} L)\|\wbf_t -\wbf_t'\|_2 +  L\eta_{\wbf,t}\|\vbf_t - \vbf'_t\|_2\Big] + \frac{1}{n}(\|\wbf_t -\wbf_t'\|_2 + 2 \eta_{\wbf,t} G)\nonumber\\
% \leq & (1+\eta_{\wbf,t} L)\Ebb_A[\|\wbf_t -\wbf_t'\|_2] + L\eta_{\wbf,t}\Ebb_A\big[\|\vbf_t - \vbf'_t\|_2\big] + \frac{2 \eta_{\wbf,t} G}{n}. \label{eq:agda-w}
% \end{align}
% Similarly, for $\vbf$ we also have
% \begin{align*}\label{eq:agda-v}
% \Ebb_A[\|\vbf_{t+1}-\vbf_{t+1}'\|_2] \leq & (1+\eta_{\vbf,t} L)\Ebb_A[\|\vbf_t -\vbf_t'\|_2] + L\eta_{\vbf,t}\Ebb_A\big[\|\wbf_t - \wbf'_t\|_2\big] + \frac{2 \eta_{\vbf,t} G}{n}. \numberthis
% \end{align*}
% Combining \eqref{eq:agda-w} and \eqref{eq:agda-v} we have
% \begin{align*}
% \Ebb_A[\Delta_{t+1}] \leq & (1+ (\eta_{\wbf,t} + \eta_{\vbf,t}) L)\Ebb_A\big[\Delta_t\big] + \frac{2 (\eta_{\wbf,t} + \eta_{\vbf,t}) G}{n}.
% \end{align*}
% According to \textbf{(A1)} , we know
% \begin{equation}\label{stab-gen-non-convex-1}
%   f(\wbf_T,\vbf';z) - f(\wbf'_T,\vbf';z) + f(\wbf',\vbf_T;z) - f(\wbf',\vbf'_T;z)\leq G\sqrt{2}\Delta_T.
% \end{equation}
% Let $\Ecal$ denote the event that $\Delta_{t_0} = 0$. Then we have
% \begin{align*}
% & \Ebb[f(\wbf_T,\vbf';z) - f(\wbf'_T,\vbf';z) + f(\wbf',\vbf_T;z) - f(\wbf',\vbf'_T;z)]\\
% = & \Pbb[\Ecal]\Ebb[f(\wbf_T,\vbf';z) - f(\wbf'_T,\vbf';z) + f(\wbf',\vbf_T;z) - f(\wbf',\vbf'_T;z) | \Ecal] \\
% & + \Pbb[\Ecal^c]\Ebb[f(\wbf_T,\vbf';z) - f(\wbf'_T,\vbf';z) + f(\wbf',\vbf_T;z) - f(\wbf',\vbf'_T;z) | \Ecal^c]\\
% \leq & \sqrt{2}G\Ebb[\Delta_T | \Ecal]  + 4\Pbb[\Ecal^c],
% \end{align*}
% where in the last step we have used \eqref{stab-gen-non-convex-1} and the condition $|f(\cdot,\cdot, z)| \leq 1$.
% Using the union bound on the outcome $i_t = n$ we obtain that
% \begin{align*}
% \Pbb[\Ecal^c] \leq \sum_{t=1}^{t_0}\Pbb[i_t = n] = \frac{t_0}{n}.
% \end{align*}
% The proof is complete by combining the above two inequalities together.
% Recalling the event $\Ecal$ that $\Delta_{t_0} = 0$, we apply the above equation recursively from $t=t_0+1$ to $T$, then
% \begin{align*}
% \Ebb_A\big[\|\wbf_{t+1}-\wbf_{t+1}'\|_2 + \|\vbf_{t+1}-\vbf_{t+1}'\|_2\big|\Delta_{t_0} = 0\big] \leq & \frac{2G}{n}\sum_{t=t_0+1}^T(\eta_{\wbf,t} + \eta_{\vbf,t}) \prod_{k=t+1}^T (1+ (\eta_{\wbf,k} + \eta_{\vbf,k}) L).
% \end{align*}
% By the elementary inequality $1+x \leq \exp(x)$ and $\eta_{\wbf,t} + \eta_{\vbf,t} \leq \frac{c}{t}$, we have
% \begin{align*}
% & \Ebb_A\big[\|\wbf_{t+1}-\wbf_{t+1}'\|_2 + \|\vbf_{t+1}-\vbf_{t+1}'\|_2\big|\Delta_{t_0} = 0\big]\\
% \leq & \frac{2cG}{n}\sum_{t=t_0+1}^T\frac{1}{t}\prod_{k=t+1}^T\exp\Big(\frac{cL}{k}\Big) = \frac{2cG}{n}\sum_{t=t_0+1}^T\frac{1}{t}\exp\Big(\sum_{k=t+1}^T\frac{cL}{k}\Big)\\
% \leq & \frac{2cG}{n}\sum_{t=t_0+1}^T\frac{1}{t}\exp\Big(cL\log\Big(\frac{T}{t}\Big)\Big) \leq \frac{2cGT^{cL}}{n}\sum_{t=t_0+1}^T\frac{1}{t^{cL+1}} \leq \frac{2G}{Ln}\Big(\frac{T}{t_0}\Big)^{cL}.
% \end{align*}
% By Lemma \ref{lem:stab-gen-nonconvex-2} we have
% \begin{align}\label{new-eq-bound}
% \Ebb[f(\wbf_T,\vbf';z) - f(\wbf'_T,\vbf';z) + f(\wbf',\vbf_T;z) - f(\wbf',\vbf'_T;z)]  \leq & \frac{8t_0}{n} + \frac{2G^2}{Ln}\Big(\frac{T}{t_0}\Big)^{cL}.
% \end{align}
% The right hand side of the above inequality is approximately minimized when
% \begin{align*}
% t_0 = \Big(\frac{G^2}{4L}\Big)^{\frac{1}{cL+1}}T^{\frac{cL}{cL+1}}.
% \end{align*}
% Plugging it into Eq. \eqref{new-eq-bound} we have (for simplicity we assume the above $t_0$ is an integer)
% \begin{align*}
% \Ebb[f(\wbf_T,\vbf';z) - f(\wbf'_T,\vbf';z) + f(\wbf',\vbf_T;z) - f(\wbf',\vbf'_T;z)]   \leq & 16\Big(\frac{G^2}{4L}\Big)^{\frac{1}{cL+1}}n^{-1}T^{\frac{cL}{cL+1}}.
% \end{align*}
% Since the above bound holds for all $z, S, S'$ and $\wbf',\vbf'$, we immediately get the same upper bound on the weak stability. Finally the theorem holds by calling Lemma \ref{lem:stab-gen}, Part 2.
% \end{proof}

The next theorem establishes the generalization bound for the empirical maximizer of a strongly concave objective, i.e. $\Ebb[R_S(\wbf^*) - R(\wbf^*)]$. The proof follows from \citet{shalev2009stochastic-supp}.

\begin{theorem}\label{thm:sc-stab-gen}
Assume \textbf{(A1)}  holds. Assume $F_S(\wbf, \cdot)$ is $\rho$-strongly concave. Assume that for any $\wbf$ and $S$, the function $\vbf \mapsto F_S(\wbf,\vbf)$ is $\rho$-strongly-concave. Then
\begin{align*}
% \Ebb[F_S(\wbf,\hat{\vbf}^*_S) - F(\wbf,\hat{\vbf}^*_S)] 
\Ebb\big[R_S(\wbf^*) - R(\wbf^*)\big] \leq \frac{4G_\vbf^2}{\rho n}.
\end{align*}
\end{theorem}

\begin{proof}
We decompose the term $\Ebb[R_S(\wbf^*) - R(\wbf^*)]$ as
\[
\Ebb\big[R_S(\wbf^*) - R(\wbf^*)\big] = \Ebb\big[F_S(\wbf^*,\hat{\vbf}^*_S) - F(\wbf^*,\vbf^*)\big] = \Ebb\big[F_S(\wbf^*,\hat{\vbf}^*_S) - F(\wbf^*,\hat{\vbf}^*_S)\big] + \Ebb\big[F(\wbf^*,\hat{\vbf}^*_S) - F(\wbf^*,\vbf^*)\big],
\]
where $\hat{\vbf}^*_S=\arg\max_{\vbf}F_S(\wbf^*,\vbf)$.
The second term $\Ebb\big[F(\wbf^*,\hat{\vbf}^*_S) - F(\wbf^*,\vbf^*)\big] \leq 0$ since $(\wbf^*,\vbf^*)$ is a saddle point of $F$. Hence it suffices to bound $\Ebb\big[F_S(\wbf^*,\hat{\vbf}^*_S) - F(\wbf^*,\hat{\vbf}^*_S)\big]$. Let $S'=\{z'_1,\ldots,z'_n\}$ be drawn independently from $\rho$. For any $i\in[n]$, define $S^{(i)}=\{z_1,\ldots,z_{i-1},z_i',z_{i+1},\ldots,z_n\}$. Denote $\hat{\vbf}^*_{S^{(i)}} = \arg\max_{\vbf\in \Vcal}F_{S^{(i)}}(\wbf^*,\vbf)$. Then
\begin{align*}\label{eq:sc-stab}
F_S(\wbf^*,\hat{\vbf}^*_S) - F_S(\wbf^*,\hat{\vbf}^*_{S^{(i)}}) = & \frac{1}{n}\sum_{j\neq i}\Big(f(\wbf^*,\hat{\vbf}^*_S;z_j) - f(\wbf^*,\hat{\vbf}^*_{S^{(i)}};z_j) \Big)   + \frac{1}{n}\Big(f(\wbf^*,\hat{\vbf}^*_S;z_i) - f(\wbf^*,\hat{\vbf}^*_{S^{(i)}};z_i)\Big)\\
= &  \frac{1}{n}\Big( f(\wbf^*,\hat{\vbf}^*_{S^{(i)}};z'_i)-f(\wbf^*,\hat{\vbf}^*_S;z'_i) \Big) + \frac{1}{n}\Big(f(\wbf^*,\hat{\vbf}^*_S;z_i) - f(\wbf^*,\hat{\vbf}^*_{S^{(i)}};z_i)\Big)\\
& + F_{S^{(i)}}(\wbf^*,\hat{\vbf}^*_S) - F_{S^{(i)}}(\wbf^*,\hat{\vbf}^*_{S^{(i)}})\\
\leq & \frac{1}{n}\Big( f(\wbf^*,\hat{\vbf}^*_{S^{(i)}};z'_i)-f(\wbf^*,\hat{\vbf}^*_S;z'_i)\Big) + \frac{1}{n}\Big(f(\wbf^*,\hat{\vbf}^*_S;z_i) - f(\wbf^*,\hat{\vbf}^*_{S^{(i)}};z_i)\Big)\\
\leq & \frac{2G_\vbf}{n}\big\|\hat{\vbf}^*_S - \hat{\vbf}^*_{S^{(i)}}\big\|_2, \numberthis
\end{align*}
where the first inequality follows from the fact that $\hat{\vbf}^*_{S^{(i)}}$ is the maximizer of $F_{S^{(i)}}(\wbf^*,\cdot)$ and the second inequality follows the Lipschitz continuity. Since $F_S$ is strongly-concave and $\hat{\vbf}^*_S$ maximizes $F_S(\wbf^*,\cdot)$, we know
\begin{align*}
\frac{\rho}{2}\big\|\hat{\vbf}^*_S - \hat{\vbf}^*_{S^{(i)}}\big\|_2^2 \leq F_S(\wbf^*,\hat{\vbf}^*_S) - F_S(\wbf^*,\hat{\vbf}^*_{S^{(i)}}).
\end{align*}
Combining it with \eqref{eq:sc-stab} we get $\big\|\hat{\vbf}^*_S - \hat{\vbf}^*_{S^{(i)}}\big\|_2 \leq 4G_\vbf/(\rho n)$. By Lipschitz continuity, the following inequality holds for any $z$
\begin{align*}
\big|f(\wbf^*,\hat{\vbf}^*_S ;z) - f(\wbf^*,\hat{\vbf}^*_{S^{(i)}};z)\big|  \leq \frac{4G_\vbf^2}{\rho n}.
\end{align*}
Since $z_i$ and $z'_i$ are i.i.d., we have
\begin{align*}
\Ebb\big[F(\wbf^*,\hat{\vbf}^*_S)\big]   =   \Ebb\big[F(\wbf^*,\hat{\vbf}^*_{S^{(i)}})\big] =  \frac{1}{n}\sum_{i=1}^n\Ebb\big[f(\wbf^*,\hat{\vbf}^*_{S^{(i)}};z_i)\big],
\end{align*}
where the last identity holds since $z_i$ is independent of $\hat{\vbf}^*_{S^{(i)}}$.
Therefore
\begin{align*}
\Ebb\big[F_S(\wbf^*,\hat{\vbf}^*_S) - F(\wbf^*,\hat{\vbf}^*_S)\big] = \frac{1}{n}\sum_{i=1}^n \Ebb\big[f(\wbf^*,\hat{\vbf}^*_S;z_i) - f(\wbf^*,\hat{\vbf}^*_{S^{(i)}};z_i)\big] \leq \frac{4G_\vbf^2}{\rho n}.
\end{align*}
The proof is complete.
\end{proof}


\begin{theorem}[Theorem \ref{thm:sgda-primal-gen} restated]
Assume the function $f(\wbf, \cdot; \zbf)$ is $\rho$-strongly concave and $F_S(\cdot, \vbf)$ satisfies $\mu$-PL condition. Suppose \textbf{(A1)}  and \textbf{(A3)}  hold. If $\Ebb[R_S(\wbf_{T+1}) - R_S^*] \leq \varepsilon_T$, then
\begin{align*}
&\Ebb[R(\wbf_T) - R_S(\wbf_T)]  \leq (1+\kappa)G_\wbf\Big(\sqrt{\frac{\varepsilon_T}{2\mu}} + \frac{1}{n}\sqrt{\frac{G_\wbf^2}{4\mu^2} + \frac{G_\vbf^2}{\rho\mu}} \Big), 
\end{align*} and 
\begin{align*}
\Ebb[R_S(\wbf^*) - R(\wbf^*)] \leq   \frac{4G_\vbf^2}{\rho n}.
\end{align*}
 
\end{theorem}

\begin{proof}
It follows directly from Theorem \ref{thm:sgda-gen} and \ref{thm:sc-stab-gen}.
\end{proof}


\subsection{Proof of Theorem \ref{thm:utility-nonconvex}}\label{sec:agda-utility}


\begin{theorem}[Theorem \ref{thm:utility-nonconvex} restated]
Assume \textbf{(A1)}, \textbf{(A3)} and \textbf{(A4)}  hold. Assume $F_S(\cdot, \vbf)$ satisfies PL condition with constant $\mu$ and $f(\wbf, \cdot; \zbf)$ is $\rho$-strongly concave. For SGDA, if $\Ebb[R_S(\wbf_T) - R_S^*] = \Ocal(\varepsilon_T)$, then iterates $\{\wbf_t, \vbf_t\}$ satisfies the following inequality
\begin{equation*}
\Ebb [R(\wbf_T) - R^*] = \Ocal(\varepsilon_T + (1 + \frac{L}{\rho})G_\wbf \Big(\sqrt{\frac{\varepsilon_T}{2\mu}} + \frac{1}{n}\sqrt{\frac{G_\wbf^2}{4\mu^2} + \frac{G_\vbf^2}{\rho\mu}}\Big) +  \frac{4G_\vbf^2}{\rho n}).
\end{equation*}
Furthermore, if we choose $T = \Ocal(n)$, $\eta_{\wbf, t} = \Ocal(\frac{1}{\mu t})$ and $\eta_{\vbf, t} = \Ocal(\frac{\kappa^2\max\{1, \sqrt{\kappa/\mu}\}}{\mu t^{2/3}})$, then
\begin{align*}
\Ebb[R(\wbf_T) - R^*] = \Ocal\bigl(\frac{\kappa^{2.75}}{\mu^{1.75}}(\frac{1}{n^{1/3}} + \frac{\sqrt{d\log(1/\delta)}}{n^{5/6}\epsilon})\bigr).
\end{align*}
\end{theorem}

\begin{proof}
For any $\wbf^* \in \arg\min_\wbf R(\wbf)$, recall that we have the error decomposition \eqref{eq:err-decomp}, which is
\begin{align*}
\Ebb[R(\wbf_T) - R^*] = & \Ebb[R(\wbf_T) - R_S(\wbf_T)] + \Ebb[R_S(\wbf_T) - R_S^*] + \Ebb[R_S^* - R_S(\wbf^*)] + \Ebb[R_S(\wbf^*) - R(\wbf^*)]\\
\leq & \Ebb[R(\wbf_T) - R_S(\wbf_T)] + \Ebb[R_S(\wbf_T) - R_S^*] + \Ebb[R_S(\wbf^*) - R(\wbf^*)],
\end{align*}
where the inequality is by $R_S^* - R_S(\wbf^*) \leq 0$. 
By Theorem \ref{thm:sgda-gen}, we have
\begin{align*}
\Ebb[R(\wbf_T) - R_S(\wbf_T)] \leq (1 + \frac{L}{\rho})G_\wbf \Big(\sqrt{\frac{\varepsilon_T}{2\mu}} + \frac{1}{n}\sqrt{\frac{G_\wbf^2}{4\mu^2} + \frac{G_\vbf^2}{\rho\mu}}\Big).
\end{align*}
And by Theorem \ref{thm:sc-stab-gen}, we have
\begin{align*}
\Ebb[R_S(\wbf^*) - R(\wbf^*)] \leq \frac{4G_\vbf^2}{\rho n}.
\end{align*}
We can plug the above two inequalities into \eqref{eq:err-decomp}, and get
\begin{equation*}
\Ebb [R(\wbf_T) - R^*] = \Ocal(\varepsilon_T + (1 + \frac{L}{\rho})G_\wbf \Big(\sqrt{\frac{\varepsilon_T}{2\mu}} + \frac{1}{n}\sqrt{\frac{G_\wbf^2}{4\mu^2} + \frac{G_\vbf^2}{\rho\mu}}\Big) +  \frac{4G_\vbf^2}{\rho n}).
\end{equation*}
Now by the choice of $\eta_{\wbf,t}, \eta_{\vbf,t}$, and Theorem \ref{thm:sgda-primal-opt} , we have $\varepsilon_T = \Ocal(\frac{\kappa^{3.5}}{\mu^{2.5}}\frac{1/m + d(\sigma_\wbf^2 + \sigma_\vbf^2)}{T^{2/3}})$. Assume $m$ is a constant. Plugging $\varepsilon_T$ into the preceding inequality and letting $T = \Ocal(n)$ yields the second statement.
\end{proof}

% \subsection{New Generalization Bound}

% Let $\S = \{\zbf_1, \cdots, \zbf_n\}$ and $\tilde{\S} = \{\tilde{\zbf}_1, \cdots, \tilde{\zbf}_n\}$ be drawn independently from $\Dcal$. For each $i \in [n]$, denote $\S^{(i)} = \{\tilde{\zbf}_1, \cdots, \zbf_{i-1}, \tilde{\zbf}_i, \zbf_{i+1}, \cdots, \tilde{\zbf}_n\}$. Let $\vbf^*_\S = \arg\max_{\vbf \in \Vcal} F(A_\wbf(\S), \vbf)$ and $\vbf^*_{\S^{(i)}} = \arg\max_{\vbf \in \Vcal} F(A_\wbf(\S^{(i)}), \vbf)$. We say a minimax algorithm $A$ has $\epsilon$ on-average stability if 
% \[
% \frac{1}{n}\sum_{i=1}^n \Ebb_{\S, \tilde{\S}, A}[f(A_\wbf(\S^{(i)}), \vbf^*_{\S^{(i)}}; \zbf_i) - f(A_\wbf(\S), \vbf^*_\S; \zbf_i)] \leq \epsilon.
% \]

% The next lemma establishes the key connection between the primal generalization error and on-average stability measure. 
% \begin{lemma}
% If a minimax algorithm $A$ has $\epsilon$ on average stability, then 
% \begin{align*}
% \Ebb_{\S, A}[R(A_\wbf(\S)) - R_S(A_\wbf(\S))] \leq \epsilon.  
% \end{align*}
% \end{lemma}

% \begin{proof}
% According to the symmetry between $\zbf_i$ and $\tilde{\zbf}_i$ we know
% \begin{align*}\label{eq:stab-gen-1}
% \Ebb[\max_{\vbf \in \Vcal} F(A_\wbf(\S), \vbf)] & =  \frac{1}{n}\sum_{i=1}^n \Ebb[\max_{\vbf \in \Vcal} F(A_\wbf(\S^{(i)}), \vbf)] \\
% & = \frac{1}{n}\sum_{i=1}^n \Ebb[F(A_\wbf(\S^{(i)}), \vbf^*_{\S^{(i)}})] = \frac{1}{n}\sum_{i=1}^n \Ebb[f(A_\wbf(\S^{(i)}), \vbf^*_{\S^{(i)}}; \zbf_i)], \numberthis
% \end{align*}
% where the last identity holds since $\zbf_i$ is independent of $A_\wbf(\S^{(i)})$ and $\vbf^*_{\S^{(i)}}$. On the other hand, 
% \begin{align*}\label{eq:stab-gen-2}
% \Ebb[\max_{\vbf \in \Vcal} F_S(A_\wbf(\S), \vbf)] = & \Ebb[\max_{\vbf \in \Vcal}\frac{1}{n}\sum_{i=1}^n f(A_\wbf(\S^{(i)}), \vbf; \zbf_i)]\\
% \geq & \Ebb[\frac{1}{n}\sum_{i=1}^n f(A_\wbf(\S^{(i)}), \vbf^*_\S; \zbf_i)] = \frac{1}{n}\sum_{i=1}^n\Ebb[ f(A_\wbf(\S^{(i)}), \vbf^*_\S; \zbf_i)]. \numberthis
% \end{align*}
% Combining \eqref{eq:stab-gen-1} and \eqref{eq:stab-gen-2} we have
% \begin{align*}
% \Ebb[R(A_\wbf(\S)) - R_S(A_\wbf(\S))] & = \Ebb[\max_{\vbf \in \Vcal} F(A_\wbf(\S), \vbf) - \max_{\vbf \in \Vcal} F_S(A_\wbf(\S), \vbf)]\\
% & \leq   \frac{1}{n}\sum_{i=1}^n \Ebb[f(A_\wbf(\S^{(i)}), \vbf^*_{\S^{(i)}}; \zbf_i) - f(A_\wbf(\S), \vbf^*_\S; \zbf_i)] \leq \epsilon.
% \end{align*}
% The proof is complete.
% \end{proof}

% In the following lemma, we derive the on-average stability bounds under the PL condition. 

% \begin{lemma}
% Assume \textbf{(A1)} and \textbf{(A2)} hold, then $A$ has on-average stability satisfying
% \begin{align*}
% \epsilon \leq     
% \end{align*}
% \end{lemma}

% \begin{proof}
% We decompose $f(A_\wbf(\S^{(i)}), \vbf^*_{\S^{(i)}}; \zbf_i) - f(A_\wbf(\S), \vbf^*_\S; \zbf_i)$ as follows
% \begin{align*}
% f(A_\wbf(\S^{(i)}), \vbf^*_{\S^{(i)}}; \zbf_i) - f(A_\wbf(\S), \vbf^*_\S; \zbf_i) = & \big(f(A_\wbf(\S^{(i)}), \vbf^*_{\S^{(i)}}; \zbf_i) - f(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \pi_{\S^{(i)}}(\vbf^*_{\S^{(i)}}); \zbf_i)\big)\\   
% & + \big(f(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \pi_{\S^{(i)}}(\vbf^*_{\S^{(i)}}); \zbf_i) - f(\pi_\S(A_\wbf(\S)), \pi_\S(\vbf^*_\S); \zbf_i)\big)\\
% & + \big(f(\pi_\S(A_\wbf(\S)), \pi_\S(\vbf^*_\S); \zbf_i) - f(A_\wbf(\S), \vbf^*_\S; \zbf_i)\big).
% \end{align*}
% We now address the above three terms separately.
% \end{proof}

% We say a minimax algorithm $A$ has weak on average stability $\epsilon$ if 
% \begin{align*} 
% \max_{\vbf \in \Vcal}\Big\{\frac{1}{n}\sum_{i=1}^n\Ebb[f(A_\wbf(\S^{(i)}), \vbf; \zbf_i) - f(A_\wbf(\S), \vbf; \zbf_i)]\Big\}  + \max_{\wbf \in \Wcal}\Big\{\frac{1}{n}\sum_{i=1}^n\Ebb[f(\wbf, A_\vbf(\S^{(i)}); \zbf_i) - f(\wbf, A_\vbf(\S); \zbf_i)]\Big\} \leq \varepsilon.
% \end{align*}

% The next lemma establishes the key connection between the weak generalization error and weak on-average stability measure. 

% \begin{lemma}
% If a minimax algorithm $A$ has $\epsilon$ weak on average stability, then 
% \begin{align*}
% \Ebb_{\S, A}[\triangle^w(A_\wbf(\S), A_\vbf(\S)) - \triangle^w_S(A_\wbf(\S), A_\vbf(\S))] \leq \epsilon.  
% \end{align*}
% \end{lemma}

% \begin{proof}
% By the definition of $\triangle^w(A_\wbf(\S), A_\vbf(\S))$ and $\triangle^w_S(A_\wbf(\S), A_\vbf(\S))$, we know
% \begin{align*}
% & \triangle^w(A_\wbf(\S), A_\vbf(\S)) - \triangle^w_S(A_\wbf(\S), A_\vbf(\S))\\
% = & \max_{\vbf \in \Vcal} \Ebb[F(A_\wbf(\S), \vbf)]  - \min_{\wbf \in \Wcal} \Ebb[F(\wbf, A_\vbf(\S))] - \max_{\vbf \in \Vcal} \Ebb[F_S(A_\wbf(\S), \vbf)]  + \min_{\wbf \in \Wcal} \Ebb[F_S(\wbf, A_\vbf(\S))]  \\
% \leq & \max_{\vbf \in \Vcal} \Ebb[F(A_\wbf(\S), \vbf) - F_S(A_\wbf(\S), \vbf)]  + \max_{\wbf \in \Wcal} \Ebb[F(\wbf, A_\vbf(\S)) - F_S(\wbf, A_\vbf(\S))]
% \end{align*}
% where the last inequality holds by the subadditivity of $\max$ and $\min$ operators. According to the symmetry we know
% \begin{align*}
% \Ebb[F(A_\wbf(\S), \vbf) - F_S(A_\wbf(\S), \vbf)] = & \frac{1}{n}\sum_{i=1}^n \Ebb[F(A_\wbf(\S^{(i)}), \vbf) - F_S(A_\wbf(\S), \vbf)]   \\
% = & \frac{1}{n}\sum_{i=1}^n \Ebb[f(A_\wbf(\S^{(i)}), \vbf; \zbf_i) - f(A_\wbf(\S), \vbf; \zbf_i)].
% \end{align*}
% In a similar way, we have
% \begin{align*}
% \Ebb[F(\wbf, A_\vbf(\S)) - F_S(\wbf, A_\vbf(\S))] 
% = & \frac{1}{n}\sum_{i=1}^n \Ebb [f(\wbf, A_\vbf(\S^{(i)}); \zbf_i) - f(\wbf, A_\vbf(\S); \zbf_i)].
% \end{align*}
% Combining the above three inequalities we have
% \begin{align*}
% & \triangle^w(A_\wbf(\S), A_\vbf(\S)) - \triangle^w_S(A_\wbf(\S), A_\vbf(\S)) \\
% \leq & \max_{\vbf \in \Vcal}\Big\{\frac{1}{n}\sum_{i=1}^n \Ebb[f(A_\wbf(\S^{(i)}), \vbf; \zbf_i) - f(A_\wbf(\S), \vbf; \zbf_i)]\Big\} + \max_{\wbf \in \Wcal}\Big\{\frac{1}{n}\sum_{i=1}^n \Ebb [f(\wbf, A_\vbf(\S^{(i)}); \zbf_i) - f(\wbf, A_\vbf(\S); \zbf_i)]\Big\}.
% \end{align*}
% The proof is complete.
% \end{proof}

% In the following lemma, we derive the weak on-average stability under the PL condition.

% \begin{lemma}
% Assume \textbf{(A1)} and \textbf{(A2)} hold, then $A$ has on-average stability satisfying
% \begin{align*}
% \epsilon \leq     
% \end{align*}
% \end{lemma}

% \begin{proof}
% We decompose $f(A_\wbf(\S^{(i)}), \vbf; \zbf_i) - f(A_\wbf(\S), \vbf; \zbf_i)$ as follows
% \begin{align*}\label{eq:stab-intermediate}
% f(A_\wbf(\S^{(i)}), \vbf; \zbf_i) - f(A_\wbf(\S), \vbf; \zbf_i) = & \big(f(A_\wbf(\S^{(i)}), \vbf; \zbf_i) - f(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}; \zbf_i)\big)\\   
% & + \big(f(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{S^{(i)}}; \zbf_i) - f(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S; \zbf_i)\big)\\
% & + \big(f(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S; \zbf_i) - f(A_\wbf(\S), \vbf; \zbf_i)\big). \numberthis
% \end{align*}
% We now address the above three terms separately. We first address $f(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}; \zbf_i) - f(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S; \zbf_i)$. We know
% \begin{align*}\label{eq:stab-intermediate-0}
% & \Ebb[f(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}; \zbf_i) - f(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S; \zbf_i)] = n \Ebb[F_\S(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}) - F_\S(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S)]\\
% = & n\Ebb[F_\S(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}) - \min_{\wbf \in \Wcal}F_\S(\wbf, \hat{\vbf}_{\S^{(i)}})] + n\Ebb[\min_{\wbf \in \Wcal}F_\S(\wbf, \hat{\vbf}_{\S^{(i)}}) - F_\S(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S)] \\
% \leq & \frac{n}{2\mu}\Ebb[\|\nabla_\wbf F_\S(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}) \|_2^2] \numberthis
% \end{align*}
% where we used PL condition of $F_\S$ and $\min_{\wbf \in \Wcal}F_\S(\wbf, \hat{\vbf}_{\S^{(i)}}) - F_S(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S) \leq 0$. According to the definition of $\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}})$ we know $\nabla_\wbf F_{\S^{(i)}}(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}) = 0$ and therefore
% \begin{align*}
% & \|\nabla_\wbf F_\S(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}})  \|_2^2\\
% = & \Big\|\nabla_\wbf F_{\S^{(i)}}(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}) - \frac{1}{n}\nabla_\wbf f(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}; \tilde{\zbf}_i) + \frac{1}{n}\nabla_\wbf f(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}; \zbf_i)\Big\|_2^2  \\
% \leq & \frac{4G_\wbf^2}{n^2}
% \end{align*}
% where we have used the $G_\wbf$ Lipschitz condition of $f$. Combined with \eqref{eq:stab-intermediate-0}, gives
% \begin{align*}\label{eq:stab-intermediate-1}
% \Ebb[f(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}; \zbf_i) - f(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S; \zbf_i)] \leq \frac{2G_\wbf^2}{\mu n}.   \numberthis 
% \end{align*}
% We then address $f(A_\wbf(\S^{(i)}), \vbf; \zbf_i) - f(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}; \zbf_i)$. We know
% \begin{align*}\label{eq:stab-intermediate-2}
% \Ebb[f(A_\wbf(\S^{(i)}), \vbf; \zbf_i) - f(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}}; \zbf_i)] = & \Ebb[F(A_\wbf(\S^{(i)}), \vbf) - F(\pi_{\S^{(i)}}(A_\wbf(\S^{(i)})), \hat{\vbf}_{\S^{(i)}})]   \\
% = & \Ebb[F(A_\wbf(\S), \vbf) - F(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S)]. \numberthis
% \end{align*}
% Finally, we address $f(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S; \zbf_i) - f(A_\wbf(\S), \vbf; \zbf_i)$. We know
% \begin{align*}\label{eq:stab-intermediate-3}
% \Ebb[f(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S; \zbf_i) - f(A_\wbf(\S), \vbf; \zbf_i)]
% = \Ebb[F_S(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S) - F_S(A_\wbf(\S), \vbf)]. \numberthis
% \end{align*}
% Plugging \eqref{eq:stab-intermediate-1}, \eqref{eq:stab-intermediate-2} and \eqref{eq:stab-intermediate-3} back into \eqref{eq:stab-intermediate}, we derive
% \begin{align*}
% & \frac{1}{n}\sum_{i=1}^n \Ebb[f(A_\wbf(\S^{(i)}), \vbf; \zbf_i) - f(A_\wbf(\S), \vbf; \zbf_i)] \\
% \leq & \frac{4G_\wbf^2}{n^2} + \Ebb[F(A_\wbf(\S), \vbf) - F(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S)] + \Ebb[F_S(\pi_\S(A_\wbf(\S)), \hat{\vbf}_\S) - F_S(A_\wbf(\S), \vbf)]
% \end{align*}
% \end{proof}

% \section{Weak PD Risk for Nonconvex Objectives (Pending)}

% We first presents a lemma that connects the weak generalization with the stability. This stability definition is analogous to the on-average stability in minimization problem.

% \begin{lemma}\label{lem:weak-gen-via-avg-stab}
% Let $A$ be a randomized algorithm, if there holds
% \begin{align*}
% \sup_{\vbf \in \Vcal}\Big\{\frac{1}{n}\sum_{i=1}^n \Ebb_{S, S', A}[f(A_\wbf(S^{(i)}), \vbf; \zbf_i) - f(A_\wbf(S), \vbf; \zbf_i)]\Big\} + \sup_{\wbf \in \Wcal}\Big\{\frac{1}{n}\sum_{i=1}^n\Ebb_{S, S', A}[f(\wbf, A_\vbf(S); \zbf) - f(\wbf, A_\vbf(S^{(i)}); \zbf)]\Big\} \leq \varepsilon    
% \end{align*}
% then the weak PD generalization error of $(A_\wbf(S), A_\vbf(S))$ satisfies
% \begin{align*}
%     \triangle^w(A_{\wbf}(S),A_{\vbf}(S))-\triangle^w_S(A_{\wbf}(S),A_{\vbf}(S))\leq\varepsilon.
% \end{align*}
% \end{lemma}

% \begin{proof}
% By the subadditivity of supremum and infimum we have
% \begin{multline*}
%   \triangle^w(A_{\wbf}(S),A_{\vbf}(S))-\triangle^w_S(A_{\wbf}(S),A_{\vbf}(S)) \leq \sup_{\vbf'\in\Vcal}\Ebb[F(A_{\wbf}(S),\vbf')-F_S(A_{\wbf}(S),\vbf')]\\+
%   \sup_{\wbf'\in\Wcal}\Ebb[F_S(\wbf',A_{\vbf}(S))-F(\wbf',A_{\vbf}(S))].
% \end{multline*}
% According to the symmetry between $z_i$ and $z_i'$ we know
% \begin{align*}
%   \Ebb[F(A_{\wbf}(S),\vbf')-F_S(A_{\wbf}(S),\vbf')] & =  \frac{1}{n}\sum_{i=1}^{n} \Ebb[F(A_{\wbf}(S^{(i)}),\vbf')]-\Ebb[F_S(A_{\wbf}(S),\vbf')]\\
%   &=  \frac{1}{n}\sum_{i=1}^{n} \Ebb\big[f(A_{\wbf}(S^{(i)}),\vbf';\zbf_i)-f(A_{\wbf}(S),\vbf';\zbf_i)\big],
% \end{align*}
% where the second identity holds since $z_i$ is not used to train $A_{\wbf}(S^{(i)})$.
% In a similar way, we can prove
% \[
% \Ebb[F_S(\wbf',A_{\vbf}(S))-F(\wbf',A_{\vbf}(S))]=
% \frac{1}{n}\sum_{i=1}^{n}\big[f(\wbf',A_{\vbf}(S);\zbf_i)-f(\wbf',A_{\vbf}(S^{(i)});\zbf_i)\big].
% \]
% As a combination of the above three inequalities we get
% \begin{multline*}
%     \triangle^w(A_{\wbf}(S),A_{\vbf}(S))-\triangle^w_S(A_{\wbf}(S),A_{\vbf}(S)) \leq  \sup_{\vbf'\in\Vcal}\Big[\frac{1}{n}\sum_{i=1}^{n} \Ebb\big[f(A_{\wbf}(S^{(i)}),\vbf';\zbf_i)-f(A_{\wbf}(S),\vbf';\zbf_i)\big]\Big]+\\
%     \sup_{\wbf'\in\Wcal}\Big[\frac{1}{n}\sum_{i=1}^{n}\big[f(\wbf',A_{\vbf}(S);\zbf_i)-f(\wbf',A_{\vbf}(S^{(i)});\zbf_i)\big]\Big].
% \end{multline*}
% The stated bound in Part (a) then follows directly from the definition of stability.
% \end{proof}

% \begin{theorem}

% \end{theorem}

% \begin{proof}
% For each $i \in [n]$, we decompose $f(A_\wbf(S^{(i)}), \vbf; \zbf_i) - f(A_\wbf(S), \vbf; \zbf_i)$ as follows
% \begin{multline*}
% f(A_\wbf(S^{(i)}), \vbf; \zbf_i) - f(A_\wbf(S), \vbf; \zbf_i) =  \Big(f(A_\wbf(S^{(i)}), \vbf; \zbf_i) - f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf_i)\Big)  \\
% + \Big(f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf_i) - f(\pi_S(A_\wbf(S)), \vbf; \zbf_i)\Big) + \Big(f(\pi_S(A_\wbf(S)), \vbf; \zbf_i) - f(A_\wbf(S), \vbf; \zbf_i)\Big)
% \end{multline*}
% For the term $f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf_i) - f(\pi_S(A_\wbf(S)), \vbf; \zbf_i)$, we have
% \begin{align*}
% f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf_i) = n F_S(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf) - n F_{S^{(i)}}(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf) + f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf'_i).
% \end{align*}
% Since $\zbf_i$ and $\zbf'_i$ follow from the same distribution, we have $\Ebb[f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf'_i)] = \Ebb[f(\pi_S(A_\wbf(S)), \vbf; \zbf_i)]$. Rearranging terms of the preceding identity and taking expectation we derive
% \begin{align*}
% \Ebb[f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf_i) - f(\pi_S(A_\wbf(S)), \vbf; \zbf_i)] = & n \Ebb[F_S(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf) - F_{S^{(i)}}(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf)]\\
% = & \Ebb[F_S(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf) - \min_{\wbf\in\Wcal} F_S(\wbf, \vbf)]
% \end{align*}
% where we used the symmetry between $\zbf_i$ and $\zbf'_i$. By the PL condition we have 
% \begin{align*}\label{eq:exp-pl}
% \Ebb[f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf_i) - f(\pi_S(A_\wbf(S)), \vbf; \zbf_i)] \leq \frac{n}{2\mu}\Ebb[\|\nabla_\wbf F_S(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf)\|_2^2]. \numberthis
% \end{align*}
% We further decompose $\|\nabla_\wbf F_S(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf)\|_2^2$ as follows
% \begin{align*}
% & \|\nabla_\wbf F_S(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf)\|_2^2\\
% = & \|\nabla_\wbf F_{S^{(i)}}(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf) - \frac{1}{n} \nabla_\wbf f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf'_i) + \frac{1}{n} \nabla_\wbf f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf_i)\|_2^2    \\
% \leq &\frac{2}{n^2}  \|\nabla_\wbf f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf'_i)\|_2^2 + \frac{2}{n^2} \|\nabla_\wbf f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf_i)\|_2^2 \leq \frac{4G_\wbf^2}{n^2}
% \end{align*}
% where in the first inequality we used $\nabla_\wbf F_{S^{(i)}}(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf) = 0$ and triangle inequality, in the second inequality we used the Lipschitz continuity. Combining the preceding inequality with \eqref{eq:exp-pl} we have
% \begin{align*}\label{eq:opt-f-stab}
% \Ebb[f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf_i) - f(\pi_S(A_\wbf(S)), \vbf; \zbf_i)] \leq \frac{2G_\wbf^2}{\mu n}. \numberthis
% \end{align*}
% Next we address the term $f(A_\wbf(S^{(i)}), \vbf; \zbf_i) - f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf_i)$. Since $A_\wbf(S^{(i)})$ and $\pi_{S^{(i)}}(A_\wbf(S^{(i)}))$ are independent of $\zbf_i$, we have 
% \begin{align*}\label{eq:f-conv}
% \Ebb[f(A_\wbf(S^{(i)}), \vbf; \zbf_i) - f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf_i)] = & \Ebb[F(A_\wbf(S^{(i)}), \vbf) - F(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf)]\\
% = & \Ebb[F(A_\wbf(S), \vbf) - F(\pi_S(A_\wbf(S)), \vbf)] \numberthis
% \end{align*}
% where we used the symmetry between $\zbf_i$ and $\zbf'_i$. Finally, we consider the term $f(\pi_S(A_\wbf(S)), \vbf; \zbf_i) - f(A_\wbf(S), \vbf; \zbf_i)$. By the definition of $\pi_S(A_\wbf(S))$, we have
% \begin{align*}
% \sum_{i=1}^n f(\pi_S(A_\wbf(S)), \vbf; \zbf_i) - f(A_\wbf(S), \vbf; \zbf_i) = \min_{\wbf \in \Wcal} F_S(\wbf,\vbf) - F_S(A_\wbf(S), \vbf).
% \end{align*}
% Plugging the preceding identity and \eqref{eq:opt-f-stab}, \eqref{eq:f-conv} into \eqref{eq:exp-pl}, we derive
% \begin{align*}
% & \frac{1}{n}\sum_{i=1}^n\Ebb[f(\pi_{S^{(i)}}(A_\wbf(S^{(i)})), \vbf; \zbf_i) - f(\pi_S(A_\wbf(S)), \vbf; \zbf_i)]\\
% \leq & \frac{2G_\wbf^2}{\mu n} + \Ebb[F(A_\wbf(S), \vbf) - F(\pi_S(A_\wbf(S)), \vbf)] + \Ebb[\min_{\wbf \in \Wcal} F_S(\wbf,\vbf) - F_S(A_\wbf(S), \vbf)].
% \end{align*}
% Therefore we have 
% \begin{align*}\label{eq:F-F_S-mid}
% \Ebb[F(A_{\wbf}(S),\vbf)-F_S(A_{\wbf}(S),\vbf)] \leq   \frac{2G_\wbf^2}{\mu n} + \Ebb[F(A_\wbf(S), \vbf) - F(\pi_S(A_\wbf(S)), \vbf)] + \Ebb[\min_{\wbf \in \Wcal} F_S(\wbf,\vbf) - F_S(A_\wbf(S), \vbf)]  \numberthis
% \end{align*}
% from which we derive
% \begin{align*}\label{eq:F-F_S}
% \Ebb[F(\pi_S(A_\wbf(S)), \vbf)]-\min_{\wbf \in \Wcal} F_S(\wbf,\vbf)] \leq   \frac{2G_\wbf^2}{\mu n}. \numberthis
% \end{align*}
% By $L$-smoothness we have for any $\gamma > 0$,
% \begin{align*}\label{eq:F-F}
% F(A_\wbf(S), \vbf) - F(\pi_S(A_\wbf(S)), \vbf) \leq & \langle\nabla_\wbf F(\pi_S(A_\wbf(S)), \vbf), A_\wbf(S) - \pi_S(A_\wbf(S)) \rangle  +  \frac{L}{2}\|A_\wbf(S) - \pi_S(A_\wbf(S)) \|_2^2\\
% \leq & \|\nabla_\wbf F(\pi_S(A_\wbf(S)), \vbf)\|_2 \|A_\wbf(S) - \pi_S(A_\wbf(S)) \|_2  +  \frac{L}{2}\|A_\wbf(S) - \pi_S(A_\wbf(S)) \|_2^2\\
% \leq & \frac{1}{4\gamma} \|\nabla_\wbf F(\pi_S(A_\wbf(S)), \vbf)\|_2^2 + (\gamma + \frac{L}{2})\|A_\wbf(S) - \pi_S(A_\wbf(S)) \|_2^2 \numberthis
% \end{align*}
% where we used the Cauchy-Schwartz inequality. Let $\wbf^* \in \arg\min_{\wbf\in\Wcal} F(\wbf, \vbf)$, by $L$-smoothness again we have
% \begin{align*}
% \Ebb[\|\nabla_\wbf F(\pi_S(A_\wbf(S)), \vbf)\|_2^2] \leq & 2L  \Ebb[F(\pi_S(A_\wbf(S)), \vbf) -  F(\wbf^*, \vbf)]\\
% \leq & 2L  \Ebb[F(\pi_S(A_\wbf(S)), \vbf) - \min_{\wbf \in \Wcal} F_S(\wbf, \vbf)] \leq \frac{4LG_\wbf^2}{\mu n}
% \end{align*}
% where in the second inequality we used $\Ebb[F(\wbf^*,\vbf)] = \Ebb[F_S(\wbf^*, \vbf)] \geq \min_{\wbf \in \Wcal} F_S(\wbf, \vbf)$ and in the last inequality we used \eqref{eq:F-F_S}. Plugging the above inequality back into \eqref{eq:F-F} and taking $\gamma = L/2$ we derive
% \begin{align*}
% F(A_\wbf(S), \vbf) - F(\pi_S(A_\wbf(S)), \vbf) \leq & \frac{2G_\wbf^2}{\mu n}  +L\|A_\wbf(S) - \pi_S(A_\wbf(S)) \|_2^2
% \end{align*}
% Plugging the above inequality back into \eqref{eq:F-F_S-mid} and rearranging, we derive
% \begin{align*}
% \Ebb[F(A_{\wbf}(S),\vbf)-\min_{\wbf \in \Wcal} F_S(\wbf,\vbf)] \leq   \frac{4G_\wbf^2}{\mu n} + L\Ebb[\|A_\wbf(S) - \pi_S(A_\wbf(S)) \|_2^2]
% \end{align*}
% and since $\min_{\wbf \in \Wcal} F_S(\wbf,\vbf) \leq F_S(A_{\wbf}(S),\vbf)$ we have
% \begin{align*}
% \Ebb[F(A_{\wbf}(S),\vbf)- F_S(A_{\wbf}(S),\vbf)] \leq   \frac{4G_\wbf^2}{\mu n} + L\Ebb[\|A_\wbf(S) - \pi_S(A_\wbf(S)) \|_2^2]
% \end{align*}
% \end{proof}

\section{Additional Experimental Details}\label{sec:add-details}
\subsection{Source Code}
For the purpose of double-blind peer-review, the source code is accessible in the supplementary file.

\subsection{Computing Infrastructure Description}
All algorithms are implemented in Python 3.6 and trained and tested on an Intel(R) Xeon(R) CPU W5590
@3.33GHz with 48GB of RAM and an NVIDIA Quadro RTX 6000 GPU with 24GB memory. The PyTorch version
is 1.6.0.

\subsection{Description of Datasets}
In experiments, we use three benchmark datasets. Specifically, ijcnn1 dataset from LIBSVM repsitory, MNIST dataset and Fashion-MNIST dataset are from \citet{lecun1998gradient-supp}, and \citet{xiao2017fashion-supp}. The details of these datasets are shown in Table \ref{tab:general_performance}. For the ijcnn1 dataset, we normalize the features into [0,1]. For MNIST and Fashion-MNIST datasets, we first normalize the features of them into [0,1] then normalize them according to the mean and standard deviation. 
\begin{table*}[th!]
% \captionsetup{font=footnotesize}
\centering
\setlength\tabcolsep{2.5pt}
% \scriptsize{
\begin{tabular}{c|cccc}
\hline
Dataset &  \#Classes & \#Training Samples & \#Testing Samples & \#Features \\ \hline
ijcnn1 & 2 & 39,992  & 9,998 & 22 \\ 
MNIST & 10 & 60,000 & 10,000 & 784 \\ 
Fashion-MNIST & 10 & 60,000 & 10,000 & 784 \\ \hline
\end{tabular}
\caption{\small \it Statistical information of each dataset for AUC optimization.}
\label{tab:datasets}
% }
\end{table*}

\subsection{Training Settings}
The training settings for NSEG and DP-SGDA on all datasets are shown in Table \ref{tab:training-settings}.
% % \vspace{-1em}
\begin{table*}[th!]
% \captionsetup{font=footnotesize}
\centering
\setlength\tabcolsep{2.5pt}
% \scriptsize{
\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|c|}
\hline
\multirow{3}{*}{Methods} & \multirow{3}{*}{Datasets} & \multirow{3}{*}{Batch Size} & \multicolumn{4}{c|}{Learning Rate}                         & \multicolumn{2}{c|}{Epochs}                 & \multicolumn{2}{c|}{Projection Size}                 \\ \cline{4-11} 
                  &                   &                   & \multicolumn{2}{c|}{Ori} & \multicolumn{2}{c|}{DP} & \multirow{2}{*}{Ori} & \multirow{2}{*}{DP} & \multirow{2}{*}{Ori} & \multirow{2}{*}{DP} \\ \cline{4-7}
                  &                   &                   &      $\wbf$     &     $\vbf$      &     $\wbf$     &     $\vbf$      &                   &                   &                   &                   \\ \hline
\multirow{3}{*}{NSEG} &ijcnn1& 64 &300&300&350&350 &1000 &15& 100&100 \\ \cline{2-11} 
                  &         MNIST          & 64 &11&11&5&5&100 &15&2&2 \\ \cline{2-11} 
                  &         Fashion-MNIST          &64 & 11&11 &5 &5 &100 &15&3&3 \\ \hline
\multirow{3}{*}{\makecell{DP-SGDA\\(Linear)}} &ijcnn1&64 &300&300&350&350&100&15&10&10  \\ \cline{2-11} 
                  &         MNIST          &64 &11&11&5&5&100&15&2&2 \\ \cline{2-11} 
                  &         Fashion-MNIST          &64 &11&11&5&5&100&15&3&3\\ \hline
\multirow{3}{*}{\makecell{DP-SGDA\\(MLP)}} &ijcnn1&64 & 3000&3001&500&501&10&10 &100&100 \\ \cline{2-11} 
                  &         MNIST          &64 &900&1000&100&210&10&10&2&2 \\ \cline{2-11} 
                  &         Fashion-MNIST          &64 &900&1000&100&210&10&10&2&2\\ \hline
\end{tabular}
% % \vspace{-1em}
\caption{\small \it Training settings for each model and each dataset.}
\label{tab:training-settings}
% }
\end{table*}

\subsection{DP-SGDA for AUC Maximization}


\begin{algorithm}[ht]
\caption{DP-SGDA for AUC Maximization\label{alg:dp-auc}}
\begin{algorithmic}[1]
\STATE {\bf Inputs:} Private dataset $S = \{\zbf_i: i \in [n]\}$, privacy budget $\epsilon, \delta$, number of iterations $T$, learning rates $\{\gamma_t, \lambda_t\}_{t=1}^T$, initial points $(\theta_0, a_0, b_0, \vbf_0)$
\STATE Compute $n_+ = \sum_{i=1}^n \Ibb[y_i=1]$ and $n_- = \sum_{i=1}^n \Ibb[y_i=-1]$
\STATE Compute noise parameters $\sigma_1$ and  $\sigma_2$ based on Eq. \eqref{eq:sigma-sigma}
\FOR{$t=1$ to $T$}
\STATE Randomly select a batch $S_t$
\STATE For each $j \in I_t$, compute gradient $\nabla_\theta f(\theta_t, a_t, b_t, \vbf_t; \zbf_j), \nabla_a f(\theta_t, a_t, b_t, \vbf_t; \zbf_j), \nabla_b f(\theta_t, a_t, b_t, \vbf_t; \zbf_j)$ and $\nabla_c f(\theta_t, a_t, b_t, \vbf_t; \zbf_j)$ based on Eq. \eqref{eq:auc-grad}
\STATE Sample independent noises $\xi_t \sim \Ncal(0, \sigma_1^2 I_{d+2})$ and $\zeta_t \sim \Ncal(0, \sigma_2^2)$
\STATE Update \begin{align*}
\begin{pmatrix}\theta_{t+1}\\a_{t+1}\\b_{t+1}\end{pmatrix} = & \Pi\Bigg\{ \begin{pmatrix}\theta_t\\a_t\\b_t\end{pmatrix} - \gamma_t \Big( \frac{1}{m}\sum_{j \in I_t}\begin{pmatrix}\nabla_\theta f(\theta_t, a_t, b_t, \vbf_t; \zbf_j)\\\nabla_a f(\theta_t, a_t, b_t, \vbf_t; \zbf_j)\\\nabla_b f(\theta_t, a_t, b_t, \vbf_t; \zbf_j)\end{pmatrix} + \xi_t\Big)\Bigg\}  \\
\vbf_{t+1} = & \Pi\Big\{\vbf_t + \lambda_t (\frac{1}{m}\sum_{j \in I_t} \nabla_\vbf f(\theta_t, a_t, b_t, \vbf_t; \zbf_j) + \zeta_t)\Big\}
\end{align*}
\ENDFOR
\STATE {\bf Outputs:} $(\theta_T, a_T, b_T, \vbf_T)$ or $(\bar{\theta}_T, \bar{a}_T, \bar{b}_T, \bar{\vbf}_T)$
\end{algorithmic}
\end{algorithm}

In this section, we provide details of using DP-SGDA to learn AUC maximization problem. AUC maximization with square loss can be reformulated as 
\begin{multline*}
	F(\theta,a,b,\vbf) = \Ebb_\zbf[(1-p)(h(\theta; \xbf) - a)^2\Ibb[y=1] + p(h(\theta; \xbf) - b)^2\Ibb[y=-1]\\ + 2(1+\vbf)(ph(\theta;\xbf)\Ibb[y=-1] - (1 - p)h(\theta; \xbf)\Ibb[y=1])] - p(1-p)\vbf^2]
\end{multline*}
where $\zbf= (\xbf,y)$ and $p = \Pbb[y=1]$. The empirical risk formulation is given as 
\begin{multline*}
F_S(\theta, a, b, \vbf) = \frac{1}{n}\sum_{i=1}^n\Big\{\frac{1}{n_+}(h(\theta; \xbf_i) - a)^2\Ibb[y_i=1] + \frac{1}{n_-}(h(\theta; \xbf_i) - b)^2\Ibb[y_i=-1]\\
 + 2(1+\vbf)\Big(\frac{1}{n_-}h(\theta; \xbf_i)\Ibb[y_i=-1] - \frac{1}{n_+}h(\theta; \xbf_i)\Ibb[y_i=1]\Big) - \frac{1}{n}\vbf^2\Big\}
\end{multline*}

For any subset $S_t$ of size $m$, let $I_t$ denote the set of indices in $S_t$, the gradients of any $j \in I_t$
% $\nabla f(\theta, a, b, c; \zbf_j)$ 
are given by
\begin{align*}\label{eq:auc-grad}
\nabla_\theta f(\theta, a, b, \vbf; \zbf_j) = & \frac{2}{n_+} (h(\theta; \xbf_j) - a) \nabla h(\theta; \xbf_j)\Ibb[y_j=1] + \frac{2}{n_-} (h(\theta; \xbf_j) - b) \nabla h(\theta; \xbf_j)\Ibb[y_j=-1] \\
& + 2(1+\vbf)\Big(\frac{1}{n_-}\nabla h(\theta; \xbf_j)\Ibb[y_j=-1] - \frac{1}{n_+}\nabla h(\theta; \xbf_j)\Ibb[y_j=1]\Big) \\
\nabla_a f(\theta, a, b, \vbf; \zbf_j) = & \frac{2}{n_+} (a - h(\theta; \xbf_j))\Ibb[y_j=1], \ \ \ \ 
\nabla_b f(\theta, a, b, \vbf; \zbf_j) =  \frac{2}{n_-} (b - h(\theta; \xbf_j))\Ibb[y_j=-1] \\
\nabla_\vbf f(\theta, a, b, \vbf; \zbf_j) = & 2\Big(\frac{1}{n_-}h(\theta; \xbf_j)\Ibb[y_j=-1] - \frac{1}{n_+}h(\theta; \xbf_j)\Ibb[y_j=1]\Big) -\frac{2}{n} \vbf \numberthis
\end{align*}
The pseudo-code can be found in Algorithm \ref{alg:dp-auc}.


% \begin{algorithm}[t]
% \caption{NSEG for AUC Maximization\label{alg:dp-auc-extragradient}}
% \begin{algorithmic}[1]
% \STATE {\bf Inputs:} Private dataset $S = \{\zbf_i: i \in [n]\}$, privacy budget $\epsilon, \delta$, number of iterations $T$, learning rates $\{\gamma_t, \lambda_t\}_{t=1}^T$, initial points $(\theta'_0, a'_0, b'_0, c'_0)$
% \STATE Compute $n_+ = \sum_{i=1}^n \Ibb[y_i=1]$ and $n_- = \sum_{i=1}^n \Ibb[y_i=-1]$
% \STATE Compute noise parameter $\sigma$ based on Eq. \eqref{eq:sigma}
% \FOR{$t=1$ to $T$}
% \STATE Randomly select a batch $S_t^1$
% \STATE Sample independent noises $\xi^1_j \sim \Ncal(0, \sigma^2 I_{d+3})$
% \STATE Update \begin{align*}
% \begin{pmatrix}\theta_{t+1}\\a_{t+1}\\b_{t+1}\\c_{t+1}\end{pmatrix} = & \Pi\Big\{ \begin{pmatrix}\theta'_t\\a'_t\\b'_t\\c'_t\end{pmatrix} - \frac{ \gamma_t}{m} \Big( \sum_{j \in I_t}\begin{pmatrix}\nabla_\theta f(\theta'_t, a'_t, b'_t, c'_t; \zbf_j)\\\nabla_a f(\theta'_t, a'_t, b'_t, c'_t; \zbf_j)\\\nabla_b f(\theta'_t, a'_t, b'_t, c'_t; \zbf_j)\\-\nabla_c f(\theta'_t, a'_t, b'_t, c'_t; \zbf_j)\end{pmatrix} + \xi^1_j\Big)\Big\}
% \end{align*}
% \STATE Randomly select a batch $S_t^2$
% \STATE Sample independent noises $\xi^2_j \sim \Ncal(0, \sigma^2 I_{d+3})$
% \STATE Update \begin{align*}
% \begin{pmatrix}\theta'_{t+1}\\a'_{t+1}\\b'_{t+1}\\c'_{t+1}\end{pmatrix} = & \Pi\Big\{ \begin{pmatrix}\theta'_t\\a'_t\\b'_t\\c'_t\end{pmatrix} - \frac{ \gamma_t}{m} \Big( \sum_{j \in I_t}\begin{pmatrix}\nabla_\theta f(\theta_{t+1}, a_{t+1}, b_{t+1}, c_{t+1}; \zbf_j)\\\nabla_a f(\theta_{t+1}, a_{t+1}, b_{t+1}, c_{t+1}; \zbf_j)\\\nabla_b f(\theta_{t+1}, a_{t+1}, b_{t+1}, c_{t+1}; \zbf_j)\\-\nabla_c f(\theta_{t+1}, a_{t+1}, b_{t+1}, c_{t+1}; \zbf_j)\end{pmatrix} + \xi^2_j\Big)\Big\}
% \end{align*}
% \ENDFOR
% \STATE {\bf Outputs:} $(\bar{\theta}_T, \bar{a}_T, \bar{b}_T, \bar{c}_T)$
% \end{algorithmic}
% \end{algorithm}

% \textbf{Regularized Generative Adversarial Networks.} Wasserstein generative adversarial networks (WGANs) admits the formulation
% \[
% F(w,\theta) = \Ebb_{x \sim \Pbb_{data}}[d_w(x)] - \Ebb_{z \sim \Pbb_{z}}[d_w(g_\theta(z))]
% \]
% where $d_w(x)$ denotes a Lipschitz continuous function parameterized by $x$ corresponding to the discriminator. $g_y(z)$ demotes the parameterized function corresponding to the generator. 




\section{Additional Experimental Results}\label{sec:additional-exp}
% \subsection{General Performance in All Settings}
We show the details of NSEG and DP-SGDA (Linear and MLP settings) performance with using five different $\epsilon\in\{0.1,0.5,1,5,10\}$ and three different $\delta\in \{1e-4,1e-5,1e-6\}$ in Table \ref{tab:general_performance}. From Table \ref{tab:general_performance}, we can find that the performance will be decreased when decrease the value of $\delta$ in the same $\epsilon$ settings. The reason is that the small $\delta$ is corresponding to a large value of $\sigma$ based on Theorem \ref{thm:moments-accountant-privacy}. A large $\sigma$ means a large noise will be added to the gradients during the training updates. Therefore, the AUC performance will be decreased as $\delta$ decreasing. On the other hand, we can find that our DP-SGDA(Linear) outperforms NSEG under the same settings. This is because the NSEG method will add a larger noise than DP-SGDA into the gradients in the training and we have discussed this detail in the Section \ref{experiments:results}. 

We also compare the $\sigma$ values from NSEG and DP-SGDA methods on all datasets in Figure \ref{fig:sigma_delta4_and_5} (a) with setting $\delta$=1e-5 and (b) $\delta$=1e-4. From the figure, it is clear that the $\sigma$ from NSEG is larger than ours in all $\epsilon$ settings. This implies the noise generated from NSEG is also larger than ours.


\begin{table*}[th!]
% \captionsetup{font=footnotesize}
\centering
\setlength\tabcolsep{2.5pt}
% \scriptsize{
\begin{tabular}{|c|c|cc|c|cc|c|cc|c|}
\hline
\multicolumn{2}{|c|}{Dataset}                  & \multicolumn{3}{c|}{ijcnn1}         & \multicolumn{3}{c|}{MNIST}         & \multicolumn{3}{c|}{Fashion-MNIST}         \\ \hline
\multicolumn{2}{|c|}{\multirow{2}{*}{Algorithm}} & \multicolumn{2}{c|}{Linear} &  MLP  & \multicolumn{2}{c|}{Linear} &   MLP    & \multicolumn{2}{c|}{Linear} &    MLP   \\ \cline{3-11} 
\multicolumn{2}{|c|}{}                  &    NSEG       &    DP-SGDA       &    DP-SGDA   &    NSEG       &    DP-SGDA       &    DP-SGDA       &    NSEG       &    DP-SGDA       &    DP-SGDA       \\ \hline
\multicolumn{2}{|c|}{Original}                  &    92.191 &  92.448 &  96.609  & 93.306  & 93.349  &99.546  & 96.552 &    96.523    &   98.020   \\ \hline
\multirow{5}{*}{$\delta$=1e-4}           &    \makecell{$\epsilon$=0.1}       &   90.231        &     91.229     &   94.020   &    91.285  & 91.962&  98.300 &   95.490 &  95.637 &  96.312   \\ \cline{2-11} 
                            &      \makecell{$\epsilon$=0.5}     &   90.352  &  91.366   &  96.108    &  91.328   &  92.067&  98.703 &  95.533&   95.829 &  97.098  \\ \cline{2-11} 
                            &     \makecell{$\epsilon$=1}      &  90.358       &  91.376      &  96.316   &   91.331 &  92.073&  98.722&    95.536 &   95.840 & 97.143  \\ \cline{2-11} 
                            &    \makecell{$\epsilon$=5}       &   90.363     &   91.385      &  96.326    &   91.334  &  92.079&  98.746&   95.539&  95.849 &   97.208 \\ \cline{2-11} 
                            &     \makecell{$\epsilon$=10}      &  90.363     &  91.387  & 96.329   &  91.335   &   92.080& 98.750 &   95.539 &  95.850  &  97.219\\ \hline
\multirow{5}{*}{$\delta$=1e-5}           &    \makecell{$\epsilon$=0.1}       &  90.168   &  91.169  & 93.274  &  91.266  &  91.910 & 98.092 &  95.468 &  95.535    &  95.989 \\ \cline{2-11} 
                            &     \makecell{$\epsilon$=0.5}      &  90.349  &  91.362  & 96.029 &  91.326  &  92.063&  98.675 &   95.531 &   95.823   &   97.031 \\ \cline{2-11} 
                            &    \makecell{$\epsilon$=1}       &  90.357  & 91.373   & 96.209 &   91.330 &   92.071&  98.714&   95.535  &  95.837  &  97.122 \\ \cline{2-11} 
                            &     \makecell{$\epsilon$=5}      & 90.363   &  91.384 &  96.300   &   91.334 &    92.079&  98.743&   95.538  &   95.848  &  97.200 \\ \cline{2-11} 
                            &     \makecell{$\epsilon$=10}      & 90.363   &  91.386  &  96.301  &   91.334 &  92.080& 98.747 &   95.539 &  95.850 &  97.213 \\ \hline
\multirow{5}{*}{$\delta$=1e-6}           &     \makecell{$\epsilon$=0.1}     & 90.106    &  91.110   &  92.763 &   91.247  &  91.858 &  97.878&  95.446  &  95.468   & 95.692\\ \cline{2-11} 
                            &      \makecell{$\epsilon$=0.5}    &  90.346 & 91.357   &  95.840 &   91.324 &  92.058 &  98.656 &  95.530&  95.816    & 96.988   \\ \cline{2-11} 
                            &     \makecell{$\epsilon$=1}     & 90.355 & 91.371 &  96.167 &   91.330 &  92.070 & 98.705&  95.534 &  95.834  & 97.102\\ \cline{2-11} 
                            &     \makecell{$\epsilon$=5}   & 90.363 & 91.383 &  96.294 &  91.334  &  92.078 & 98.742 & 95.538&   95.848 & 97.198\\ \cline{2-11} 
                            &     \makecell{$\epsilon$=10}  & 90.363& 91.386 &  96.297  &   91.334 &  92.080 &  98.747& 95.539&    95.850  & 97.213  \\ \hline
\end{tabular}
\caption{\small \it Comparison of AUC performance in NSEG and DP-SGDA (Linear and MLP settings) on three datasets with different $\epsilon$ and different $\delta$. The ``Original'' means no noise ($\epsilon=\infty$) is added in the algorithms.}
\label{tab:general_performance}
% }
% \vspace{-1em}
\end{table*}


% \subsection{General Performance in All Settings}

\begin{figure*}[ht]
\begin{subfigure}[t]{0.48\linewidth}
    \includegraphics[width=\linewidth]{figs/noise_size_delta-5.pdf}
    % \caption{}
\end{subfigure}%
    \hfill%
\begin{subfigure}[t]{0.48\linewidth}
    \includegraphics[width=\linewidth]{figs/noise_size_delta-4.pdf}
    % \caption{}
\end{subfigure}
 % \vspace{-1em}
\caption{\em  Comparison of $\sigma$ in NSEG and DP-SGDA (with Linear setting) on three datasets with different $\epsilon$ and (a) $\delta$=1e-5 and (b) $\delta$=1e-4.}
% \vspace{-1em}
\label{fig:sigma_delta4_and_5}
\end{figure*}

\bibliography{yang_122-supp}

\end{document}
