
\documentclass[accepted]{uai2023} 
\usepackage[american]{babel}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{booktabs}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage[switch]{lineno}
\usepackage{adjustbox}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{blindtext}
\usepackage{enumitem}
\usepackage{multirow}
\usepackage{subcaption}
\usepackage{graphicx}
\theoremstyle{definition}
\newtheorem{assumption}{Assumption}
\newtheorem{definition}{Definition}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}{Remark}
\newtheorem{claim}{Claim}
\newtheorem{fact}{Fact}
\newtheorem{property}{Property}
\newtheorem{proposition}{Proposition}
\usepackage{color}
\usepackage{xcolor}         % colors
\urlstyle{same}
\newtheorem{example}{Example}
\newtheorem{theorem}{Theorem}
\usepackage{natbib} % has a nice set of citation styles and commands
  \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{xr} 
\externaldocument{su_604}
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Differentially Private Stochastic Convex Optimization in \\ (Non)-Euclidean Space Revisited\\(Supplementary material)
}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author[1]{{Jinyan Su}{}}
\author[2]{Changhong Zhao}
\author[3,4,5]{Di Wang}
% Add affiliations after the authors
\affil[1]{%
   Mohamed bin Zayed University of Artificial Intelligence
}
\affil[2]{%
   Department of Information Engineering,\\
The Chinese University of Hong Kong
}
\affil[3]{%
   Provable Responsible AI and Data Analytics Lab 
  }
  \affil[4]{%
 Computational Bioscience Research Center
  }
  \affil[5]{%
  Division of CEMSE, 
    King Abdullah University of Science and Technology 
  }

  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\section{Notation summary}
\begin{table*}[h]
\centering
\resizebox{1\linewidth}{!}{%	
\begin{tabular}{l|l}
\toprule
$\mathcal{C}$: constraint set & $G_{\mathcal{C}}$: Gaussian width of set $\mathcal{C}$\\
\hline
$d$: dimension & $n$: sample size\\
\hline
$\epsilon,\delta$: privacy parameters& $\ell$: convex loss funtion\\
\hline
$L$: Lipschitz constant& $\beta$: smoothness constant\\
\hline
$\lambda$: regularization parameter & $\alpha$: optimization accurancy\\
\hline
$\ell_p^d$: Normed space corresponds to $\|\cdot\|_p$, where $\|x\|_p=(\sum_{j=1}^d|x_j|^p)^{1/p}$&$\mathcal{L}(\theta)$: population risk\\ 
\hline
$\hat{\mathcal{L}}(\theta,D)$: empirical risk& $\kappa$: $\kappa$-regular space\\\hline
$||\cdot||_{\mathcal{C}}$: Minkowski norm, $\|\cdot\|_{\mathcal{C}}=\min\{r\in \mathbb{R}^{+}:v\in r\mathcal{C}\}$& $\|\cdot\|_{\mathcal{C}^{*}}$: dual norm of $\|\cdot\|_{\mathcal{C}}$\\\hline
$\sigma$: the variance of Gaussian noise&$\|\cdot\|_{+}$: the smooth norm for $(\mathbf{E}, \|\cdot\|_{*})$ \\
\bottomrule
\end{tabular}}
\caption{Notation summary of the paper.}
\label{tab:notation}
\end{table*}
\section{Omitted Proofs in Section \ref{sec:euclidean}}
\subsection{Proof of Theorem \ref{thm:1}}
\begin{algorithm}
	\caption{$\mathcal{A}_{\text{ObjP}}$: Objective perturbation }
	\begin{algorithmic}[1]
		\State {\bfseries Input:} Dateset $D$, loss function $\ell$, regularization parameter $\lambda$.
		
		
		
	\State  Sample $\mathbf{G}\sim\mathcal{N}(0, \sigma_1^2\mathbb{I}_d)$ where $\sigma_1^2  = \frac{32L^2\log(1/\delta)}{\epsilon^2}$. Set $\lambda\geq \frac{r\beta}{2\epsilon n}$, where $r = \min\{d, 2\cdot \text{rank}(\nabla^2\ell(\theta, x))\}$ with $\text{rank}(\nabla^2\ell(\theta,x))$ being the maximal rank of the Hessian of $\ell$ for all $\theta \in \mathcal{C}$ and $x\sim \mathcal{P}$.
	
	\State Let $\mathcal{J}(\theta,D) = \hat{\mathcal{L}}+\frac{\langle \mathbf{G},\theta\rangle}{n}+\lambda||\theta||_2^2$.\\
	 \Return $\theta_1 = \underset{\theta\in\mathcal{C}}{\arg\min}\mathcal{J}(\theta, D)$.
	\end{algorithmic}
	\label{alg5}
\end{algorithm}
\begin{proof}
Let $\theta_1 = \underset{\theta\in\mathcal{C}}{\arg\min} \mathcal{J}(\theta, D)$, where $\mathcal{J}(\theta, D)=\hat{\mathcal{L}}(\theta, D)+\frac{\langle \mathbf{G},w\rangle}{n}+\lambda ||\theta||_2^2$. Let $\theta_2 = \mathcal{O}(\mathcal{J},\alpha)$ where $\mathcal{O}$ is the optimizer defined in the algorithm. Notice that one can compute $\hat{\theta}$ from tuple $(\theta_1, \theta_2-\theta_1+\mathbf{H})$ by simple post-processing. Furthermore, the algorithm that outputs $\theta_1$ is $(\epsilon,\delta)$-DP by the following theorem.
\begin{lemma}[Theorem 1 in \citep{iyengar2019towards}]
Suppose Assumption \ref{ass1} holds and that the smoothness parameter satisfy $\beta \leq \frac{\epsilon n \lambda}{r}$, the algorithm $\mathcal{A}_{\text{ObjP}}$ (Algorithm \ref{alg5}) that outputs $\theta_1=\underset{\theta\in\mathcal{C}}{\arg\min} \mathcal{J}(\theta,D)$ is $(\epsilon, \delta)$-DP.
\end{lemma}

Next, we will bound the term $||\theta_2 -\theta_1||$ to make $(\theta_2-\theta_1+\mathbf{H})$ differentially private, conditioned on $\theta_1$. As $\mathcal{J}(\theta, D)$ is $\lambda$-strongly convex, we have $\mathcal{J}(\theta_2, D)\geq \mathcal{J}(\theta_1,D) +\frac{\lambda}{2}||\theta_2-\theta_1||_2^2$, which implies that
\begin{equation}\label{eq13}
       ||\theta_2-\theta_1||_2\leq\sqrt{\frac{2}{\lambda}(\mathcal{J}(\theta_2,D)-\mathcal{J}(\theta_1, D))}\leq \sqrt{\frac{2\alpha}{\lambda}}.
\end{equation}
Thus, conditioned on $\theta_1$, $\theta_2 - \theta_1$ has the $l_2$ sensitivity of $\sqrt{\frac{8\alpha}{\lambda}}$. Therefore, $(\theta_2-\theta_1)+\mathbf{H}$ is $(\epsilon/2, \delta.2)$-DP. By the standard composition in \cite{dwork2014algorithmic}, the tuple $(\theta_1, \theta_2-\theta_1+\mathbf{H})$ satisfies $(\epsilon,\delta)$-DP and hence $\hat{\theta}$ satisfies $(\epsilon,\delta)$-DP.
\end{proof}
\subsection{Proof of Theorem \ref{th4}}
\begin{proof}
Let $\theta_1$ be the exact minimizer of $\mathcal{J}(\theta,D)$. We split the objective $\mathbb{E}[\mathcal{L}(\hat{\theta})]-\mathcal{L}(\theta^{*})$ into two parts and bound them separately.
\begin{equation}\label{eq1}
    \mathbb{E}[\mathcal{L}(\hat{\theta})] - \mathcal{L}(\theta^{*}) = \mathbb{E}[\mathcal{L}(\hat{\theta}) -\mathcal{L}(\theta_1)] + \mathbb{E}[\mathcal{L}(\theta_1)]-\mathcal{L}(\theta^{*}).
\end{equation}
In the following, we bound the term $\mathbb{E}[\mathcal{L}(\hat{\theta}) -\mathcal{L}(\theta_1)]$ and the term $\mathbb{E}[\mathcal{L}(\theta_1)]-\mathcal{L}(\theta^{*})$ separately. 
To bound the term $\mathbb{E}[\mathcal{L}(\theta_1)]-\mathcal{L}(\theta^{*})$, we need the following two lemmas. The first lemma states the excess empirical risk of $\theta_1$ while the second lemma states the stability property of regularized empirical risk minimization.
\begin{lemma}\label{le1}(Excess empirical loss of $\theta_1$ in $\mathcal{A}_{\text{ObjP}}$).
Let $D\sim\mathcal{P}^n$, under Assumption \ref{ass1}, the excess empirical loss of $\theta_1$ satisfies
\begin{equation}\label{eq12}
\mathbb{E}[\hat{\mathcal{L}}(\theta_1,D)]-\underset{\theta\in \mathcal{C}}{\min} \hat{\mathcal{L}}(\theta,D)\leq O\left( \frac{LG_{\mathcal{C}}\sqrt{\log (1/\delta)}}{\epsilon n} +\lambda ||\mathcal{C}||_2^2
\right),
\end{equation}
where the expectation is taken over the randomness induced by Gaussian noise.
\end{lemma}
\begin{lemma}\label{le2}[\citep{shalev2014understanding}]
Let $f:\mathcal{C}\times D\rightarrow \mathbb{R}$ be a convex, $\rho$-Lipschitz loss function where $D=\{x_1,\cdots,x_n\}\sim\mathcal{P}^n$.  Let $\mathcal{A}$ be an algorithm that outputs $\tilde{\theta} = \underset{\theta\in\mathcal{C}}{\arg\min} \{\hat{F}(\theta, D)+\lambda ||\theta||^2\}$  with $\lambda>0$ where $\hat{F}(\theta, D)= \frac{1}{n}\sum_{i =1}^n f(\theta, x_i)$, then $\mathcal{A}$ is $\frac{2\rho^2 }{\lambda n}$-uniformly stable, i.e., for all neighboring datasets $D\sim D'$ we have 
\begin{equation*}
    \sup_{z}|\mathbb{E}[f(\mathcal{A}(D),  z)- f(\mathcal{A}(D'),  z)]|\leq \frac{2\rho^2 }{\lambda n}. 
\end{equation*}
\end{lemma}
The property of uniform stability is described  by the following lemma.
\begin{lemma}[\cite{bousquet2002stability}]\label{le6}
Let $\mathcal{A}:\mathcal{X}^n\rightarrow \mathcal{C}$ be an $\alpha$-uniformly stable algorithm w.r.t. loss $\ell: \mathcal{C}\times \mathcal{X}\rightarrow \mathbb{R}$. Let $D\sim\mathcal{P}^n$ where $\mathcal{P}$ is the distribution over $\mathcal{X}$. Then,
\begin{equation*}
\underset{D\sim\mathcal{P}^n,\mathcal{A}}{\mathbb{E}}[\mathcal{L}(\mathcal{A}(D))-\hat{\mathcal{L}}(\mathcal{A}(D),D)]\leq \alpha.
\end{equation*}
\end{lemma}
Now we begin to bound the term $\mathcal{L}(\theta_1)-\mathcal{L}(\theta^{*})$ using the above three lemmas.
Fix any realization of the noise vector $\mathbf{G}$, we define $f_{\mathbf{G}}(\theta, x) =\ell(\theta, x) +\frac{\langle \mathbf{G},\theta\rangle}{n}$, then $f_{\mathbf{G}}$ is $\left(L+\frac{||\mathbf{G}||_2}{n}\right)$-Lipschitz.

Define $\hat{F}_{\mathbf{G}}(\theta, D) =\frac{1}{n}\sum_{i =1}^n f_{\mathbf{G}}(\theta, x_i)$, and we have $\theta_1 = \underset{\theta\in\mathcal{C}}{\arg\min} \hat{F}_{\mathbf{G}}(\theta, D) +\lambda ||\theta||_2^2$, so from Lemma \ref{le2}, the algorithm that outputs $\theta_1$ is $\frac{2\left(L+\frac{||\mathbf{G}||_2}{n}\right)^2}{\lambda  n }$-uniformly stable.
Denote $F_{\mathbf{G}}(\theta) = \underset{x\sim\mathcal{P}}{\mathbb{E}}[f_{\mathbf{G}}(\theta, x)]$, according to Lemma \ref{le6}, we have 
\begin{equation*}
    \underset{D\sim\mathcal{P}^n}{\mathbb{E}}[\mathcal{L}(\theta)-\hat{\mathcal{L}}(\theta, D)] = \underset{D\sim\mathcal{P}^n}{\mathbb{E}}[F_{\mathbf{G}}(\theta)-\hat{F}_{\mathbf{G}}(\theta,D)]\leq \frac{2\left(L+\frac{||\mathbf{G}||_2}{n}\right)^2}{\lambda n}.
\end{equation*}
Take the expectation w.r.t. $\mathbf{G}\sim \mathcal{N}(0, \frac{32L^2\log(1/\delta)}{\epsilon^2}\mathbb{I}_d)$ as well, we get 
\begin{equation}\label{eq10}
\begin{aligned}
\mathbb{E}[\mathcal{L}(\theta)-\hat{\mathcal{L}}(\theta, D)]\leq O\left(\frac{L^2\cdot \left(1+\frac{\sqrt{d\log(1/\delta)}}{\epsilon n}\right)^2}{\lambda n}\right)\leq O\left(\frac{L^2}{\lambda n}\right),
\end{aligned}
\end{equation}
 where we assume $n\geq O(\frac{\sqrt{d \log(1/\delta)}}{\epsilon})$.
 
Thus
\begin{equation}\label{eq14}
    \begin{aligned}
    \mathbb{E}[\mathcal{L}(\theta_1)] -\mathcal{L}(\theta^{*})& =  \mathbb{E}[\mathcal{L}(\theta_1)] -\underset{\theta\in\mathcal{C}}{\min} \mathcal{L}(\theta)\\
    &\leq \mathbb{E}[\hat{\mathcal{L}}(\theta_1,D)-\underset{\theta\in\mathcal{C}}{\min}\hat{\mathcal{L}}(\theta, D)] + \mathbb{E}[\mathcal{L}(\theta_1)-\hat{\mathcal{L}}(\theta_1,D)]\\
    & \leq O\left(\frac{L\cdot G_{\mathcal{C}}\cdot\sqrt{\log(1/\delta)}}{\epsilon n}+\lambda ||\mathcal{C}||_2^2 +\frac{L^2}{\lambda n}
    \right),
    \end{aligned}
\end{equation}
where we use the fact that $\underset{D\sim \mathcal{P}^n}{\mathbb{E}}[\underset{\theta\in \mathcal{C}}{\min \hat{\mathcal{L}}}(\theta, D)]\leq \underset{\theta \in \mathcal{C}}{\min}\underset{D\sim \mathcal{P}^n}{\mathbb{E}}[\hat{\mathcal{L}}(\theta,D)] =\underset{\theta \in\mathcal{C}}{\min} \mathcal{L}(\theta)$ and the last bound is directly from Eq.(\ref{eq12}) and Eq.(\ref{eq10}).


Now we bound the term $\mathbb{E}[\mathcal{L}(\hat{\theta})]-\mathcal{L}(\theta_1)$.
Recall that $\theta_2 =\mathcal{O}(\mathcal{J},\alpha)$ and 
\begin{equation*}
    \mathbb{E}[\mathcal{L}(\hat{\theta})]-\mathcal{L}(\theta_1) = \mathbb{E}[\mathcal{L}(\hat{\theta})] -\mathcal{L}(\theta_2) +\mathcal{L}(\theta_2)-\mathcal{L}(\theta_1).
\end{equation*}

Note the term $\mathcal{L}(\theta_2)-\mathcal{L}(\theta_1)\leq L\cdot ||\theta_1-\theta_2||_2\leq L\cdot \sqrt{\frac{2\alpha}{\lambda}}$ (From Eq.(\ref{eq13})), and the term $\mathbb{E}[\mathcal{L}(\hat{\theta})] -\mathcal{L}(\theta_2)\leq L\cdot\mathbb{E}[||\hat{\theta}-\theta_2||_2]$. 

Also note that $\hat{\theta} =\text{Proj}_{\mathcal{C}}(\theta_2+\mathbf{H})$. Let $q$ be the line through $\theta_2$ and $\hat{\theta}$, and let $p$ be the projection of $\theta_3 = \theta_2+\mathbf{H}$ onto $q$. The key observation is that $p$ lies on the ray from $\hat{\theta}$ to infinity otherwise $p$ will be a point in $\mathcal{C} $ that is closer to $\theta_3 $ than $\hat{\theta}$. Thus we have
\begin{equation*}
    \begin{aligned}
    \mathbb{E}[||\hat{\theta}-\theta_2||_2^2]&= \mathbb{E}[\langle \hat{\theta}-\theta_2 ,\hat{\theta}-\theta_2\rangle]\\
    & \leq \mathbb{E}[\langle \hat{\theta}-\theta_2,\theta_3-\theta_2]\\
    & =\mathbb{E}[\langle \mathbf{H},\hat{\theta}-\theta_2\rangle]\\
    &\leq 2\cdot\underset{\theta\in\mathcal{C}}{\max}\mathbb{E}[\langle \mathbf{H}, \theta\rangle]\\
    & \leq O(\mathbb{E}[\max |\langle \mathbf{H},\theta\rangle|])\\
    & =O\left(\sqrt{\frac{\alpha \log(1/\delta)}{\lambda}}\cdot \frac{G_{\mathcal{C}}}{\epsilon}\right),
    \end{aligned}
\end{equation*}
where the last equation is from the definition of Gaussian width.

So we have 
    \begin{equation}\label{eq15}
    \begin{aligned}
    \mathbb{E}[\mathcal{L}(\hat{\theta})]-\mathcal{L}(\theta_1)&\leq   
     L\cdot \sqrt{\frac{2\alpha}{\lambda}}+L\cdot \mathbb{E}[||\hat{\theta}-\theta_2||_2]\\
    &\leq O\left(L\cdot\sqrt[4]{\frac{\alpha \log(1/\delta)}{\lambda}}\cdot \sqrt{\frac{G_{\mathcal{C}}}{\epsilon}}+L\sqrt{\frac{\alpha}{\lambda}}\right).
    \end{aligned}
    \end{equation}
    In total, combining Eq.(\ref{eq14}) and Eq.(\ref{eq15}), we can bound Eq. (\ref{eq1}) by
    \begin{equation*}
    \begin{aligned}
        \mathbb{E}[\mathcal{L}(\hat{\theta})]- \mathcal{L}(\theta^{*})
        &=\mathbb{E}[\mathcal{L}(\hat{\theta}) -\mathcal{L}(\theta_1)] + \mathbb{E}[\mathcal{L}(\theta_1)]-\mathcal{L}(\theta^{*})\\
        &\leq O\left( L\cdot\sqrt[4]{\frac{\alpha \log(1/\delta)}{\lambda}}\cdot \sqrt{\frac{G_{\mathcal{C}}}{\epsilon}}+L\sqrt{\frac{\alpha}{\lambda}}+\frac{L\cdot G_{\mathcal{C}}\cdot\sqrt{\log(1/\delta)}}{\epsilon n}+\lambda ||\mathcal{C}||_2^2 +\frac{L^2}{\lambda n}\right).
        \end{aligned}
    \end{equation*}
    Since $\alpha\leq \min\left\{
\frac{L||\mathcal{C}||_2}{n^{\frac{3}{2}}}, \frac{\epsilon^2 L||\mathcal{C}||_2^3}{G^2_{\mathcal{C}}\log(1/\delta) n^{\frac{5}{2}}}
\right\}$, we have $\sqrt{L\cdot||\mathcal{C}||_2\sqrt{n}\alpha}\leq \frac{L\cdot ||\mathcal{C}||_2}{\sqrt{n}}$ and $L\cdot\sqrt[4]{\frac{\alpha \log(1/\delta)}{\lambda}}\cdot \sqrt{\frac{G_{\mathcal{C}}}{\epsilon}}\leq \frac{L\cdot ||\mathcal{C}||_2}{\sqrt{n}}$. Let $\lambda = \frac{L}{\sqrt{n}||\mathcal{C}||_2}$,
    then 
    \begin{equation*}
        \mathbb{E}[\mathcal{L}(\hat{\theta})]-\mathcal{L}(\theta^{*}) \leq O\left(\frac{L\cdot G_{\mathcal{C}}\cdot\sqrt{\log(1/\delta)}}{\epsilon n}+\frac{L||\mathcal{C}||_2}{\sqrt{n}}\right).
    \end{equation*}
    Note that we need $\lambda =\frac{L}{\sqrt{n}||\mathcal{C}||_2}\geq \frac{r \beta}{\epsilon n }$, namely, $n\geq \frac{r^2\beta^2 ||\mathcal{C}||_2^2}{\epsilon^2 L^2}$.
\end{proof}
\begin{proof}[{\bf Proof of Lemma \ref{le1}}]
Let $\bar{\mathcal{L}}(\theta, D) = \hat{\mathcal{L}}(\theta,D) + \lambda ||\theta||_2^2$ and $\bar{\theta}=\underset{\theta\in\mathcal{C}}{\arg\min} \bar{\mathcal{L}}(\theta, D)$. So $\mathcal{J}(\theta, D) = \bar{\mathcal{L}}(\theta, D) +\frac{\langle \mathbf{G}, \theta\rangle}{n}$. Since $\theta_1 $ minimizes $ \mathcal{J}(\theta, D)$, we have $\mathcal{J}(\bar{\theta}, D)\geq \mathcal{J}(\theta_1, D)$, namely,
\begin{equation*}
    \bar{\mathcal{L}}(\bar{\theta}, D) +\frac{\langle \mathbf{G}, \bar{\theta}\rangle}{n} \geq \bar{\mathcal{L}}(\theta_1,D) +\frac{\langle \mathbf{G},\theta_1\rangle}{n}.
\end{equation*}
Recall that $\mathbf{G}\sim \mathcal{N}(0, \frac{128L^2\log(1/\delta)}{\epsilon^2} \mathbb{I}_d)$, rearrange the inequality and take the expectation at both sides and we get
\begin{equation*}
    \begin{aligned}
      \mathbb{E}[\bar{\mathcal{L}}(\theta_1,D)-\bar{\mathcal{L}}(\bar{\theta},D)]&\leq \mathbb{E}[\frac{\langle\mathbf{G},\bar{\theta}-\theta_1\rangle}{n}]\\
      & \leq 2 \cdot\underset{\theta\in\mathcal{C}}{\max}\mathbb{E} \left[\frac{\langle \mathbf{G}, \theta\rangle}{n}\right]\\
      &\leq 2\cdot \mathbb{E}\left[\underset{\theta\in\mathcal{C}}{\max}\left|\frac{\langle \mathbf{G},\theta\rangle}{n}\right|\right]\\
      &=O\left(\frac{L\cdot G_{\mathcal{C}}\sqrt{\log(1/\delta)}}{\epsilon n}\right),
    \end{aligned}
\end{equation*}
where the last bound is from the definition of Gaussian width.

Thus 
\begin{equation*}
    \begin{aligned}
    \mathbb{E}[\hat{\mathcal{L}}(\theta_1,D)-\hat{\mathcal{L}}(\theta^{*},D)] &= \mathbb{E}[\bar{\mathcal{L}}(\theta_1, D) - \bar{\mathcal{L}}(\theta^{*},D)+\lambda ||\theta^{*}||_2^2 -\lambda ||\theta_1||_2^2]\\
    &\leq \mathbb{E}[\bar{\mathcal{L}}(\theta_1, D) - \bar{\mathcal{L}}(\theta^{*},D)+\lambda ||\theta^{*}||_2^2]\\
    & \leq \mathbb{E}[\bar{\mathcal{L}}(\theta_1, D) - \bar{\mathcal{L}}(\bar{\theta},D)+\lambda ||\theta^{*}||_2^2]\\
    & \leq O\left(\frac{L\cdot G_{\mathcal{C}}\sqrt{\log(1/\delta)}}{\epsilon n} +\lambda ||\mathcal{C}||_2^2\right).
    \end{aligned}
\end{equation*}
\end{proof}
\subsection{Proof of Theorem \ref{thm:3}}
\begin{proof}
The proof is similar to the convex case. Note that $\mathcal{J}(\theta, D)$ is a  $\frac{r\beta}{\epsilon n}$-strongly convex function.
\end{proof}
\subsection{Proof of Theorem \ref{th5}}
\begin{proof}
By the assumptions we made about $n$, we have $\Delta\geq \frac{L\cdot ||\mathcal{C}||_2}{\sqrt{n}}$ and $\frac{L}{\sqrt{n}||\mathcal{C}||_2}\geq \frac{r \beta}{\epsilon n}$. 

Since the loss function is $\Delta$-strongly convex with respect to $||\cdot||_{\mathcal{C}}$, which implies that the loss function is $\frac{\Delta}{||\mathcal{C}||_2^2}$-strongly convex w.r.t. $||\cdot||_2$ and thus  $\frac{L}{\sqrt{n}||\mathcal{C}||_2}$-strongly convex w.r.t. $||\cdot||_{2}$, where we use the fact that $\Delta\geq \frac{L\cdot ||\mathcal{C}||_2}{\sqrt{n}}$ and $||v||_{\mathcal{C}}\geq \frac{||v||_2}{||\mathcal{C}||_2}$ for any vector $v\in\mathcal{C}$.


Since $\Delta \geq \frac{L}{\sqrt{n}||\mathcal{C}||_2}\geq \frac{r \beta}{\epsilon n}
$, we have $\lambda= \max\left\{
	\frac{r \beta}{\epsilon n}-\Delta,0
	\right\}=0$.
 
The population loss can be disassembled as the following two parts, and we bound them separately. 
\begin{equation*}
    \mathbb{E}[\mathcal{L}(\hat{\theta})] - \mathcal{L}(\theta^{*}) = \mathbb{E}[\mathcal{L}(\hat{\theta})-\mathcal{L}(\theta_1)] + \mathbb{E}[\mathcal{L}(\theta_1)] - \mathcal{L}(\theta^{*}).
\end{equation*}
We first bound $\mathbb{E}[\mathcal{L}(\hat{\theta})-\mathcal{L}(\theta_1)]$.
Note that
\begin{equation*}
    \mathbb{E}[\mathcal{L}(\hat{\theta})-\mathcal{L}(\theta_1)] = \mathbb{E}[\mathcal{L}(\hat{\theta})-\mathcal{L}(\theta_2)] +\mathbb{E}[\mathcal{L}(\theta_2)-\mathcal{L}(\theta_1)].
\end{equation*}
For term $\mathbb{E}[\mathcal{L}(\theta_2)-\mathcal{L}(\theta_1)]$, since $\mathcal{L}$ is $\Delta$-strongly convex w.r.t. $||\cdot||_{\mathcal{C}}$ and thus $\frac{\Delta}{||\mathcal{C}||_2^2}$-strongly convex w.r.t. $||\cdot||_2$. So by the definition of strong convexity of $\mathcal{L}$, we have
\begin{equation*}
    \alpha \geq \mathcal{L}(\theta_2)-\mathcal{L}(\theta_1)\geq \frac{\Delta}{2||\mathcal{C}||_2^2}||\theta_2-\theta_1||_2^2,
\end{equation*}
where $\alpha$ is the optimization accuracy.

Thus,
\begin{equation*}
    ||\theta_2-\theta_1||_2\leq \sqrt{\frac{2\alpha ||\mathcal{C}||_2^2}{\Delta}}.
\end{equation*}

So using the definition of $L$-Lipschitz,
\begin{equation*}
    \mathbb{E}[\mathcal{L}(\theta_2)-\mathcal{L}(\theta_1)] \leq L \cdot \mathbb{E}[||\theta_2-\theta_1||_2]\leq L\cdot \sqrt{\frac{2\alpha ||\mathcal{C}||_2^2}{\Delta}}.
\end{equation*}




For term $\mathbb{E}[\mathcal{L}(\hat{\theta})-\mathcal{L}(\theta_2)]$, it is similar to the convex case, and we have
\begin{equation*}
    \mathbb{E}[\mathcal{L}(\hat{\theta})-\mathcal{L}(\theta_2)]\leq O\left(L\cdot\sqrt[4]{\frac{\alpha \log(1/\delta)||\mathcal{C}||_2^2}{\Delta}}\cdot \sqrt{\frac{G_{\mathcal{C}}}{\epsilon}}\right).
\end{equation*}
Thus, 
\begin{equation*}
    \mathbb{E}[\mathcal{L}(\hat{\theta})-\mathcal{L}(\theta_1)]\leq O\left(L\cdot\sqrt[4]{\frac{\alpha \log(1/\delta)||\mathcal{C}||_2^2}{\Delta}}\cdot \sqrt{\frac{G_{\mathcal{C}}}{\epsilon}} + L\cdot \sqrt{\frac{2\alpha ||\mathcal{C}||_2^2}{\Delta}}
    \right).
\end{equation*}
Next we bound $\mathbb{E}[\mathcal{L}(\theta_1)] - \mathcal{L}(\theta^{*})$.
Note that 
\begin{equation*}
   \mathbb{E} [\mathcal{L}(\theta_1)]-\mathcal{L}(\theta^{*})\leq \mathbb{E}[\hat{\mathcal{L}}(\theta_1,D)-\underset{\theta\in\mathcal{C}}{\min}\hat{\mathcal{L}}(\theta, D)] + \mathbb{E}[\mathcal{L}(\theta_1)-\hat{\mathcal{L}}(\theta_1,D)],
\end{equation*}
where we used the fact that $\mathbb{E}[\underset{\theta \in \mathcal{C}}{\min }\hat{\mathcal{L}}(\theta, D)]\leq \underset{\theta \in \mathcal{C}}{\min } \mathbb{E}[\hat{\mathcal{L}}(\theta, D)]=\mathcal{L}(\theta^{*})$.


For term $\mathbb{E}[\mathcal{L}(\theta_1)-\hat{\mathcal{L}}(\theta_1,D)]$, note that with $\lambda = 0$,  $f_{\mathbf{G}}(\theta, x) = \ell(\theta, x)+\frac{\langle \mathbf{G},\theta\rangle}{n}$ would be $\frac{\Delta}{||\mathcal{C}||_2^2}$ strongly convex w.r.t. $||\cdot||_2$. Using the same notation as in the convex case, where $\hat{F}_{\mathbf{G}}(\theta,D) = \frac{1}{n}\sum_{i=1}^n f_{\mathbf{G}}(\theta,x_i)$ and $F_{\mathbf{G}}(\theta)=\underset{x\sim \mathcal{P}}{\mathbb{E}}[f_{\mathbf{G}}(\theta,x)]$, we have
\begin{equation*}
\begin{aligned}
    \mathbb{E}[\mathcal{L}(\theta_1)-\hat{\mathcal{L}}(\theta_1,D)]& = \mathbb{E}[F_{\mathbf{G}}(\theta_1)-\hat{F}_{\mathbf{G}}(\theta_1,D)]\\
    &\leq \frac{\left(L+\frac{||\mathbf{G}||_2}{n}\right)^2||\mathcal{C}||_2^2}{n\Delta}~(\text{According to Lemma \ref{le2}})\\
    &\leq O\left(\frac{L^2||\mathcal{C}||_2^2}{n\Delta}
    \right) (\text{since} ~n\geq O\left(\frac{\sqrt{d\log(1/\delta)}}{\epsilon}\right)).
    \end{aligned}
\end{equation*}
Let $\theta^{'} = \underset{\theta\in\mathcal{C}}{\arg\min}\hat{\mathcal{L}}(\theta, D)$. In the following, we bound 
the term $\mathbb{E}[\hat{\mathcal{L}}(\theta_1,D)-\underset{\theta\in\mathcal{C}}{\min}\hat{\mathcal{L}}(\theta, D)]=\mathbb{E}[\hat{\mathcal{L}}(\theta_1,D)-\hat{\mathcal{L}}(\theta^{'}, D)]$.


By the definition of strong convexity,
\begin{equation*}
\begin{aligned}
   \hat{\mathcal{L}}(\theta_1,D)&\geq \hat{\mathcal{L}}(\theta^{'},D)+\frac{\Delta}{2} ||\theta_1-\theta^{'}||_{\mathcal{C}}^2,\\
   \Leftrightarrow \hat{\mathcal{L}}(\theta_1,D)+\frac{\langle \mathbf{G},\theta_1\rangle}{n}-\frac{\langle \mathbf{G},\theta_1\rangle}{n}&\geq \hat{\mathcal{L}}(\theta^{'},D)+\frac{\langle \mathbf{G},\theta^{'}\rangle}{n}-\frac{\langle \mathbf{G},\theta^{'}\rangle}{n}+\frac{\Delta}{2} ||\theta_1-\theta^{'}||_{\mathcal{C}}^2, \\
   \Leftrightarrow \mathcal{J}(\theta_1,D)-\frac{\langle \mathbf{G},\theta_1\rangle}{n}&\geq \mathcal{J}(\theta^{'},D)-\frac{\langle \mathbf{G},\theta^{'}\rangle}{n}+\frac{\Delta}{2} ||\theta_1-\theta^{'}||_{\mathcal{C}}^2.
   \end{aligned}
\end{equation*}
So,
\begin{equation*}
     \mathcal{J}(\theta_1,D)-\mathcal{J}(\theta^{'},D)+\frac{\langle \mathbf{G},\theta^{'}-\theta_1\rangle}{n}\geq \frac{\Delta}{2} ||\theta_1-\theta^{'}||_{\mathcal{C}}^2.
\end{equation*}
Since $\mathcal{J}(\theta_1,D)-\mathcal{J}(\theta^{'},D)\leq 0$ (due to the optimality condition), we get
\begin{equation}\label{eq16}
\begin{aligned}
    \frac{\langle \mathbf{G},\theta^{'}-\theta_1\rangle}{n}&\geq \frac{\Delta}{2} ||\theta_1-\theta^{'}||_{\mathcal{C}}^2,\\
    \Rightarrow &||\theta_1-\theta^{'}||_{\mathcal{C}}\leq \frac{2\cdot \langle \mathbf{G},\frac{\theta^{'}-\theta_1}{||\theta^{'}-\theta_1||_{\mathcal{C}}}\rangle }{n\Delta},
   \\
     \Rightarrow & ||\theta_1-\theta^{'}||_{\mathcal{C}}\leq 2\cdot \underset{\theta \in \mathcal{C}} {\max}\frac{\langle \mathbf{G},\theta\rangle}{n\Delta} = \frac{2||\mathbf{G}||_{\mathcal{C}^{*}}}{n\Delta}.
    \end{aligned}
\end{equation}
Using $\mathcal{J}(\theta_1,D)-\mathcal{J}(\theta^{'},D)\leq 0$ again, and take the expectation at both sizes, 
\begin{equation*}
    \mathcal{L}(\theta^{'})+\mathbb{E}[\frac{\langle \mathbf{G},\theta^{'}\rangle}{n}] \geq \mathcal{L}(\theta_1)+ \mathbb{E}[\frac{\langle\mathbf{G},\theta_1\rangle}{n}].
\end{equation*}
Thus
\begin{equation*}
\begin{aligned}
    \mathcal{L}(\theta_1)-\mathcal{L}(\theta^{'})&
    \leq \mathbb{E}[\frac{\langle \mathbf{G},\theta^{'}-\theta_1\rangle}{n}]\\
    &\leq \mathbb{E}\left[\frac{||\mathbf{G}||_{\mathcal{C}^{*}}}{n}\cdot ||\theta_1 -\theta^{'}||_{\mathcal{C}}\right]~\text{(Holder’s inequality)}\\
    & \leq \mathbb{E}\left[\frac{2||\mathbf{G}||^2_{\mathcal{C}^{*}}}{n^2 \Delta}
   \right]~\text{(according to Eq.(\ref{eq16}))}\\
    &\leq O\left(
    \frac{G_{\mathcal{C}}^2L^2 \log(1/\delta)}{\Delta n^2 \epsilon^2}
    \right).
    \end{aligned}
\end{equation*}
Thus $\mathbb{E}[\hat{\mathcal{L}}(\theta_1,D)-\underset{\theta\in\mathcal{C}}{\min}\hat{\mathcal{L}}(\theta, D)]\leq  O\left(\frac{L^2||\mathcal{C}||_2^2}{n\Delta}+
    \frac{G_{\mathcal{C}}^2L^2 \log(1/\delta)}{\Delta n^2 \epsilon^2}
    \right)$.
    So
    \begin{equation*}
         \mathbb{E}[\mathcal{L}(\hat{\theta})] - \mathcal{L}(\theta^{*})\leq O\left(
         \frac{L^2||\mathcal{C}||_2^2}{n\Delta}+
    \frac{G_{\mathcal{C}}^2L^2 \log(1/\delta)}{\Delta n^2 \epsilon^2}
        + L\cdot\sqrt[4]{\frac{\alpha \log(1/\delta)||\mathcal{C}||_2^2}{\Delta}}\cdot \sqrt{\frac{G_{\mathcal{C}}}{\epsilon}} + L\cdot \sqrt{\frac{2\alpha ||\mathcal{C}||_2^2}{\Delta}}
         \right).
    \end{equation*}
    When $\alpha\leq O\left(\min\left\{
\frac{L^2 ||\mathcal{C}||_2^2}{\Delta n^2},\frac{L^4\cdot ||\mathcal{C}||_2^6\epsilon^2}{\Delta^3 n^4 G_{\mathcal{C}}^2\log(1/\delta)}
\right\}
\right)$, we have $ L\cdot \sqrt{\frac{2\alpha ||\mathcal{C}||_2^2}{\Delta}}\leq  \frac{L^2||\mathcal{C}||_2^2}{n\Delta} $ and $ L\cdot\sqrt[4]{\frac{\alpha \log(1/\delta)||\mathcal{C}||_2^2}{\Delta}}\cdot \sqrt{\frac{G_{\mathcal{C}}}{\epsilon}}\leq  \frac{L^2||\mathcal{C}||_2^2}{n\Delta}$.

Thus,
\begin{equation*}
    \mathbb{E}[\mathcal{L}(\hat{\theta})] - \mathcal{L}(\theta^{*})\leq O\left( \frac{L^2||\mathcal{C}||_2^2}{n\Delta} +  \frac{G_{\mathcal{C}}^2L^2 \log(1/\delta)}{\Delta n^2 \epsilon^2}
    \right).
\end{equation*}
    

\end{proof}

\subsection{\bf Proof of Theorem \ref{th1}}
\begin{proof}
To show the proof, we first prove  the following theorem on the lower bound of excess empirical risk and then use reduction  from Private ERM to Private SCO to get the lower bound for excess population risk.
\begin{theorem}\label{th2}
Let $\mathcal{C}$ be a symmetric body contained in the unit Euclidean ball $\mathcal{B}_2^d$ in $\mathbb{R}^d$ and satisfies $\|\mathcal{C}\|_2=1$. For any $ n=O(\frac{\sqrt{d\log(1/\delta)}}{\epsilon})$, $\epsilon=O(1)$ and  $2^{-\Omega(n)}\leq \delta\leq 1/n^{1+\Omega(1)}$, there exists a loss $\ell$ which is $1$-Lipschitz w.r.t. $\|\cdot\|_2$ and $\mathcal{C}^2_{\min}$-strongly convex w.r.t. $\|\cdot\|_\mathcal{C}$, and a dataset $D=\{x_1,\cdots,x_n\}\subseteq \mathcal{C}$ such as for any $(\epsilon,\delta)$-differentially private algorithm $\mathcal{A}$, its output 
  satisfies 
	$$
	\mathbb{E}[\hat{\mathcal{L}}(\mathcal{A},D)]-\underset{\theta\in\mathcal{C}}{\min} \hat{\mathcal{L}}(\theta,D)=\Omega \left(\frac{G_{\mathcal{C}}^2\log(1/\delta)}{(\log(2d))^4\epsilon^2n^2}\right),$$ 
  where the expectation is taken over the internal randomness of the algorithm $\mathcal{A}$.
\end{theorem}
\begin{theorem}[Reduction from private ERM to private SCO \citep{bassily2019private}]\label{th8}
For any $\gamma>0$, suppose there is a $\left(\frac{\epsilon}{4 \log(1/\delta)}, \frac{e^{-\epsilon}\delta}{8\log (2/\delta)}\right)$-DP algorithm $\mathcal{A}$ such that for any distribution on domain $\mathcal{X}$, $\mathcal{A}$  yields expected population loss $\mathbb{E}_{\mathcal{A}}[\mathcal{L}(\mathcal{A})]-\min_w \mathcal{L}(w)< \gamma$. Then, there is a $(\epsilon,\delta)$-DP algorithm $\mathcal{B}$ that given any dataset  $D\in \mathcal{X}^n$, it yields expected excess empirical loss $\mathbb{E}_{\mathcal{B}}[\hat{\mathcal{L}}(\mathcal{B},D)]-\min_w \hat{\mathcal{L}}(w,D)< \gamma$.


\end{theorem}
From Theorem \ref{th8}, for any dataset $D$ and any 1-Lipschitz, $\mathcal{C}_{\min}^2$- strongly convex loss $\ell$, if there exists an algorithm with excess population loss
\begin{equation*}
	\mathbb{E}[{\mathcal{L}}(\theta^{priv})]-\underset{\theta\in\mathcal{C}}{\min} {\mathcal{L}}(\theta)=o \left(\frac{G_{\mathcal{C}}^2\log(1/\delta)}{(\log(2d))^4\epsilon^2n^2}\right),
\end{equation*}
then there exists an algorithm $\mathcal{B}$ such that the excess empirical loss $
	\mathbb{E}[\hat{\mathcal{L}}(\mathcal{B},D)]-\underset{\theta\in\mathcal{C}}{\min} \hat{\mathcal{L}}(\theta,D)=o \left(\frac{G_{\mathcal{C}}^2\log(1/\delta)}{(\log(2d))^4\epsilon^2n^2}\right)
	$, which contradicts Theorem  \ref{th2}.


Thus, $\forall n=O(\frac{\sqrt{d\log(1/\delta)}}{\epsilon})$, there exists a dataset $D=\{x_1,\cdots,x_n\}\subseteq \mathcal{C}$ and a strongly convex loss function $\ell$ such that for any output $\theta^{priv}$, the excess population loss
	$
	\mathbb{E}[{\mathcal{L}}(\theta^{priv})]-\underset{\theta\in\mathcal{C}}{\min} {\mathcal{L}}(\theta)=\Omega \left(\frac{G_{\mathcal{C}}^2\log(1/\delta)}{(\log(2d))^4\epsilon^2n^2}\right)$.



As a result, we have
\begin{equation*}
	\mathbb{E}[{\mathcal{L}}(\theta^{priv})]-\underset{\theta\in\mathcal{C}}{\min} {\mathcal{L}}(\theta)=\Omega \left(\max\left\{\frac{G_{\mathcal{C}}^2\log(1/\delta)}{(\log(2d))^4\epsilon^2n^2},\frac{1}{n}\right\}\right),
\end{equation*}
where the first term is the lower bound on excess empirical loss and the second term is the lower bound on excess population loss in the non-private setting.
	
\end{proof}
\begin{proof}[{\bf Proof of Theorem \ref{th2}}]
Before starting our proof, we give some background on the mean point problem.

Let $\bar{x}=\frac{1}{n}\sum\limits_{i=1}^nx_i$ be the mean of the database $D$, where $D=\{x_1,\cdots,x_n\}$ is a multiset of points in $\mathcal{C}$.
The sample complexity of the mean point problem to achieve an error $\alpha$ with respect to an algorithm $\mathcal{A}$ is defined as 
\begin{equation*}
SC_{mp}(\mathcal{C},\mathcal{A},\alpha)=\min \{n:\underset{D}{\sup}~(\mathbb{E}||\mathcal{A}(D)-\bar{x}||_2^2)^{1/2}\leq \alpha\},
\end{equation*}
where the supremum is taken over the database $D$ consisting of at most $n$ points from $\mathcal{C}$ and the expectation is taken over the randomness of the algorithm $\mathcal{A}$.

The sample complexity of solving the mean point problem with error $\alpha$ under $(\epsilon,\delta)$-differential privacy over convex set $\mathcal{C}$ is defined as the minimum number of samples among all the differentially private algorithm $\mathcal{A}$.
\begin{equation*}
SC_{mp}(\mathcal{C},\alpha)=\min \{	SC_{mp}(\mathcal{C},\mathcal{A},\alpha):\mathcal{A} ~\text{is}~ (\epsilon,\delta)\text{-differentially private} \}.
\end{equation*}
Previous work \cite{kattis2016lower} shows that we can characterize sample complexity $SC_{mp}(\mathcal{C},\alpha)$ as a natural property of convex set $\mathcal{C}$.



\begin{lemma}\cite{kattis2016lower}\label{le7}
	Let $\mathcal{C}$ be a symmetric convex body contained in the unit Euclidean ball $\mathcal{B}_2^d$ in $\mathbb{R}^{d}$. Let $c$ be an absolute constant, then for any $\epsilon=O(1), 2^{-\Omega(n)}\leq \delta\leq 1/n^{1+\Omega(1)}$ and any $\alpha\leq \frac{G_{\mathcal{C}}}{c\sqrt{d}(\log2d)^2}$, 
	\begin{equation}\label{eq17}
	SC_{mp}(\mathcal{C},\alpha)=\Omega\left(\frac{G_{\mathcal{C}}\sqrt{\log(1/\delta)}}{(\log 2d)^2\alpha\epsilon}\right),
	\end{equation}
	\begin{equation*}
	SC_{mp}(\mathcal{C},\alpha)=O\left(\min\left\{\frac{G_{\mathcal{C}}\sqrt{\log(1/\delta)}}{\alpha^2\epsilon},\frac{\sqrt{d\log(1/\delta)}}{\alpha\epsilon}\right\}\right).
	\end{equation*}
	
	When $G_{\mathcal{C}}=\Omega(\sqrt{d})$, then $SC_{mp}(\mathcal{C},\alpha)=\Theta\left(\frac{\sigma(\epsilon, \delta)\sqrt{d}}{\alpha}\right)$ for any $\alpha\leq 1/c$.
\end{lemma}

Now we start our proof with the help of the above lemma.

	Let $\ell(\theta;x)=\frac{1}{2
 } ||\theta-x||_2^2$ be half of the squared $\ell_2$-distance between $\theta\in \mathcal{C}\subseteq \mathcal{B}_2^d$ and $x_i\in \mathcal{C}$, which is $1$-Lipschitz and $1$-strongly convex w.r.t to $\|\cdot\|_2$. Actually, based on the following lemma we can easily show it is $\mathcal{C}^2_{\min}$-strongly convex w.r.t $\|\cdot\|_\mathcal{C}$. 

 \begin{lemma}
     For any $x$, we have $\|x\|_2\geq \|x\|_\mathcal{C} \cdot \mathcal{C}_{\min}$.
 \end{lemma}
\begin{proof}
    By the definition of $ \|x\|_\mathcal{C}$ we can see it is sufficient to show that $x\in \frac{\|x\|_2}{\mathcal{C}_{\min} } \mathcal{C}$. Note that as $\mathcal{C}$ is symmetric and  $\mathcal{C}_{\min} $ is the minimal distance from the original point to the boundary of $\mathcal{C}$, thus, $\frac{\mathcal{C}}{\mathcal{C}_{\min} }$ contains the unit $\ell_2$-norm ball, indicating that $x\in \frac{\|x\|_2}{\mathcal{C}_{\min} } \mathcal{C}$. 
\end{proof}

 
	The strongly convex decomposable loss function is defined as $\hat{\mathcal{L}}(\theta;D)=\frac{1}{2n}\sum\limits_{i=1}^n\ell(\theta;x_i)=\frac{1}{2n}\sum\limits_{i=1}^n||\theta-x_i||_2^2$. 
	Notice that the minimizer of $\hat{\mathcal{L}}(\cdot;D)$ over $\mathcal{B}_2^d$ is $\theta^{*}=\frac{1}{n}\sum\limits_{i=1}^n x_i\in\mathcal{C}$, and the excess empirical risk can be written as:
	\begin{equation*}
	\mathbb{E}[\hat{\mathcal{L}}(\theta^{priv};D)]-\hat{\mathcal{L}}(\theta^{*};D)=\frac{1}{2}\mathbb{E}||\theta^{priv}-\theta^{*}||_2^2=\frac{1}{2}\mathbb{E}||\theta^{priv}-\frac{1}{n}\sum\limits_{i=1}^n x_i||_2^2.
	\end{equation*}
	We prove the theorem by contradiction.
		Assume Theorem \ref{th2} is false, then for any dataset $D$, there exists a $(\epsilon,\delta)$-differentially private algorithm $\mathcal{A}$, for some 
		$n=O(\frac{\sqrt{d\log(1/\delta)}}{\epsilon})$, it outputs $\theta^{priv}$ such that $\mathbb{E}[\hat{\mathcal{L}}(\theta^{priv};D)]-\hat{\mathcal{L}}(\theta^{*};D)=\frac{1}{2}\mathbb{E}||\theta^{priv}-\frac{1}{n}\sum\limits_{i=1}^n x_i||_2^2=o\left(\frac{G_{\mathcal{C}}^2\log(1/\delta)}{(\log(2d))^4\epsilon^2n^2}\right)$.
		
		In  Lemma \ref{le7}, 
  \begin{equation*}
		\begin{aligned}
		    SC_{mp}=&\min\{n:\underset{D}{\sup} (\mathbb{E}||\theta^{priv}-\bar{x}||_2^2)\leq \alpha^2\}\\
		    =& \Omega\left(\frac{G_{\mathcal{C}}\sqrt{\log(1/\delta)}}{(\log 2d)^2\alpha\epsilon}\right) \text{(Using Eq.(\ref{eq17}))}\\
		    =&o(n)~~~~~(\text{ By letting }\alpha=o\left(\frac{G_{\mathcal{C}}\sqrt{\log(1/\delta)}}{(\log(2d))^2\epsilon n}\right)),
		    \end{aligned}
		\end{equation*}
	which leads to a contradiction.
\end{proof}
\section{Omitted Proofs in Section \ref{sec:regular}}
\subsection{Proof of Theorem \ref{thm:6}}
\begin{proof}
Note that for any neighboring dataset $D$ and $D^{'}$, we have $||\nabla \hat{\mathcal{L}}(w_t,D)-\nabla \hat{\mathcal{L}}(w_t,D^{'})||_{*}\leq \frac{2L}{n}$ by the Lipschitz assumption.
Since for $\ell_p^d$-space, $||\cdot||_{*} = ||\cdot||_{\frac{p}{p-1}}$, the space $(\mathbf{E},||\cdot||_{*})$ is $\kappa$-regular with $\kappa = \min\{\frac{p}{p-1}-1, 2\ln d \}=\min\{\frac{1}{p-1},2\ln d\}$, so using the privacy guarantee provided by generalized Gaussian mechanism and the advanced composition theorem, the algorithm is $(\epsilon, \delta)$-DP.
\end{proof}

\subsection{Proof of theorem \ref{th:7}}
\begin{proof}

Observe that $\Phi(x) =\frac{\kappa}{2}||x||_{\kappa_{+}}^2$ where $\kappa=\min\{\frac{1}{p-1},2\ln d\}$ and $\kappa_{+}=\frac{\kappa}{\kappa-1}$  is 1-strongly convex w.r.t. $||\cdot||$ by the definition of $||\cdot||_{\kappa^{+}}$ and the duality between strongly convexity and smoothness.
We recall the following lemma showing that adding regularization may impair smoothness, but it also induces good properties such as relatively smooth and strongly convex.

\begin{lemma}\label{le3}(Lemma 14 in \cite{attia2022uniform})
Let $f(x)$ be a convex and $\beta$-smooth function w.r.t. $||\cdot||$ and $\Phi(x)$ be 1-strongly convex w.r.t. $||\cdot||$, then $f^{\alpha}(x) = f(x)+\alpha \cdot \Phi(x)$ for $\alpha>0$ is $(\alpha+\beta)$-smooth relative to $\Phi(x)$ as well as $\alpha$-strongly convex relative to $\Phi(x)$.
\end{lemma}

Let $w_{\alpha}^{*} = \underset{w\in\mathbf{E}}{\arg\min}\hat{\mathcal{L}}(w,D)+\alpha \Phi(w)$, $w^{*}=\underset{w\in\mathbf{E}}{\arg\min}\mathcal{L}(w)$ and $\tilde{w}^{*}=\tilde{w}^{*}(D)=\underset{w\in\mathbf{E}}{\arg\min}\hat{\mathcal{L}}(w,D)$, and $C_D=\Phi^{\frac{1}{2}}(\tilde{w}^{*})$.
Based on the optimality of $w_{\alpha}^{*}$ for the regularized objective function $\hat{\mathcal{L}}(w,D)+\alpha \Phi(w)$, along with the optimality of $\tilde{w}^{*}$ for the objective $\hat{\mathcal{L}}(w,D)$,
we have

\begin{align}
    \hat{\mathcal{L}}(w_{\alpha}^{*},D)+\alpha \Phi(w_{\alpha}^{*})&\leq \hat{\mathcal{L}}(\tilde{w}^{*},D)+\alpha \Phi(\tilde{w}^{*}),\notag\\
  \implies     \Phi(\tilde{w}^{*})-\Phi(w^{*}_{\alpha}) &\geq  \frac{
    \hat{\mathcal{L}}(w_{\alpha}^{*},D)-\hat{\mathcal{L}}(\tilde{w}^{*},D)
    }{\alpha} >0, \notag\\
  \implies  \Phi(\tilde{w}^{*})&>\Phi(w_{\alpha}^{*}). \label{eq18}
    \end{align}

Since $w_1 =0= \underset{w\in\mathbf{E}}{\arg\min}\Phi(w)$, from the first-order optimality of $w_1$, we have $\langle\nabla \Phi(w_1),w_1-w_{\alpha}^{*}\rangle \leq 0$ and thus
\begin{equation*}
\begin{aligned}
    D_{\Phi}(w_{\alpha}^{*},w_1) &=\Phi(w_{\alpha}^{*})-\Phi(w_1)-\langle\nabla \Phi(w_1), w_{\alpha}^{*}-w_1\rangle\\
    &\leq \Phi(w_{\alpha}^{*})-\Phi(w_1)\\
    &\leq \Phi(\tilde{w}^{*})-\Phi(w_1)(~\text{From Eq.( \ref{eq18})})\\
    &\leq C_D^2 ~(\text{Let}~ C_D^2 = \Phi(\tilde{w}^*)).
    \end{aligned}
\end{equation*}
Now we  rewrite our objectives in Algorithm \ref{alg3}:
\begin{equation*}
    \begin{aligned}
    &\langle \nabla \hat{\mathcal{L}}(w_t,D)+g_t, w-w_t\rangle +\beta\cdot D_{\Phi }(w,w_t)+\alpha \Phi(w)\\
    =&\langle \nabla \hat{\mathcal{L}}(w_t,D)+g_t, w-w_t\rangle +(\beta+\alpha)\cdot D_{\Phi }(w,w_t)+\alpha \Phi(x)-\alpha \cdot D_{\Phi }(w,w_t)
    \\
    =&\langle \nabla \hat{\mathcal{L}}(w_t,D)+g_t, w-w_t\rangle +(\alpha+\beta)\cdot D_{\Phi }(w,w_t) +\alpha \Phi(w) -\alpha\cdot(\Phi(w)-\Phi(w_t)-\langle\nabla \Phi(w_t),w-w_t\rangle)\\
    =&\langle \nabla \hat{\mathcal{L}}(w_t,D)+\alpha \nabla \Phi(w_t)+g_t, w-w_t\rangle +(\alpha+\beta)\cdot D_{\Phi }(w,w_t) +\alpha \Phi(w_t)\\
    =&\langle \nabla \hat{\mathcal{L}}^{(\alpha)}(w_t,D)+g_t, w-w_t\rangle +(\alpha+\beta)\cdot D_{\Phi }(w,w_t) +\alpha \Phi(w_t).
    \end{aligned}
\end{equation*}
where  $\hat{\mathcal{L}}^{(\alpha)} (w,D)\triangleq \hat{\mathcal{L}}(w,D)+\alpha \cdot\Phi(w)$ and note that $\hat{\mathcal{L}}^{(\alpha)} (w,D)$ is $(\alpha+\beta)$-smooth relative to $\Phi(x)$ as well as $\alpha$-strongly convex relative to $\Phi(w)$  according to Lemma \ref{le3}. Next, we 
recall the following ``three-point property":
\begin{lemma}\label{le4}(\textbf{Three point property}) \cite{tseng2008accelerated}.
Let $\phi(x)$ be a convex function and $D_{\Phi}(\cdot,\cdot)$ be the Bregman divergence for $\Phi(\cdot)$. For given $z$, let $z^{*}=\underset{x\in\mathbf{E}}{\arg\min}\{\phi(x)+D_{\Phi}(x,z)\}$, then for all $x\in \mathbf{E}$ we have 
\begin{equation*}
    \phi(x)+D_{\Phi}(x,z)\geq \phi(z^{*})+D_{\Phi}(z^{*},z)+D_{\Phi}(x,z^{*}).
\end{equation*}


\end{lemma}
Let $\phi(w) = \frac{1}{\alpha+\beta}\cdot\langle\nabla f(w_t)+g_t,w-w_t\rangle$ where $f(w) = \hat{\mathcal{L}}(w,D)+\alpha \cdot \Phi(w)$, set $z=w_t$ in Lemma \ref{le4}, we get
\begin{equation*}
\frac{1}{\alpha+\beta}\cdot\langle\nabla f(w_t)+g_t,w-w_t\rangle +D_{\Phi}(w,w_t)\geq \frac{1}{\alpha+\beta}\cdot\langle\nabla f(w_t)+g_t,w_{t+1}-w_t\rangle+D_{\Phi}(w_{t+1},w_t)+D_{\Phi}(w,w_{t+1}), 
\end{equation*}
which implies 
\begin{equation*}
(\alpha+\beta)\cdot D_{\Phi}(w_{t+1},w_t) \leq \langle \nabla f(w_t)+g_t,w-w_{t+1}\rangle +(\alpha+\beta)\cdot (D_{\Phi}(w,w_t)-D_{\Phi}(w,w_{t+1})). 
\end{equation*}
Since $f(w)$ is $(\alpha+\beta)$-smooth relative to $\Phi(w)$, we have
\begin{equation}\label{eq2}
\begin{aligned}
f(w_{t+1})\leq &f(w_t)+\langle \nabla f(w_t), w_{t+1}-w_t\rangle +(\alpha+\beta)\cdot D_{\Phi}(w_{t+1},w_t)\\
\leq & f(w_t)+\langle \nabla f(w_t), w-w_t\rangle +(\alpha+\beta)\cdot (D_{\Phi}(w,w_t)-D_{\Phi}(w,w_{t+1}))+\langle g_t, w-w_{t+1}\rangle.
\end{aligned}
\end{equation}
Since $f(w)$ is $\alpha$-strongly convex relative to $\Phi(w)$, from the definition, we have
\begin{equation*}
f(w_t)+ \langle \nabla f(w_t), w-w_t\rangle \leq f(w) -\alpha \cdot D_{\Phi}(w,w_t).
\end{equation*}
So inequality (\ref{eq2}) becomes 
\begin{equation}\label{eq3}
\begin{aligned}
f(w_{t+1})&\leq f(w) -\alpha \cdot D_{\Phi}(w,w_t) +(\alpha+\beta)\cdot (D_{\Phi}(w,w_t)-D_{\Phi}(w,w_{t+1}))+\langle g_t, w-w_{t+1}\rangle\\
&\leq f(w) +\beta\cdot D_{\Phi}(w,w_t)-(\alpha+\beta)\cdot D_{\Phi}(w,w_{t+1})+\langle g_t, w-w_{t+1}\rangle.
\end{aligned}
\end{equation}
Note that  for any constant $a>0$
\begin{equation*}
\begin{aligned}
\langle g_t, w-w_{t+1}\rangle \leq & a\cdot ||g_t||_{*}^2 +\frac{1}{2a}\cdot ||w-w_{t+1}||^2\\
\leq & a\cdot ||g_t||_{*}^2 +\frac{1}{2a} \cdot D_{\Phi}(w, w_{t+1}),
\end{aligned}
\end{equation*}
where the last inequality is due to $\Phi$ being 1-strongly convex w.r.t. $\|\cdot\|$.  Now inequality (\ref{eq3}) can be written as
\begin{equation}\label{eq4}
f(w_{t+1})\leq f(w) +\beta\cdot D_{\Phi}(w,w_t)-(\alpha+\beta-\frac{1}{2a})\cdot D_{\Phi}(w,w_{t+1})+a\cdot ||g_t||_{*}^2.
\end{equation}
Let $w$ in Eq. (\ref{eq4}) to be $w_{\alpha}^* = \arg\min f(w)$, let $a = \frac{1}{\alpha}$, we have

\begin{equation*}
\begin{aligned}
D_{\Phi}(w_{\alpha}^{*}, w_{t+1})\leq &\frac{\beta}{\alpha+\beta-\frac{1}{2a}} \cdot D_{\Phi}(w_{\alpha}^{*},w_t) +O\left(\frac{a}{\alpha+\beta -\frac{1}{2a}} \cdot \|g_t||_{*}^2\right)\\
\leq & \frac{1}{1+\frac{\alpha}{2\beta}} \cdot D_{\Phi}(w_{\alpha}^{*}, w_t) +O\left(\frac{1}{\alpha\beta}\cdot ||g_t||_{*}^2\right).
\end{aligned}
\end{equation*}
Letting $t = 1,2,\cdots, T$, add these inequalities together, we have
\begin{equation*}
\begin{aligned}
\mathbb{E}[D_{\Phi}(w_{\alpha}^{*},w_{T+1})]\leq &\left(\frac{1}{1+\frac{\alpha}{2\beta}}\right)^T \cdot D_{\Phi}(w_{\alpha}^{*},w_1)+O\left(\frac{1}{\alpha^2}\cdot g^2\right)\\
= & \left(1+\frac{\alpha}{2\beta}\right)^{-T} \cdot D_{\Phi}(w_{\alpha}^{*},w_1)+O\left(\frac{1}{\alpha^2}\cdot g^2\right)\\
\leq &2^{-\frac{\alpha T}{2\beta}}\cdot D_{\Phi}(w_{\alpha}^{*},w_1)+O\left(\frac{1}{\alpha^2}\cdot g^2\right)\\
\leq &2^{-\frac{\alpha T}{2\beta}}\cdot C_D^2+O\left(\frac{1}{\alpha^2}\cdot g^2\right),
\end{aligned}
\end{equation*}
where the expectation is taken over all $g_1, \cdots, g_T$ and $g^2 = \mathbb{E}[||g_t||_{*}^2]$. The last inequality utilizes the fact that $(1+\frac{1}{x})^x\geq 2$ for all $x\geq 1$ and note that $\frac{2\beta}{\alpha}\geq 1$. 
Since $\Phi$ is strongly convex, we also have 
\begin{equation*}
\frac{1}{2}\mathbb{E}[||w_{\alpha}^{*}-w_{T+1}||^2]\leq \mathbb{E}[D_{\Phi}(w_{\alpha}^{*},w_{T+1})] \leq 2^{-\frac{\alpha T}{2\beta}}\cdot C_D^2+O\left(\frac{1}{\alpha^2}\cdot g^2\right).
\end{equation*}
Thus, we have
\begin{equation*}
\mathbb{E}[||w_{\alpha}^{*}-w_{T+1}||] \leq O\left( 2^{-\frac{\alpha T}{4\beta}}\cdot C_D +\frac{1}{\alpha}\cdot g \right).
\end{equation*}
Now we consider a neighboring data $D'$ of $D$ where they differ by the $i$-th entry. Denote $w_{\alpha}^{*'}=\hat{\mathcal{L}}(w,D')+\alpha \cdot \Phi(w)$ and $w^{'}_{T+1}$ as the parameters of the algorithm on $D'$. Then, similar to the previous case we can get 
\begin{equation*}
\mathbb{E}[||w_{\alpha}^{*'}-w^{'}_{T+1}||] \leq O\left( 2^{-\frac{\alpha T}{4\beta}}\cdot C_D +\frac{1}{\alpha}\cdot g \right).
\end{equation*}
Next, we will bound the term $||w_{\alpha}^{*}-w_{\alpha}^{*'}||$ by the following lemma.
\begin{lemma}\label{le5}
Let $f_1,f_2:\mathbf{E}\rightarrow \mathbb{R}$ be convex and $\alpha$-strongly convex (relatively). Let $x_1=\underset{x\in\mathbf{E}}{\arg\min} f_1(x)$ and $x_2=\underset{x\in\mathbf{E}}{\arg\min } f_2(x)$, then
\begin{equation*}
||x_2-x_1||\leq \frac{2}{\alpha}||\nabla (f_2-f_1)({x_1})||_{*}.
\end{equation*}
\end{lemma}
From the above lemma, let $f_1(w) = \hat{\mathcal{L}}(w,D)+\alpha \cdot \Phi(w)$ and $f_2(w) = \hat{\mathcal{L}}(w,D^{'})+\alpha \cdot \Phi(w)$, we can get 
\begin{equation*}
||w_{\alpha}^{*}-w_{\alpha}^{*'}||\leq \frac{2||{\nabla \ell(w_{\alpha}^{*};x_i)-\nabla \ell(w_{\alpha}^{*};x_i^{'})}||_{*}}{n\alpha}\leq \frac{4L}{n\alpha}.
\end{equation*}


In total
\begin{equation*}
\begin{aligned}
\mathbb{E}[||w_{T+1}^{'}-w_{T+1}||]\leq & O\left(  2^{-\frac{\alpha T}{4\beta}} \cdot C_D +\frac{L}{n\alpha}+\frac{g}{\alpha}\right)\\
=& O\left(  2^{-\frac{\alpha T}{4\beta}} \cdot C_D +\frac{L}{n\alpha}+\frac{L\sqrt{\log(1/\delta) d \kappa T}}{\alpha n \epsilon}\right).
\end{aligned}
\end{equation*}
Similarly, we can also show that for any $t$ we have 
\begin{equation*}
\begin{aligned}
\mathbb{E}[||w_{t+1}^{'}-w_{t+1}||]\leq & O\left(  2^{-\frac{\alpha t}{4\beta}} \cdot C_D +\frac{L}{n\alpha}+\frac{g}{\alpha}\right)\\
=& O\left(  2^{-\frac{\alpha t}{4\beta}} \cdot C_D +\frac{L}{n\alpha}+\frac{L\sqrt{\log(1/\delta) d \kappa T}}{\alpha n \epsilon}\right).
\end{aligned}
\end{equation*}


Now we go back to Eq. (\ref{eq4}),
\begin{equation*}
\begin{aligned}
f(w_{t+1})-f(w_{\alpha}^{*})\leq & \beta\cdot D_{\Phi}(w_{\alpha}^{*},w_t)-(\alpha+\beta-\frac{1}{2a})\cdot D_{\Phi}(w_{\alpha}^{*},w_{t+1})+a\cdot ||g_t||_{*}^2\\
\leq  & \beta\cdot D_{\Phi}(w_{\alpha}^{*},w_t)-(\beta+\frac{\alpha}{2})\cdot D_{\Phi}(w_{\alpha}^{*},w_{t+1})+O\left(\frac{1}{\alpha}\cdot ||g_t||_{*}^2\right).
\end{aligned}
\end{equation*}
Since 
\begin{equation*}
\begin{aligned}
&\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t \cdot \mathbb{E}[f(w_{t+1})-f(w_{\alpha}^{*})]\\
\leq &\beta \left[\sum_{t=1}^T \left( \frac{2\beta+\alpha}{2\beta}\right)^t   \cdot D_{\Phi}(w_{\alpha}^{*},w_t)- \sum_{t=1}^T \left( \frac{2\beta+\alpha}{2\beta}\right)^{t +1}   \cdot D_{\Phi}(w_{\alpha}^{*},w_{t+1})\right]+ O\left( \sum_{t=1}^T\left(\frac{2\beta+\alpha}{2\beta}\right)^{t}\cdot \frac{1}{\alpha} g^2   \right)\\
= & \beta\left[\frac{2\beta+\alpha}{2\beta}\cdot D_{\Phi}(w_{\alpha}^{*},w_1) -\left(\frac{2\beta+\alpha}{2\beta}\right)^{T+1} \cdot D_{\Phi}(w_{\alpha}^{*},w_{T+1})\right]+ O\left( \sum_{t=1}^T\left(\frac{2\beta+\alpha}{2\beta}\right)^{t}\cdot \frac{1}{\alpha} g^2   \right)\\
\leq & \frac{2\beta+\alpha}{2}\cdot D_{\Phi}(w_{\alpha}^{*},w_1) + O\left( \sum_{t=1}^T\left(\frac{2\beta+\alpha}{2\beta}\right)^{t}\cdot \frac{1}{\alpha} g^2   \right).
\end{aligned}
\end{equation*}
Let 
\begin{equation*}
\hat{w}=\frac{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t  \cdot w_{t+1}}{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t }.
\end{equation*}
And we have

\begin{align}
\mathbb{E}[f(\hat{w})-f(w_{\alpha}^{*})] &= \mathbb{E}\left[f\left(\frac{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t  \cdot w_{t+1}}{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t }\right)-f(w_{\alpha}^{*})\right]\notag\\
& \leq \mathbb{E} \left[ \frac{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t \cdot f(w_{t+1})}{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t  }   -f(w_{\alpha}^{*})\right]\notag\\
& =\frac{\mathbb{E}\left[ \sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t \cdot\left(f(w_{t+1})-f(w^{*}_{\alpha})\right)\right]}{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t }\notag\\
& = \frac{\sum_{t=1}^T\left( \frac{2\beta +\alpha}{2\beta}  \right)^t \cdot \mathbb{E}[f(w_{t+1})-f(w^{*}_{\alpha})]}{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t }\notag\\
& \leq \frac{(2\beta+\alpha)\cdot D_{\Phi}(w_{\alpha}^{*},w_1)}{2\cdot\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t } +O\left( \frac{1}{\alpha} g^2 \right)\notag\\
&=\frac{\alpha \cdot D_{\Phi}(w_{\alpha}^{*},w_1)}{2\left[\left(\frac{2\beta+\alpha}{2\beta} \right)^T-1\right]} +O\left( \frac{1}{\alpha} g^2\right)\notag\\
& \leq \frac{\alpha}{2} \cdot D_{\Phi}(w_{\alpha}^{*},w_1)+O\left( \frac{1}{\alpha} g^2 \right)\label{eq5}\\
& \leq O\left(\alpha \cdot D_{\Phi}(w_{\alpha}^{*},w_1)+\frac{1}{\alpha} g^2\right),\notag
\end{align}

where we used the fact that when $T\geq \frac{2\beta}{\alpha}$,
\begin{equation*}
\left(\frac{2\beta+\alpha}{2\beta} \right)^T=
(1+\frac{\alpha}{2\beta})^T
\geq 2
\end{equation*}
in inequality (\ref{eq5}).



Denote $\tilde{w}^{*}=\underset{w\in \mathbf{E}}{\arg\min} \hat{\mathcal{L}}(w,D)$, we have
\begin{equation*}
\begin{aligned}
\mathbb{E}[\hat{\mathcal{L}}(\hat{w},D) -\hat{\mathcal{L}}(\tilde{w}^{*},D)] &= \mathbb{E}[\hat{\mathcal{L}}^{(\alpha)}(\hat{w},D)-\hat{\mathcal{L}}^{(\alpha)}(\tilde{w}^{*},D)]+\alpha \cdot \Phi(\tilde{w}^{*})-\alpha \cdot \Phi(\hat{w})\\
& \leq \mathbb{E}[\hat{\mathcal{L}}^{(\alpha)}(\hat{w},D) -\hat{\mathcal{L}}^{(\alpha)}(w_{\alpha}^{*},D)]+\alpha \cdot \Phi(\tilde{w}^{*})-\alpha \cdot \Phi(\hat{w})\\
&\leq  O \left(\alpha \cdot D_{\Phi}(w_{\alpha}^{*},w_1)\right)+O\left( \frac{1}{\alpha} g^2 \right)+ \alpha \cdot \Phi(\tilde{w}^{*})-\alpha \cdot \Phi(\hat{w})\\
&\leq  O \left(\alpha \cdot D_{\Phi}(\tilde{w}^{*},w_1)\right)+O\left( \frac{1}{\alpha} g^2 \right)+\alpha \cdot C_D^2\\
&\leq  O(\alpha\cdot C_D^2+ \frac{1}{\alpha} g^2 ).
\end{aligned}
\end{equation*}
Now we bound the sensitivity of $\hat{w}$:
\begin{equation}\label{eq7}
\begin{aligned}
\mathbb{E}[||\hat{w}-\hat{w}^{'}||]&\leq \frac{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t \mathbb{E}[||w_{t+1}-w_{t+1}^{'}||]}{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t}\\
& \leq O\left( \frac{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t2^{-\frac{\alpha t}{4\beta}} \cdot C_D}{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t}  +\frac{L}{n\alpha}+\frac{L\sqrt{\log(1/\delta) d \kappa T}}{\alpha n \epsilon}\right).
\end{aligned}
\end{equation}
We bound the first term above:
\begin{equation} \label{eq6}
\begin{aligned}
\frac{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t2^{-\frac{\alpha t}{4\beta}} \cdot C_D}{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t} = &\frac{ C_D \cdot\sum_{t=1}^T \left[\frac{2\beta+\alpha}{2\beta} \cdot \left(\frac{1}{2}\right)^{\frac{\alpha }{4\beta}}\right]^t }{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t} \\
=& C_D \cdot \frac{1-\frac{2\beta+\alpha}{2\beta}}{\frac{2\beta+\alpha}{2\beta}\cdot \left[1-\left(\frac{2\beta+\alpha}{2\beta}\right)^T\right]} \cdot \frac{\frac{2\beta+\alpha}{2\beta}\cdot \left(\frac{1}{2}\right)^{\frac{\alpha}{4\beta}} \cdot \left( 1-\left[\frac{2\beta+\alpha}{2\beta}\cdot \left(\frac{1}{2}\right)^{\frac{\alpha}{4\beta}} 
\right]^T
\right)  }{1-\frac{2\beta+\alpha}{2\beta}\cdot \left(\frac{1}{2}\right)^{\frac{\alpha}{4\beta}}}\\
=& C_D\cdot  \left(\frac{1}{2}\right)^{\frac{\alpha}{4\beta}} \cdot\frac{\alpha}{(2\beta+\alpha)\cdot \left(\frac{1}{2}\right)^{\frac{\alpha}{4\beta}}-2\beta} \cdot \frac{\left[\frac{2\beta+\alpha}{2\beta}\cdot \left(\frac{1}{2}\right)^{\frac{\alpha}{4\beta}} 
\right]^T-1}{\left(\frac{2\beta+\alpha}{2\beta}\right)^T-1}.
\end{aligned}
\end{equation}
Consider function $f(x)=(1+x)\cdot a^x$. Its derivative $f'(x)=\ln a \cdot a^x+a^x+\ln a\cdot x\cdot a^x=a^x(\ln a+1+\ln a \cdot x)$, let $a=\frac{1}{\sqrt{2}}$, then $f'(x)>0$ for $x\in [0,1]$. Thus we have $(1+x)\cdot (\frac{1}{\sqrt{2}})^x>1$.
Let $x=\frac{\alpha}{2\beta}$, we have $(1+\frac{\alpha}{2\beta})\cdot (\frac{1}{2})^{\frac{\alpha}{4\beta}}>1$, namely $(2\beta+\alpha)\cdot (\frac{1}{2})^{\frac{\alpha}{4\beta}}-2\beta>0$.

In the following, we bound the term $\frac{\alpha}{(2\beta+\alpha)\cdot \left(\frac{1}{2}\right)^{\frac{\alpha}{4\beta}}-2\beta}$.
\begin{equation*}
\begin{aligned}
\frac{\alpha}{(2\beta+\alpha)\cdot \left(\frac{1}{2}\right)^{\frac{\alpha}{4\beta}}-2\beta} &=\frac{\alpha}{(2\beta+\alpha)\cdot \left((\frac{1}{2})^{\frac{\alpha}{4\beta}}-1\right)+\alpha}\\
&\leq \frac{\alpha}{(2\beta+\alpha)\cdot(-\frac{\alpha}{4\beta})+\alpha}\\
&=\frac{1}{\frac{1}{2}-\frac{\alpha}{4\beta}}\leq 4 ~(\text{Assume} ~\frac{\alpha}{\beta}\leq 1),
\end{aligned}
\end{equation*}
where we use the fact that $(\frac{1}{2})^{\frac{\alpha}{4\beta}}-1\geq -\frac{\alpha}{4\beta}$. (To prove this is to prove that $2^{\frac{\alpha}{4\beta}}(1-\frac{\alpha}{4\beta})\leq 1$. Let $f(x)=a^x(1-x)$. The derivative $f'(x)=\ln a \cdot a^x-\ln a\cdot x\cdot a^x-a^x=a^x\cdot(\ln a-x \cdot \ln a -1)<0$ when $a<e$. So $f(x)$ decreases in $[0,1]$, and thus $f(x)\leq 1$, $\forall x\in [0,1]$. Let $a=2$  and $x=\frac{\alpha}{4\beta}$, and we will get $2^{\frac{\alpha}{4\beta}}\cdot (1-\frac{\alpha}{4\beta})\leq 1$.)

Now we bound the term $\frac{\left[\frac{2\beta+\alpha}{2\beta}\cdot \left(\frac{1}{2}\right)^{\frac{\alpha}{4\beta}} 
\right]^T-1}{\left(\frac{2\beta+\alpha}{2\beta}\right)^T-1}$.
\begin{equation*}
\begin{aligned}
\frac{\left[\frac{2\beta+\alpha}{2\beta}\cdot \left(\frac{1}{2}\right)^{\frac{\alpha}{4\beta}} 
\right]^T-1}{\left(\frac{2\beta+\alpha}{2\beta}\right)^T-1}=&\frac{\left(\frac{2\beta+\alpha}{2\beta}\right)^T\cdot (\frac{1}{2})^{\frac{\alpha T}{4\beta}}-(\frac{1}{2})^{\frac{\alpha T}{4\beta}}+(\frac{1}{2})^{\frac{\alpha T}{4\beta}}-1
}{\left(\frac{2\beta+\alpha}{2\beta}\right)^T-1}\\
=& \left(\frac{1}{2}\right)^{\frac{\alpha T}{4\beta}} +\frac{(\frac{1}{2})^{\frac{\alpha T}{4\beta}}-1}{\left(\frac{2\beta+\alpha}{2\beta}\right)^T-1}\\
<&\left(\frac{1}{2}\right)^{\frac{\alpha T}{4\beta}}.
\end{aligned}
\end{equation*}
Thus, Eq. (\ref{eq6}) becomes
\begin{equation*}
\frac{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t2^{-\frac{\alpha t}{4\beta}} \cdot C_D}{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t} = O\left(C_D \cdot \left(\frac{1}{2}\right)^{\frac{\alpha (T+1)}{4\beta}}\right).
\end{equation*}
Bring this back to Eq.(\ref{eq7}) and we can get
\begin{equation*}
\mathbb{E}[||\hat{w}-\hat{w}'||]
\leq O\left( C_D \cdot2^{\frac{-\alpha (T+1)}{4\beta}} +\frac{L}{n\alpha}+\frac{L\sqrt{\log(1/\delta) d \kappa T}}{\alpha n \epsilon}\right).
\end{equation*}
Since the loss is $L$-Lipschitz w.r.t $\|\cdot\|$, we can see the generalization error $\mathbb{E}[\mathcal{L}(\hat{w})-\hat{\mathcal{L}}(\hat{w},D)] \leq L \cdot O\left( C_D \cdot2^{\frac{-\alpha (T+1)}{4\beta}} +\frac{L}{n\alpha}+\frac{L\sqrt{\log(1/\delta) d \kappa T}}{\alpha n \epsilon}\right).$





Take $\alpha=\frac{4\beta}{T+1}\log_2 \frac{n}{T}$,

\begin{equation*}
\begin{aligned}
\mathbb{E}[\mathcal{L}(\hat{w})]-\mathcal{L}(w^{*})&=\mathbb{E}[\mathcal{L}(\hat{w})-\hat{\mathcal{L}}(\hat{w},D)] + \mathbb{E}[\hat{\mathcal{L}}(\hat{w},D)-\hat{\mathcal{L}}(w^{*},D)]\\
&\leq L\cdot \mathbb{E}[||\hat{w}-\hat{w}'||]+\mathbb{E}[\hat{\mathcal{L}}(\hat{w},D)-\hat{\mathcal{L}}(\tilde{w}^{*},D)]\\
&=O\left( L\cdot 2^{\frac{-\alpha(T+1)}{4\beta}}\cdot \mathbb{E}[C_D]+\frac{L^2}{n\alpha}+\frac{L^2\sqrt{\log(1/\delta)d \kappa T}}{\alpha n \epsilon}
+\alpha\cdot \mathbb{E}[C_D^2]+\frac{1}{\alpha}\cdot \frac{L^2 \log(1/\delta)d \kappa T}{n^2\epsilon^2}
\right)\\
&=\tilde{O}\left(\frac{T\sqrt{\kappa}}{n}+\frac{T^{\frac{3}{2}}\sqrt{d\log(1/\delta)\kappa}}{n\epsilon}+\frac{T^2 d\log (1/\delta)\kappa}{n^2\epsilon^2}+\frac{\kappa }{T}
\right) ~(\text{By substituting} ~\alpha=\frac{4\beta}{T+1}\log_2 \frac{n}{T})\\
&=\tilde{O}\left(\frac{T\sqrt{\kappa}}{n}+\frac{T^{\frac{3}{2}}\sqrt{d\log(1/\delta)\kappa}}{n\epsilon}+\frac{\kappa}{T} 
\right)\\
&\leq\tilde{O}\left(\frac{T^{\frac{3}{2}}\sqrt{d\log(1/\delta)\kappa}}{n\epsilon}+\frac{\kappa}{T} 
\right) ~(\text{Since $T=O\left(\sqrt{n\sqrt{\kappa}}\right)$})\\
&=  \tilde{O}\left( \kappa^ \frac{4}{5}\left(\frac{\sqrt{d\log(1/\delta) }}{n\epsilon}\right)^{\frac{2}{5}}
\right)~(\text{By letting }~T=\Theta\left(\left(\frac{n\epsilon\sqrt{k}}{\sqrt{d\log(1/\delta)}}\right)^{\frac{2}{5}}\right)),
\end{aligned}
\end{equation*}
where $\tilde{O}$ hides a factor of $\mathbb{E}[\tilde{C}_D^2]$ with $\tilde{C}_D^2 = \|\tilde{w}^*\|_{\kappa_+}^2$ and $\tilde{w}^{*}=\underset{w\in\mathbf{E}}{\arg\min} \hat{\mathcal{L}}(w,D)$. 

(Note that since we assume $n=O\left( \frac{\epsilon^4}{(d \log (1/\delta))^2\kappa^{1/2}}\right)$,  the constraint $T=O\left(\sqrt{n\sqrt{\kappa}}\right)$ comes for free when  letting $T=\Theta\left(\left(\frac{n\epsilon\sqrt{k}}{\sqrt{d\log(1/\delta)}}\right)^{\frac{2}{5}}\right)$).
\subsection{Proof of Theorem \ref{thm:8}}
To be self-contained, we first review the Phased DP-SGD algorithm in \cite{feldman2020private}. Since we are concerned about the unconstrained case, we slightly modify the original Phased DP-SGD algorithm by  eliminating the projection step.
	\begin{algorithm}
	\caption{Phased-DP-SGD algorithm \cite{feldman2020private} \label{alg:6}}
	\begin{algorithmic}[1]
		\State {\bfseries Input:}  	Dataset $S=\{x_1,\cdots,x_n\} $, convex loss $\ell$, step size $\eta$ (will be specified later), privacy parameter $\epsilon$ and (or) $\delta$.
		\State 	Set $k= \lceil \log_2 n\rceil$. Partite the whole dataset $S$ into $k$ subsets $\{S_1,\cdots,S_k\}$. Denote $n_i$ as the number of samples in $S_i$, {\em i.e.,} $|S_i|=n_i$, where $n_i=\lfloor 2^{-i}n \rfloor$. Moreover, set $w_0=0$. 
		\For {$i=1,\cdots ,k$}
		\State  Let $\eta_i=4^{-i}\eta$, $w_i^1=w_{i-1}$. 
		\For{$t= 1,\cdots,n_i$}
		\State Update $w_i^{t+1}=w_i^{t}-\eta_i\nabla \ell(w_{i}^{t},x_i^t)$, where $x_i^t$ is the $t$-th sample of the set $S_i$. 
		
		\EndFor
		\State Set $\overline{w}_i=\frac{1}{n_i+1}\sum \limits_{t=1}^{n_i+1} w_i^t$. 
		\State For $(\epsilon,\delta)$-DP, $w_i=\overline{w}_i+\xi_i$, where $\xi_i \sim \mathcal{N}(0,\sigma_i^2\mathbb{I}_d)$ with $\sigma_i=\frac{4L\eta_i\sqrt{\log (1/\delta)}}{\epsilon}$. 
		\EndFor \\
		\Return $w_k$
		\end{algorithmic}
\end{algorithm}
		\begin{lemma}\label{alemma:14}(Modification of Theorem 4.4 in \cite{feldman2020private})
		Let  $\ell(\cdot, x)$ be $\beta$-smooth, convex and $L$-Lipschitz function over $\mathbb{R}^d$ for each $x$. If we set $\eta=\frac{1}{L}\min\{\frac{4}{\sqrt{n}}, \frac{\epsilon}{2\sqrt{d\log(1/\delta)}}\}$  and if $\eta\leq \frac{1}{\beta}$ ({\em i.e.,} $n$ is sufficiently large), then Algorithm \ref{alg:6} will be $(\epsilon,\delta)$-DP for all $\epsilon\leq 2\log (1/\delta)$. The output satisfies
		\begin{equation*}
	\mathbb{E}[\mathcal{L}(w_k)]-\mathcal{L}(\theta^*) \leq O\left(L\|\theta^*\|_2^2\left(\frac{1}{\sqrt{n}}+\frac{\sqrt{d\log(1/\delta)}}{\epsilon n}\right)\right). 
		\end{equation*}
	\end{lemma} 
\begin{proof}
First, we have the following result, which can be found in the  standard convergence bounds for SGD
    \begin{lemma}\label{alemma:3}
Consider the Gradient Descent method with initial parameter $w_0$, fixed stepsize $\eta$ and iteration number $T$, assume in the $t$-the iteration we have $w_t$, then for any $w$ we have 
\begin{equation}
   \mathcal{L}(\bar{w}_T, D)-\mathcal{L}(w, D)\leq O(\frac{\|w_0-w\|_2^2}{\eta T}+\eta L^2),
\end{equation}
where $\bar{w}_T=\frac{w_0+ w_1+w_2+\cdots+w_T}{T+1}$. 
\end{lemma}

Now we focus on the $i$-th epoch, by Lemma \ref{alemma:3} we have for any $w$
\begin{equation}\label{eq:a.3}
     \mathbb{E} [\mathcal{L}(\bar{w}_i)]-\mathcal{L}(w)\leq O(\frac{\mathbb{E}[\|w_{i-1}-w\|_2^2]}{\eta T}+\eta L^2).
\end{equation}
Now let's be back to our proof. We have (denote $\theta^*=\arg\min_{w\in \mathbb{R}^d } \mathcal{L}(w)$ )
\begin{align*}
      & \mathcal{L}(w_k)-\mathcal{L}(\theta^*)=  \underbrace{ \mathcal{L}(w_k)-\mathcal{L}(\bar{w}_k)}_{A}
      + \underbrace{\sum_{i=2}^k (\mathcal{L}(\bar{w}_i)-\mathcal{L}(\bar{w}_{i-1}))}_{B}+\underbrace{\mathcal{L}(\bar{w}_1)-\mathcal{L}(\theta^*)}_{C}
\end{align*}
For term $A$, by the Lipschitz property we have 
\begin{align*}
    \mathbb{E} [\mathcal{L}(w_k)]-\mathcal{L}(\bar{w}_k) &\leq L \mathbb{E}[\|w_k-\bar{w}_k\|_2]\leq L\mathbb{E}\|\zeta_k\|_2. 
\end{align*}
For each term of $B$ by (\ref{eq:a.3}) and take $w=\bar{w}_{i-1}$ we have 
\begin{align}\label{eq:a.4}
   \mathbb{E} [\mathcal{L}(\bar{w}_i)]-\mathcal{L}(\bar{w}_{i-1}) &\leq O(\frac{\mathbb{E}[\|w_{i-1}- \bar{w}_{i-1}\|^2_2]}{\eta_i n_i}+\eta_i L^2) = O(\frac{\mathbb{E} [\|\zeta_i\|^2_2]}{\eta_i n_i}+\eta_i L^2)
\end{align}
For term $C,$ by (\ref{eq:a.3}) and take $w=\theta^*$ we have 
\begin{align}\label{eq:a.5}
    \mathbb{E}[\mathcal{L}(\bar{w}_1)]-L(\theta^*)& \leq O(\frac{\|\theta^*\|^2_2}{\eta_1 n_1}+\eta_1 L^2).
\end{align}
Thus, combing  (\ref{eq:a.3}), (\ref{eq:a.4}) and (\ref{eq:a.5}), we have 
\begin{align}\label{eq:a.6}
   \mathbb{E} [\mathcal{L}(w_k)]-\mathcal{L}(\theta^*)\leq  O(L\mathbb{E}[\|\zeta_k\|_2]+ \frac{\|\theta^*\|^2_2}{\eta_1 n_1}+\eta_1 L^2 +\sum_{i=2}^k (\frac{\mathbb{E} [\|\zeta_i\|^2_2]}{\eta_i n_i}+\eta_i L^2)
\end{align}
Now, we analyze the case of $(\epsilon, \delta)$-DP, it is almost the same for $\epsilon$-DP. Specifically, we have $\mathbb{E}[\|\zeta_i\|_2^2]=O(\frac{dL^2\eta_i^2\log (1/\delta)}{\epsilon^2})$. Thus, 
\begin{align*}
   L\mathbb{E}[\|\zeta_k\|_2]\leq 
   L\sqrt{ \mathbb{E}\|\zeta_k\|_2^2}&= L^2\cdot \frac{\sqrt{d\log(1/\delta)}\eta_k}{\epsilon}\\
   &=
   O(\frac{\sqrt{d\log (1/\delta)}\eta L^2}{n^2\epsilon})\\
   &=O(L(\frac{\sqrt{d\log (1/\delta)}}{n^{2.5}\epsilon}+\frac{1}{n^2}))  . 
\end{align*}
where the second inequality is due to $\eta= \frac{1}{L}\min \{\frac{1}{\sqrt{n}}, \frac{\epsilon}{\sqrt{d\log (1/\delta)}}\}$. And 
\begin{align*}
    \frac{\|\theta^{*}\|^2_2}{\eta_1 n_1}+\eta_1 L^2 &=O( \frac{\|\theta^*\|^2_2}{\eta n}+ \eta L^2) \\
    &=O(\|\theta^*\|^2_2 L(\frac{1}{n}\max \{\sqrt{n}, \frac{\sqrt{d\log (1/\delta)}}{\epsilon}\}+\frac{1}{\sqrt{n}}))\\
    & \leq O(\|\theta^*\|^2_2 L(\frac{1}{\sqrt{n}}+ \frac{\sqrt{d\log (1/\delta)}}{n\epsilon})),
\end{align*}
where the second inequality is due to $\eta= \frac{1}{L}\min \{\frac{1}{\sqrt{n}}, \frac{\epsilon}{\sqrt{d\log (1/\delta)}}\}$.
\begin{align*}
    \sum_{i=2}^k (\frac{\mathbb{E} \|\zeta_i\|^2_2}{\eta_i n_i}+\eta_i L^2) &=O(  \sum_{i=2}^k (\frac{dL^2\eta_i^2\log (1/\delta)}{\eta_i n_i \epsilon^2}+\eta_iL^2) \\
    &=O(  \sum_{i=2}^k \frac{ 2^{-i}}{n\eta }+4^{-i}\frac{L}{\sqrt{n}} )\\
    &= O(  \sum_{i=2}^k (2^{-i} (\frac{1}{n\eta}+ \frac{L}{\sqrt{n}}))\\
    &\leq O(  \sum_{i=2}^\infty (2^{-i}L (\frac{1}{n}\max \{\sqrt{n}, \frac{\sqrt{d\log (1/\delta)}}{\epsilon}\}+ \frac{1}{\sqrt{n}}))\\
    &\leq O(L(\frac{1}{\sqrt{n}}+ \frac{\sqrt{d\log (1/\delta)}}{n\epsilon})).
\end{align*}
Thus, combining with the previous three bounds into (\ref{eq:a.6}), we have our result. 

\end{proof}
Next, we will prove  Theorem \ref{thm:8} via Lemma \ref{alemma:14}. Specifically, we have the following result. 
\begin{theorem}
    For the $\ell_p^d$ space with $1<p<2$ and  suppose Assumption \ref{as3} holds. Then Algorithm \ref{alg:6} will be $(\epsilon,\delta)$-DP for all $\epsilon\leq 2\log (1/\delta)$. If we set $\eta=\frac{1}{L}\min\{\frac{4}{\sqrt{n}}, \frac{\epsilon}{2\sqrt{d\log(1/\delta)}}\}$, the output satisfies 
\begin{equation}
   \mathbb{E}[\mathcal{L}(\hat{w})]-\mathcal{L}(\theta^*)\leq  O\left(Ld^{1-\frac{2}{p}}\|\theta^*\|^2\left(\frac{1}{\sqrt{n}}+\frac{\sqrt{d\log(1/\delta)}}{\epsilon n}\right)\right). 
\end{equation}
\end{theorem}
\begin{proof}
We bound the $\|\cdot\|_2$-diameter and Lipschitz constant for the $\ell_p^d$-setting. First we have that $\|\theta^*\|_2\leq d^{\frac{1}{2}-\frac{1}{p}}\|\theta^*\|$. Moreover, since $\ell$ is Lipschitz w.r.t. $\|\cdot\|$, we can see it is $L$-Lipschitz w.r.t $\|\cdot\|_2$ as $\|\nabla \ell(w, x)\|_2\leq \|\nabla \ell(w, x)\|_*\leq L$. Moreover since $\ell$ is $\beta$-smooth w.r.t $\|\cdot\|$, we have $\|\nabla \ell(w, x)-\nabla \ell(w', x)\|_2\leq \|\nabla \ell(w, x)-\nabla \ell(w', x)\|_2\|_*\leq \beta \|w-w'\|\leq \beta\|w-w'\|_2$, indicating that it is $\beta$-smooth w.r.t. $\|\cdot\|_2$. Thus, we have 
\begin{equation}
   \mathbb{E}[\mathcal{L}(\hat{w})]-\mathcal{L}(\theta^*)\leq  O\left(Ld^{1-\frac{2}{p}}\|\theta^*\|^2\left(\frac{1}{\sqrt{n}}+\frac{\sqrt{d\log(1/\delta)}}{\epsilon n}\right)\right). 
\end{equation}
\end{proof}
\subsection{Proof of Theorem \ref{thm:9}}
\begin{proof}
We first recall the following lemma:
\begin{lemma}\citep{feldman2022hiding}
For a domain $\mathcal{D}$, let $\mathcal{R}^{(i)}: f\times \mathcal{D}\rightarrow \mathcal{S}^{(i)}$ for $i \in [n]$ be a sequence of algorithms such that $\mathcal{R}^{(i)}(z_{1:i-1},\cdot)$ is a $(\epsilon_0,\delta_0)$-DP local randomizer for all values of auxiliary inputs $z_{1:i-1}\in \mathcal{S}^{(1)}\times \cdots \times \mathcal{S}^{(i-1)}$. Let $\mathcal{A}_{\mathcal{S}}:\mathcal{D}^n \rightarrow \mathcal{S}^{(1)}\times \cdots \times \mathcal{S}^{(n)}$ be the algorithm that given a dataset $x_{1:n\in\mathcal{D}^n}$, sample a uniformly random permutation $\pi$, then sequentially computes $z_i = \mathcal{R}^{(i)}(z_{1:i-1},x_{\pi(i)})$ for $i\in [n]$, and the outputs $z_{1:n}$. Then for any $\delta \in [0,1]$ such that $\epsilon_0\leq \log \left(\frac{n}{16\log (2/\delta)}\right)$, $\mathcal{A}_{\mathcal{S}}$ is $(\epsilon, \delta+O(e^{\epsilon}\delta_0 n))$-DP where $\epsilon = O\left((1-e^{-\epsilon_0})\cdot (\frac{\sqrt{e^{\epsilon_0}\log (1/\delta)}}{\sqrt{n}}+\frac{e^{\epsilon_0}}{n})\right)$.
\end{lemma}
Now let's get back to the proof.
Note that by the Generalized Gaussian mechanism, we can see $\mathcal{R}(x) = g_x+\mathcal{GG}_{||\cdot||_{+}}(\sigma^2)$ with $\sigma^2 =O\left(\frac{\kappa  (\beta M+\lambda)^2 \log(1/\delta_0)}{ \epsilon_0^2}\right)$ will be a $(\epsilon_0,\delta_0)$-DP local minimizer. The output could be considered as the postprocessing of the shuffled output $\mathcal{R}(x)$. Thus, the algorithm will be $(\hat{\epsilon},\hat{\delta}+O(e^{\hat{\epsilon}}\delta_0 n))$-DP where $\hat{\epsilon}=O\left((1-e^{-\epsilon_0})\cdot (\frac{\sqrt{e^{\epsilon_0}\log (1/\hat{\delta})}}{\sqrt{n}}+\frac{e^{\epsilon_0}}{n})\right)$.

Now, assume that $\epsilon_0\leq \frac{1}{2}$, then $\exists c_1>0$, s.t.,
\begin{equation*}
\begin{aligned}
\hat{\epsilon}&\leq c_1(1-e^{-\epsilon_0})\cdot \left(\frac{\sqrt{e^{\epsilon_0}\log(1/\hat{\delta})}}{\sqrt{n}}+\frac{e^{\epsilon_0}}{n}\right)\\
& \leq c_1\cdot\left( (e^{\epsilon_0/2}-e^{-\epsilon_0/2})\cdot\sqrt{\frac{\log(1/\hat{\delta})}{n}}  +\frac{e^{\epsilon_0}  -1}{n}  \right)\\
& \leq c_1 \cdot\left( \left((1+\epsilon_0)-(1-\frac{\epsilon_0}{2}  )\right) \cdot \sqrt{\frac{\log(1/\hat{\delta})}{n}}+\frac{(1+2\epsilon_0)-1}{n}\right)\\
& =c_1 \cdot \epsilon_0\cdot \left( \frac{3}{2}\sqrt{\frac{\log(1/\hat{\delta})}{n}}+\frac{2}{n}\right).
\end{aligned}
\end{equation*}
Set $\hat{\delta}=\frac{\delta}{2}$, $\delta_0 =c_2\cdot\frac{\delta}{e^{\hat{\epsilon}}n}$ for some constant $c_2>0$ and replace $\epsilon_0=\frac{c_3\cdot \kappa (\beta M+\lambda)\cdot\sqrt{\log(1/\delta_0)}}{\sigma_1}$:
\begin{equation*}
\begin{aligned}
\hat{\epsilon}&\leq c_1\cdot c_3\cdot \frac{\kappa(\beta M+\lambda)\cdot \sqrt{\log(1/\delta_0)}}{\sigma_1}\cdot \left( \frac{3}{2}\sqrt{\frac{\log(1/\hat{\delta})}{n}}+\frac{2}{n}\right)\\
& \leq O\left(\frac{\kappa (\beta M+\lambda)\cdot \sqrt{\log(1/\delta_0)\log (1/\hat{\delta})}}{\sigma_1 \sqrt{n}}\right)\\
& \leq O\left(\frac{\kappa (\beta M+\lambda)\cdot \sqrt{\log(1/\delta)\log (e^{\hat{\epsilon}}n/\delta)}}{\sigma_1 \sqrt{n}}\right).
\end{aligned}
\end{equation*}
For any $\epsilon\leq 1$, if we set $\sigma = O\left(\frac{\kappa (\beta M+\lambda)\sqrt{\log(1/\delta)\log(n/\delta)}}{\epsilon \sqrt{n}}\right)$, then we have $\hat{\epsilon}\leq \epsilon$. Furthermore, we need $\epsilon_0 = O\left(\frac{\kappa (\beta M+\lambda)\sqrt{\log(1/\delta_0)}}{\sigma}
\right)\leq \frac{1}{2}$, which would be ensured if we set $\epsilon = O\left(\sqrt{\frac{\log(n/\delta)}{n}}\right)$. This implies that for $\sigma=O\left(\frac{\kappa (\beta M+\lambda)\cdot \log (n/\delta)}{\epsilon\sqrt{n}}\right)$, algorithm \ref{alg4} satisfies $(\epsilon,\delta)$-DP as long as $\epsilon=O\left(\sqrt{\frac{\log(n/\delta)}{n}}\right)$.
\end{proof}
\subsection{Proof of theorem \ref{thm:10}}
\begin{proof}
Denote $y_t = \frac{1}{|B_t|}\sum_{x\in B_t} g_x$, $z_t =\frac{1}{|B_t|}\sum_{x\in B_t}Z_x^t$ and  $\tilde{y}_t= y_t+z_t$.  The optimality condition for $w_t=\underset{w\in\mathcal{C}}{\arg\min} \left\{ \langle \frac{\sum_{x\in B_t} g_x + Z_x^t}{|B_t|},w\rangle +\gamma_t \cdot D_{\Phi}(w,w_{t-1})\right\}$ has the form:
\begin{equation*}
\langle \tilde{y}_t +\gamma_t (\nabla \Phi(w_t)-\nabla \Phi(w_{t-1})),z-w_t\rangle \geq 0, \forall z \in \mathcal{C}.
\end{equation*}
Equivalently, we have 
\begin{equation*}
\begin{aligned}
\langle \tilde{y}_t, w_t -z\rangle & \leq \gamma_t\langle \nabla \Phi (w_t)-\nabla \Phi (w_{t-1}), z-w_t\rangle \\
& = \gamma_t(D_{\Phi}(z,w_{t-1})-D_{\Phi}(z,w_{t})-D_{\Phi}(w_t, w_{t-1})), ~\forall z\in\mathcal{C}.
\end{aligned}
\end{equation*}
Let $\xi_t =y_t-\nabla \mathcal{L}(w_{t-1})+z_t=\tilde{y}_t-\nabla \mathcal{L}(w_{t-1})$, then we have 
\begin{equation*}
\langle \nabla \mathcal{L}(w_{t-1}),w_t -z\rangle \leq \gamma_t(D_{\Phi}(z,w_{t-1})-D_{\Phi}(z,w_{t})-D_{\Phi}(w_t, w_{t-1}))-\langle \xi_t,w_t-z\rangle.
\end{equation*}
On the other hand, we know that 
\begin{align}
\mathcal{L}(w_t) -\mathcal{L}(z)& =( \mathcal{L}(w_t) - \mathcal{L}(w_{t-1}))+(\mathcal{L}(w_{t-1})-\mathcal{L}(z))\notag\\
& =\langle \nabla \mathcal{L}(w_{t-1}),w_t-w_{t-1}\rangle +\beta \cdot D_{\Phi}(w_t,w_{t-1})+\langle \nabla \mathcal{L}(w_{t-1}), w_{t-1}-z\rangle\label{eq8}\\
& \leq \langle \nabla\mathcal{L}(w_{t-1}),w_t -z\rangle +\frac{\gamma_t}{2} D_{\Phi}(w_t, w_{t-1})\label{eq9}\\
& \leq \gamma_t (D_{\Phi}(z, w_{t-1})-D_{\Phi}(z, w_t)-\frac{1}{2} D_{\Phi}(w_t, w_{t-1}))-\langle \xi_t, w_t-z\rangle,\notag
\end{align}
where Eq. (\ref{eq8}) uses the fact that $D_{\Phi}(w_t,w_{t-1})\geq \frac{1}{2}||w_t-w_{t-1}||^2$ and $\mathcal{L}$ is smooth as well as the convexity of $\mathcal{L}$ while Eq. (\ref{eq9}) is because $\gamma_t \geq 2\beta$.

Due to the strong convexity of $D_{\Phi}(\cdot, w_{t-1})$, we have 
\begin{align*}
&\langle \xi_t, w_{t-1}-w_t\rangle \leq \frac{\gamma_t\|w_{t-1}-w_t\|_2^2}{4}+ \frac{||\xi_t||_{*}^2}{\gamma_t}\\
\implies  & \langle \xi_t, w_{t-1}-w_t\rangle \leq \frac{\gamma_t}{2}D_{\Phi}(w_t, w_{t-1})+ \frac{||\xi_t||_{*}^2}{\gamma_t}\\
\implies &\langle \xi_t, z-w_t\rangle -\frac{\gamma_t}{2}D_{\Phi}(w_t, w_{t-1})\leq \langle \xi_t, z-w_{t-1}\rangle +\frac{||\xi_t||_{*}^2}{\gamma_t}.
\end{align*}
Thus,
\begin{equation*}
\begin{aligned}
&\mathcal{L}(w_t)-\mathcal{L}(z)\leq \gamma_t (D_{\Phi}(z, w_{t-1})-D_{\Phi}(z, w_{t}))-\langle \xi_t, w_{t-1}-z\rangle +\frac{||\xi_t||_{*}^2}{\gamma_t}\\
\Rightarrow&\frac{1}{\gamma_t}(\mathcal{L}(w_t)-\mathcal{L}(z))\leq D_{\Phi}(z, w_{t-1})-D_{\Phi}(z, w_{t})-\frac{\langle \xi_t, w_{t-1}-z\rangle }{\gamma_t}+\frac{||\xi_t||_{*}^2}{\gamma_t^2}.\\
\end{aligned}
\end{equation*}

Thus, summing over $t=1,\cdots, T$, 
\begin{equation*}
\begin{aligned}
&\sum_{t=1}^T (\gamma_t^{-1})\cdot (\mathcal{L}(w_t)-\mathcal{L}(z))\leq D_{\Phi}(z, w_0)-D_{\Phi}(z, w_T)+\sum_{t=1}^T \left(
\frac{\langle \xi_t, z-w_{t-1} \rangle}{\gamma_t}+\frac{||\xi_t||_{*}^2}{\gamma_t^2}
\right)\\
\Rightarrow&(\sum_{t=1}^T \gamma_t^{-1})\cdot (\mathcal{L}(\frac{\sum_{t=1}^T \gamma_{t}^{-1} w_t}{\sum_{t=1}^T \gamma_{t}^{-1} })  -\mathcal{L}(z))\leq D_{\Phi}(z, w_0)-D_{\Phi}(z, w_T)+\sum_{t=1}^T \left(
\frac{\langle \xi_t, z-w_{t-1} \rangle}{\gamma_t}+\frac{||\xi_t||_{*}^2}{\gamma_t^2}
\right)\\
\Rightarrow&(\sum_{t=1}^T \gamma_t^{-1})\cdot (\mathcal{L}(\hat{w})  -\mathcal{L}(z))\leq D_{\Phi}(z, w_0)+\sum_{t=1}^T \left(
\frac{\langle \xi_t, z-w_{t-1} \rangle}{\gamma_t}+\frac{||\xi_t||_{*}^2}{\gamma_t^2}\right).
\end{aligned}
\end{equation*}
Take the expectation over the randomness of the noise, we get
\begin{equation*}
(\sum_{t=1}^T \gamma_t^{-1})\cdot (\mathbb{E}[\mathcal{L}(\hat{w})]-\mathcal{L}(z))\leq D_{\Phi}(z, w_0)+\sum_{t=1}^T 
\frac{\mathbb{E}[\langle \xi_t, z-w_{t-1} \rangle]}{\gamma_t}+\sum_{t=1}^T\frac{\mathbb{E}[||\xi_t||_{*}^2]}{\gamma_t^2}.
\end{equation*}
To bound the term $\sum_{t=1}^T\frac{\mathbb{E}[\langle \xi_t, z-w_{t-1}\rangle ]}{\gamma_t}$, let $x_t=y_t-\nabla \mathcal{L}(w_{t-1})$
and notice that
\begin{equation*}
\begin{aligned}
\sum_{t=1}^T\frac{\mathbb{E}[\langle \xi_t, z-w_{t-1}\rangle ]}{\gamma_t}
=&\sum_{t=1}^T\frac{\mathbb{E}[\langle y_t - \nabla \mathcal{L}(w_{t-1}), z-w_{t-1}\rangle]}{\gamma_t}  \\
=& \sum_{t=1}^T\frac{[\langle x_t, z-w_{t-1}\rangle]}{\gamma_t}.
\end{aligned}
\end{equation*}
We will bound $\sum_{t=1}^T \langle x_t, z-w_{t-1}\rangle =\sum_{t=1}^T \psi_t$. First, we recall the following lemma proposed by \cite{nazin2019algorithms}. 

\begin{lemma}\label{lemma:a17}
When $\beta M \leq \lambda$, we have
\begin{equation*}
||x_t||_{*}\leq 2\beta M+\lambda \leq 3\lambda \Rightarrow |\langle x_t, z-w_{t-1}\rangle |\leq 3\lambda M,
\end{equation*}
\begin{equation*}
||\mathbb{E}[x_t]||_{*}\leq \beta \cdot M \cdot \left(\frac{\sigma}{\lambda}\right)^2 +\frac{\sigma^2}{\lambda}\leq \frac{2\sigma^2}{\lambda}\Rightarrow |\mathbb{E}[\langle x_t, z-w_{t-1}\rangle ]|\leq \frac{2\sigma^2 M}{\lambda },
\end{equation*}
\begin{equation*}
\left(\mathbb{E}[||x_t||_{*}^2]\right)^{1/2} \leq \sigma +\beta M \cdot \frac{\sigma}{\lambda}\leq 2\sigma\Rightarrow \left(\mathbb{E}[(\langle x_t, z-w_{t-1}\rangle )^2]\right)^{1/2} \leq 2\sigma M.
\end{equation*}

\end{lemma}
Next, we recall Bernstein's inequality for martingales \cite{freedman1975tail},

\begin{lemma}
    Suppose $X_1, \cdots, X_n$ are a sequence of random variables such that $0\leq X_i\leq 1$. Define the martingale difference sequence $\{Y_n=\mathbb{E}[X_n|X_1, \cdots, X_{n-1}]-X_n\}$ and denote $K_n$ the sum of the conditional variances
\begin{equation*}
    K_n=\sum_{t=1}^n \text{Var}(X_n|X_1,\cdots, X_{n-1}).
\end{equation*}
Let $S_n=\sum_{i=1}^n X_i$, then for all $\epsilon, k\geq 0$ we have
\begin{equation}
  \text{Pr}[\sum_{i=1}^n \mathbb{E}[X_n|X_1, \cdots, X_{n-1}]-S_n\geq \epsilon, K_n\leq k]\leq \exp(-\frac{\epsilon^2}{2k+2\epsilon/3}). 
\end{equation}
\end{lemma}




we have 
\begin{equation*}
\begin{aligned}
\text{Pr}\left\{\sum_{t=1}^T \psi_t\geq \frac{2 TM\sigma^2}{\lambda}+3\cdot (2\sigma M)\sqrt{\tau T}\right\}&\leq \exp\left\{-\frac{9\cdot \tau}{2+\frac{2}{3}\cdot \frac{3\sqrt{\tau}\cdot(3\lambda M)}{2\sigma M \sqrt{T}}}\right\}\\
&\leq \exp\left\{-\frac{9\tau}{2+\frac{3\lambda \sqrt{\tau}}{\sigma \sqrt{T}}}\right\}\\
&
\leq e^{-\tau}
\end{aligned}
\end{equation*}
for all $\tau= O\left(\frac{\sigma^2 T}{\lambda^2}\right)$.


Thus, for all  $\tau= O\left(\frac{\sigma^2 T}{\lambda^2}\right)$ w. p. $1-e^{-\tau}$,
\begin{equation*}
\sum_{t=1}^T \psi_t \leq O\left(\frac{TM\sigma^2}{\lambda}+\sigma M\sqrt{T\tau}\right).
\end{equation*}
Next we bound the term of $\sum_{t=1}^T \mathbb{E}[||\xi_t||_{*}^2]$. It is notable that $$\mathbb{E}[||\xi_t||_{*}^2]= \mathbb{E}[\|x_t+z_t\|^2_*]\leq 2\|x_t\|_*^2+2\mathbb{E}[\|z_t\|_*^2]=2\|x_t\|_*^2+2g^2,$$ with $$g^2=O(\frac{1}{|B_t|} \frac{\log(\frac{n}{\delta})\cdot d\kappa (\beta M+\lambda )^2 \cdot \log(1/\delta)}{n\epsilon^2})= O(\frac{\log(\frac{n}{\delta})\cdot dT\kappa (\beta M+\lambda )^2 \cdot \log(1/\delta)}{n^2\epsilon^2}).$$  Thus, it is sufficient for us to bound $\sum_{i=1}^T \|x_t\|_*^2=\sum_{i=1}^T \phi_i$. Similar to Lemma \ref{lemma:a17} we have the following result 
\begin{lemma}\citep{nazin2019algorithms} 
When $M \leq \lambda$, we have
\begin{align*}
    &\mathbb{E}[\phi_i]\leq (\sigma+\frac{M\sigma}{\lambda})^2\leq 4\sigma^2, \\ 
    & \phi_i\leq (2M+\lambda)^2\leq 9\lambda^2,\\
    & [\mathbb{E}(\phi_i^2)]^\frac{1}{2}\leq (\sigma+\frac{M\sigma}{\lambda})(2M+\lambda)\leq 6\lambda \sigma. 
\end{align*}

\end{lemma}
Thus, by Berstern's inequality, we have if $\tau= O\left(\frac{\sigma^2 T}{\lambda^2}\right)$ 
\begin{align*}
    \text{Pr}[\sum_{t=1}^T ||x_t||_{*}^2\geq 4\sigma^2 T+18\lambda \sigma \sqrt{T\tau}  ]\leq \exp(-\frac{9\tau}{2+\frac{3\sqrt{\tau}\lambda}{\sigma\sqrt{T}}})\leq \exp(-\tau). 
\end{align*}

In total, let $\gamma_t=\bar{\gamma}$, we have with probability at least $1-2\exp(-\tau)$ 
\begin{equation}\label{aeq:28}
\mathbb{E}[\mathcal{L}(\hat{w})]-L(\theta^*)\leq O\left(
\frac{D_{\Phi}(\theta^{*},w_0)\cdot \bar{\gamma}}{T}+\frac{M\sigma^2}{\lambda}+\frac{\sigma M\sqrt{\tau}}{\sqrt{T}} +\frac{\sigma^2}{\bar{\gamma}} +\frac{M\sigma \sqrt{\tau}}{\sqrt{T}\bar{\gamma}}
+\frac{\log(\frac{n}{\delta})\cdot dT\kappa (\beta M+\lambda )^2 \cdot \log(1/\delta)}{n^2\epsilon^2\bar{\gamma} }
\right).
\end{equation}
Let $\frac{\bar{\gamma}}{T} =O(\frac{(\beta M+\lambda) \sqrt{ d \log (1/\delta)}}{nM\epsilon} )$, and since $D_{\Phi}(\theta^{*},w_0)=\Phi(\theta^*)\leq \frac{\kappa M^2}{2}$ we have
\begin{equation*}
\mathbb{E}[\mathcal{L}(\hat{w})]-L(\theta^*) \leq \tilde{O}\left(
\frac{M\sigma^2}{\lambda}+\frac{\sigma M\sqrt{\tau}}{\sqrt{T}}+\frac{M\sigma^2}{\bar{\gamma}}+\frac{(\beta M+\lambda) M \kappa \sqrt{ d\log(1/\delta)}}{n\epsilon}
\right).
\end{equation*}
Let $\lambda =\frac{\sigma \sqrt{n\epsilon}}{\sqrt[4]{\kappa^2 d \log (1/\delta)}}\geq \max\{\beta, 1\} M$, we have
\begin{equation*}
\mathbb{E}[\mathcal{L}(\hat{w})]-\mathcal{L}(\theta^*)\leq  O\left(
\frac{M\sigma \kappa \sqrt[4]{ d \log (1/\delta)}}{\sqrt{n\epsilon}} + \frac{\sigma M \sqrt{\tau }}{\sqrt{T}}+\frac{M\sigma^2}{\bar{\gamma}}
\right).
\end{equation*}
Let $\bar{\gamma}=\sqrt{T}$, then $\sqrt{T}=O(\frac{Mn\epsilon} {(\beta M+\lambda) \sqrt{d \log (1/\delta)}})$, and it holds that
\begin{equation*}
\mathbb{E}[\mathcal{L}(\hat{w})]-\mathcal{L}(\theta^*)\leq  O\left(
\frac{M\max\{\sigma^2, \sigma\} \sqrt[4]{\kappa^2 d\log(1/\delta)}\sqrt{\log(1/\delta^{'}})}{\sqrt{n\epsilon}}
\right)
\end{equation*}
w.p. at least $1-\delta^{'}$.
\end{proof}
\section{Additional Theorems and Proofs}\label{appe: additional theorems}
\begin{theorem}\label{thm:11}
      For the $\ell_p^d$ space with $1<p<2$, suppose Assumption \ref{ass:4} holds and assume $n$ is large enough such that $O((\frac{\sqrt{n\epsilon}M}{\kappa\sqrt[4]{d\log(1/\delta)}})^\frac{2}{3})\geq \max\{\beta, 1\}M$. For any $0<\epsilon, \delta< 1$, Algorithm \ref{alg:7} is $(\epsilon, \delta)$-DP. Moreover, if we set $\{\gamma_t\}=\gamma=\sqrt{T}$, $T=\frac{n\epsilon}{M\lambda \sqrt{d\log(1/\delta) } }$ and  $\lambda=O((\frac{\sqrt{n\epsilon}M}{\kappa\sqrt[4]{d\log(1/\delta)}})^\frac{2}{3})$. Then for any failure probability $\delta'$, the output $\hat{w}$ satisfies 
      the following with probability at least $1-\delta'$
\begin{equation*}
       \mathbb{E}[\mathcal{L}(\hat{w})]-L(\theta^*)\leq O\left(M^\frac{4}{3}\frac{\kappa^\frac{2}{3}(d\log (1/\delta))^\frac{1}{6}\sqrt{\log(1/\delta')} }{(n\epsilon)^\frac{1}{3}} \right), 
\end{equation*}
      where the expectation is taken over  the randomness of noise, and the probability is w.r.t. the dataset $D$. 
\end{theorem}
\subsection{Proof of Theorem \ref{thm:11}}

\begin{algorithm}
	\caption{Truncated DP Batched Mirror Descent }
	\begin{algorithmic}[1]
		\State {\bfseries Input:} Dataset $D$, loss function $\ell$, initial point $w_0=0$, smooth parameter $\beta$ and  and $\lambda$. 
 \State Divide the permuted data into $T$ batches $\{B_i\}_{i=1}^T$ where $|B_i|=\frac{n}{T}$ for all $i=1,\cdots, T$
 \For{$t = 1,\cdots,T$}
\For{each $x\in B_t$}
\State \hspace{-3mm}\begin{small}$g_x= \begin{cases} \nabla \ell(w_{t-1},x)& \text{if} ~||\nabla \ell(w_{t-1},x)||_{*}\leq \beta M+\lambda\\ 0& {\text{otherwise}} \end{cases}$\end{small}
	\EndFor
 \State Let 
 \State\hspace{-2mm}\begin{small}$w_t=\underset{w\in\mathcal{\mathcal{C}}}{\arg\min} \left\{ \langle \frac{\underset{{x\in B_t}}{\sum} g_x }{|B_t|}+Z^t,w\rangle +\gamma_t \cdot D_{\Phi}(w,w_{t-1})\right\},$\end{small} where  $Z^{t}\sim \mathcal{GG}_{||\cdot||_{+}}(\sigma_1^2)$ with $\sigma_1^2 =O\left(\frac{\kappa (\beta M+\lambda )^2 \cdot \log(1/\delta)}{|B_t|^2\epsilon^2}   \right)$, $||\cdot||_{+}$ is the smooth norm for $(\mathbf{E},||\cdot||_{*})$. $\kappa = \min \{\frac{1}{p-1},\log d\}$ and  $\Phi(x) =\frac{\kappa}{2}||x||_{\kappa_{+}}^2$ with $\kappa_{+}= \frac{\kappa}{\kappa -1}$.  
	\EndFor\\
	 \Return $\hat{w} = (\sum_{t=1}^T \gamma_{t}^{-1})^{-1} \cdot \sum_{t=1}^T \gamma_{t}^{-1} w_t$
	\end{algorithmic}
	\label{alg:7}
\end{algorithm}
We propose our method in Algorithm \ref{alg:7}. Note that there are two key differences compared to Algorithm \ref{alg4}. First, since we do not need the privacy amplification via shuffling, there is no shuffling step. Secondly, instead of adding noise to each truncated gradient $g_x$, here we add a generalized Gaussian noise to the averages of the gradients for each batch. In the following we will prove our theoretical results in Theorem \ref{thm:11}. 

\begin{proof}
The proof of DP is just by the Generalizer Gaussian mechanism. For utility, 
the proof is almost the same as in the proof for Theorem \ref{thm:10}, while the only difference is the noise. Similar to (\ref{aeq:28}) we have the following result with probability at least $1-2\exp(-\tau)$
\begin{equation}\label{aeq:29}
\mathcal{L}(\hat{w})-\mathcal{L}(\theta^*)\leq O\left(
\frac{ \kappa M^2 \bar{\gamma}}{T}+\frac{M\sigma^2}{\lambda}+\frac{\sigma M\sqrt{\tau}}{\sqrt{T}} +\frac{\sigma^2}{\bar{\gamma}} +\frac{M\sigma \sqrt{\tau}}{\sqrt{T}\bar{\gamma}}
+\frac{ dT^2\kappa (\beta M+\lambda )^2 \cdot \log(1/\delta)}{n^2\epsilon^2\bar{\gamma} }
\right).
\end{equation}
Take $\bar{\gamma}=\sqrt{T}$ then we have 
\begin{equation*}
    \mathcal{L}(\hat{w})-\mathcal{L}(\theta^*)\leq O\left(
\frac{\kappa M^2\sqrt{\tau}}{\sqrt{T}}+\frac{M^2}{\lambda}+\frac{ dT^{3/2}\kappa \lambda^2 \cdot \log(1/\delta)}{n^2\epsilon^2 }
\right).
\end{equation*}
Take $T=\frac{n\epsilon}{M\lambda \sqrt{d\log(1/\delta) } }$ we have 
\begin{equation*}
        \mathcal{L}(\hat{w})-\mathcal{L}(\theta^*)\leq O\left(
\frac{\kappa M\sqrt{\lambda}\sqrt[4]{d\log(1/\delta)}\sqrt{\tau}}{\sqrt{n\epsilon}}+\frac{M^2}{\lambda}
\right).
\end{equation*}
    Take $\lambda=O((\frac{\sqrt{n\epsilon}M}{\kappa\sqrt[4]{d\log(1/\delta)}})^\frac{2}{3})\geq \max\{\beta, 1\}M$ we have w.p at least $1-\delta'$
\begin{equation*}
       \mathcal{L}(\hat{w})-\mathcal{L}(\theta^*)\leq O\left(M^\frac{4}{3}\frac{\kappa^\frac{2}{3}(d\log (1/\delta))^\frac{1}{6}\sqrt{\log(1/\delta')} }{(n\epsilon)^\frac{1}{3}} \right). 
\end{equation*}
\end{proof}
\begin{theorem}\label{thm:12}
      For the $\ell_p^d$ space with $2\leq p\leq \infty$, suppose Assumption \ref{ass:4} holds. Then the Algorithm 1 in  \cite{kamath2022improved} is $(\epsilon, \delta)$-DP for all $0<\epsilon, \delta<1$. Moreover, suppose the loss function is non-negative, there exists $R=O(1)$ such that $\|\nabla \mathcal{L}(w)\|_*\leq R$ for all $w\in\mathcal{C}$ and 3) in Assumption \ref{ass:5} holds. then the output satisfies 
\begin{equation}
    \mathbb{E}[\mathcal{L}(w)]-\mathcal{L}(\theta^*)\leq O\left(\frac{d^{\frac{3}{2}-\frac{1}{p}} }{\sqrt{n}}+\frac{d^{\frac{3}{2}-\frac{1}{2p}} }{\sqrt{n\epsilon}}\right). 
\end{equation}
 \end{theorem}
\subsection{Proof of Theorem \ref{thm:12}}
\cite{kamath2022improved} study DP-SCO with heavy-tailed data in Euclidean space and propose an $(\epsilon,\delta)$-DP algorithm for any $0<\epsilon, \delta<1$ that achieves an expected excess population risk of $O(M\frac{d}{\sqrt{n}}+\frac{\sqrt{M}d^\frac{5}{4}}{\sqrt{n\epsilon}})$, where $M$ is the $\ell_2$-norm diameter of the constraint set $\mathcal{C}$, under the following assumptions 
\begin{assumption}\label{ass:5}
    1) The loss function $\ell(w, x)$ is non-negative, differentiable and convex for all $w\in \mathcal{C}$. 2) The loss function is $\beta$-smooth. 3) The gradient of $\mathcal{L}(w)$ at the optimum is zero. 4) There is a constant $\sigma$ such that for all $j\in [d]$ and $w\in\mathcal{C}$ we have  $\mathbb{E}[\langle \nabla \ell(w, x)-\nabla \mathcal{L}(w), e_j\rangle^2]\leq \sigma^2$, where $e_j$ is the $j$-th standard basis vector. 5) For any $w\in\mathcal{C}$, the distribution of the gradient has bounded mean, i.e., $\|\nabla \mathcal{L}(w)\|_2\leq R$. 
\end{assumption}
 For $\ell_p^d$ space, we know that $L$-Lipschitz w.r.t $\|\cdot\|$ implies $L$-Lipschitz w.r.t $\|\cdot\|_2$. Moreover, $\mathbb{E}[||\nabla \ell(w,x)-\nabla \mathcal{L}(w)||_{*}^2]\leq \sigma^2$ implies $\mathbb{E}[||\nabla \ell(w,x)-\nabla \mathcal{L}(w)||_{2}^2]\leq \sigma^2$ which indicates condition 4) in Assumption \ref{ass:5}. For the diameter, it has the diameter of $d^{\frac{1}{2}-\frac{1}{p}}M$ w.r.t $\|\cdot\|_2$. Thus we have the following result. 
 








\end{proof}

\bibliography{uai2023-template}

\end{document}
