\documentclass[accepted]{uai2023} 
\usepackage[american]{babel}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{booktabs}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage[switch]{lineno}
\usepackage{adjustbox}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{blindtext}
\usepackage{enumitem}
\usepackage{multirow}
\usepackage{subcaption}
\usepackage{graphicx}
\theoremstyle{definition}
\newtheorem{assumption}{Assumption}
\newtheorem{definition}{Definition}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}{Remark}
\newtheorem{claim}{Claim}
\newtheorem{fact}{Fact}
\newtheorem{property}{Property}
\newtheorem{proposition}{Proposition}
\usepackage{color}
\usepackage{xcolor} 
\usepackage{xr}




\urlstyle{same}

% the following package is optional:
%\usepackage{latexsym}

% See https://www.overleaf.com/learn/latex/theorems_and_proofs
% for a nice explanation of how to define new theorems, but keep
% in mind that the amsthm package is already included in this
% template and that you must *not* alter the styling.
\newtheorem{example}{Example}
\newtheorem{theorem}{Theorem}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Differentially Private Stochastic Convex Optimization in \\ (Non)-Euclidean Space Revisited}



\author[1]{{Jinyan Su}{}}
\author[2]{Changhong Zhao}
\author[3,4,5]{Di Wang}
% Add affiliations after the authors
\affil[1]{%
   Mohamed bin Zayed University of Artificial Intelligence
}
\affil[2]{%
   Department of Information Engineering,\\
The Chinese University of Hong Kong
}
\affil[3]{%
   Provable Responsible AI and Data Analytics Lab 
  }
  \affil[4]{%
 Computational Bioscience Research Center
  }
  \affil[5]{%
  Division of CEMSE, 
    King Abdullah University of Science and Technology 
  }
\externaldocument{su_604-supp}
  \begin{document}
\maketitle
\begin{abstract}
    In this paper, we revisit the problem of Differentially Private Stochastic Convex Optimization (DP-SCO) in Euclidean and general $\ell_p^d$ spaces. Specifically, we focus on three settings that are still far from well understood:  (1) DP-SCO over a  constrained and bounded (convex) set in Euclidean space; (2) unconstrained DP-SCO in $\ell_p^d$ space; (3) DP-SCO with heavy-tailed data over a  constrained and bounded set in $\ell_p^d$ space. For problem (1), for both convex and strongly convex loss functions, we propose methods whose outputs could achieve (expected) excess population risks that are only dependent on the Gaussian width of the constraint set,  rather than the dimension of the space. Moreover, we also show the bound for strongly convex functions is optimal up to a logarithmic factor. For problems (2) and (3), we propose several novel  algorithms and provide the first theoretical results for both cases when $1<p<2$ and $2\leq p\leq \infty$. 
\end{abstract}


\section{Introduction}

Learning from data that contains sensitive information has become a critical consideration.  It enforces machine learning algorithms to not only learn effectively from the training data but also provide a certain level of guarantee on privacy preservation. To address the privacy concern, as a rigorous notion for statistical data privacy, differential privacy (DP) \citep{dwork2006calibrating} has received much attention in the past few years and has become a de facto technique for private data analysis.  


As the two most fundamental models in machine learning, Stochastic Convex Optimization (SCO) \citep{vapnik1999nature} with its empirical form, Empirical Risk Minimization (ERM), 
can find numerous applications, such as biomedicine and healthcare. However, as these applications always involve sensitive data, it is essential to design DP algorithms for SCO and ERM, which corresponds to the problem of DP-SCO and DP-ERM, respectively. DP-SCO and DP-ERM have been
extensively studied  for over a decade, starting from \cite{chaudhuri2008privacy}. For example, \cite{bassily2014private} presents the optimal rates of general DP-ERM for both convex and strongly loss functions. \cite{bassily2019private,feldman2020private} later study the optimal rates of general DP-SCO, which is later extended by \cite{su2022faster,asi2021adapting} to loss functions that satisfy the growth condition. \cite{bassily2021non,asi2021private} provide the first study on DP-SCO over non-Euclidean space, i.e., the $\ell_p$ space with $1\leq p\leq \infty$. 


While there are a vast number of studies on DP-SCO/DP-ERM, there are still several open problems left, especially the constrained case in Euclidean space where the convex constraint set has some specific geometric structures, and the case where the space is non-Euclidean. In detail, while it has been shown that the optimal rate of DP-ERM over $\ell_2$-norm ball depends on $O(\sqrt{d})$ and $O(d)$ for convex and strongly convex loss, respectively \citep{bassily2014private}, \cite{talwar2014private}  show that for general constraint set $\mathcal{C}$, the bound on $d$ could be improved  to  $O(G_\mathcal{C})$ and $O(G^2_\mathcal{C})$ for these two classes of functions, where $G_\mathcal{C}$ is the Gaussian width of set $\mathcal{C}$ (see Definition \ref{def:12} for details), which could be far less than the dimension $d$. However, compared to DP-ERM with Gaussian width, DP-SCO with Gaussian width is far from well understood. The best-known result even cannot recover the optimal rate of the $\ell_2$-norm ball case \citep{amid2022public}.  For the non-Euclidean case, \cite{bassily2021non} only study the constrained case where the constrained set has a bounded diameter. Theoretical behaviors for the unconstrained case are still unknown. Moreover, In the Euclidean case, recently, there has been a line of work focusing on DP-SCO where the distribution of loss gradients is heavy-tailed rather than uniformly bounded \citep{wang2020differentially,hu2022high,kamath2022improved}. However, non-Euclidean DP-SCO with heavy-tailed data has not been studied so far. 

In this paper, we study the theoretical behaviors of three problems: (1) DP-SCO (with Lipschitz loss) over  a convex constraint set $\mathcal{C}$ in Euclidean space; (2) unconstrained DP-SCO in $\ell_p^d$ space; (3) DP-SCO with heavy-tailed data over a convex constraint set $\mathcal{C}$ in $\ell_p^d$ space. Specifically, our contributions can be summarized as follows. 

 \noindent {\bf 1.} For problem (1), we consider both convex and strongly convex (smooth) loss functions. We show that for convex functions, there is an $(\epsilon, \delta)$-DP algorithm whose output could achieve an (expected) excess population risk of $O(\frac{ G_{\mathcal{C}}\sqrt{\log(1/\delta)}}{\epsilon n}+\frac{1}{\sqrt{n}})$, where $n$ is the sample size. The rate could be improved to $O(\frac{ G^2_{\mathcal{C}}{\log(1/\delta)}}{ n^2\epsilon^2}+\frac{1}{n})$ for strongly convex functions. Moreover, we also show that the bound for strongly convex functions is optimal up to a factor of $\text{Poly}(\log d)$ if $\mathcal{C}$ is contained in the unit $\ell_2$-norm ball. To the best of our knowledge, this is the first lower bound of DP-SCO that depends on Gaussian width.
 
 \noindent {\bf 2.} We then study problem (2). Specifically, when $1<p<2$, we propose a novel method named Noisy Regularized Mirror Descent, which adds  regularization terms and Generalized Gaussian noise to Mirror Descent. By analyzing its stability, we show the output could achieve an excess population risk of $\tilde{O}(\kappa^\frac{4}{5}(\frac{\sqrt{d\log(1/\delta)}}{n\epsilon})^{\frac{2}{5}})$, where $\kappa = \min \{\frac{1}{p-1},2\log d\}$. We also discuss the case when $2\leq p\leq \infty$. 
 
 \noindent {\bf 3.} Finally, we consider problem (3), assuming that the second-order moment of $\|\cdot\|_*$-norm of the loss gradient is bounded. When $1<p<2$, through a noisy, shuffled, and truncated version of Mirror Descent, we show a bound of $\tilde{O}(
\frac{\sqrt[4]{\kappa^2 d\log(1/\delta)})}{\sqrt{n\epsilon}}
)$ in the high privacy regime $\epsilon=\tilde{O}(n^{-\frac{1}{2}})$, and a bound of $O(\frac{\kappa^\frac{2}{3}(d\log (1/\delta))^\frac{1}{6} }{(n\epsilon)^\frac{1}{3}})$ for general $0<\epsilon<1$. We also study the case when $2\leq p\leq \infty$. 


\section{Related Work}

As there is a long list of work on DP-SCO/DP-ERM, here we just mention the work close to the problems we study in this paper. See Table \ref{tab:1} and \ref{tab:2} for detailed comparisons. 
 
\noindent {\bf DP-SCO/DP-ERM with Gaussian width.} For DP-ERM over $\ell_2$-norm ball, although \cite{bassily2014private} show the optimal rate of $O(\frac{\sqrt{d\log(1/\delta)}}{n\epsilon})$ and  $O(\frac{d\log (1/\delta)}{n^2\epsilon^2})$ for convex and strongly convex loss, respectively, \cite{talwar2014private} show that for general constraint set $\mathcal{C}$ it is possible to improve the factor $d$ to the Gaussian width of $\mathcal{C}$. After that, \cite{kasiviswanathan2016efficient} further improve the rate for generalized linear functions, \cite{wang2017differentially} provide an accelerated algorithm, and \cite{wang2019differentially} extend to non-convex loss functions. However, all of them only study the problem of DP-ERM, and their methods cannot be generalized to DP-SCO directly. For DP-SCO, the only known result is given by \cite{amid2022public}, which studies general convex loss under the setting where there is some public data. As we can see from Table \ref{tab:1}, our result significantly improves theirs. Moreover, we  show a nearly optimal rate for strongly convex functions, which is the first lower bound of DP-SCO/DP-ERM that depends on the Gaussian width. 

\noindent {\bf DP-SCO in $\ell_p^d$ space.} Compared to the Euclidean space case, there is little work on DP-SCO in non-Euclidean ($\ell_p^d$) space. \cite{bassily2021non} provide the first study of the problem for $1\leq p\leq \infty$ and propose several results for $p=1$, $1<p<2$ and $2\leq p\leq \infty$. Later \cite{han2022private} further extend to the online setting. However, all the previous algorithms and utility analyses highly rely on the assumption that the diameter of the constrained set is bounded and known, i.e., their results will not hold in the unconstrained case, which is more difficult than the constrained case. In this paper, we fill the gap by providing the first results for unconstrained DP-SCO in $\ell_p^d$ space by proposing several new methods. 


%\noindent {\bf DP-SCO with heavy-tailed data.} The problem of DP-SCO where the distribution of loss gradients is heavy-tailed has been intensively studied in recent years, such as \cite{hu2022high,wang2020differentially,kamath2022improved,wang2022differentially,tao2022private}. However, all of them only consider Euclidean space and their methods cannot be generalized to non-Euclidean ones with $1<p<2$. 
\begin{table*}[t]
\begin{center}
\resizebox{\textwidth}{!}{%	
\begin{tabular}{|l|l|l|l|l|}
\hline
Methods                                                                                                          & Problem          & Assumption                       & Convex Bound               & Strongly Convex Bound                  \\ [1ex] \hline

\citep{talwar2014private} &  ERM  & Lipschitz & $\tilde{O}(\frac{{G_{\mathcal{C}}}}{n\epsilon})$ & $\tilde{O}(\frac{{G^2_{\mathcal{C}}}}{n^2\epsilon^2})$  \\[1ex]
					\hline
\citep{kasiviswanathan2016efficient}   & ERM                                                                   & Lipschitz and GLM      &      $\tilde{O}(\frac{\sqrt{G_\mathcal{C}}}{\sqrt{n\epsilon}})$       &    ---          \\ [1ex]\hline
\cite{amid2022public}    &  SCO                                                                                                                     & Lipschitz      &        $\tilde{O}(\frac{\sqrt{G_\mathcal{C}}}{\sqrt{n}n^{1/4}_{public}}+\frac{1}{\sqrt{n}})$        & ---             \\ [1ex]\hline
      {\bf This paper}           & SCO   & Lipschitz       & $\tilde{O}(\frac{{G_{\mathcal{C}}}}{n\epsilon}+\frac{1}{\sqrt{n}})$ & $\tilde{O}(\frac{{G^2_{\mathcal{C}}}}{n^2\epsilon^2}+\frac{1}{n})$  (*)             \\[1ex] \hline
\end{tabular}}
\caption{Comparisons on the results for $(\epsilon,\delta)$ DP-SCO/DP-ERM in Euclidean space with bounded constraint set $\mathcal{C}$ (dependence on other parameters are omitted). Here $G_\mathcal{C}$ is the Gaussian width of $\mathcal{C}$, $n$ is the sample size, and $n_{public}$ is the size of public data. $\tilde{O}$ hides other logarithmic factors. (*): We also show such a bound is nearly optimal when $\mathcal{C}$ is contained in unit $\ell_2$ ball. }
\label{tab:1}
\end{center}
\end{table*}

\begin{table*}[!th]
\begin{center}
\resizebox{\textwidth}{!}{%	
\begin{tabular}{|l|l|l|l|l|}
\hline
Methods                                                                                                          & Constrained         & Assumption                      & Bound for $\ell^d_p$ ($1<p<2$)              &  Bound for $\ell^d_p$ ($2\leq p\leq \infty$)                \\ [1ex] \hline

\citep{bassily2021non} &  Yes  & Lipschitz & $\tilde{O}(\sqrt{\frac{\kappa}{n}}+\frac{\kappa\sqrt{d}}{n\epsilon})$ & $\tilde{O}(\frac{d^{\frac{1}{2}-\frac{1}{p}}}{\sqrt{n}}+\frac{d^{1-\frac{1}{p}}}{n\epsilon})$  \\[1ex]
					\hline
      {\bf This paper}          & No   & Lipschitz       & $\tilde{O}(\kappa^\frac{4}{5}\cdot (\frac{\sqrt{d}}{n\epsilon})^{\frac{2}{5}}   )$ & $\tilde{O}(d^{1-\frac{2}{p}}(\frac{1}{\sqrt{n}}+\frac{\sqrt{d}}{\epsilon n})) $ \\[1ex] \hline
            {\bf This paper}           & Yes   & Heavy-tailed      & $\tilde{O}(
\frac{\sqrt[4]{\kappa^2 d})}{\sqrt{n\epsilon}}
)$/$\tilde{O}(\frac{\kappa^\frac{2}{3}(d)^\frac{1}{6} }{(n\epsilon)^\frac{1}{3}})$ (*)& $\tilde{O}(\frac{d^{\frac{3}{2}-\frac{1}{p}} }{\sqrt{n}}+\frac{d^{\frac{3}{2}-\frac{1}{2p}} }{\sqrt{n\epsilon}})$               \\[1ex] \hline
\end{tabular}}
\caption{Comparisons on the results for $(\epsilon,\delta)$ DP-SCO in $\ell^d_p$ space with $1<p\leq \infty$ (dependence on other parameters are omitted). Here $d$ is the dimension, $n$ is the sample size, and $\kappa=\min\{\frac{1}{p-1}, 2\log d\}$. $\tilde{O}$ hides other logarithmic factors. (*): The first bound is for the case of $\epsilon=\tilde{O}(n^{-\frac{1}{2}})$ and the second one is for general $0<\epsilon<1$.} 
\label{tab:2}
\end{center}
\end{table*}




\section{Preliminaries}

In this section, we recall some definitions and lemmas that would be used throughout the paper. Notation summary can be found in the appendix \ref{tab:notation}.
\begin{definition}[Differential Privacy \citep{dwork2006calibrating}]\label{def:1}
	Given a data universe $\mathcal{X}$, we say that two datasets $D, D'\subseteq \mathcal{X}$ are neighbors if they differ by only one data sample, which is denoted as $D \sim D'$. A randomized algorithm $\mathcal{A}$ is $(\epsilon,\delta)$-differentially private (DP) if for all neighboring datasets $D,D'$ and for all events $S$ in the output space of $\mathcal{A}$, we have $\text{Pr}(\mathcal{A}(D)\in S)\leq e^{\epsilon} \text{Pr}(\mathcal{A}(D')\in S)+\delta.$
\end{definition}
\begin{lemma}[Advanced Composition Theorem \cite{dwork2014algorithmic}]\label{lemma:adv}
Given target privacy parameters $0<\epsilon <1$ and $0<\delta<1$, to ensure $(\epsilon, T\delta'+\delta)$-DP over $T$ mechanisms, it suffices that each mechanism is $(\epsilon',\delta')$-DP, where $\epsilon'=\frac{\epsilon}{2\sqrt{2T\ln(2/\delta)}}$ and $\delta'=\frac{\delta}{T}$.  
\end{lemma}
	\begin{definition}[DP-SCO in General Normed Space \citep{bassily2021non}]\label{def:sco}
		Given a dataset $D=\{x_1,\cdots,x_n\}$ from a data universe $\mathcal{X}$ where  $\{x_i=(z_i, y_i)\}_{i}$ with a feature vector $z_i$ and a label/response $y_i$ are i.i.d. samples from some unknown distribution $\mathcal{D}$, a normed space $(\mathbf{E}, \|\cdot\|)$ of dimension $d$, 
  a convex constraint set  $\mathcal{C} \subseteq \mathbf{E}$, and a convex loss function $\ell: \mathcal{C}\times \mathcal{X}\mapsto \mathbb{R}$. Differentially Private Stochastic Convex Optimization (DP-SCO) is to find $\theta^{\text{priv}}$ to minimize the population risk, {\em i.e.,} $\mathcal{L} (\theta)=\mathbb{E}_{x\sim \mathcal{D}}[\ell(\theta, x)]$
		with the guarantee of being differentially private.\footnote{Note that in this paper we consider the proper learning case, that is $\theta^{\text{priv}}$ should be in $\mathcal{C}$.} 
		 The utility of the algorithm is measured by the (expected) excess population risk, that is  $\mathcal{L} (\theta^{\text{priv}})-\mathcal{L }(\theta^*),$ where  $\theta^*=\arg\min_{\theta \in \mathbb{\mathcal{C}}}\mathcal{L}(\theta).$	 Besides the population risk, we can also measure the \textit{empirical risk} of dataset $D$: $\hat{\mathcal{L}}(\theta, D)=\frac{1}{n}\sum_{i=1}^n \ell(\theta, x_i).$  
	\end{definition}
In Definition \ref{def:sco}, we consider DP-SCO in general normed space with a convex set $\mathcal{C}\subseteq \mathbf{E}$. In this paper, we mainly focus on two cases: 1) Constraint Euclidean case  where $\mathbf{E}=\mathbb{R}^d$, $\|\cdot \|$ is the $\ell_2$-norm, and $\mathcal{C}$ is a bounded set whose diameter is denoted as $\|\mathcal{C}\|_2=\max_{\theta, \theta'\in \mathcal{C}}\|\theta-\theta'\|_2$; 2)
$\ell_q^d$ case where $\mathbf{E}=\mathbb{R}^d$ and $\|\cdot\|$ is the $\ell_p$-norm $\|\cdot\|_p$ with  $1<p\leq \infty$ (where $||x||_p = (\sum_{j=1}^d |x_j|^p)^{\frac{1}{p}}$), and $\mathcal{C}$ could be either bounded or unbounded. %As we mentioned in the previous section, here we only focus on the setting where $1\leq p\leq 2$. 
Since $\ell^d_p$ spaces are regular. To better illustrate our idea,  we will introduce regular spaces. 



Let $(\mathbf{E},||\cdot||)$ be a normed space of dimension $d$ and let $\langle \cdot,\cdot\rangle$ be an arbitrary inner product over $\mathbf{E}$ (not necessarily inducing the norm $\|\cdot\|$). The dual norm over $\mathbf{E}$ is defined as $||y||_{*}=\underset{||x\|\leq 1}{\max}\langle y,x\rangle$. So $(\mathbf{E}, ||\cdot||_{*})$ is also a $d$-dimensional normed space. For example, let  $\ell_p^d = (\mathbb{R}^d,||\cdot||_p)$ with $1\leq p\leq \infty$, the dual norm of $\ell_p^d$ is $\ell_q^d$, where $\frac{1}{p}+\frac{1}{q}=1$.


We call a normed space regular if its dual norm is sufficiently smooth. In detail, we have the following definition. 

%We consider general spaces whose dual has sufficient smooth norm. And we use the notion of regular space to quantify the smoothness property of a space, Namely, we consider spaces whose dual space is $\kappa$-regular.

\begin{definition}[$\kappa$-regular Space \cite{juditsky2008large}]
Given $\kappa\geq 1$, we say a normed space $(\mathbf{E},||\cdot||)$  $\kappa$-regular if there exists a $\kappa_{+}$, s.t., $ 1\leq \kappa_{+}\leq \kappa$ and there exists a norm $||\cdot||_{+}$ such that $(\mathbf{E},||\cdot||_{+})$ is $\kappa_{+}$-smooth, i.e., for all $x,y\in\mathbf{E}$, 
\begin{equation*}
    ||x+y||_{+}^2\leq ||x||_{+}^2+\langle \nabla (||\cdot||_{+}^2)(x),y\rangle +\kappa_{+}||y||_{+}^2.
\end{equation*}
And $||\cdot||$ and $||\cdot||_{+}$ are equivalent with the following constraint:
    $||x||^2\leq ||x||_{+}^2 \leq \frac{\kappa}{\kappa_{+}}||x||^2 ~(\forall x\in \mathbf{E}).$
\end{definition}
For  $\ell_p^d$ space with $2\leq p\leq \infty$, it is $\kappa$-regular with $\kappa = \min\{p-1,2e\log d\}$. In this case we have $\|x\|_{+}=\|x\|_r$ with $r=\min\{p, 2\log d+1\}$ and $\kappa_+=(r-1)$ \cite{dumbgen2010nemirovski}. So in the $\ell_p$ spaces with $1<p<2$ we focus on, their dual spaces are $\kappa$-regular with $\kappa = \min\{\frac{1}{p-1}, 2\ln d\}$.

In the following, we introduce the mechanisms that will be used in the latter sections. 

\begin{lemma}[Gaussian Mechanism]\label{le-gaussian}
	Given a dataset $D\in\mathcal{X}^n$ and a function $q : \mathcal{X}^n\rightarrow \mathbb{R}^d$, the Gaussian mechanism is defined as  $q(D)+\xi$ where $\xi\sim \mathcal{N}(0,\frac{2\Delta^2_2(q)\log(1.25/\delta)}{\epsilon^2}\mathbb{I}_d)$,  where $\Delta_2(q)$ is the $\ell_2$-sensitivity of the function $q$,
{\em i.e.,}
		$\Delta_2(q)=\sup_{D\sim D'}\|q(D)-q(D')\|_2.$	Gaussian mechanism preserves $(\epsilon, \delta)$-DP.
\end{lemma}

%To achieve Differential privacy in non-Euclidean setting, noise has to be added to the gradient based on the gradient sensitivity w.r.t. the dual norm. 
Note that the Gaussian mechanism is tailored for the case where the query has bounded $\ell_2$-norm sensitivity. \cite{bassily2021non} propose a Generalized  Gaussian mechanism that leverages the regularity of the dual space $(\mathbf{E}, \|\cdot\|_*)$. 

%Gaussian mechanism is not applicable in non-Euclidean space and we have to use the Generalized Gaussian mechanism (\ref{def3}) which is a noise addition mechanism according to the generalized Gaussian distribution (Definition \ref{def2}) and leverages the regularity of the dual space $(\mathbf{E},||\cdot||_{*})$.
\begin{definition}[Generalized Gaussian distribution \cite{bassily2021non}]
\label{def2}
Let $(\mathbf{E},||\cdot||_{*})$ be a $d$-dimensional $\kappa$-regular space with smooth norm $||\cdot||_{+}$. Define the generalized Gaussian distribution $\mathcal{GG}_{||\cdot||_{+}}(\mu,\sigma^2)$, as one with density $g(z) = C(\sigma,d)\cdot e^{-\frac{||z-\mu||_{+}^2}{2\sigma^2}}$, where $C(\sigma,d) = [\text{Area}(\{||x||_{+}=1\})\frac{(2\sigma^2)^{d/2}}{2}\Gamma(\frac{d}{2})]^{-1}$, and the Area is the $d-1$ dimensional surface measure on $\mathbb{R}^d$.
\end{definition}
\begin{lemma}[Generalized Gaussian mechanism \cite{bassily2021non}]
    Given a dataset $D\in\mathcal{X}^n$, and a query $q: \mathcal{X}^n\rightarrow \mathbf{E}$ with bounded $||\cdot||_{*}$-sensitivity: $s= {\sup}_{D\sim D^{'}}||q(D)-q(D^{'})||_{*}$, the Generalized Gaussian mechanism is defined as  $q(D)+\xi$ where $\xi\sim \mathcal{GG}_{||\cdot||_{+}}(0, \frac{2\kappa \log(1/\delta)s^2}{\epsilon^2})$. 	The Generalized Gaussian mechanism preserves $(\epsilon, \delta)$-DP.
\end{lemma}
\begin{lemma}[Prop 4.2 in \citep{bassily2021non}]
    For any $m\geq 1$, if $z\sim \mathcal{GG}_{||\cdot||_{+}}(0,\sigma^2)$, then $\mathbb{E}[\|z\|_+^m]\leq (2\sigma^2)^\frac{m}{2}\Gamma(\frac{m+d}{2})/\Gamma(\frac{d}{2})$. Specifically, $\mathbb{E}[\|z\|_*^2]\leq \mathbb{E}[\|z\|_+^2]\leq d\sigma^2$, where $\Gamma(\cdot)$ is the Gamma function. 
\end{lemma}
In the following, we recall some terminologies on the properties of the loss function and the constraint set $\mathcal{C}$. 
\begin{definition}
{($L$-Lipschitz)}
Given the loss function $\ell(\cdot, \cdot):\mathcal{C} \times \mathcal{X}\rightarrow \mathbb{R}$. It is $L$-Lipschitz w.r.t. the norm $||\cdot||$ if for all $x\in \mathcal{X}$ and $w_1,w_2\in \mathcal{C}$ we have 
\begin{equation}
    |\ell(w_1,x)-\ell(w_2,x)|\leq L\cdot ||w_1-w_2||. \notag 
\end{equation}
\end{definition}
\begin{definition}
{($\beta$-Smooth)}
Given the loss function $\ell(\cdot, \cdot):\mathcal{C} \times \mathcal{X}\rightarrow \mathbb{R}$. It is $\beta$-smooth w.r.t. the norm $||\cdot||$ if its gradient is $\beta$-Lipschitz w.r.t. $||\cdot||$, namely, 
for all $x\in \mathcal{X}$ and $w_1,w_2\in \mathcal{C}$ we have 
\begin{equation}
    ||\nabla\ell(w_1,x)-\nabla\ell(w_2,x)||_{*}\leq \beta\cdot ||w_1-w_2||. \notag 
\end{equation}
\end{definition}
\begin{definition}{(Strongly convex)}
Given the loss function $\ell(\cdot, \cdot):\mathcal{C} \times \mathcal{X}\rightarrow \mathbb{R}$, it is $\alpha$-strongly convex w.r.t. the norm $||\cdot||$ if for all $x\in \mathcal{X}$ and $w_1, w_2\in \mathcal{C}$,
\begin{equation*}
    \langle \nabla \ell(w_1, x)-\nabla \ell(w_2, x), w_1-w_2\rangle \geq \alpha \cdot||w_1-w_2||^2.
\end{equation*}
\end{definition}
\begin{definition}{(Bregman divergence)}
For a convex function $\Phi:\mathbf{E}\rightarrow \mathbb{R}$, the Bregman divergence is defined as 
\begin{equation}
    D_{\Phi}(y,x) = \Phi(y)-\Phi(x)-\langle \nabla \Phi(x),y-x\rangle. \notag 
\end{equation}
\end{definition}
Notice that the Bregman divergence is always positive, and it is convex in the first argument.
\begin{definition}{(Relative strongly convex \citep{lu2018relatively})}
A function $f:\mathbf{E}\rightarrow \mathbb{R}$ is $\alpha$-strongly convex \textbf{relative}  to $\Phi:\mathbf{E}\rightarrow \mathbb{R}$ if for all $x, y\in \mathbf{E}$,
\begin{equation}
    f(x)+\langle \nabla f(x),y-x\rangle +\alpha D_{\Phi}(y,x)\leq f(y). \notag 
\end{equation}
\end{definition}

\begin{definition}{(Relative smooth \citep{lu2018relatively})}
A function $f:\mathbf{E}\rightarrow \mathbb{R}$ is $\beta$-smooth \textbf{relative}  to $\Phi:\mathbf{E}\rightarrow \mathbb{R}$ if $\forall x, y\in \mathbf{E}$,
    $f(x)+\langle \nabla f(x),y-x\rangle +\beta D_{\Phi}(y,x)\geq  f(y).$
\end{definition}
Next, we introduce some basic concepts on Minkowski norm of a symmetric, closed, and convex set $\mathcal{C}$.

\begin{definition}[Minkowski norm]
For a centrally symmetric convex set $\mathcal{C}\subseteq \mathbb{R}^d$, the Minkowski norm (denoted by  $||\cdot||_{\mathcal{C}}$) is defined as follows. For any vector $v\in \mathbb{R}^d$, 
\begin{equation*}
    ||\cdot||_{\mathcal{C}}=\min\{r\in \mathbb{R}^{+}: v\in r \mathcal{C}\}.
\end{equation*}
The dual norm of $||\cdot||_{\mathcal{C}}$ is denoted as $||\cdot||_{\mathcal{C}^{*}}$, and for any vector $v\in \mathbb{R}^d$, 
    $||v||_{\mathcal{C}^{*}} =\underset{w\in \mathcal{C}}{\max}|\langle w, v\rangle|$.
Note that by Holder's inequality, for any pair of dual norms $||\cdot||$ and $||\cdot||_{*}$, and any $x,y\in \mathbb{R}^d$, $|\langle x, y\rangle |\leq ||x||\cdot||y||_{*}$. So we have $|\langle x,y\rangle |\leq ||x||_{\mathcal{C}}\cdot||y||_{\mathcal{C}^{*}}$.
\end{definition}
In the constrained Euclidean case, 
our work relies on appropriately quantifying the size of  a convex body, which leads to the following definition of Gaussian width.

\begin{definition}\label{def:12}(Gaussian width)
Let $\xi\sim \mathcal{N}(0,\mathbb{I}_d)$ be a Gaussian random vector in $\mathbb{R}^d$, for a set $\mathcal{C}$, the Gaussian width is defined as 
    $G_{\mathcal{C}}=\mathbb{E}_{\xi}[\underset{w\in\mathcal{C}}{\sup}\langle \xi,w \rangle ].$

\end{definition}
Compared to dimension $d$, the Gaussian width
of a convex set $\mathcal{C}\subset \mathbb{R}^d$ could be much smaller. For example, when $\mathcal{C}$ is the unit $\ell_1$-norm  ball, $G_\mathcal{C}=O(\sqrt{\log d})$; and when $\mathcal{C}$ is the set of  of all unit s-sparse vectors on $\mathbb{R}^d$, $G_\mathcal{C}=O(\sqrt{s\log \frac{d}{s}})$. We refer readers to  \citet{talwar2014private} for details. 

\section{DP-SCO in Euclidean Space}\label{sec:euclidean}

In this section, we focus on the Euclidean case with a closed, bounded, and convex constraint set $\mathcal{C}$, and the loss function could be either convex or strongly convex.  

\subsection{General Convex Case}\label{sec:convex}

Before showing our idea, we need to discuss the weakness of previous approaches. Note that since our goal is getting an upper bound that depends on the Gaussian width of the constrained set $\mathcal{C}$, we will not discuss the approaches that achieve upper bounds that are polynomial in $d$. 

In general, all methods can be categorized into two classes: gradient perturbation and objective function perturbation. In gradient perturbation methods \citep{talwar2014private}, the key idea is modifying the Mirror Descent by adding noise to gradients. While this approach could achieve satisfactory bounds for the empirical risk  \citep{wang2017differentially,wang2019differentially}, however, when considering the population risk we need to use batched gradients at each iteration, which will induce a sub-optimal rate \citep{amid2022public}. Instead of perturbing the gradient, \cite{talwar2014private} show that the objective function perturbation method in \cite{chaudhuri2011differentially} could also achieve an upper bound that only depends on the Gaussian width, instead of $d$. However, this approach has two weaknesses: First, \cite{talwar2014private} only shows the bound for the empirical risk, and whether its excess population risk is satisfactory or not is unknown; Secondly, it is well-known that the objective perturbation approach needs to exactly get the minimizer of the perturbed objective function, which is inefficient in practice. 

Motivated by the objective perturbation method in \cite{talwar2014private}, our algorithm is an approximate version proposed in \cite{bassily2019private}. See detailed procedures in Algorithm \ref{alg1}. In detail, first, similar to the standard objective perturbation, we add a random and linear term $\frac{\langle \mathbf{G},\theta\rangle }{n}$ with Gaussian noise  $\mathbf{G}$ and an $\ell_2$ regularization to the original empirical risk function to obtain a new objective function $\mathcal{J}(\theta, D)$. Then we obtain an approximate minimizer $\theta_2$ of the perturbed empirical risk $\mathcal{J}(\theta, D)$ via any efficient optimization method (such as proximal SVRG \cite{xiao2014proximal} or projected SGD) to ensure that $\mathcal{J}(\theta_2,D)-\underset{\theta\in\mathcal{C}}{\min} \mathcal{J}(\theta,D)$ is at most $\alpha$. Formally, we can define such an optimization method  as an optimizer function $\mathcal{O}: \mathcal{F}\times [0, 1]\rightarrow  \mathcal{C}$, where $\mathcal{F}$ is the class of objectives and the other argument is the optimization accuracy.  Finally, we perturb $\theta_2$ with Gaussian noise to fuzz the difference 
between $\theta_2$ and the true minimizer, we then project the perturbed $\theta_2$ onto  set $\mathcal{C}$. 




Since the algorithm itself is not new, here we  highlight our contributions: First, with some specific parameters, we show such an algorithm could achieve an excess population risk of $O(\frac{G_\mathcal{C}}{n\epsilon}+\frac{1}{\sqrt{n}})$, while \cite{bassily2019private} only show an upper bound of $O(\frac{\sqrt{d}}{n\epsilon}+\frac{1}{\sqrt{n}})$; Second, we extend the algorithm to the strongly convex case (see Section \ref{sec:strongconv} for details). In the following, we will show the theoretical guarantees of our algorithm. First, we need the following assumption on the loss function $\ell$.


%In this subsection, we focus on a practical algorithm called approximate objective perturbation, where the empirical loss is first perturbed by adding two terms: a noisy linear term $\frac{\langle \mathbf{G},\theta\rangle }{n}$ and a regularization term $\lambda ||\theta||_{2}^2$. Then we obtain an approximate minimizer $\theta_2$ of the perturbed empirical risk $\mathcal{J}(\theta, D)$ via efficient optimization methods and ensures that $\mathcal{J}(\theta_2,D)-\underset{\theta\in\mathcal{C}}{\min} \mathcal{J}(\theta,D)$ is at most $\alpha$. After that, we perturb $\theta_2$ with Gaussian noise to fuzz the difference  between $\theta_2$ and the true minimizer, the detailed procedure is given in Algorithm \ref{alg1}. 

\begin{assumption}\label{ass1}
The loss function $\ell$ is  twice differentiable, $L$-Lipschitz and $\beta$-smooth w.r.t. the Euclidean norm $||\cdot||_2$ over $\mathcal{C}$. 
\end{assumption}
\begin{algorithm}
	\caption{$\mathcal{A}_{\text{App-ObjP}}$: Approximate Objective perturbation }
	\begin{algorithmic}[1]
		\State {\bfseries Input:} Dateset $D$, loss function $\ell$, regularization parameter $\lambda$, optimizer $\mathcal{O}: \mathcal{F}\times [0, 1]\rightarrow  \mathcal{C}$, where $\mathcal{F}$ is the class of objectives, and the other argument is the optimization accuracy. $\alpha \in [0,1]:$ optimization accuracy.
		
		
		
	\State  Sample $\mathbf{G}\sim\mathcal{N}(0, \sigma_1^2\mathbb{I}_d)$ where $\sigma_1^2  = \frac{128L^2\log(2.5/\delta)}{\epsilon^2}$. Set $\lambda\geq \frac{r\beta}{\epsilon n}$, where $r = \min\{d, 2\cdot \text{rank}(\nabla^2\ell(\theta, x))\}$ with $\text{rank}(\nabla^2\ell(\theta,x))$ being the maximal rank of the Hessian of $\ell$ for all $\theta \in \mathcal{C}$ and $x\sim \mathcal{P}$.
	
	\State Let $\mathcal{J}(\theta,D) = \hat{\mathcal{L}}(\theta, D)+\frac{\langle \mathbf{G},\theta\rangle}{n}+\lambda||\theta||_2^2$.\\
	 \Return $\hat{\theta} = {\text{Proj}}_{\mathcal{C}}[\mathcal{O}(\mathcal{J},\alpha)+\mathbf{H}]$ where $\mathbf{H}\sim\mathcal{N}
	 (0,\sigma_2^2\mathbb{I}_d)$ 
	 with $\sigma_2^2 = 
	 \frac{64\alpha
	 \log(2.5/\delta)}{\lambda\epsilon^2}$
	\end{algorithmic}
	\label{alg1}
\end{algorithm}
\begin{theorem}\label{thm:1}
Suppose that Assumption \ref{ass1} holds and that the smoothness parameter $\beta$ satisfies $\beta \leq \frac{\epsilon n \lambda}{r}$. Then for any $0< \epsilon, \delta< 1$, 
$\mathcal{A}_{\text{App-ObjP}}$ (Algorithm \ref{alg1}) is $(\epsilon, \delta)$-DP.
\end{theorem}
It is notable that although we need to assume $\beta$ is not large enough, as we will see in Theorem \ref{th4}, the assumption will always hold when $n$ is sufficiently large. 
\begin{theorem}\label{th4}
Suppose that Assumption \ref{ass1} holds. 
When $n$ is large enough such that  $n\geq \frac{r^2\beta^2||\mathcal{C}||_2^2}{\epsilon^2L^2}$ and $n\geq O\left(\frac{\sqrt{d\log(1/\delta)}}{\epsilon}\right)$, take $\lambda=\frac{L}{\sqrt{n}||\mathcal{C}||_2}$ and $\alpha\leq \min\left\{
\frac{L||\mathcal{C}||_2}{n^{\frac{3}{2}}}, \frac{\epsilon^2 L||\mathcal{C}||_2^3}{G^2_{\mathcal{C}}\log(1/\delta) n^{\frac{5}{2}}}
\right\}$ in Algorithm \ref{alg1}, we have 
\begin{equation*}
    \mathbb{E}[\mathcal{L}(\hat{\theta})] - \mathcal{L}(\theta^{*})\leq O\left(\frac{L\cdot G_{\mathcal{C}}\sqrt{\log(1/\delta)}}{\epsilon n}+\frac{L||\mathcal{C}||_2}{\sqrt{n}}\right),
\end{equation*}
where  the expectation is taken over the internal randomness of the algorithm.
\end{theorem}
\begin{remark}
While we consider the same algorithm as in \cite{bassily2019private}, there are several crucial differences. First, to achieve the upper bound of $O(\frac{\sqrt{d}}{n\epsilon}+\frac{1}{\sqrt{n}})$, \cite{bassily2019private} only need to set $\alpha\leq O(\frac{1}{n^2}\max\{\frac{1}{\sqrt{n}}, \frac{d}{n\epsilon}\})$ while we need to be more aggressive by choosing $\alpha\leq O({\epsilon^2}{n^{-\frac{5}{2}}})$. This is reasonable as we aim to get an improved upper bound. Thus we have to get a more accurate estimation. Secondly, besides enforcing the perturbed approximation to lie in the set $\mathcal{C}$ as it does in \cite{bassily2019private}, the projection operator in Step 4 of Algorithm \ref{alg1} plays a more critical role in achieving a bound that depends on $G_\mathcal{C}$ in our analysis, i.e., the bound in \cite{bassily2019private} will still hold even there is no projection step, while this is not true for our case. Specifically, although the noise $\mathbf{H}$ is a $d$-dimensional Gaussian noise, we can show that due to the projection operator, the error introduced by the noise  depends only on $G_\mathcal{C}$ rather than $\sqrt{d}$, i.e., $||\hat{\theta}-\theta_2||_2^2\leq O(\sqrt{\frac{\alpha \log(1/\delta)}{\lambda}}\cdot \frac{G_{\mathcal{C}}}{\epsilon})$. A similar idea has also been used in privately answering multiple linear queries \citep{nikolov2016geometry}. 
\end{remark}
\subsection{Strongly Convex Case}\label{sec:strongconv}
We aim to extend our above idea to the strongly convex case. First, we impose 
the following assumption. 
\begin{assumption}\label{ass2}
We assume the loss is twice differentiable, $L$-Lipschitz and $\beta$-smooth w.r.t. $||\cdot||_2$, and it is $\Delta$-strongly convex w.r.t. $||\cdot||_{\mathcal{C}}$ over the set $\mathcal{C}$.
\end{assumption}
Note that we can relax the assumption to strongly convex w.r.t $\|\cdot\|_2$ as $\|v\|_2\geq \mathcal{C}_{\min}\cdot \|v\|_\mathcal{C}$, where $\mathcal{C}_{\min}$ is in Theorem \ref{th1}. See the proof of Theorem \ref{th1} for details. 


Our method is shown in  Algorithm \ref{alg2}. Note that, compared with Algorithm \ref{alg1}, the main difference is the regularization parameter $\lambda$. This is because the loss function is already $\Delta$-strongly convex, thus smaller $\lambda$ will be sufficient to make $\mathcal{J}$ to be $\frac{r\beta}{\epsilon n}$-strongly convex. Moreover, when $n$ is large enough, we can see $\lambda=0$, indicating that we can get an improved excess population risk  compared to the convex case. 
\begin{algorithm}
	\caption{$\mathcal{A}_{\text{App-ObjP-SC}}$: Approximate Objective perturbation for strongly convex function }
	\begin{algorithmic}[1]
		\State {\bfseries Input:} Dateset $D$, loss function $\ell$, regularization parameter $\lambda$, optimizer $\mathcal{O}: \mathcal{F}\times [0, 1]\rightarrow  \mathcal{C}$, where $\mathcal{F}$ is the class of objectives and the other argument is the optimization accuracy. $\alpha \in [0,1]:$ optimization accuracy.
		
		
		
	\State  Sample $\mathbf{G}\sim\mathcal{N}(0, \sigma_1^2\mathbb{I}_d)$ where $\sigma_1^2  = \frac{128L^2\log(2.5/\delta)}{\epsilon^2}$. Set $\lambda= \max\left\{
	\frac{r \beta}{\epsilon n}-\Delta,0
	\right\}$, where $r = \min\{d, 2\cdot \text{rank}(\nabla^2\ell(\theta, x))\}$ with $\text{rank}(\nabla^2\ell(\theta,x))$ being the maximal rank of the Hessian of $\ell$ for all $\theta \in \mathcal{C}$ and $x\sim \mathcal{P}$.
	
	\State Let $\mathcal{J}(\theta,D) = \hat{\mathcal{L}}(\theta, D)+\frac{\langle \mathbf{G},\theta\rangle}{n}+\lambda||\theta||_2^2$.\\
	 \Return $\hat{\theta} = {\text{Proj}}_{\mathcal{C}}[\mathcal{O}(\mathcal{J},\alpha)+\mathbf{H}]$ where $\mathbf{H}\sim\mathcal{N}
	 (0,\sigma_2^2\mathbb{I}_d)$ 
	 with $\sigma_2^2 = 
	 \frac{64\alpha
	 \log(2.5/\delta)\cdot ||\mathcal{C}||_2^2}{\Delta\epsilon^2}$
	\end{algorithmic}
	\label{alg2}
\end{algorithm}
\begin{theorem}\label{thm:3}
If the loss function satisfies Assumption \ref{ass2}. Then for any $0<\epsilon, \delta<1$, 
$\mathcal{A}_{\text{App-ObjP-SC}}$ (Algorithm \ref{alg2}) is $(\epsilon,\delta)$-DP.
\end{theorem}
\begin{theorem}\label{th5} Suppose that Assumption \ref{ass2} holds. If $n$ is large enough such that 
 $n\geq O(\max\{\frac{L^2 ||\mathcal{C}||_2^2}{\Delta^2}, \frac{||\mathcal{C}||_2^2 r^2\beta^2}{L^2\epsilon^2}
\})$ and $n\geq O\left(\frac{\sqrt{d\log(1/\delta)}}{\epsilon}\right)$, then  by setting $\alpha\leq O\left(\min\left\{
\frac{L^2 ||\mathcal{C}||_2^2}{\Delta n^2},\frac{L^4\cdot ||\mathcal{C}||_2^6\epsilon^2}{\Delta^3 n^4 G_{\mathcal{C}}^2\log(1/\delta)}
\right\}
\right)$, we have 
\begin{equation}
    \mathbb{E}[\mathcal{L}(\hat{\theta})]-\mathcal{L}(\theta^{*})\leq O\left(
    \frac{L^2 ||\mathcal{C}||_2^2}{\Delta n\epsilon} + \frac{G_{\mathcal{C}}^2L^2\log(1/\delta)}{\Delta n^2\epsilon^2}
    \right), \notag 
    \end{equation}
    where  the expectation is taken over the internal randomness of the algorithm.
\end{theorem}
\begin{remark}\label{remark:2}
 First, it is notable that an objective perturbation method for strongly convex loss has also been presented by \cite{talwar2014private}. However, there are two major differences: (1) the method in \cite{talwar2014private} needs to solve the perturbed objective function exactly, indicating it is inefficient; (2) \cite{talwar2014private} only provide the excess empirical risk. It is unknown whether their method could achieve the same bound as ours for the excess population risk.  Secondly, when $\mathcal{C}$ is an $\ell_2$-norm ball, the bounds in  Theorem \ref{th4} and Theorem \ref{th5} will recover the optimal rate of DP-SCO over $\ell_2$-norm ball for convex and strongly convex loss functions, respectively \citep{bassily2019private}.   Thirdly, the terms of $O(\frac{G_\mathcal{C}}{n\epsilon})$ and $O(\frac{G^2_\mathcal{C}}{n^2\epsilon^2})$ match the best-known results of excess empirical risk for the convex  and strongly convex case, respectively \citep{talwar2014private}. 
\end{remark}
In Remark \ref{remark:2}, we showed that our results are optimal when $\mathcal{C}$ is an $\ell_2$-norm ball and are comparable to the best results of DP-ERM with Gaussian width. A natural question is whether we can further improve these two upper bounds. In the following, we partially answer the question by providing a lower bound for  strongly convex loss functions. 

\begin{theorem}\label{th1}
Let $\mathcal{C}$ be a symmetric body contained in the unit Euclidean ball $\mathcal{B}_2^d$ in $\mathbb{R}^d$ and satisfies $\|\mathcal{C}\|_2=1$. For any $ n=O(\frac{\sqrt{d\log(1/\delta)}}{\epsilon})$, $\epsilon=O(1)$ and  $2^{-\Omega(n)}\leq \delta\leq 1/n^{1+\Omega(1)}$, there exists a loss $\ell$ which is $1$-Lipschitz w.r.t. $\|\cdot\|_2$ and $\mathcal{C}^2_{\min}$-strongly convex w.r.t. $\|\cdot\|_\mathcal{C}$, and a dataset $D=\{x_1,\cdots,x_n\}\subseteq \mathcal{C}^n$ such as for any $(\epsilon,\delta)$-differentially private algorithm on minimizing the empirical risk function $\hat{\mathcal{L}}(\theta,D)$ over $\mathcal{C}$, its output 
 $\theta^{priv}\in \mathcal{C}$ satisfies 
	$$
	\mathbb{E}[\mathcal{L}(\theta^{priv})]-\mathcal{L}(\theta^*)=\Omega \left(\max\left\{\frac{G_{\mathcal{C}}^2\log(1/\delta)}{(\log(2d))^4\epsilon^2n^2},\frac{1}{n}\right\}\right),$$ where the expectation is taken over the internal randomness of the algorithm $\mathcal{A}$. Here $\mathcal{C}_{\min}= \min\{\|v\|_2: v\in \partial \mathcal{C}\}$ with $\partial \mathcal{C}$ as the boundary of the set $\mathcal{C}$, i.e., it is the distance between the original point to the boundary of $\mathcal{C}$. 
\end{theorem}
    Taking $\Delta=\mathcal{C}^2_{\min}$ and $L=1$ in Theorem \ref{th5}, we can see the rate of excess population risk in Theorem \ref{th5} for strongly convex loss functions is nearly optimal by a factor of $\tilde{O}({\mathcal{C}^{-2}_{\min}})$. It is unknown whether we can further close the gap, and we will leave it as an open problem.  

\section{DP-SCO in $\ell_p^d$ Space} \label{sec:regular}

In this section, we will focus on DP-SCO in $\ell_p^d$ space where $1<p\leq \infty$. As we mentioned in the Introduction section, we study two settings: (1) $\mathcal{C}$ is $\mathbb{R}^d$ and the gradient of the loss function is bounded (i.e., the loss is Lipschitz); (2) $\mathcal{C}$ is bounded, and the distribution of  gradient of the loss is heavy-tailed. Similar to the previous study in \cite{bassily2021non}, for each setting, there are two cases: $1<p< 2$ and $2\leq p\leq \infty$. Notice that, unlike the previous section, we only study the case where the loss functions are convex. The reason is that except  for the Euclidean space, for a strongly convex function, the ratio between its smoothness and strong convexity, i.e., the condition number, will depend on the dimension of $\mathbf{E}$. For example, in the $\ell_1^d$ space, it has been shown that there is no function whose condition number is less than $d$ \citep{juditsky2014deterministic}.

\subsection{Unconstrained Case}

In this part, we will study Lipschitz loss under the following assumption that is commonly used in the related work on general stochastic convex optimization. 
\begin{assumption}\label{as3}
We assume $\ell(\cdot,x)$ is convex, $\beta$-smooth and $L$-Lipschitz w.r.t. $||\cdot||$ over $\mathbb{R}^d$.
\end{assumption}
Due to its difficulty, we first consider the case where $1< p< 2$. See Algorithm \ref{alg3} for details. Note that Algorithm \ref{alg3} could be considered as a noisy and regularized version of the standard mirror descent, i.e., at each iteration, we first perform linearization of $\hat{\mathcal{L}}(w_t, D)$, then we add a generalized Gaussian noise to its gradient to privatize the algorithm, a Bregman divergence term and a regularized term $\alpha\Phi(\cdot)$ with some specific $\alpha$ to the  linearization term. Then we solve the perturbed and regularized optimization problem.  We output a linear combination of the intermediate parameters as the final output. 

It is notable that although our method is a noisy modification of Mirror Descent, it is completely different from the previous private Mirror Descent based methods in \cite{talwar2014private,wang2017differentially,bassily2021non,amid2022public}: First, instead of directly adding noise to the gradient in standard Mirror Descent, here we have an additional regularization term, which is crucial for us to make the algorithm stable, indicating that we can get an excess population risk. To be more specific, first,  by the definition of $\|\cdot\|_{+}$, and  the duality between strong convexity and smoothness, we can easily see $\Phi$  is 1-strongly convex w.r.t $\|\cdot\|$. This indicates that the function $\hat{\mathcal{L}}(w, D)+\alpha \Phi(w)$ is relatively strongly convex and smooth (note that it is not smooth as the regularization term is not smooth when $1<p<2$). And the update step is just a noisy version of Mirror Descent for $\hat{\mathcal{L}}(w, D)+\alpha \Phi(w)$. Recently, it has been shown that Mirror Descent is stable for relatively strongly convex and smooth functions. Thus, we can also show that Algorithm \ref{alg3} is stable, indicating that we can get an excess population risk. From the above intuition, we can also see the parameter $\alpha$ need to be carefully tuned to balance the stability and the excess empirical risk. The second difference is that, instead of using the last iterate or the average of iterates, our output is a linear combination of intermediate iterates, which is due to the noise we added. In the following we show the main results. 

\begin{theorem}\label{thm:6}
For the $\ell_p^d$ space with $1<p<2$, suppose Assumption \ref{as3} holds, then for any $0<\epsilon, \delta<1$, Algorithm \ref{alg3} is $(\epsilon,\delta)$-DP.
\end{theorem}

\begin{theorem}\label{th:7}
For the $\ell_p^d$ space with $1<p<2$, suppose Assumption \ref{as3} holds. In Algorithm \ref{alg3}, take $\alpha=\frac{4\beta}{T}\log_2 \frac{n}{T}$ and $T=O((\frac{n\epsilon\kappa}{\sqrt{d\log(1/\delta)}})^{\frac{2}{5}})$, assume $n$ is sufficiently large such that $n\geq O\left( \frac{\epsilon^4}{(d \log (1/\delta))^2\kappa^{1/2}}\right)$, then we have 
\begin{equation*}
    \mathbb{E}[\mathcal{L}(\hat{w})]-\mathcal{L}(\theta^*)\leq \tilde{O}(\kappa^\frac{4}{5}\cdot (\frac{\sqrt{d\log(1/\delta)}}{n\epsilon})^{\frac{2}{5}}   ),
\end{equation*}
where $\tilde{O}$ hides $\beta, L$ and a factor of $\mathbb{E}_{D}[\tilde{C}_D^2]$ with $\tilde{C}_D^2 = \|\tilde{w}^*\|_{\kappa_+}^2\leq \|\tilde{w}^*\|^2$ and $\tilde{w}^{*}=\underset{w\in\mathbf{E}}{\arg\min} \hat{\mathcal{L}}(w,D)$).
\end{theorem}
 The key idea to prove Theorem \ref{th:7} is to show  that Algorithm \ref{alg3} is uniformly stable (w.r.t $\|\cdot\|$) by bounding the term $\mathbb{E}[\|w_{t+1}-w'_{t+1}\|]$, where $w'_{t+1}$ is the corresponding iterate of the algorithm when the input data is $D'$, which is a neighboring data of $D$. To show this, rather than analyzing the stability of $w_{t+1}$ directly via the approach in \cite{hardt2016train}, our strategy is bounding $\|w_{t+1}-w^*_\alpha\|$, where $w^*_\alpha=\arg\min \hat{\mathcal{L}}(w, D)+\alpha\Phi(w)$. As the regularized function $\hat{\mathcal{L}}(w, D)+\alpha\Phi(w)$ now is relatively smooth and convex, the stability of $w^*_\alpha$ is $O(\frac{1}{n})$. Thus we can get the sensitivity of $w_{t+1}$. Then we can bound the sensitivity of $\hat{w}$. 
\begin{remark}
    In the constrained case, \cite{bassily2021non} show that it is possible to achieve an upper bound of $\tilde{O}((M+M^2)(\frac{\sqrt{\kappa}}{\sqrt{n}}+\frac{\kappa \sqrt{d\log 1/\delta}}{n\epsilon})),$ where  $
    M$ is the diameter of set $\mathcal{C}$. Thus, we can see there is still a gap between the unconstrained case and the constrained case.  
\end{remark}
\begin{algorithm}
	\caption{Noisy Regularized Mirror Descent for $\ell_p^d$ ($1<p<2$).}
	\begin{algorithmic}[1]
		\State {\bfseries Input:} Dateset $D$, loss function $\ell$, smoothness parameter $\beta$ and parameter $\alpha$.
	\State  Take $w_1 = 0$.
	\For{$t = 1,\cdots,T$}
\State Solve the following optimization problem 
\begin{align}
    &w_{t+1}= \underset{w\in\mathbf{E}}{\arg\min}\{
\langle \nabla \hat{\mathcal{L}}(w_t,D)+g_t, w-w_t\rangle \notag \\ &+\beta\cdot D_{\Phi }(w,w_t)+\alpha \Phi(w)
\}, 
\end{align}
 where $g_t\sim \mathcal{GG}_{||\cdot||_{+}}(0,\sigma^2)$ with $\sigma^2 = \frac{64L^2\kappa T\log(1/\delta)}{n^2\epsilon^2}$ and $||\cdot||_{+}$ is the smooth norm for $(\mathbf{E},||\cdot||_{*})$. $\kappa = \min \{\frac{1}{p-1},2\log d\}$ and  $\Phi(x) =\frac{\kappa}{2}||x||_{\kappa_{+}}^2$ with $\kappa_{+}= \frac{\kappa}{\kappa -1}$.  
	\EndFor\\
	 \Return $\hat{w}=\frac{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t  \cdot w_{t+1}}{\sum_{t=1}^T \left(\frac{2\beta+\alpha}{2\beta}\right)^t }.$
	\end{algorithmic}
	\label{alg3}
\end{algorithm}
Next, we study the case where $2\leq p\leq \infty$. The key idea is to reduce the $\ell_p^d$ space to the Euclidean space by leveraging the relationship between the $\ell_p$ norm and the Euclidean norm. Thus, here we adopt the Phased DP-SGD algorithm proposed by \cite{feldman2020private}. As the parameters in the original Phased DP-SGD depend on the diameter, we modify them to the unconstrained case. Specifically, we have the following result. 
\begin{theorem}\label{thm:8}
For the $\ell_p^d$ space with $2\leq p\leq \infty$, suppose Assumption \ref{as3} holds. Then for any $0<\epsilon, \delta<1$, there is an $(\epsilon,\delta)$-DP algorithm whose output $\theta$ satisfies 
\begin{equation}
   \mathbb{E}[\mathcal{L}(\theta)]-\mathcal{L}(\theta^*)\leq  O(d^{1-\frac{2}{p}}\|\theta^*\|^2(\frac{1}{\sqrt{n}}+\frac{\sqrt{d\log(1/\delta)}}{\epsilon n})). \notag 
\end{equation}
\end{theorem}
In the constrained case, \cite{bassily2021non} shows the optimal rate of $O(M d^{\frac{1}{2}-\frac{1}{p}}(\frac{1}{\sqrt{n}}+\frac{\sqrt{d\log 1/\delta}}{n\epsilon})),$ where $M$ is the diameter of the set $\mathcal{C}$ w.r.t. $\|\cdot\|$. Thus, we can see there is a difference of $O(d^{\frac{1}{2}-\frac{1}{p}})$. This is because, rather than linear in $M$ in the constrained case, in the Euclidean and unconstrained case, we can show the excess population risk depends on $\|\theta^*\|_2^2$, which is less than $d^{1-\frac{2}{p}}\|\theta^*\|^2$. 

\subsection{Heavy-tailed and Constrained Case}

In the above section, we studied DP-SCO with Lipschitz loss functions, i.e., the $\|\cdot\|_*$ norm of  the loss gradient is  uniformly bounded by $L$. Next, we will relax this assumption to a heavy-tailed distribution, i.e., we only assume the  variance of the loss gradient w.r.t $\|\cdot\|_*$ is finite. As we have discussed the difficulty of the unconstrained case compared to the constrained case,  throughout the section, we focus on the constrained case with the $\|\cdot \|$-norm diameter $M$. 
\begin{assumption}\label{ass:4}
We assume $\ell(\cdot,x)$ is convex and $\beta$-smooth  $||\cdot||$ over $\mathcal{C}$.  Moreover, for all  $w\in\mathcal{C}$ there exists a known constant $\sigma>0$ such that 
$\mathbb{E}[||\nabla \ell(w,x)-\nabla \mathcal{L}(w)||_{*}^2]\leq \sigma^2$. 
\end{assumption}
It is noteworthy that the heavy-tailedness assumption is commonly used in previous related work, such as \cite{vural2022mirror}. Besides the norm of gradient, there is another line of work that only assumes the second-order moment of each coordinate of the gradient is bounded \citep{hu2022high,kamath2022improved,wang2020differentially,wang2022differentially,tao2022private}. We leave such a relaxed assumption as future work. 

 

Like the previous section, we first study the case where $1<p<2$.  We present our algorithm in Algorithm \ref{alg4}, which could be considered a shuffled, truncated, and noisy version of one-pass Mirror Descent.  Specifically, in the first step, we shuffle the dataset and divide it into several batches (we will use one batch for one iteration). Using the by-now standard method of privacy amplification by shuffling \citep{feldman2022hiding}, we can amplify the overall privacy guarantee  by a factor of $\tilde{O}(\frac{1}{n})$ as compared to the analysis for the unshuffled dataset. Next, motivated by \cite{nazin2019algorithms}, at each iteration, we first conduct a truncation step to each sample gradient $\nabla \ell(w_{t-1}, x)$. Such an operator can not only remove outliers, but also upper bound the $\|\cdot\|_*$-sensitivity of the truncated gradients to $O(\beta M+\lambda)$. Then we perform the Mirror Descent update by these perturbed and truncated  sample gradients. In the following, we show the privacy and utility guarantees of our algorithm. 
\begin{algorithm}
	\caption{Shuffled Truncated DP Mirror Descent }
	\begin{algorithmic}[1]
		\State {\bfseries Input:} Dataset $D$, loss function $\ell$, initial point $w_0=0$, smooth parameter $\beta$ and   $\lambda$. 
	\State  Randomly permute the data and denote the permuted data as $\{x_1,\cdots,x_n\}$.
 \State Divide the permuted data into $T$ batches $\{B_i\}_{i=1}^T$ where $|B_i|=\frac{n}{T}$ for all $i=1,\cdots, T$
 \For{$t = 1,\cdots,T$}
\For{each $x\in B_t$}
\State \hspace{-3mm}\begin{small}$g_x= \begin{cases} \nabla \ell(w_{t-1},x)& \text{if} ~||\nabla \ell(w_{t-1},x)||_{*}\leq \beta M+\lambda\\ 0& {\text{otherwise}} \end{cases}$\end{small}
	\EndFor
 \State 
Update 
\begin{small} $$w_t=\underset{w\in\mathcal{\mathcal{C}}}{\arg\min} \left\{ \left\langle \frac{\underset{{x\in B_t}}{\sum} g_x + Z_x^t}{|B_t|},w \right\rangle +\gamma_t \cdot D_{\Phi}(w,w_{t-1})\right\},$$ \end{small} where  $Z_x^{t}\sim \mathcal{GG}_{||\cdot||_{+}}(\sigma_1^2)$ with $\sigma_1^2 =O\left(\frac{\log(\frac{n}{\delta})\cdot \kappa (\beta M+\lambda )^2 \cdot \log(1/\delta)}{n\epsilon^2}   \right)$, $||\cdot||_{+}$ is the smooth norm for $(\mathbf{E},||\cdot||_{*})$. $\kappa = \min \{\frac{1}{p-1},2\log d\}$ and  $\Phi(x) =\frac{\kappa}{2}||x||_{\kappa_{+}}^2$ with $\kappa_{+}= \frac{\kappa}{\kappa -1}$.  
	\EndFor\\
	 \Return $\hat{w} = (\sum_{t=1}^T \gamma_{t}^{-1})^{-1} \cdot \sum_{t=1}^T \gamma_{t}^{-1} w_t$
	\end{algorithmic}
	\label{alg4}
\end{algorithm}
\begin{theorem}\label{thm:9}
For the $\ell_p^d$ space with $1<p<2$, suppose Assumption \ref{ass:4} holds. Algorithm \ref{alg4} is $(\epsilon, \delta)$-DP if $\epsilon = O( \sqrt{\frac{\log(n/\delta)}{n}})$ and $0<\delta<1$.
\end{theorem}

\begin{theorem}\label{thm:10}
For the $\ell_p^d$ space with $1<p<2$, suppose Assumption \ref{ass:4} holds and assume $n$ is sufficiently large such that $n\geq O(\frac{\max\{\beta^2, 1\}M^2\sqrt{d\kappa^2 \log(1/\delta)} }{\epsilon})$. Given a failure probability $\delta'>0$, in Algorithm \ref{alg4}, take ${T}=O(\frac{M^2n^2\epsilon^2} {\lambda^2 {d \log (1/\delta)}})$, $\{\gamma\}_{t=1}^T=\bar{\gamma}=\sqrt{T}$, and $\lambda =O(\frac{\sqrt{n\epsilon}}{\sqrt[4]{\kappa^2 d \log (1/\delta)}})$, then the output $\hat{w}$ satisfies the following with probability $1-\delta'$
   \begin{equation*}
\mathbb{E}[\mathcal{L}(\hat{w})]- \mathcal{L}(w^*)\leq  \tilde{O}(
\frac{M\sqrt[4]{\kappa^2 d\log(1/\delta)}\log(1/\delta^{'})}{\sqrt{n\epsilon}}
), 
\end{equation*}
where the expectation is taken over  the randomness of noise, and the probability is w.r.t. the dataset $D\sim \mathcal{D}^n$.
\end{theorem}
\begin{remark}
     First, note that due to the privacy amplification, here the noise added to each sample gradient is $\tilde{O}(\frac{\beta M+\lambda}{\sqrt{n}\epsilon})$ rather than $\tilde{O}(\frac{\beta M+\lambda}{\epsilon})$ if without shuffling. Secondly, note that the truncation step is quite different from the previous work on DP-SCO with heavy-tailed data \citep{wang2020differentially}, i.e., we enforce the sample gradient to become zero if its norm exceeds the threshold. Finally, compared to the best-known result $O(\sqrt{\frac{\kappa}{n}})$ in the non-private and heavy-tailed case \citep{nazin2019algorithms} and the bound $\tilde{O}(\sqrt{\frac{\kappa}{n}}+\frac{\kappa \sqrt{d}}{n\epsilon})$ for private and Lipschitz case \citep{bassily2021non}, we can see there may exist a space to improve our bound further.
\end{remark}
There are two limitations in Theorem \ref{thm:10}. First, %due to the privacy amplification theorem, 
Algorithm \ref{alg4} is $(\epsilon, \delta)$ only for $\epsilon=\tilde{O}(n^{-\frac{1}{2}})$, which cannot be generalized to mid or low privacy regime. Secondly, Theorem \ref{thm:10} only holds for the case $1<p< 2$. To address the first issue, we can slightly modify the algorithm by using batched Mirror Descent without shuffling, while we will get a worse  upper bound. For the second one, similar to Theorem \ref{thm:8}, we can reduce the problem to the Euclidean case. The formal theorems (as well as proofs) are relegated into  Appendix \ref{appe: additional theorems}. 
\section{Conclusion}
   In this paper, we revisited the problem of Differentially Private Stochastic Convex Optimization (DP-SCO) in Euclidean and general $\ell_p^d$ spaces. Specifically, we focused on three settings that are still far from well understood and provided several new results. Specifically, for DP-SCO over a  constrained and bounded (convex) set in Euclidean space, for both convex and
strongly convex loss functions, we proposed methods whose outputs could achieve (expected)
excess population risks that are only dependent on the Gaussian width of the constraint set
rather than the dimension of the space. Moreover, we also showed the bound for strongly convex
functions is optimal up to a logarithmic factor. We also provided the first theoretical results for  unconstrained DP-SCO in $\ell_p^d$ space and DP-SCO with heavy-tailed data over a  constrained and bounded set in $\ell_p^d$ space.
\section*{Acknowledgements}
Di Wang was supported in part by the baseline funding BAS/1/1689-01-01, funding from the CRG grand URF/1/4663-01-01, FCC/1/1976-49-01 from CBRC.  He was also supported by the funding of the SDAIA-KAUST Center of Excellence in Data Science and Artificial Intelligence (SDAIA-KAUST AI). Changhong Zhao was supported in part by Hong Kong Research Grants Council through ECS Grant 24210220. 

%\section{Conclusion}
\bibliography{uai2023-template}
\onecolumn
\newpage 


\appendix 

\end{document}
