
\documentclass{article}

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2021} with \usepackage[nohyperref]{icml2021} above.
\usepackage{hyperref}

% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}

% Use the following line for the initial blind version submitted for review:
\usepackage{icml2021}
\usepackage{enumitem}


\def \dd {\rm{d}}



\usepackage{mylatexstyle}







\def\supp{\mathop{\text{supp}}}
\def\card{\mathop{\text{card}}}
\def\rank{\mathrm{rank}}
\def\dd{\mathrm{d}}
\def\tr{\mathop{\text{tr}}}
\newcommand{\red}{\color{red}}
\newcommand{\blue}{\color{blue}}
\newcommand{\la}{\langle}
\newcommand{\ra}{\rangle}
\newcommand{\cIs}{\cI_{\hat{s}}}
\newtheorem{condition}[theorem]{Condition}

%\def \hbtheta{\hat \btheta}
\def \hbtheta{ \btheta}
\def \barell{\bar \ell}
\def \Breg {B_{\psi}}
\def \tbtheta {\tilde{\btheta}}
\def \poly {\mathrm{poly}}



\def\supp{\mathop{\text{supp}}}
\def\card{\mathop{\text{card}}}
\def\rank{\mathrm{rank}}
% \def \CC {\textcolor{red}}
\def \CC {}
\def \CCC {\textcolor{blue}}

%\def \hbtheta{\hat \btheta}
\def \hbtheta{ \btheta}
\def \barell{\bar \ell}
\def \Breg {B_{\psi}}
\def \tbtheta {\tilde{\btheta}}
\def \poly {\mathrm{poly}}


% \usepackage{xcolor}
% \newcommand{\bin}[1]{\textcolor{blue}{[Bin: #1]}}

% \ifdefined\final
% \usepackage[disable]{todonotes}
% \else
% \usepackage[textsize=tiny]{todonotes}
% \fi
% \setlength{\marginparwidth}{0.8in}
% \newcommand{\todoy}[2][]{\todo[size=\scriptsize,color=blue!20!white,#1]{Yuan: #2}}
% \newcommand{\todod}[2][]{\todo[size=\scriptsize,color=red!20!white,#1]{XXX: #2}}
% \newcommand{\todoq}[2][]{\todo[size=\scriptsize,color=orange!20!white,#1]{Quanquan: #2}}
% \newcommand{\mip}[1]{\langle #1 \rangle}





\icmltitlerunning{Over-parameterized Logistic Regression for Sub-Gaussian Mixtures: Risk Bounds and Benign Overfitting}

\begin{document}

\twocolumn[
% \icmltitle{Logistic Regression for Sub-Gaussian Mixtures: Overparameterization and Benign Overfitting}
\icmltitle{Over-parameterized Logistic Regression for Sub-Gaussian Mixtures: Risk Bounds and Benign Overfitting}

% It is OKAY to include author information, even for blind
% submissions: the style file will automatically remove it for you
% unless you've provided the [accepted] option to the icml2021
% package.

% List of affiliations: The first argument should be a (short)
% identifier you will use later to specify author affiliations
% Academic affiliations should list Department, University, City, Region, Country
% Industry affiliations should list Company, City, Region, Country

% You can specify symbols, otherwise they are numbered in order.
% Ideally, you should not use this facility. Affiliations will be numbered
% in order of appearance and this is the preferred way.
\icmlsetsymbol{equal}{*}

\begin{icmlauthorlist}
\icmlauthor{Aeiau Zzzz}{equal,to}
\icmlauthor{Bauiu C.~Yyyy}{equal,to,goo}
\icmlauthor{Cieua Vvvvv}{goo}
\icmlauthor{Iaesut Saoeu}{ed}
\icmlauthor{Fiuea Rrrr}{to}
\icmlauthor{Tateu H.~Yasehe}{ed,to,goo}
\icmlauthor{Aaoeu Iasoh}{goo}
\icmlauthor{Buiui Eueu}{ed}
\icmlauthor{Aeuia Zzzz}{ed}
\icmlauthor{Bieea C.~Yyyy}{to,goo}
\icmlauthor{Teoau Xxxx}{ed}
\icmlauthor{Eee Pppp}{ed}
\end{icmlauthorlist}

\icmlaffiliation{to}{Department of Computation, University of Torontoland, Torontoland, Canada}
\icmlaffiliation{goo}{Googol ShallowMind, New London, Michigan, USA}
\icmlaffiliation{ed}{School of Computation, University of Edenborrow, Edenborrow, United Kingdom}

\icmlcorrespondingauthor{Cieua Vvvvv}{c.vvvvv@googol.com}
\icmlcorrespondingauthor{Eee Pppp}{ep@eden.co.uk}

% You may provide any keywords that you
% find helpful for describing your paper; these are used to populate
% the "keywords" metadata in the PDF but will not be shown in the document
\icmlkeywords{Machine Learning, ICML}

\vskip 0.3in
]

% this must go after the closing bracket ] following \twocolumn[ ...

% This command actually creates the footnote in the first column
% listing the affiliations and the copyright notice.
% The command takes one argument, which is text to display at the start of the footnote.
% The \icmlEqualContribution command is standard text for equal contribution.
% Remove it (just {}) if you do not need this facility.

%\printAffiliationsAndNotice{}  % leave blank if no need to mention equal contribution
\printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text.




% \title{\huge Logistic Regression for Sub-Gaussian Mixtures: Overparameterization and Benign Overfitting}

% %Over-parameterized Logistic Regression for Sub-Gaussian Mixtures: Risk Bounds and Benign Overfitting

% \author
% {
    
% }


% \date{}


% \begin{document}

% \maketitle




% \section{Introduction}

\begin{abstract}
    Modern machine learning systems such as deep neural networks are often highly over-parameterized so that they can overfit the noisy training data, yet they can still achieve small test errors in practice. In order to understand this ``benign overfitting'' phenomenon, \citet{bartlett2020benign} established risk bounds of the minimum-norm interpolator for over-parameterized linear regression models. In this paper, we study benign overfitting of logistic regression without explicit regularization for linear classification problems. Specifically, we consider data generated from sub-Gaussian mixtures, and provide a tight risk bound for logistic regression in the over-parameterized setting. Our results precisely characterize the condition under which benign overfitting can occur for logistic regression, and are tighter than previous results.
    %Simulation results corroborate our theory.
    %When reducing to the isotropic case, our risk bound is tighter than existing results for Gaussian data and sub-Gaussian mixtures.
    % The phenomenon of benign overfitting is one of the key mysteries
    %(1) under certain conditions, we show that, for mixture of sub-Gaussian data, the maximum margin classifier on the training data is equivalent to the minimum norm interpolator; and (2) A tight risk bound for anisotropic data distributions along with a lower bound. %Our results are applicable to the anisotropic setting and are therefore more general than previous works. Moreover, 
\end{abstract}

% Some existing works have studied benign overfitting in sub-Gaussian mixtures. However, they are mainly focused on the isotropic setting. 

% Our contribution:
% \begin{enumerate}
% \item \todoq{consider logistic regression, implicit bias, finite-time analysis}
% \item Infinite-dimensional linear classification
% \item Anisotropic
% %\item Comparison with Phil
% %\item \todoq{can we do sub-Gaussian?}
% \item \todoq{Can we cover Massart noise?}
% \item Distribution-specific PAC learning, halfspace, linear classification
% \item do we require the component of the sub-Gaussian r.v. to be independent? See peter's benign overfitting in ridge regression paper.
% \end{enumerate}



\section{Introduction}

In modern machine learning, complicated models such as deep neural networks have received increasing popularity. These complicated models are known to be able to overfit any noisy training data sets, while at the same time achieving small test errors. In fact, this \textit{benign overfitting} phenomenon is not a unique feature of deep learning. Even for kernel methods and linear models, \citet{belkin2018understand} has demonstrated that interpolators on the noisy training data can still perform well on the test data. A series of works \citep{belkin2019two,muthukumar2020harmless,hastie2019surprises} have since then theoretically studied how over-parameterization can achieve small population risk.

% It has been widely observed that t can achieve small test error, even when . 
% the benefit of over-parameterization in machine learning models.% under the interpolation regime .
% These complicated machine learning models can known to have the power to perfectly fit the training data and achieve interpolation, while at the same time also achieves good test error \citep{zhang2016understanding}. 




Among the recent studies of learning problems with over-parameterization, a remarkable work by \citet{bartlett2020benign} has studied the benign overfitting phenomenon in linear regression. The authors considered the setting where the data are generated from a ground-truth linear model with noises, 
and established a tight population risk bound for the minimum norm linear interpolator with a matching lower bound. More recently, \citet{tsigler2020benign} further studied benign overfitting in ridge regression, and established non-asymptotic generalization bounds for over-parametrized ridge regression. They showed that those bounds are tight for a range of regularization parameter values. Notably, these results cover arbitrary covariance structures of the data, and give a nice characterization of how the spectrum of the data covariance matrix affects the population risk in the over-parameterized regime.



% proposed an intriguing phenomenon called \textit{benign overfitting}, which states that over-parameterized models that interpolate the training data can still achieve good population risk, even when the training data consist of noises.  Benign overfitting is quite surprising as it, to certain extent, contradicts with the classic PAC learning theory on the overfitting issue. 

% To explain benign overfitting, \citet{bartlett2020benign} and a series of follow-up works \citep{tsigler2020benign} have studied over-parameterized linear regression, and demonstrated that under certain conditions 
% on the eigenvalues of the input covariance matrix, the population risk achieved by minimum norm interpolator can be still asymptotically optimal even with the presence of noises.



More recently, benign overfitting has also been studied in the setting of linear classification \citep{chatterji2020finite,muthukumar2020classification,wang2020benign}. Specifically, \citet{muthukumar2020classification} studied the setting where the data inputs are Gaussian and the labels are generated from a ground truth linear model with label flipping noises, and utilized an equivalence result between the hard-margin support vector machine (SVM) solution and the minimum norm interpolator to study benign overfitting. 
\citet{chatterji2020finite,wang2020benign} studied the benign overfitting phenomenon in sub-Gaussian/Gaussian mixture models and established population risk bounds for the maximum margin classifier. \citet{chatterji2020finite} leveraged the \textit{implicit bias} of gradient descent for logistic regression \citep{soudry2017implicit} to establish the risk bound. \citet{wang2020benign} established an equivalence result between classification and regression for isotropic Gaussian mixture models. While these results have offered valuable insights into the benign overfitting phenomenon for (sub-)Gaussian mixture classification, they still have certain limitations. Unlike the results in the regression setting where the eigenvalues of the data covariance matrix play a key role, the current results for Gaussian/sub-Gaussian mixture models are limited to the isotropic or close-to-isotropic settings, where the impact of the spectrum of the data covariance matrix on the risk remains elusive.



% Motivated by the observation that over-parameterized linear logistic regression produces the maximum margin classifier, i.e., hard-margin support vector machine (SVM) solution \citep{soudry2017implicit}, these works establish population risk bounds of the maximum margin classifier for two-class linear classification problems. 




% A particularly interesting observation studied by  \citet{muthukumar2020classification,wang2020benign} is the equivalence between classification and regression in the over-parameterized setting. 

% More recently, benign overfitting has also been studied in the setting of over-parameterized linear classification for Gaussian/sub-Gaussian mixtures \citep{chatterji2020finite,muthukumar2020classification,wang2020benign}. Motivated by the observation that over-parameterized linear logistic regression produces the maximum margin classifier, i.e., hard-margin support vector machine (SVM) solution \citep{soudry2017implicit}, these works establish population risk bounds of the maximum margin classifier for two-class linear classification. 
% A particularly interesting observation studied by  \citet{muthukumar2020classification,wang2020benign} is the equivalence between classification and regression in the over-parameterized setting. 

%  \citet{wang2020benign} specifically utilized an observation on the  equivalence between classification and regression in the over-parameterized setting \citet{muthukumar2020classification}, and proved that under certain conditions, the hard-margin SVM solution is exactly the minimum norm linear interpolator.  \citet{wang2020benign} showed that the hard-margin SVM solution for isotropic Gaussian mixtures. 



% Among the recent studies of learning under the interpolation regime, A remarkable work \citet{muthukumar2020classification} showed an interesting  phenomenon on the equivalence between classification and regression in the over-parameterized setting. The authors proved that for Gaussian design linear classification, under certain conditions the hard-margin SVM solution coincides with the minimum norm solution of least square regression. \citet{wang2020benign} also studied isotropic Gaussian mixture classification and showed that under certain conditions, the hard-margin SVM solution is exactly the minimum norm linear interpolator. 


% These works also utilized the equivalence results between classification and regression to establish population classification error bound for the study of benign overfitting.

% studied in \citet{muthukumar2020classification,wang2020benign} together with


% The equivalence between hard-margin SVM and minimum norm interpolator is closely related to the \textit{benign overfitting} phenomenon \citep{bartlett2020benign}. 

% A surprising phenomenon is that , while at the same time generalize well on test data.  



In this paper, we study the benign overfitting phenomenon in a general sub-Gaussian mixture model that covers both the isotropic and anisotropic settings, where the $d$-dimensional features from two classes have the same covariance matrix $\bSigma$ but have different means $\bmu$ and $-\bmu$ respectively. We consider over-parameterized logistic regression trained with gradient descent, and establish a population risk bound for the obtained classifier. Our proof of the risk bound has a key step to demonstrate that under certain conditions regarding the eigenvalues of $\bSigma$, the mean vector $\bmu$ and the sample size $n$, the maximum margin classifier for this problem is identical to the minimum norm interpolator. We then utilize this result to establish a tight population risk bound of the maximum margin classifier. Our result reveals how the eigenvalues of the  covariance matrix $\bSigma$ affect the benign property of the classification problem, and is tighter and more general than existing results on sub-Gaussian/Gaussian mixture models. The contributions of this paper are as follows: 
\begin{itemize}[leftmargin = *]
    \item We establish a tight population risk bound for the maximum margin classifier. Our bound works for both the isotropic and anisotropic settings, which is more general than existing results in \citet{chatterji2020finite,wang2020benign}. When reducing our bound to the setting studied in \citet{chatterji2020finite}, our result gives a bound $\exp(-\Omega( n \| \bmu \|_2^4 / d ))$, where $n$ is the training sample size. Our bound is tighter than the risk bound $\exp(-\Omega( \| \bmu \|_2^4 / d ))$ in \citet{chatterji2020finite} by a factor of $n$ in the exponent. Our result also gives a tighter risk bound than that in \citet{wang2020benign} in the so-called ``low SNR setting'': our result suggests that $ \| \bmu \|_2^4 = \omega(d / n)$ suffices to ensure an $o(1)$ population risk, while \citet{wang2020benign} requires $ \| \bmu \|_2^4 = \omega( (d / n)^{3/2})$.
    %For certain anisotropic settings which is not covered by existing results, our bound can be even tighter. 
    \item Our analysis reveals that for a class of high-dimensional anisotropic sub-Gaussian mixture models, logistic regression solved with gradient descent can achieve small population risk under mild assumptions on the sample size $n$ and mean vector $\bmu$. Specifically, suppose that the eigenvalues of $\bSigma$ are $\{\lambda_k = k^{-\alpha}\}_{k=1}^d$ for some parameter $\alpha \in [0,1)$. Then our result shows that to achieve $o(1)$ population risk, the following conditions on $\| \bmu \|_2$ suffice:
    \begin{align*}
        \| \bmu \|_2 = \left\{ 
        \begin{array}{ll}
             \omega( 1 + (d^{1 - 2\alpha} / n)^{1/4} ), & \text{ if }\alpha \in [0, 1/2),\\
             \omega( 1 + (\log(d) / n)^{1/4} ), & \text{ if }\alpha =  1/2, \\
             \omega( 1 ).  & \text{ if }\alpha \in ( 1/2,  1).
        \end{array}
        \right.
    \end{align*}
    % while we need $\| \bmu \|_2 = \omega( (d / n)^{1/4} )$ when $\alpha \in [0, 1/2)$, we only need to require $\| \bmu \|_2 = \omega( 1 + (\log(d) / n)^{1/4} )$ or $\| \bmu \|_2 = \omega( 1)$ for $\alpha = 1/2$ and $\alpha \in ( 1/2,  1)$ respectively. 
    Therefore, when $\alpha = 1/2$, the condition on the mean vector $\bmu$ only has a logarithmic dependency on the dimension $d$, and when $\alpha \in (1/2,1)$, the condition on $\bmu$ for benign overfitting is dimension free.
    \item Our proof of the population risk bound introduces some tight intermediate results, which may be of independent interest. Specifically, our proof utilizes polarization identity to establish equivalence between the maximum margin classifier and the minimum norm interpolator. This is, to the best of our knowledge, the first equivalence result between classification and regression for anisotropic sub-Gaussian mixture models.
    %With a novel proof technique utilizing the , we are able to demonstrate that under certain conditions, the maximum margin classifier is identical to the minimum-norm interpolator. 
    %\CC{Although we are considering a more complicated model, our condition on $\bSigma$ is better than the conditions given in \citet{muthukumar2020classification} by a $\log(n)$ factor.}
    %, and our condition on the relationships among $\bSigma$, $\bmu$ and $n$ is better than the result in \citet{wang2020benign} by a  $\log(n)$ factor as well}. Our analysis here may be of independent interest.
\end{itemize}




\subsection{Additional Related Work}

% The phenomenon of benign overfitting has been studied by a recent line of work \citep{bartlett2020benign,chatterji2020finite,muthukumar2020classification,wang2020benign}. 

% The phenomenon of benign overfitting was first raised up by \citet{bartlett2020benign}, where the authors studied over-parameterized linear regression and showed that under certain conditions, minimum norm interpolator on the noisy training data can still achieve asymptotically optimal population risk. A later work \citet{chatterji2020finite} studied the risk of interpolating linear classifiers under the setting where the two class-conditional distributions have different means but the same covariance
% matrix. \citet{muthukumar2020classification} studied benign overfitting for a different classification problem with Gaussian features, and highlighted an important observation that under certain conditions, the solutions of hard-margin support vector machine (SVM) are identical to the least-squares minimum-norm interpolators. Following the same intuition, a very recent work \citet{wang2020benign} further studied the equivalence between SVM and minimum norm interpolator and the benign overfitting phenomenon under an isotropic Gaussian mixture model. The results in this paper are closely related to \citet{muthukumar2020classification} and \citet{wang2020benign}. Compared with these existing results, our work applies to infinite-dimensional, anisotropic sub-Gaussian mixtures, and gives a tighter population risk bound. 

Benign overfitting is closely related to the phenomenon of double descent studied in recent works. \citet{belkin2019reconciling,belkin2019two} showed experimental results and provided theoretical analyses on some specific models to  demonstrate that the risk curve versus over-parameterization has a double descent shape. These results can therefore indicate that over-parameterization can be beneficial to achieve small test risk. \citet{hastie2019surprises,wu2020optimal} studied the double descent phenomenon in linear regression under the setting where the dimension $d$ and sample size $n$ can grow simultaneously but have a fixed ratio, and showed that the population risk exhibits a double descent curve with respect to the ratio. More recently, \citet{mei2019generalization,liao2020random,montanari2020interpolation} further extended the setting to random feature models and studied double descent when the sample size, data dimension and the number of random features have fixed ratios.

Our work is also related to the studies of implicit bias, which analyze the impact of training algorithms when the over-parameterized models have multiple global minima. Specifically, \citet{soudry2017implicit} showed that if the training data are linearly separable, then gradient descent on unregularized logistic regression converges directionally to the maximum margin linear classifier on the training data set. \citet{ji2019implicit} further studied the implicit bias of gradient descent for logistic regression on non-separable data. \citet{gunasekar2018characterizing} studied the implicit bias of various optimization methods for generic objective functions. \citet{gunasekar2017implicit,arora2019implicit} established implicit bias results for matrix factorization problems. More recently, \citet{lyu2019gradient} showed that gradient flow for learning homogeneous neural networks with logistic loss maximizes the normalized margin on the training data set. These studies of implicit bias offer a handle for us to connect the over-parameterized logistic regression  with the maximum margin classifiers for linear models.
%  with the minimum norm interpolators

Our work is also motivated by the recent study of the generalization error bounds of over-parameterized neural networks. \citet{arora2019fine} gave a generalization error bound of over-parameterized two-layer networks interpolating the training data. \citet{allen2018learning} showed that two- and three-layer networks can have small test error when learning a class of smooth networks. \citet{cao2019generalizationsgd} gave generalization bounds for over-parameterized deep ReLU networks trained by stochastic gradient descent. \citet{ji2019polylogarithmic,chen2019much} showed that polylogarithmically over-parameterized neural networks can achieve small test errors. While some of these results can handle noises when using early stopping or online gradient descent, none of them can give meaningful generalization bounds when the neural networks are trained to perfectly fit the noisy training data. Therefore these results cannot explain the phenomenon of benign overfitting. 


% \citet{gunasekar2017implicit}

% A series of papers \citep{gunasekar2017implicit,soudry2017implicit,gunasekar2018characterizing,gunasekar2018implicit,nacson2018stochastic,li2018algorithmic,jacot2020implicit} studied implicit bias problem, aiming to figure out when there are multiple optimal solutions of a training objective function, what kind of nice properties the optimal found by a certain training algorithm would have. Implicit bias results of gradient descent, stochastic gradient descent, or mirror descent for various problem settings including matrix factorization, logistic regression, deep linear networks as well as homogeneous models. The major difference between these results and our work is that implicit bias results usually focus on the parameter space, while we study the functions a neural network prefer to learn in the function space. 




% If $a_n\geq C b_n$ for all large enough $n$, then we denote $a_n = \Omega(b_n)$. 


% we use $a_n = O(b_n)$ to denote that $a_n\le C_1 b_n$ for some absolute constant $C_1> 0$, and use $a_n = \Omega (b_n)$ to denote that $a_n\ge C_2 b_n$ for some absolute constant $C_2>0$. In addition, we also use $\tilde O(\cdot)$ and $\tilde \Omega(\cdot)$ to hide some logarithmic terms in Big-O and Big-Omega notations.


\section{Problem Setting}\label{section:problemsetting}
In this section, we introduce the notations and detailed problem setup. We begin with the notations. %describing the details of the sub-Gaussian mixture model. 

\subsection{Notations}
We use lower case letters to denote scalars, and use lower/upper case bold face letters to denote vectors/matrices respectively. For a vector $\vb$, we denote by $\| \vb \|_2$ the $\ell_2$-norm of $\vb$. For a matrix $\Ab$, we use $\| \Ab \|_2$, $\| \Ab \|_F$ to denote its spectral norm and Frobinuous norm respectively, and use $\tr(\Ab)$ to denote its trace. For a vector $\vb \in \RR^d$ and a positive definite matrix $\Ab$, we define $\| \vb \|_{\Ab} = \sqrt{ \vb^\top  \Ab  \vb}$. For an integer $n$, we denote $[n] = \{ 1,2, \ldots, n \}$.

We also use standard asymptotic notations $O(\cdot )$, $\Omega(\cdot )$,  $o(\cdot )$, and $\omega(\cdot )$. Let $\{a_n\}$ and $\{b_n\}$ be two sequences. If there exists a constant $C> 0$ such that $|a_n|\leq C |b_n|$ for all large enough $n$, then we denote $a_n = O(b_n)$. We denote $a_n = \Omega(b_n)$ if $b_n = O(a_n)$. Moreover, we write  $a_n = o(b_n)$ if $\lim | a_n / b_n | = 0$ and $a_n = \omega(b_n)$ if $\lim | a_n / b_n | = \infty$. We also use $\tilde O(\cdot)$ and $\tilde \Omega(\cdot)$ to hide some logarithmic terms in Big-O and Big-Omega notations.

At last, for a random variable $Z$, we denote by $\| Z \|_{\psi_2}$ and $\| Z \|_{\psi_1}$ the sub-Gaussian and sub-exponential norms of $Z$ respectively. 

\subsection{Sub-Gaussian Mixture Model}
We consider a model where the feature vectors are generated from a mixture of two sub-Gaussian distributions with means $\bmu$ and $-\bmu$ and the same covariance matrix $\bSigma$.
We assume that each data pair $(\xb,y)$ are generated independently from the following procedure:

\begin{enumerate}[leftmargin = *]
    \item The label $y\in\{+1,-1\}$ is generated as a Rademacher random variable.
    \item A random vector $\ub \in \RR^d$ is generated from a distribution such that the entries of $\ub $ are independent sub-Gaussian random variables with $\EE [u_j] = 0$, $\EE [u_j^2] = 1$ and $\| u_j \|_{\psi_2} \leq \sigma_u$ for all $j\in[d]$. %\CC{In our analysis, we treat $\sigma_u$ as an absolute constant.}
    \item Let $\bSigma$ be a positive definite matrix with eigenvalue decomposition $\bSigma = \Vb \bLambda \Vb^\top$, where $\bLambda = \diag\{ \lambda_1,\ldots,\lambda_d \}$ and $\Vb$ is an orthonormal matrix consisting of the eigenvectors of $\bSigma$. We calculate the random vector $\qb$ based on $\ub$ as $\qb = \Vb \bLambda^{1/2} \ub$. This ensures that $\qb$ has mean zero and a covariance matrix $\bSigma$.
    \item The feature is given as $\xb = y\cdot \bmu + \qb$, where $\bmu \in \RR^d$ is a vector. Clearly, the mean of $\xb$ is $\bmu$ when $y = 1$ and is $-\bmu$ when  $y = -1$.
\end{enumerate}
% Suppose that $y_i$, $i\in [n]$ are generated as i.i.d. Rademacher variables. Suppose that $\qb\in\RR^d$ is generated from a distribution with covariance matrix $\bSigma$ such that $\qb = \Vb \bLambda^{1/2} \ub $, where $\bSigma = \Vb \bLambda \Vb^\top$ is the spectral decomposition of $\bSigma$, and $\ub$ is a random vector with independent zero-mean sub-Gaussian entries. Without loss of generality, we assume that $\EE u_j^2 = 1$ and $\sup_j \| u_j \|_{\psi_2} \leq \sigma_u$ (This is because we can absorb the scaling into $\bSigma$). The features are given as
% \begin{align*}
%     \xb = y\cdot \bmu + \qb,
% \end{align*}
% where $\bmu \in \RR^d$ is a fixed vector. 
We consider $n$ training data points $(\xb_i,y_i)$ generated independently from the above procedure, and denote
\begin{align*}
    \Xb = \yb \bmu^\top + \Qb,
\end{align*}
where $\Xb = [ \xb_1,\ldots, \xb_n ]^\top, \Qb = [ \qb_1,\ldots, \qb_n ]^\top \in \RR^{n \times d}$, and $\yb = [y_1,\ldots, y_n]^\top \in \{\pm1\}^n$. %, and each row of $\Qb$ are generated independently from $N(\mathbf{0}, \bSigma)$. 

A recent work \citep{chatterji2020finite} has studied a similar sub-Gaussian mixture model in a close-to-isotropic setting and considered additional label flipping noises. In this paper, we do not introduce the label flipping noises for simplicity, but we consider a general covariance matrix $\bSigma$ to cover the general anisotropic setting\footnote{Note that \citet{chatterji2020finite} do not explicitly assume the covariance matrix to be a scalar multiple of the identity matrix. However, their assumption requires $\tr(\bSigma) = \Omega(d)$, which implies that their result is in a close-to-isotropic setting.}. %\CC{It is worth noting that even though we do not consider label flipping noises, establishing risk bounds for over-parameterized linear classification is still a challenging problem. Intuitively, the two Gaussian clusters have `overlap', and the optimal classifier should not perfectly classify all the training data. Therefore, the optimal classifier and the overfitting classifier could be very different, and we can treat the training data points that are not correctly classified by the optimal classifier, but are fitted by the overfitting classifier as `noisy data'. This means that benign overfitting in this setting is still a challenging problem to study.}
%\CC{It is worth noting that even though we do not consider label flipping noises, establishing risk bounds for over-parameterized linear classification is still a challenging problem. Intuitively, the two Gaussian clusters have `overlap', and the optimal classifier should not perfectly classify all the training data. Therefore, the optimal classifier and the overfitting classifier could be very different, and benign overfitting in this setting is still a challenging problem to study.}


%  because the two Gaussian clusters have overlaps, and the overfitting linear classifier can be very different from the optimal one.
% Note that our sub-Gaussian mixture model is

Our model is rather general and covers the following examples. 

\begin{example}[Gaussian mixture model]
The most straight-forward example is when the data are generated from Gaussian mixtures $N(\bmu,\bSigma)$ and $N(-\bmu,\bSigma)$. This is covered by our model when the sub-Gaussian vector $\ub$ is a standard Gaussian random vector.
\end{example}


\begin{example}[Rare/weak feature model]\label{example:rare-weak}
The rare-weak model is a special case of the Gaussian mixture model where $\bSigma = \Ib$ and $\bmu$ is a sparse vector with $s$ non-zero entries equaling $\gamma$.
\end{example}

% \begin{example}[Boolean Rare-Weak Model]
% In the Boolean rere-weak model, each data point $(\xb,y)$ is generated from the following procedure:
% \begin{enumerate}[leftmargin = *]
%     \item The label $y\in\{+1,-1\}$ is generated as a Rademacher random variable.
%     \item The entries of $\xb$ are conditionally independent Boolean random variables given $y$. For $\gamma \in (0,1/2)$, $x_i = y$ with probability $1/2 + \gamma$ and  $x_i = -y$ with probability $1/2 - \gamma$ for $i\in [s]$. $x_i$ for $i\in\{ s+1,\ldots, d \}$ are independent Rademacher random variables.
%     % \item Let $\bSigma$ be a positive definite matrix with eigenvalue decomposition $\bSigma = \Vb \bLambda \Vb^\top$, where $\bLambda = \diag\{ \lambda_1,\ldots,\lambda_d \}$ and $\Vb$ is an orthornomal matrix. We calculate the random vector $\qb$ based on $\ub$ as $\qb = \Vb \bLambda^{1/2} \ub$. This ensures that $\qb$ has zero mean and covariance matrix $\bSigma$.
%     % \item The feature is given as $\xb = y\cdot \bmu + \qb$, where $\bmu \in \RR^d$ is a vector. Clearly, the mean of $\xb$ is $\bmu$ when $y = 1$ and is $-\bmu$ when  $y = -1$.
% \end{enumerate}
% \end{example}
The rare/weak feature model was originally investigated by \citet{donoho2008higher,jin2009impossibility}, and has been recently studied by \citet{chatterji2020finite}. %with additional label flipping noises. 
% Similar examples have been studied in \citet{chatterji2020finite}. 


\subsection{Over-parameterized Logistic Regression}
We consider learning a linear classifier with logistic regression and study the population risk bound. For any $\btheta\in\RR^d$, we define the population risk of the linear classifier defined by $\btheta$ as
\begin{align*}
    R(\btheta) = \PP\big( y\cdot \la \btheta, \xb \ra < 0 \big).
\end{align*}

To perform logistic regression, we consider the following empirical loss minimization problem:
\begin{align*}
    \min_{\btheta\in\RR^d} L(\btheta ) := \frac{1}{n} \sum_{i=1}^n \log [ 1 + \exp(- y_i \cdot \la \btheta, \xb_i \ra ) ].
\end{align*}
We solve the above optimization problem  with gradient descent
\begin{align}\label{eq:GDupdate}
    \btheta^{(t+1)} = \btheta^{(t)} - \eta\cdot \nabla  L(\btheta^{(t)} ),
\end{align}
where $\eta>0$ is the learning rate.

In the over-parameterized setting where $d \gg n$, it is evident that the training data points are linearly separable with high probability \CC{(for example, $\Xb\Xb^\top$ is invertible with high probability and the minimum norm interpolator $\hat\btheta_{\text{LS}} = \Xb^\top (\Xb\Xb^\top)^{-1} \yb$ separates the training data.)}. For linearly separable data, a series of recent works have studied the \textit{implicit bias} of (stochastic) gradient descent for logistic regression \citep{soudry2017implicit,ji2019implicit,nacson2019stochastic}. These results demonstrate that among all linear classifiers that can classify the training data correctly, gradient descent will converge to the one that maximizes the $\ell_2$ margin. Such an implicit bias result is summarized in the following lemma.
\begin{lemma}[Theorem~3 in \citet{soudry2017implicit}]\label{lemma:implicitbias}
    Suppose that the training data set $\{(\xb_i,y_i)\}$ is linearly separable. Then as long as $\eta>0$ is small enough, the gradient descent iterates $\btheta^{(t)}$ for logistic regression defined in \eqref{eq:GDupdate} has the following direction limit:
    \begin{align*}
    \lim_{t\rightarrow \infty}\frac{\btheta^{(t)}}{\| \btheta^{(t)} \|_2} =  \frac{\hat\btheta_{\text{SVM}}}{\| \hat\btheta_{\text{SVM}} \|_2},
\end{align*}
where $\hat\btheta_{\text{SVM}}$ is the maximum margin classifier, i.e., the solution to the hard-margin SVM:
\begin{align*} %\label{eq:thetasvm_definition}
    \hat\btheta_{\text{SVM}} = \argmin \| \btheta \|_2^2, ~~\text{subject to } y_i \cdot \la\btheta , \xb_i\ra \geq 1, i\in [n].
\end{align*}
\end{lemma}


% Therefore, by the implicit bias results for logistic regression , we have
% , i.e. the hard-margin SVM solution defined as
From Lemma~\ref{lemma:implicitbias}, we can see that, to establish risk bounds for over-parameterized logistic regression trained by gradient descent, it suffices to study the risk of the maximum margin classifier $\hat\btheta_{\text{SVM}}$. 


\section{Main Results}\label{section:mainresults}
% \CC{Denote $\|\bmu\|_{\bSigma} = \sqrt{ \bmu^\top \bSigma \bmu } $}. 

In this section, we present our main result on the population risk bound of over-parameterized logistic regression, and then apply the result to isotropic and anisotropic sub-Gaussian mixture models to study the conditions under which benign overfitting occurs.

The main result of this paper is given in the following theorem, where we establish the population risk bound for the maximum margin classifier $R(\hat\btheta_{\text{SVM}})$.

\begin{theorem}\label{thm:BOnew}
Suppose that 
% $\tr( \bSigma ) \geq C n \sqrt{n} \cdot \| \bSigma \|_2 + n\cdot \| \bSigma \|_F  +  n\sqrt{\log(n)})$ and $\tr( \bSigma ) \geq C n\cdot \| \bmu \|_{\bSigma} )$
$$\tr( \bSigma ) \geq C \max\big\{ n^{3/2} \|\bSigma \|_2, n\| \bSigma \|_F , n\sqrt{\log(n)}\cdot \| \bmu \|_{\bSigma} \big\}$$ 
and
$\| \bmu \|_2 \geq C \| \bSigma\|_{2}$ 
% and $\| \bmu \|_2 \geq C \| \bSigma\|_{2}$ 
for some large enough absolute constant $C$. Then when $n$ is large enough, with probability at least $1 - n^{-1}$, the maximum margin classifier $\hat\btheta_{\text{SVM}}$ has the following risk bound
\begin{align*}
    R(\hat\btheta_{\text{SVM}}) \leq \exp\Bigg(  \frac{ - C' n \| \bmu \|_2^4  }{  n  \| \bmu \|_{\bSigma}^2+ \| \bSigma\|_F^2 +  n\| \bSigma\|_2^2 } \Bigg),
\end{align*}
where $C'$ is an absolute constant.
\end{theorem}

% \CC{XXXXXXXXX}
Theorem~\ref{thm:BOnew} gives the population risk bound of the maximum margin classifier $\hat\btheta_{\text{SVM}}$. Based on the implicit bias of gradient descent for over-parameterized logistic regression (Lemma~\ref{lemma:implicitbias}), we have that the gradient descent iterates $\btheta^{(t)}$ satisfy that
\begin{align*}
    \lim_{t\rightarrow \infty} R(\btheta^{(t)}) &= \lim_{t\rightarrow \infty} R(\btheta^{(t)}/ \| \btheta^{(t)} \|_2) \\
    &=R(\hat\btheta_{\text{SVM}}/\| \hat\btheta_{\text{SVM}} \|_2)\\
    &= R(\hat\btheta_{\text{SVM}}).
\end{align*}
Therefore, the same risk bound in Theorem~\ref{thm:BOnew} also applies to the over-parameterized logistic regression trained by gradient descent. 

Note that Theorem~\ref{thm:BOnew} holds for general covariance matrices $\bSigma$, and illustrates how the spectrum of $\bSigma$ affects the population risk of logistic regression. This makes our result more general than the recent results for the isotropic setting in \citet{chatterji2020finite,wang2020benign}, where the population risk bounds are given only in terms of the sample size $n$, dimension $d$ and the norm of the mean vector $\|\bmu\|_2$. In fact, when we specialize our general result to the isotropic setting, our result also provides a tighter risk bound than these existing results. Specifically, our population risk bound for the isotropic setting is given in the following corollary. 







% it is typical to consider the setting where $\| \bSigma\|_2 = O(1)$. 

\begin{corollary}\label{col:isotropic}
Consider the setting where $\bSigma = \Ib$. 
Suppose that
% $\tr( \bSigma ) \geq C n \sqrt{n} \cdot \| \bSigma \|_2 + n\cdot \| \bSigma \|_F  +  n\sqrt{\log(n)})$ and $\tr( \bSigma ) \geq C n\cdot \| \bmu \|_{\bSigma} )$
$$d \geq C \max\big\{ n^{2} , n\sqrt{\log(n)}\cdot \| \bmu \|_{2} \big\}$$
and
$\| \bmu \|_2 \geq C $ 
% and $\| \bmu \|_2 \geq C \| \bSigma\|_{2}$ 
for some large enough absolute constant $C$. Then when $n$ is large enough, with probability at least $1 - n^{-1}$, the maximum margin classifier $\hat\btheta_{\text{SVM}}$ has the following risk bound
\begin{align*}
    R(\hat\btheta_{\text{SVM}} ) \leq  \exp\bigg( - \frac{  C' n \| \bmu \|_2^4  }{  n  \| \bmu \|_{2}^2+ d } \bigg),
\end{align*}
where $C'$ is an absolute constant.
\end{corollary}
\begin{remark}
 \citet{chatterji2020finite} recently gave a risk bound of order $\exp( - \Omega( \| \bmu \|_2^4 / d ))$ for sub-Gaussian mixture models under the condition that $d = \Omega( \max\{ n^2\log(n), n\| \bmu \|_2^2 \} )$. Clearly, our result in Corollary~\ref{col:isotropic} only requires the condition $d = \Omega(n\| \bmu \|_2 )$ rather than $d = \Omega(n\| \bmu \|_2^2 )$. Therefore, our condition on $d$ is milder. Moreover, when the stronger condition $d = \Omega(n\| \bmu \|_2^2 )$ holds, our risk bound becomes $\exp( - \Omega( n \| \bmu \|_2^4 / d ))$, which is better than the result of \citet{chatterji2020finite} by a factor of $n$ in the exponent.
\end{remark}

\begin{remark}
\citet{wang2020benign} studied the risk bound in two settings, namely the  low-SNR setting ($\| \bmu \|_2^2 \leq d / n $) and the high-SNR setting ($\| \bmu \|_2^2 > d / n $). \CC{The numerator $ n  \| \bmu \|_{2}^2+ d $ in the exponent in Corollary~\ref{col:isotropic} provides further insights into the difference between the low- and high-SNR settings.} Moreover, for the low-SNR setting, Corollary~2.2 in \citet{wang2020benign} requires $ \| \bmu \|_2^4 = \omega( (d / n)^{3/2})$ to make the risk converge to zero. In comparison,  Corollary~\ref{col:isotropic} only requires  $ \| \bmu \|_2^4 = \omega(d / n)$ to ensure $R(\hat\btheta_{\text{SVM}}) = o(1)$. \CC{Therefore, our risk bound is tighter than theirs.}
\end{remark}


% \subsection{Examples for Anisotropic Sub-Gaussian Mixtures}

% In this subsection,


% Corollary~\ref{col:isotropic} and the discussion above demonstrate that our risk bound in

Besides being tighter than previous results  when reduced to the isotropic setting, %the strength of 
Theorem~\ref{thm:BOnew} %also %lies in the \CC{generality} of the result -- it 
covers both the isotropic and anisotropic settings. In the following, we provide some case studies under the anisotropic setting and show how the decay rate of the eigenvalues of the covariance matrix $\bSigma$ affects the population risk.

It is worth noting that the assumption of Theorem~\ref{thm:BOnew} requires that $\tr(\bSigma)$ is large enough, while the risk bound in Theorem~\ref{thm:BOnew} only depends on $\| \bSigma \|_F$ and $\| \bSigma \|_2$. In the over-parameterized setting where the dimension $d$ is large, it is possible that for certain covariance matrices $\bSigma$ with appropriate eigenvalue decay rates, $\tr(\bSigma) \gg 1$ while $\| \bSigma \|_F, \| \bSigma \|_2 = O(1) $. This implies that for many anisotropic sub-Gaussian mixture models,  the assumptions in Theorem~\ref{thm:BOnew} can be easily satisfied, while the risk bound can be small at the same time. 
% give tight risk bounds for a ide range of anisotropic Gaussian mixture models. 
Following this intuition, we study the conditions under which the the maximum margin interpolator $\hat\btheta_{\text{SVM}}$ achieves $o(1)$ population risk. We denote by $\lambda_k$ the $k$-th largest eigenvalue of $\bSigma$, and consider a polynomial decay 
spectrum $\{ \lambda_k = k^{-\alpha} \}_{k=1}^d$, where we introduce a parameter $\alpha$ to control the eigenvalue decay rate.
We have the following corollary.

% We consider different decay rates of the spectrum of $\bSigma$, and 

% Note that the assumption of Theorem~\ref{thm:BOnew} requires that $\tr(\bSigma)$ is large enough, while the risk bound only depends on $\| \bSigma \|_F$ and $\| \bSigma \|_F$ instead of $\tr(\bSigma)$.


% Here we discuss 




% \noindent\textbf{Polynomial decay.} %Suppose that $\lambda_k = k^{-\alpha}$. Then we have the following results:
% \begin{enumerate}[leftmargin = *]
%     \item If $\alpha\in (1/2, 1 )$, then $ \tr( \bSigma ) = \sum_{k=1}^d \lambda_k \geq d^{1 - \alpha} / (1 - \alpha) $, and $ \| \bSigma \|_F = (\sum_{k=1}^d \lambda_k^2 )^{1/2}\leq (  2\alpha - 1)^{-1/2} $. Therefore the risk bound in Theorem~\ref{thm:BOnew} reduces to 
% \end{enumerate}

\begin{corollary}\label{col:polynomialdecay}
    Suppose that $\lambda_k = k^{-\alpha}$, and one of the following conditions hold:
\begin{enumerate}[leftmargin = *]
    \item $\alpha\in [0, 1/2)$, $ d = \tilde\Omega( n^{\frac{3}{2(1 - \alpha)}} + n^2 + ( n \| \bmu \|_{\bSigma})^{\frac{1}{1 - \alpha}}  )$, and $\| \bmu \|_2 = \omega( 1 + n^{-1/4}d^{1/4 - \alpha/2}  )$.
    \item $\alpha = 1/2$, $ d = \tilde\Omega( n^{3} + n^2 \| \bmu \|_{\bSigma}^2  )$, and $\| \bmu \|_2 = \omega( 1 + n^{-1/4}(\log(d))^{1/4} )$.
    \item $\alpha\in (1/2, 1 )$, $ d = \tilde\Omega( n^{\frac{3}{2(1 - \alpha)}} + ( n \| \bmu \|_{\bSigma})^{\frac{1}{1 - \alpha}}  )$, and $\| \bmu \|_2 = \omega(1)$.
    % \item $\alpha =  1$, $ d = \exp[ \Omega( n^{3/2} + n \| \bmu \|_{\bSigma}) ]$, and $\| \bmu \|_2 = \omega(1)$.
\end{enumerate}
% $$\tr( \bSigma ) \geq C \max\{ n\sqrt{\log(n)}, n^{3/2} \|\bSigma \|_2, n\| \bSigma \|_F , n \| \bmu \|_{\bSigma} \}.$$ 
Then with probability at least $1 - n^{-1}$, the population risk of the maximum margin classifier satisfies $R(\hat\btheta_{\text{SVM}}) = o(1)$.
\end{corollary}



Corollary~\ref{col:polynomialdecay} follows by calculating the orders of $\tr(\bSigma) = \sum_{k=1}^d \lambda_k$ and $\| \bSigma \|_F^2 =  \sum_{k=1}^d \lambda_k^2$. A full proof is provided in Appendix~\ref{section:corollaryproof}. 
Intuitively,  when $\| \bmu \|_2$ is large, the two classes are far away from each other and therefore linear classifiers can achieve small population risk. From Corollary~\ref{col:polynomialdecay}, we can see that the decay rate of the eigenvalues of the covariance matrix $\bSigma$ determines how large $\| \bmu \|_2 $ needs to be to ensure small population risk: when the $\{\lambda_k\}$ decays faster (i.e., when $\alpha$ is larger), logistic regression can achieve $o(1)$ population risk with a smaller $\| \bmu \|_2 $. 

Corollary~\ref{col:polynomialdecay} also exhibits a certain ``phase transition'' regarding the eigenvalue decay rate and the conditions on $\| \bmu \|_2$. \CC{For simplicity we can consider the case when $n$ is fixed.} We can see that the eigenvalue decay rate can be divided into three regimes $\alpha\in [0, 1/2)$, $\alpha =  1/2$ and $\alpha\in (1/2, 1)$. Under the condition that $d = \Omega(\poly(n))$, achieving $o(1)$ risk in each of these regimes requires $\| \bmu \|_2 = \omega( d^{1/4} ) $, $\| \bmu \|_2 = \omega( [\log(d)]^{1/4} ) $, and $\| \bmu \|_2 = \omega( 1) $ respectively. Specifically, when $\alpha\in (1/2, 1)$, the condition on $\bmu$ is independent of the dimension $d$. This means that when $\alpha\in (1/2, 1)$, for any $\epsilon > 0$, as long as $\| \bmu \|_2 = \Omega( \sqrt{\log(\epsilon)})$, we have
\begin{align*}
    \lim_{d\rightarrow \infty} R(\hat\btheta_{\text{SVM}}) \leq \epsilon.
\end{align*}
Therefore, our result covers the infinite dimensional setting when the eigenvalues of the covariance matrix $\bSigma$ have an appropriate decay rate, i.e., $\alpha\in (1/2, 1)$. 

At last, we can also apply our risk bound to the rare/weak feature model defined in Example~\ref{example:rare-weak}. We have the following corollary. 
% Therefore, Corollary~\ref{col:polynomialdecay} gives a comprehensive characterization of benign overfitting for anisotropic sub-Gaussian mixtures.

\begin{corollary}\label{col:rare-weak}
Consider the rare/weak feature model (Example~\ref{example:rare-weak}). Suppose that 
% $\tr( \bSigma ) \geq C n \sqrt{n} \cdot \| \bSigma \|_2 + n\cdot \| \bSigma \|_F  +  n\sqrt{\log(n)})$ and $\tr( \bSigma ) \geq C n\cdot \| \bmu \|_{\bSigma} )$
$$d \geq C \max\{ n^{2} , \gamma n\sqrt{ s \log(n)} \}$$ 
and
$\gamma\sqrt{s} \geq C $ 
% and $\| \bmu \|_2 \geq C \| \bSigma\|_{2}$ 
for some large enough absolute constant $C$. Then when $n$ is large enough, with probability at least $1 - n^{-1}$, the maximum margin classifier $\hat\btheta_{\text{SVM}}$ has the following risk bound
\begin{align*}
    R( \hat\btheta_{\text{SVM}} ) \leq  \exp\bigg( - \frac{  C' n \gamma^4s^2  }{  n  \gamma^2 s + d } \bigg),
\end{align*}
where $C'$ is an absolute constant.
\end{corollary}
By Corollary~\ref{col:rare-weak}, we can see that our bound is tighter by a factor of $n$ in the exponent compared with the risk bound in \citet{chatterji2020finite} for the rare/weak feature model. Under the setting where $n$ and $\gamma$ are fixed constants, our bound can also be compared with the negative result in \citet{jin2009impossibility}, which showed that achieving a small population risk is impossible when $s = O(d^2)$. Our result, on the other hand, demonstrates that when $s = \omega(d^2)$, $o(1)$ population risk is achievable. 


% A natural setting on the spectrum of $\bSigma$ is that $\| \bSigma\|_{2} = O(1)$, under which the assumptions in Theorem~\ref{thm:BOnew} reduces to 


% \begin{remark}
% Theorem~\ref{thm:BOnew} is stronger than Theorem~\ref{thm:benignoverfitting} where the bound is 
% \begin{align*}
%     \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg(- \frac{1 }{256  \| \bSigma\|_{2} } \cdot \min\bigg\{ \| \bmu \|_2^2 , \frac{n \| \bmu \|_2^4 }{\tr(\bSigma)} \bigg\}\bigg).
% \end{align*}
% By $\tr(\bSigma^2) \leq \| \bSigma \|_2\cdot \tr(\bSigma)$, $\| \bSigma^2\|_F \leq  \| \bSigma \|_2\cdot \| \bSigma\|_F$, $\| \bmu \|_{\bSigma}^2 \leq \| \bSigma \|_2  \| \bmu \|_2^2$ and the assumption $ \tr(\bSigma) > C\max\{ n\cdot  \| \bSigma \|_2, \sqrt{n}\cdot  \|\bSigma \|_F\}$, we obtain from Theorem~\ref{thm:BOnew} that
% \begin{align*}
%     \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) &\leq  \exp\bigg[ - \frac{  C'' n\cdot \| \bmu \|_2^4  }{  n \cdot\| \bSigma \|_2  \| \bmu \|_2^2+  \| \bSigma \|_2\cdot \tr(\bSigma)} \bigg]\\
%     &\leq \exp\bigg(- \frac{C''' }{ \| \bSigma\|_{2} } \cdot \min\bigg\{ \| \bmu \|_2^2 , \frac{n \| \bmu \|_2^4 }{\tr(\bSigma)} \bigg\}\bigg).
% \end{align*}
% Therefore the bound in Theorem~\ref{thm:BOnew} can be much tighter, e.g., when $d$ is large, $ \tr(\bSigma)$ can be close to infinity while $ \tr(\bSigma^2)$ can be bounded by a constant.
% \end{remark}

\section{Proof of the Main Results}\label{section:proof_main}
In this section, we explain how we establish the population risk bound of the maximum margin classifier, and give the proof of Theorem~\ref{thm:BOnew}. %The proof is based on \CC{XXX}.


For classification problems, one of the key challenges is that the maximum margin classifier usually does not have an explicit form solution. \CC{To overcome this difficulty,  \citet{chatterji2020finite} 
% introduced a proof technique utilizing
utilized the implicit bias results (Lemma~\ref{lemma:implicitbias}) to get a handle on the relationship between the maximum margin classifier and the training data.} 
% establish risk bound of $\hat\btheta_{\text{SVM}}$.
More recently, \citet{wang2020benign} showed that for isotropic Gaussian mixture models, an explicit form of $\hat\btheta_{\text{SVM}}$ can be calculated by the equivalence between hard-margin support vector machine and minimum norm least square regression. Notably, it was shown that such an equivalence result holds under the assumptions of \citet{chatterji2020finite} and no any additional assumptions are needed. In this paper, we also study the equivalence between classification and regression as a first step. However, our proof works for a more general setting that covers both isotropic and anisotropic sub-Gaussian mixtures, and introduces a novel proof technique based on the polarization identity that leads to a tighter bound. %\CC{improving the condition $d \geq n \sqrt{\log(n)}\|\bmu \|_{2}$ in \citet{wang2020benign}  to  $\tr(\bSigma) \geq n \|\bmu \|_{\bSigma}$}. 
We present this result in Section~\ref{section:equivalence}.



% this technique is not necessary, as in the over-parameterized setting it is possible to obtain the explicit form solution of $\hat\btheta_{\text{SVM}}$ by establishing an equivalence between hard-margin SVM and least square regression. Remarkably, \citet{wang2020benign} showed that this equivalence result does not introduce any additional assumption compared with the assumptions made in \citet{chatterji2020finite}. 



\subsection{Equivalence Between Classification and Regression}\label{section:equivalence}
Here we establish an equivalence guarantee for the 
maximum margin classifier
and the minimum norm interpolator. We first define the minimum norm interpolator $\hat\btheta_{\text{LS}} $ as follows:
\begin{align*}
    \hat\btheta_{\text{LS}} := \argmin \| \btheta \|_2^2, ~~\text{subject to } y_i \cdot \la\btheta , \xb_i\ra = 1, i\in [n].
\end{align*}
In comparison, we recall that the maximum margin classifier $\hat\btheta_{\text{SVM}}$ is defined in Lemma~\ref{lemma:implicitbias} as
\begin{align*}
    \hat\btheta_{\text{SVM}} = \argmin \| \btheta \|_2^2, ~~\text{subject to } y_i \cdot \la\btheta , \xb_i\ra \geq 1, i\in [n].
\end{align*}
We can see that the two optimization problems have the same solution when all the training data are support vectors, i.e., all the inequalities become equalities in the constraints. Based on this observation, \citet{muthukumar2020classification,hsu2020proliferation} have studied the conditions under which the maximum margin classifier $\hat\btheta_{\text{SVM}}$ is identical to the minimum norm interpolator $\hat\btheta_{\text{LS}} $. The result is given in the following lemma. 




% Consider
% \begin{align*}
%     \hat\btheta_{\text{LS}} = \argmin \| \btheta \|_2^2 \qquad \text{subject to } y_i = \btheta^\top \xb_i, i\in [n]
% \end{align*}
% and 
% \begin{align*}
%     \hat\btheta_{\text{SVM}} = \argmin \| \btheta \|_2^2 \qquad \text{subject to } y_i \cdot \btheta^\top \xb_i \geq 1, i\in [n]
% \end{align*}

\begin{lemma}[\citet{hsu2020proliferation}]\label{lemma:equivalence}
$\hat\btheta_{\text{SVM}} = \hat\btheta_{\text{LS}}$ if and only if 
$$\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i >0$$ 
for all $i\in[n]$. 
\end{lemma}

According to Lemma~\ref{lemma:equivalence}, to study the equivalence between the maximum margin classifier and the minimum norm interpolator, it suffices to derive sufficient conditions such that $\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i$, $i\in[n]$ are strictly positive with high probability. We have the following lemma which summarizes some calculations regarding the quantity $\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i$. 

\begin{lemma}\label{lemma:condition_calculation}
    Suppose that 
    $$\tr(\bSigma) > C\max\{ n^{3/2}  \| \bSigma\|_2 , n \| \bSigma\|_F, n \| \bmu \|_{\bSigma}\}$$ 
    for some large enough absolute constant $C$. Then with probability at least $ 1 - O(n^{-2}) $, 
    \begin{align*}
        \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i\geq G \Big[ 1 - C' n\big|\bmu^\top\Qb^\top (\Qb\Qb^\top)^{-1}\eb_i \big|   \Big]
    \end{align*}
    for all $i\in [n]$, 
    where $G = G(\bmu, \Qb, \yb, \bSigma)>0 $ is a strictly positive factor and $C'>0$ is an absolute constant. 
\end{lemma}
By Lemma~\ref{lemma:condition_calculation}, we can see that in order to ensure $\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i > 0$, it suffices to establish an upper bound for
$|\bmu^\top\Qb^\top (\Qb\Qb^\top)^{-1}\eb_i | $. 
However, deriving tight upper bounds for this term turns out to be challenging, as a simple application of the Cauchy-Schwarz inequality can lead to a loose bound with an additional $\sqrt{n}$ factor. In the following, we establish a refined bound on the term $|\bmu^\top\Qb^\top (\Qb\Qb^\top)^{-1}\eb_i | $.%, which we consider as one of our key technical contributions. 


\begin{lemma}\label{lemma:anisotropicbound1}
% With probability at least $1 - 4n^{-1}$, 
Suppose that 
    $$\tr(\bSigma) > C\max\{ n^{3/2}  \| \bSigma\|_2 , n \| \bSigma\|_F \}$$ 
for some large enough absolute constant $C$. Then with probability at least $ 1 - O(n^{-2}) $, 
\begin{align*}
    \big| \bmu^\top\Qb^\top (\Qb\Qb^\top)^{-1} \eb_i \big| \leq   \frac{C' \| \bmu \|_{\bSigma}  \cdot  \sqrt{\log(n)}}{ \tr(\bSigma)  }
\end{align*}
for all $i\in [n]$, where $C'>0$ is an absolute constant.
\end{lemma}

The proof of Lemma~\ref{lemma:anisotropicbound1} is motivated by the observation that the matrix $\Qb\Qb^\top\in\RR^{n\times n}$ is close enough to a scalar multiple of the identity matrix. By concentration arguments, we have  with high probability that, 
$$\| \Qb\Qb^\top - \tr(\bSigma) \Ib \|_2 \leq \epsilon_{\lambda},$$
where 
$\epsilon_{\lambda} : = C\cdot \big( n\cdot \| \bSigma\|_2  + \sqrt{n}\cdot \| \bSigma\|_F \big)$ (See Lemma~\ref{lemma:eigenvalue_concentration} and its proof for more details). We can then apply the polarization identity $\ab^\top \Mb \bbb = (1/4)\cdot  (\ab+\bbb)^\top \Mb (\ab+\bbb) - (1/4)\cdot(\ab-\bbb)^\top \Mb (\ab-\bbb)$ to obtain
\begin{align}
    &\bmu^\top\Qb^\top (\Qb\Qb^\top)^{-1} \eb_i \nonumber\\
    & = \frac{ (\Qb\bmu + \| \Qb\bmu \|_2 \eb_i )^\top (\Qb\Qb^\top)^{-1} (\Qb\bmu + \| \Qb\bmu \|_2 \eb_i)}{4\| \Qb\bmu \|_2} \nonumber \\
    &\quad - \frac{ (\Qb\bmu - \| \Qb\bmu \|_2 \eb_i )^\top (\Qb\Qb^\top)^{-1} (\Qb\bmu - \| \Qb\bmu \|_2 \eb_i) }{4\| \Qb\bmu \|_2} \nonumber \\
    & \leq \frac{1}{4\| \Qb\bmu \|_2} \bigg[  \frac{\| \Qb\bmu + \| \Qb\bmu \|_2 \eb_i \|_2^2}{ \tr(\bSigma)- \epsilon_{\lambda}  } - \frac{\| \Qb\bmu - \| \Qb\bmu \|_2 \eb_i \|_2^2}{ \tr(\bSigma)+ \epsilon_{\lambda}  }     \bigg] \nonumber\\
    &= \frac{\| \Qb\bmu \|_2 \cdot \epsilon_{\lambda} +  \eb_i^\top \Qb\bmu \cdot \tr(\bSigma)}{ \tr(\bSigma)^2 - \epsilon_{\lambda}^2 }\nonumber\\
    &\leq \| \Qb\bmu \|_2 \cdot O\bigg(\frac{\epsilon_{\lambda}}{ \tr(\bSigma)^2}\bigg) + |\eb_i^\top \Qb\bmu| \cdot O\bigg(\frac{1}{ \tr(\bSigma)}\bigg),\label{eq:main_lemmaproof_eq1}
\end{align}
where the first inequality follows by the eigenvalue bound $\| \Qb\Qb^\top - \tr(\bSigma) \Ib \| \leq \epsilon_{\lambda}$, and the second inequality follows because the lemma assumption implies that  $\tr(\bSigma) \geq C \sqrt{n} \epsilon_{\blambda}$ for some large enough absolute constant $C$. By concentration arguments, with high probability we have $\| \Qb\bmu \|_2 = O(\sqrt{n} \| \bmu \|_{\bSigma}) $
and $|\eb_i^\top \Qb\bmu| = O(\sqrt{\log(n)}\cdot \| \bmu \|_{\bSigma}) $ (See the proof of Lemma~\ref{lemma:anisotropicbound1} in Appendix~\ref{section:proof_appendix1} for more details). Note that the same upper bound also holds for $-\bmu^\top\Qb^\top (\Qb\Qb^\top)^{-1} \eb_i$ with exactly the same proof. Therefore, plugging these calculations into \eqref{eq:main_lemmaproof_eq1} and applying the assumption $\tr(\bSigma) \geq C \sqrt{n} \epsilon_{\blambda}$ completes the proof. 


% A more detailed proof of Lemma~\ref{lemma:anisotropicbound1} is given in Appendix~\ref{section:proof_appendix1}. 
From the above proof sketch of Lemma~\ref{lemma:anisotropicbound1}, we can see that our proof technique enables us to show that $|\bmu^\top\Qb^\top (\Qb\Qb^\top)^{-1} \eb_i|$ has an upper bound \eqref{eq:main_lemmaproof_eq1} whose leading term only depends on $\eb_i^\top \Qb\bmu$. Note that the bound of $\eb_i^\top \Qb\bmu$ is (almost) independent of $n$ because it is only related to the $i$-th training example. This is a key feature of Lemma~\ref{lemma:anisotropicbound1} which leads to a tight bound.

% with the summation of a $\| \Qb\bmu \|_2 \cdot O( \epsilon_{\lambda} / \tr(\bSigma)^2 )$ term and a $\eb_i^\top \Qb\bmu \cdot O(1 /  \tr(\bSigma))$ term, and the leading term $\eb_i^\top \Qb\bmu \cdot O(1 /  \tr(\bSigma))$ only depends on $\eb_i^\top \Qb\bmu$, which is defined based on the $i$-th training example alone. 
%  \CC{Appendix XXXX}. \CC{Note that based on our model, $\EE( \Qb\Qb^\top) = \tr(\bSigma)\Ib$, and $\EE( \| \Qb\bmu \|_2^2 ) = n \|\bmu\|_{\bSigma}^2 $. Therefore a trivial upper bound of $\big| \bmu^\top\Qb^\top (\Qb\Qb^\top)^{-1} \eb_i \big|$ would be of order $ \sqrt{n} \cdot \| \bmu \|_{\bSigma} / \tr(\bSigma) $. However, our bound in Lemma~\ref{lemma:anisotropicbound1} does not have the $\sqrt{n}$ factor, and therefore is tighter.}


We are now ready to present our main result on the equivalence between maximum margin classifier and the minimum norm interpolator. %The proposition is given as follows. 


\begin{proposition}\label{prop:interpolationregression}
Suppose that 
$$\tr( \bSigma ) \geq C \max\{ n^{3/2} \| \bSigma \|_2, n \| \bSigma \|_F, n\sqrt{\log(n)}\cdot \| \bmu \|_{\bSigma} \} $$
for some large enough absolute constant $C$. Then with probability at least $ 1 - O(n^{-2}) $, 
$\hat\btheta_{\text{SVM}} =  \hat\btheta_{\text{LS}}$.
\end{proposition}



\begin{proof}[Proof of Proposition~\ref{prop:interpolationregression}]
By the union bound, we have that with probability at least $1 - 2n^{-2}$, the results in Lemma~\ref{lemma:condition_calculation} and Lemma~\ref{lemma:anisotropicbound1} both hold. Therefore, for any $i\in[n]$, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i&\geq G \Big[ 1 - c_1 n\big|\bmu^\top\Qb^\top (\Qb\Qb^\top)^{-1}\eb_i \big|   \Big] \\
    &\geq G \Bigg[ 1 - \frac{ c_2 n  \sqrt{\log(n)} \cdot \| \bmu \|_{\bSigma} }{ \tr(\bSigma)  } \Bigg]\\
    &\propto  \tr(\bSigma) - c_2 n \sqrt{\log(n)}\cdot \| \bmu \|_{\bSigma}.
\end{align*}
By the assumption $\tr( \bSigma ) \geq C n\sqrt{\log(n)}\cdot \| \bmu \|_{\bSigma} $ for some large enough absolute constant $C$, we have $\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i > 0$. Finally, applying Lemma~\ref{lemma:equivalence}, we conclude that $\hat\btheta_{\text{SVM}} = \hat\btheta_{\text{LS}}$.
\end{proof}



% This motivates us to consider giving a lower bound for $\yb^\top (\Qb\Qb^\top)^{-1} \eb_i y_i$ and a upper bound for $|\bmu^\top\Qb^\top (\Qb\Qb^\top)^{-1}\eb_i y_i|$. However, we note that 


% the following lemma, which is one of our key technical contributions.





% This result is given in the following lemma.

% \begin{lemma}\label{lemma:interpolationregression}
% Suppose that there exists a large enough absolute constant $C$ such that
% $$\tr( \bSigma ) \geq C \max\{ n\sqrt{\log(n)}), n^{3/2} \| \bSigma \|_2, n \| \bSigma \|_F,  n \| \bmu \|_{\bSigma} \}. $$
% Then with probability at least $1 - n^{-1}$, $y_i \cdot \eb_i^\top (\Xb\Xb^\top)^{-1}\yb >0$ for all $i\in[n]$. 
% % and $\tr( \bSigma ) \geq C n\cdot \| \bmu \|_{\bSigma} $, 
% \end{lemma}

\subsection{Population Risk Bound for the Maximum Margin Classifier}
In this subsection, we derive the population risk bound for logistic regression and provide the proof of Theorem~\ref{thm:BOnew}. We first present the following lemma on the risk bound of linear classifiers for sub-Gaussian mixture models.


\begin{lemma}\label{lemma:subGaussian_riskbound}
There exists an absolute constant $C$, such that for any $\btheta\in \RR^d$, the following risk bound holds:
\begin{align*}
    R(\btheta) \leq \exp\bigg( - \frac{ C( \btheta^\top \bmu )^2 }{ \|\btheta\|_{\bSigma}^2} \bigg).
\end{align*}
A similar result is given in \citet{chatterji2020finite} where $\bSigma$ is replaced by $\Ib$. Our result here depends on the full spectrum of the covariance matrix and is sharper than \citet{chatterji2020finite} when $\bSigma$ has decaying eigenvalues.

% \begin{align*}
%     \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg[ - \frac{ [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2}{  2 \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb} \bigg]
% \end{align*}
%     \begin{align*}
%         \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg[ - \frac{ [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2}{  2 \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb} \bigg]
%     \end{align*}
\end{lemma}
The proof of Lemma~\ref{lemma:subGaussian_riskbound} is given in Appendix~\ref{section:proof_appendix1}. 
In addition to this risk bound for general vector $\btheta$, we also have the following explicit calculation for $\hat\btheta_{\text{SVM}}$ thanks to our analysis in Section~\ref{section:equivalence}. This is because the minimum norm interpolator $\hat\btheta_{\text{LS}}$ has the explicit form $\hat\btheta_{\text{LS}} = \Xb^\top (\Xb\Xb^\top)^{-1} \yb$.
Therefore by Proposition~\ref{prop:interpolationregression}, we also have
\begin{align*}
    \hat\btheta_{\text{SVM}} = \Xb^\top (\Xb\Xb^\top)^{-1} \yb.
\end{align*}
Plugging the above calculation into the risk bound in Lemma~\ref{lemma:subGaussian_riskbound} and utilizing the model definition $\Xb = \yb \bmu^\top + \Qb$, we are able to show the following risk bound for $\hat\btheta_{\text{SVM}}$.
\begin{lemma}\label{lemma:thetaSVM_riskbound} 
    Suppose that 
$$\tr( \bSigma ) \geq C \max\{n\sqrt{\log(n)}, n^{3/2} \| \bSigma \|_2, n \| \bSigma \|_F, n \| \bmu \|_{\bSigma} \} $$
for some large enough absolute constant $C$. Then with probability at least $ 1 - O( n^{-2} )$, 
\begin{align*}
    &R(\hat\btheta_{\text{SVM}} )\\
    &\leq \exp\bigg\{ \frac{ - C'\cdot [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2 }{(\yb^\top (\Xb\Xb^\top)^{-1} \yb)^2 \| \bmu \|_{\bSigma}^2 + \| \Qb^\top (\Xb\Xb^\top)^{-1} \yb \|_{\bSigma}^2 } \bigg\},
\end{align*}
where $C'$ is an absolute constant.
\end{lemma}
The proof of Lemma~\ref{lemma:thetaSVM_riskbound} is given in Appendix~\ref{section:proof_appendix1}. Lemma~\ref{lemma:thetaSVM_riskbound} 
utilizes the structure of the model to divide the denominator in the exponent into two terms. 
% proposes 
% a key step that 
% divides . 
Motivated by this result, we define
\begin{align*}
    &I_1 = [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2, \\
    &I_2 = (\yb^\top (\Xb\Xb^\top)^{-1} \yb)^2 \cdot \| \bmu \|_{\bSigma}^2,\\
    &I_3 =  \| \Qb^\top (\Xb\Xb^\top)^{-1} \yb \|_{\bSigma}^2.
\end{align*}
In the following, we develop a lower bound for $I_1$ and upper bounds for $I_2$ and $I_3$ respectively. The following lemma summarizes the bounds.

% Therefore in the following, we focus on deriving a lower bound for the numerator in the exponent $[ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2$, and an upper bound for the denominator $\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb$. 


\begin{lemma}\label{lemma:I1I2I3bounds}
    Suppose that
% $$\tr( \bSigma ) \geq C \max\{ n\sqrt{\log(n)}, n^{3/2} \|\bSigma \|_2, n\| \bSigma \|_F , n \| \bmu \|_{\bSigma} \}$$ 
$$\tr( \bSigma ) \geq C \max\{ n, n \|\bSigma \|_2, \sqrt{n} \|\bSigma \|_F , n \| \bmu \|_{\bSigma} \}$$
and
$\| \bmu \|_2 \geq C \| \bSigma\|_{2}$ 
% and $\| \bmu \|_2 \geq C \| \bSigma\|_{2}$ 
for some large enough absolute constant $C$. Then when $n$ is large enough, with probability at least $1 - O(n^{-2})$, 
\begin{align*}
    &I_1 \geq C'^{-1} H(\bmu, \Qb, \yb, \bSigma)\cdot n^2 \cdot   \| \bmu \|_2^4,\\
    &I_2 \leq C' H(\bmu, \Qb, \yb, \bSigma)\cdot n^2 \cdot \| \bmu \|_{\bSigma}^2,\\
    &I_3 \leq C' H(\bmu, \Qb, \yb, \bSigma)\cdot (  n\cdot \|\bSigma\|_F^2 +  n^2\cdot \| \bSigma\|_2^2),
\end{align*}
where $H(\bmu, \Qb, \yb, \bSigma) > 0$ is a strictly positive coefficient, and $C' > 0$ is an absolute constant.  
\end{lemma}

The proof of Lemma~\ref{lemma:I1I2I3bounds} is given in Appendix~\ref{section:proof_appendix1}. 
% Here we would like to highlight our bound of $I_3$.
To illustrate the key idea in the proof of Lemma~\ref{lemma:I1I2I3bounds}, we take $I_3$ as an example. 
Based on our model in Section~\ref{section:problemsetting}, we have $\Qb = \Zb \bLambda^{1/2} \Vb^\top$, where $\Zb\in\RR^{n\times d}$ is a random matrix with independent sub-Gaussian entries, and $\bLambda$, $\Vb$ are defined based on the eigenvalue decomposition $\bSigma = \Vb \bLambda \Vb^\top$. By some linear algebra calculation (see the proof of Lemma~\ref{lemma:I1I2I3bounds} in Appendix~\ref{section:proof_appendix1} for more details), we have
\begin{align}\label{eq:main_lemmaproof_eq2}
    I_3 = \ab^\top (\Zb\bLambda \Zb^\top)^{-1} \Zb\bLambda^2 \Zb^\top (\Zb\bLambda \Zb^\top)^{-1} \ab,
\end{align}
where  $\| \ab \|_2^2 = O(D^{-2} n) $ with
\begin{align*}
    D &=  \yb^\top (\Qb \Qb^\top)^{-1} \yb \cdot  (\| \bmu\|_2^2 - \bmu^\top \Qb^\top (\Qb \Qb^\top)^{-1} \Qb\bmu) \\
&\quad + (1 + \yb^\top(\Qb \Qb^\top)^{-1} \Qb\bmu)^2.
\end{align*}
The key observation here is that while the term $D$ above has a very complicated form, it is not necessary to bound it. This is because $D^{-2}$ is a common term that appears in all $I_1$, $I_2$ and $I_3$ and therefore can be canceled out when calculating the ratio $I_1 / (I_2 + I_3)$. 
% , as the factor $D^{-2}$ appears in the bounds of $I_1$ and $I_2$ as well. Moreover, 
With the calculation in \eqref{eq:main_lemmaproof_eq2}, we are able to invoke the following eigenvalue concentration inequalities (see Lemma~\ref{lemma:eigenvalue_concentration} and Lemma~\ref{lemma:eigenvalue_concentration2} for more details) to give upper and lower bounds regarding the matrices $ \Zb\bLambda^2 \Zb^\top $ and $ \Zb\bLambda \Zb^\top $ respectively:
\begin{align*}
    & \big\| \Zb\bLambda \Zb^\top - \tr(\bSigma)\cdot \Ib \big\|_2 \leq c_1\cdot \big( n \| \bSigma\|_2  + \sqrt{n} \| \bSigma\|_F \big), \\
    & \big\| \Zb\bLambda^2 \Zb^\top - \|\bSigma\|_F^2\cdot \Ib \big\|_2 \leq c_1\cdot \big( n \| \bSigma\|_2^2  + \sqrt{n} \| \bSigma^2 \|_F \big),
\end{align*}
where $c_1$ is an absolute constant. Plugging the above inequalities and the bound $\| \ab \|_2^2 = O(D^{-2} n)$ into \eqref{eq:main_lemmaproof_eq2}, we obtain \CC{with some calculation that}
\begin{align*}
    I_3 \leq c_2 H(\bmu, \Qb, \yb, \bSigma)\cdot (  n\cdot \|\bSigma\|_F^2 +  n^2\cdot \| \bSigma\|_2^2)
\end{align*}
with $ H(\bmu, \Qb, \yb, \bSigma) = [D\cdot \tr(\bSigma) ]^{-2} $, where $c_2$ is an absolute constant. This gives the bound of $I_3$ in Lemma~\ref{lemma:I1I2I3bounds}. 
% $\| \Qb\Qb^\top - \tr(\bSigma) \Ib \| \leq \epsilon_{\lambda}$, where 
% $\epsilon_{\lambda} : = C\cdot \big( n\cdot \| \bSigma\|_2  + \sqrt{n}\cdot \| \bSigma\|_F \big)$ 


% $s = \yb^\top \Ab^{-1} \yb$, $t = \bnu^\top \Ab^{-1} \bnu$, $h = \yb^\top \Ab^{-1} \bnu$, $D = s (\| \bmu\|_2^2 - t) + (h+1)^2$


% We would like to highlight that 
Lemma~\ref{lemma:I1I2I3bounds} is significant in three-fold. First of all, the result does not have an explicit dependency on $d$, which makes it applicable to infinite dimensional data. 
% covers general covaraince matrices $\bSigma$, and the assumptions and bounds only depend on the trace and norms of $\bSigma$ instead of the dimension $d$. Therefore Lemma~\ref{lemma:I1I2I3bounds} is more general and more refined compared with existing works
% This is in stark contrast with \citet{chatterji2020finite,wang2020benign} which depends on. 
Second, Lemma~\ref{lemma:I1I2I3bounds} gives bounds with great simplicity, and shows that the three bounds share a same strictly positive factor $ H(\bmu, \Qb, \yb, \bSigma)$, which can be canceled out since our final goal is to bound the ratio $I_1 / (I_2 + I_3)$. Lastly, Lemma~\ref{lemma:I1I2I3bounds} reveals the fact that the risk bound only depends on $\|\bSigma\|_F$ and $ \| \bSigma\|_2$, which can be small even though the assumption requires $\tr(\bSigma)$ to be large.

We are now ready to present the proof of Theorem~\ref{thm:BOnew}.

\begin{proof}[Proof of Theorem~\ref{thm:BOnew}]
Clearly, under the assumptions of Theorem~\ref{thm:BOnew}, the conditions in Lemma~\ref{lemma:thetaSVM_riskbound} and Lemma~\ref{lemma:I1I2I3bounds} are both satisfied. 
By Lemma~\ref{lemma:I1I2I3bounds}, we have
\begin{align*}
    &\frac{ [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2 }{(\yb^\top (\Xb\Xb^\top)^{-1} \yb)^2 \| \bmu \|_{\bSigma}^2 + \| \Qb^\top (\Xb\Xb^\top)^{-1} \yb \|_{\bSigma}^2} \\
    &\qquad \geq c_1 \cdot \frac{ n^2 \| \bmu \|_2^4 }{  n^2 \| \bmu \|_{\bSigma}^2 + n\cdot \|\bSigma\|_F^2 +  n^2\cdot \| \bSigma\|_2^2 },
\end{align*}
where $c_1$ is an absolute constant. 
Therefore by Lemma~\ref{lemma:thetaSVM_riskbound} we have
\begin{align*}
    R(\hat\btheta_{\text{SVM}}) \leq \exp\Bigg(  \frac{ - c_2 n \| \bmu \|_2^4  }{  n  \| \bmu \|_{\bSigma}^2+ \| \bSigma\|_F^2 +  n\| \bSigma\|_2^2 } \Bigg)
\end{align*}
for some absolute constant $c_2$. Note that by union bound, the above inequality holds with probability at least $ 1 - O(n^{-2}) \geq 1 - n^{-1}$ when $n$ is large enough. This completes the proof. 
\end{proof}


% \section{Simulation}

\section{Conclusion and Future Work}
We have studied the benign overfitting phenomenon for sub-Gaussian mxiture models, and established a population risk bound for over-parameterized  logistic regression. Our population risk bound is general and covers both the isotropic and anisotropic settings. When reduced to the isotropic setting, our bound is tighter than existing results. We have also studied a class of non-isotropic models which can be benign even for infinite-dimensional data.

An immediate future work direction is to study the population risk lower bound for the maximum margin classifier $\hat\btheta_{\text{SVM}}$ in the over-parameterized setting. It is also interesting to study the relation between the dimension and the population risk and verify the double descent phenomenon. Studying  benign overfitting for more complicated learning models such as neural networks is another important future work direction. 

\newpage

\bibliography{deeplearningreference}
% \bibliographystyle{ims}
\bibliographystyle{icml2021}


\appendix

\onecolumn


\section{Proof of Lemmas in Section~\ref{section:proof_main}}\label{section:proof_appendix1}
% We first introduce several notations to simplify our proof.
We denote $\bnu = \Qb \bmu$ and $\Ab = \Qb\Qb^\top$. Based on these notations, in the following we present several basic lemmas that are used in our proof. We have the following lemma which gives concentration inequalities for the the eigenvalues of $\Ab$.


% is summarized from the proof of Lemma~26 in \citet{bartlett2020benign}

\begin{lemma}\label{lemma:eigenvalue_concentration}
    With probability at least $1 - n^{-2}$,
    \begin{align*}
    \big\| \Ab  - \tr(\bSigma)\cdot \Ib \big\|_2 \leq \epsilon_{\lambda} := C \sigma_u^2  \big( n\cdot \| \bSigma\|_2  + \sqrt{n}\cdot \| \bSigma\|_F \big),
\end{align*}
where $C$ is an absolute constant.
\end{lemma}






The following lemma presents some calculations on the quantity $\yb^\top (\Xb\Xb^\top)^{-1}$. It utilizes a result introduced in \citet{wang2020benign}, which is based on the application of the Sherman–Morrison–Woodbury formula.


% is given as Lemma 3 in \citet{wang2020benign}.

\begin{lemma}\label{lemma:matrixcalculation}
The following calculation of $\yb^\top (\Xb\Xb^\top)^{-1}$ holds: 
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu )\cdot \yb^\top \Ab^{-1} - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1}],
\end{align*}
where 
$D = \yb^\top\Ab^{-1} \yb \cdot  (\| \bmu\|_2^2 - \bnu^\top \Ab^{-1} \bnu)  + (1 + \yb^\top\Ab^{-1} \bnu)^2> 0$.
% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} &= \yb^\top \Ab^{-1} - D^{-1}\cdot [ \| \bmu \|_2^2 s + h^2 + h - st ] \cdot \yb^\top \Ab^{-1} - D^{-1}\bnu^\top \Ab^{-1}\\
%     & =\bigg[ 1 - \frac{\| \bmu \|_2^2 s + h^2 + h - st }{ \| \bmu\|_2^2 s - st + (h+1)^2} \bigg] \cdot \yb^\top \Ab^{-1} - D^{-1}\bnu^\top \Ab^{-1} \\
%     & = \frac{ h + 1 }{ \| \bmu\|_2^2 s - st + (h+1)^2} \cdot \yb^\top \Ab^{-1} - D^{-1}\bnu^\top \Ab^{-1}\\
%     & = D^{-1} [ (h + 1) \yb^\top \Ab^{-1} - \bnu^\top \Ab^{-1}].
% \end{align*}
\end{lemma}







Motivated by Lemma~\ref{lemma:matrixcalculation}, we estimate the orders of the terms $ \yb^\top \Ab^{-1} \yb$, $\bnu^\top \Ab^{-1} \bnu$, and $\yb^\top \Ab^{-1} \bnu$. The results are given in the following lemma.


\begin{lemma}\label{lemma:concentrationbounds}
Let $\epsilon_{\lambda}$ be defined in Lemma~\ref{lemma:eigenvalue_concentration}, and suppose that $\tr( \bSigma )  > \epsilon_{\lambda}$. Then with probability at least $1 - O(n^{-2})$, the following inequalities hold: %\todoq{What is $s$ and $t$?}, \todoy{simplify notation}
\begin{align*}
    &\frac{n}{ \tr( \bSigma )  + \epsilon_{\lambda} } \leq  \yb^\top \Ab^{-1} \yb \leq \frac{n}{ \tr( \bSigma )  - \epsilon_{\lambda} },\\
    &\frac{n  - C \sqrt{n\log( n)}}{ \tr( \bSigma )  + \epsilon_{\lambda} } \cdot \| \bmu \|_{\bSigma}^2 \leq  \bnu^\top \Ab^{-1} \bnu \leq \frac{ n  + C \sqrt{n\log( n)} }{ \tr( \bSigma )  - \epsilon_{\lambda} } \cdot \| \bmu \|_{\bSigma}^2,\\
    &|\yb^\top \Ab^{-1} \bnu| \leq \frac{C n   }{  \tr( \bSigma )  - \epsilon_{\lambda}  } \| \bmu \|_{\bSigma},
\end{align*}
where $C$ is an absolute constant.
% \begin{align*}
%     \frac{n  - c \sqrt{n\log( n)}}{ \tr( \bSigma )  + \epsilon_{\lambda} } \cdot \| \bmu \|_{\bSigma}^2 \leq  \bnu^\top \Ab^{-1} \bnu \leq \frac{ n  + c \sqrt{n\log( n)} }{ \tr( \bSigma )  - \epsilon_{\lambda} } \cdot \| \bmu \|_{\bSigma}^2
% \end{align*}

% \begin{align*}
%     |\yb^\top \Ab^{-1} \bnu| \leq \frac{n  + c \sqrt{n\log( n)} }{  \tr( \bSigma )  - \epsilon_{\lambda}  } \| \bmu \|_{\bSigma}
% \end{align*}

\end{lemma}





% \eqref{eq:isotropic_eigenvalueconcentration} 




























\subsection{Proof of Lemma~\ref{lemma:condition_calculation}}

Here we present the proof of Lemma~\ref{lemma:condition_calculation}. We first introduce the following lemma, which gives a lower bound on the quantity $\yb^\top \Ab^{-1} \eb_i y_i$.



\begin{lemma}\label{lemma:anisotropicbound2}
Let $\epsilon_{\lambda}$ be defined in Lemma~\ref{lemma:eigenvalue_concentration}, and suppose that $\tr( \bSigma )  > \epsilon_{\lambda}$. Then with probability at least $1 - O(n^{-2})$, 
\begin{align*}
    \yb^\top \Ab^{-1} \eb_i y_i  \geq \frac{ \tr(\bSigma) - \sqrt{n} \epsilon_{\lambda} }{\tr(\bSigma)^2  - \epsilon_{\lambda}^2}
\end{align*}
for all $i\in [n]$. 
\end{lemma}




We are now ready to give our proof of Lemma~\ref{lemma:condition_calculation}.

\begin{proof}[Proof of Lemma~\ref{lemma:condition_calculation}] By Lemma~\ref{lemma:matrixcalculation}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i = D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} \eb_i y_i - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1} \eb_i y_i].
\end{align*}
Plugging in the inequalities in Lemmas~\ref{lemma:concentrationbounds} and \ref{lemma:anisotropicbound2}, we have that as long as $\tr(\bSigma) > c_1\max\{ n \| \bmu \|_{\bSigma}, \epsilon_{\lambda}\}$ for some large enough constant $c_1$, $\yb^\top \Ab^{-1} \bnu \leq 1/2$ and therefore
\begin{align}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i &= D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} \eb_i y_i  - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1}\eb_i y_i ] \nonumber \\
    &\geq D^{-1}\cdot \bigg[ \frac{1}{2}\cdot \yb^\top \Ab^{-1} \eb_i y_i  - \frac{ c_2 n }{ \tr(\bSigma)}\cdot |\bnu^\top \Ab^{-1}\eb_i y_i| \bigg],\label{eq:equivalenceproof_eq0}
\end{align}
where $c_2$ is an absolute constant. 
By Lemma~\ref{lemma:anisotropicbound2}, we can see that as long as $ \tr(\bSigma) \geq c_3\sqrt{n} \epsilon_{\lambda}$ for some large enough absolute constant $c_3$, we have
\begin{align*}
    \yb^\top \Ab^{-1} \eb_i y_i \geq \frac{ \tr(\bSigma) - \sqrt{n} \epsilon_{\lambda} }{\tr(\bSigma)^2  - \epsilon_{\lambda}^2} \geq \frac{1}{2\tr(\bSigma)}.
\end{align*}
Plugging the bound above into \eqref{eq:equivalenceproof_eq0}, we obtain
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i \geq \frac{1}{4D \tr(\bSigma)}\cdot [ 1 - c_4 n \cdot |\bnu^\top \Ab^{-1}\eb_i y_i| ].
\end{align*}
Since $D > 0$, we see that $G(\bmu, \Qb, \yb, \bSigma):= [4D \tr(\bSigma)]^{-1} > 0$. This completes the proof. 
\end{proof}


\subsection{Proof of Lemma~\ref{lemma:anisotropicbound1}}
Here we give the detailed proof of Lemma~\ref{lemma:anisotropicbound1} to backup the proof sketch presented in Section~\ref{section:equivalence}. The proof is based on the polarization identity.


\begin{proof}[Proof of Lemma~\ref{lemma:anisotropicbound1}] We have the following calculation,
\begin{align}
    \bmu^\top\Qb^\top\Ab^{-1} \eb_i y_i &= \frac{1}{\| \Qb\bmu \|_2}\cdot (\Qb\bmu)^\top \Ab^{-1} (\| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    & = \frac{1}{4\| \Qb\bmu \|_2}\cdot (\Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i)^\top \Ab^{-1} (\Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    &\quad - \frac{1}{4\| \Qb\bmu \|_2}\cdot (\Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i)^\top \Ab^{-1} (\Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    & \leq \frac{1}{4\| \Qb\bmu \|_2}\cdot \bigg[  \frac{\| \Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i \|_2^2}{ \tr(\bSigma)- \epsilon_{\lambda}  } - \frac{\| \Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i \|_2^2}{ \tr(\bSigma)+ \epsilon_{\lambda}  }     \bigg] \nonumber\\
    &= \frac{1}{4\| \Qb\bmu \|_2}\cdot \bigg[  \frac{2 \| \Qb\bmu \|_2^2 + 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu }{ \tr(\bSigma)- \epsilon_{\lambda}  } - \frac{2\| \Qb\bmu \|_2^2  - 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu }{ \tr(\bSigma)+ \epsilon_{\lambda}  } \bigg] \nonumber \\
    &= \frac{1}{2\| \Qb\bmu \|_2}\cdot \frac{2\| \Qb\bmu \|_2^2 \cdot \epsilon_{\lambda} + 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu \cdot \tr(\bSigma)}{ \tr(\bSigma)^2 - \epsilon_{\lambda}^2  }\nonumber\\
    &= \frac{\| \Qb\bmu \|_2 \cdot \epsilon_{\lambda} +  y_i \eb_i^\top \Qb\bmu \cdot \tr(\bSigma)}{ \tr(\bSigma)^2 - \epsilon_{\lambda}^2  },\label{eq:fgbounds_eq1}
\end{align}
where the first equality holds due to the polarization identity $\ab^\top \Mb \bbb = 1/4(\ab+\bbb)^\top \Mb (\ab+\bbb) - 1/4(\ab-\bbb)^\top \Mb (\ab-\bbb)$, and the first inequality follows by Lemma~\ref{lemma:eigenvalue_concentration}. Based on our model assumption, we can denote $\Qb = \Zb \bLambda^{1/2} \Vb^\top$, where the entries of $\Zb$ are independent sub-Gaussian random variables with $ \| \Zb_{ij} \|_{\psi_2} \leq \sigma_u$ for all $i\in [n]$ and $j\in[p]$. Denote $\tilde\bmu = \Lambda^{1/2} \Vb^\top \bmu$. Then with the same proof as in Lemma~\ref{lemma:concentrationbounds}, we have 
\begin{align*}
      \| \Qb \bmu \|_2^2= \| \Zb \tilde\bmu \|_2^2 \leq  2n \| \tilde\bmu \|_2^2  = 2n \| \bmu \|_{\bSigma}^2
\end{align*}
when $n$ is large enough. Moreover, we also have
\begin{align*}
    \| y_i \eb_i^\top \Qb \bmu \|_{\psi_2} = \Bigg\| \sum_{j=1}^p \Zb_{ij} \tilde\mu_j \Bigg\|_{\psi_2} \leq \| \tilde\bmu \|_2 \cdot \sigma_u. 
\end{align*}
Therefore by Hoeffding's inequality, with probability at least $1 - n^{-1}$, we have
\begin{align*}
    | y_i \eb_i^\top \Qb \bmu | \leq c_1 \| \tilde\bmu \|_2 \cdot \sqrt{\log(n)} = c_1 \| \bmu \|_{\bSigma} \cdot \sqrt{\log(n)},%\label{eq:fgbounds_eq2}
\end{align*}
where $c_1$ is an absolute constant. Therefore we have
\begin{align*}
    \bnu^\top \Ab^{-1} \eb_i y_i &\leq \frac{\sqrt{2n} \| \bmu \|_{\bSigma} \cdot \epsilon_{\lambda} +  c_2 \| \bmu \|_{\bSigma}\sqrt{\log(n)} \cdot \tr(\bSigma)}{ \tr(\bSigma)^2 - \epsilon_{\lambda}^2  }.
\end{align*}
With the exact same proof, we also have
\begin{align*}
    -\bnu^\top \Ab^{-1} \eb_i y_i &\leq \frac{\sqrt{2n} \| \bmu \|_{\bSigma} \cdot \epsilon_{\lambda} +  c_2 \| \bmu \|_{\bSigma} \sqrt{\log(n)} \cdot \tr(\bSigma)}{ \tr(\bSigma)^2 - \epsilon_{\lambda}^2  }.
\end{align*}
Therefore by the assumption that $\tr(\bSigma) > C \sqrt{n} \epsilon_{\lambda}$ for some large enough absolute constant $C$, we have 
\begin{align*}
    |\bnu^\top \Ab^{-1} \eb_i | &\leq \frac{c_3 \| \bmu \|_{\bSigma}  \cdot  \sqrt{\log(n)}}{ \tr(\bSigma)  }
\end{align*}
for some absolute constant $c_3$. This completes the proof.
\end{proof}







\subsection{Proof of Lemma~\ref{lemma:subGaussian_riskbound}}\label{section:proof_subGaussian_riskbound}
Here we give the detailed proof of Lemma~\ref{lemma:subGaussian_riskbound}, which is based on the one-side sub-Gaussian tail bound.
\begin{proof}[Proof of Lemma~\ref{lemma:subGaussian_riskbound}]
By definition, we have
\begin{align*}
    R(\btheta) &= \PP( y\cdot \btheta^\top \xb < 0 )  = \PP[ y\cdot \btheta^\top ( y\cdot \bmu + \qb) < 0 ] = \PP[ \btheta^\top \bmu  < y\cdot \btheta^\top \qb ] = \PP[ \btheta^\top \bmu  < y\cdot \btheta^\top  \Vb \bLambda^{1/2} \ub ],
\end{align*}
where in the second and last equations we plug in the definitions of $\xb$ and $\qb$ according to our data generation procedure described in Section~\ref{section:problemsetting}. Note that $\ub$ has independent, $\sigma_u$-sub-Gaussian entries. Therefore we have
\begin{align*}
    \|  \btheta^\top  \Vb \bLambda^{1/2} \ub  \|_{\psi_2} \leq c_1  \|  \btheta^\top  \Vb \bLambda^{1/2} \|_{2} = c_1  \sqrt{ \btheta^\top  \Vb \bLambda \Vb^\top \btheta } = c_1  \sqrt{ \btheta^\top  \bSigma \btheta }.
\end{align*}
Applying the one-side sub-Gaussian tail bound (e.g., Theorem~A.2 in \citet{chatterji2020finite}) completes the proof. 
\end{proof}



\subsection{Proof of Lemma~\ref{lemma:thetaSVM_riskbound}}
The proof of Lemma~\ref{lemma:thetaSVM_riskbound} is given as follows, where we utilize Proposition~\ref{prop:interpolationregression} and Lemma~\ref{lemma:subGaussian_riskbound} to derive the desired bound.



\begin{proof}[Proof of Lemma~\ref{lemma:thetaSVM_riskbound}]
By Proposition~\ref{prop:interpolationregression}, we have
\begin{align*}
    \hat\btheta_{\text{SVM}} = \hat\btheta_{\text{LS}}  = \Xb^\top (\Xb\Xb^\top)^{-1} \yb.
\end{align*}
Plugging it into the risk bound in Lemma~\ref{lemma:subGaussian_riskbound}, we obtain
\begin{align*}
    R(\hat\btheta_{\text{SVM}} ) \leq \exp\bigg\{ - \frac{ C[ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2}{ \| \Xb^\top (\Xb\Xb^\top)^{-1} \yb \|_{\bSigma}^2 } \bigg\}.
\end{align*}
Note that based on our model, we have $\Xb = \yb \bmu^\top + \Qb$, and 
\begin{align}
    \| \Xb^\top (\Xb\Xb^\top)^{-1} \yb \|_{\bSigma}^2 
    &\quad= \| (\yb \bmu^\top + \Qb)^\top (\Xb\Xb^\top)^{-1} \yb \|_{\bSigma}^2  \nonumber\\
    &\quad\leq 2 \| \bmu \yb^\top (\Xb\Xb^\top)^{-1} \yb \|_{\bSigma}^2 + 2\| \Qb^\top (\Xb\Xb^\top)^{-1} \yb \|_{\bSigma}^2 \nonumber\\
    &\quad= 2 (\yb^\top (\Xb\Xb^\top)^{-1} \yb)^2 \cdot \| \bmu \|_{\bSigma}^2 + 2\| \Qb^\top (\Xb\Xb^\top)^{-1} \yb \|_{\bSigma}^2.\nonumber%\label{eq:denominatorboundsketch}
\end{align}
Therefore we have
\begin{align*}
    R(\hat\btheta_{\text{SVM}} ) \leq \exp\bigg\{ \frac{ - (C/2)\cdot [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2 }{(\yb^\top (\Xb\Xb^\top)^{-1} \yb)^2 \cdot \| \bmu \|_{\bSigma}^2 + \| \Qb^\top (\Xb\Xb^\top)^{-1} \yb \|_{\bSigma}^2 } \bigg\}.
\end{align*}
This completes the proof.
\end{proof}




\subsection{Proof of Lemma~\ref{lemma:I1I2I3bounds}}
In this subsection we present the proof of Lemma~\ref{lemma:I1I2I3bounds}. We first give the following lemma, which follows by exactly the same proof as Lemma~\ref{lemma:eigenvalue_concentration}.
% With , we have the following lemma.



\begin{lemma}\label{lemma:eigenvalue_concentration2}
    Suppose that $\Zb \in \RR^{n\times d}$ is a random matrix with i.i.d. sub-Gaussian entries with sub-Gaussian norm $\sigma_u$. Then with probability at least $1 - O(n^{-2})$,
    \begin{align*}
    \big\| \Zb\bLambda^2 \Zb^\top   - \|\bSigma\|_F^2\cdot \Ib \big\|_2 \leq \epsilon_{\lambda}' := C \sigma_u^2  \big( n\cdot \| \bSigma\|_2^2  + \sqrt{n}\cdot \| \bSigma^2\|_F \big),
\end{align*}
where $C$ is an absolute constant.
\end{lemma}




Based on Lemma~\ref{lemma:eigenvalue_concentration2}, we can give the proof of Lemma~\ref{lemma:I1I2I3bounds} as follows.



\begin{proof}[Proof of Lemma~\ref{lemma:I1I2I3bounds}]
We first derive the lower bound for $I_1$. By Lemma~\ref{lemma:matrixcalculation} and the model definition $\Xb = \yb \bmu^\top + \Qb$, we have
\begin{align}
    \yb^\top (\Xb\Xb^\top)^{-1}  \Xb \bmu &= D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1}] (\yb \bmu^\top + \Qb)\bmu \nonumber\\
    &= D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1}] (\yb \cdot \| \bmu \|_2^2 + \Qb\bmu) \nonumber\\
    & = D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} \yb - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1} \yb] \cdot \| \bmu \|_2^2 \nonumber \\
    & \qquad + D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1}\bnu - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1}\bnu] )\nonumber\\
    & = D^{-1}\cdot [ (\| \bmu \|_2^2 - \bnu^\top \Ab^{-1}\bnu) \yb^\top \Ab^{-1} \yb  + ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1}\bnu  ],\label{eq:refinedBO_numerator_eq0}
\end{align}
where the third equality follows by the notation $\bnu = \Qb\bmu$.
By Lemma~\ref{lemma:concentrationbounds} and the assumption that $\tr( \bSigma ) \geq C \max\{ \epsilon_{\lambda} , n \|\bSigma \|_2 , n \| \bmu \|_{\bSigma} \}$ for some large enough constant $C$, when $n$ is large enough we have
\begin{align*}
     &| \yb^\top \Ab^{-1} \bnu | \leq \frac{ c_1 n   }{  \tr( \bSigma )  - \epsilon_{\lambda}  } \| \bmu \|_{\bSigma} \leq \frac{ 2c_1 n   }{  \tr( \bSigma ) } \| \bmu \|_{\bSigma} \leq 1,\\
     &0\leq \bnu^\top \Ab^{-1}\bnu \leq \frac{ n  + c_2 \sqrt{n\log( n)} }{ \tr( \bSigma )  - \epsilon_{\lambda} } \cdot \| \bmu \|_{\bSigma}^2 \leq \frac{ 2n }{ \tr( \bSigma ) } \cdot \| \bmu \|_{\bSigma}^2 \leq \frac{ 2n \| \bSigma \|_2 }{ \tr( \bSigma ) } \cdot \| \bmu \|_2^2 \leq \frac{1}{2}\cdot  \| \bmu \|_2^2 ,\\
     & \yb^\top \Ab^{-1} \yb \geq \frac{n}{ \tr( \bSigma )  + \epsilon_{\lambda} } \geq \frac{n}{ 2\tr( \bSigma ) },
\end{align*}
where $c_1,c_2$ are absolute constants. 
Plugging the bounds above into \eqref{eq:refinedBO_numerator_eq0}, we obtain
\begin{align*}
    | \yb^\top (\Xb\Xb^\top)^{-1}  \Xb \bmu | &\geq D^{-1}\cdot  \bigg(\frac{1}{2}\cdot\| \bmu \|_2^2\cdot \yb^\top \Ab^{-1} \yb  - 2\cdot   |\yb^\top \Ab^{-1}\bnu | \bigg)\\
    &\geq D^{-1}\cdot \bigg[ \frac{n}{ 4\tr( \bSigma ) } \cdot \| \bmu \|_2^2 - \frac{ 4n   }{  \tr( \bSigma ) } \| \bmu \|_{\bSigma} \bigg]\\
    &\geq D^{-1}\cdot \frac{n}{ 4\tr( \bSigma ) } \cdot  ( \| \bmu \|_2^2 -16 \| \bmu \|_{\bSigma} )\\
    &\geq D^{-1}\cdot \frac{n}{ 4\tr( \bSigma ) } \cdot  ( \| \bmu \|_2^2 -16 \|\bSigma \|_2 \| \bmu \|_2 )\\
    &\geq D^{-1}\cdot \frac{n}{ 8\tr( \bSigma ) } \cdot  \| \bmu \|_2^2,
\end{align*}
where the last inequality follows by the assumption that $\| \bmu \|_2 \geq C \| \bSigma\|_{2} $ for some large enough absolute constant $C$. 
Therefore we have
\begin{align*}%\label{eq:refinedBO_numerator_bound}
    [\yb^\top (\Xb\Xb^\top)^{-1}  \Xb \bmu]^2 \geq D^{-2}\cdot \frac{n^2}{ 64 [\tr( \bSigma )]^2 } \cdot   \| \bmu \|_2^4 = \frac{H(\bmu, \Qb, \yb, \bSigma) }{64} \cdot n^2 \| \bmu \|_2^4,
\end{align*}
where we define 
$$ H(\bmu, \Qb, \yb, \bSigma) := [D\cdot \tr(\bSigma) ]^{-2} > 0. $$
This completes the proof of the lower bound of $I_1$. 

% This gives a lower bound on the numerator in the exponent \eqref{eq:refinedBOderivation_eq0}. In the following we derive an upper bound for the denominator $\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb$. By definition, $\Xb = \yb \bmu^\top + \Qb$, Therefore
% \begin{align}
%     &\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb \nonumber \\
%     &\qquad\leq 2\cdot \underbrace{(\yb^\top (\Xb\Xb^\top)^{-1} \yb)^2 \bmu^\top \bSigma \bmu}_{I_1} + 2\cdot \underbrace{ \yb^\top (\Xb\Xb^\top)^{-1} \Qb \bSigma \Qb^\top (\Xb\Xb^\top)^{-1} \yb}_{I_2}\label{eq:refinedBO_denominator_eq0}
% \end{align}

% % The key is then to give upper bound of the term $\yb^\top (\Xb\Xb^\top)^{-1} \Qb \bSigma \Qb^\top (\Xb\Xb^\top)^{-1} \yb$. 



For $I_2$, by Lemma~\ref{lemma:matrixcalculation} we have 
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \yb &=  D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} \yb- \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1} \yb] \\
    & = D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} \yb- \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1} \yb] \\
    &= D^{-1} \cdot \yb^\top \Ab^{-1} \yb\\
    &\leq D^{-1}  \cdot \frac{n}{ \tr( \bSigma )  - \epsilon_{\lambda} }\\
    &\leq  2D^{-1}  \cdot \frac{n}{ \tr( \bSigma )},
\end{align*}
where the first inequality follows by Lemma~\ref{lemma:concentrationbounds}, and the second inequality follows by the assumption that $\tr( \bSigma ) \geq C \epsilon_{\lambda}$ for some large enough constant $C$. Therefore we have
\begin{align*}
    I_2 =  (\yb^\top (\Xb\Xb^\top)^{-1} \yb)^2 \cdot  \| \bmu \|_{\bSigma}^2 \leq  4D^{-2}  \cdot \frac{n^2 \cdot \| \bmu \|_{\bSigma}^2}{ [\tr( \bSigma )]^2} = 4 H(\bmu, \Qb, \yb, \bSigma) \cdot n^2 \cdot \| \bmu \|_{\bSigma}^2, %\label{eq:refinedBOderivation_I1bound} 
\end{align*}
where we use the definition $H(\bmu, \Qb, \yb, \bSigma) =  [D\cdot \tr(\bSigma) ]^{-2}$.
This proves the upper bound of $I_2$. 


For $I_3$, by our calculation in Lemma~\ref{lemma:matrixcalculation}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb - \yb^\top \Ab^{-1} \yb \cdot \bnu]^\top \Ab^{-1}.
\end{align*}
Denote $\ab = D^{-1}[( 1 + \yb^\top \Ab^{-1} \bnu )\cdot \yb - \yb^\top \Ab^{-1} \yb \cdot \bnu]$. Then
\begin{align}
    I_3 &= \yb^\top (\Xb\Xb^\top)^{-1} \Qb \bSigma \Qb^\top (\Xb\Xb^\top)^{-1} \yb \nonumber \\
    &= \ab^\top (\Qb\Qb^\top)^{-1}\Qb\bSigma\Qb^\top (\Qb\Qb^\top)^{-1} \ab \nonumber \\
    & = \ab^\top (\Zb\bLambda \Zb^\top)^{-1} \Zb\bLambda^2 \Zb^\top (\Zb\bLambda \Zb^\top)^{-1} \ab, \label{eq:refinedBOderivation_eq1}
\end{align}
where we plug in $\bSigma = \Vb \bLambda \Vb^\top$ and $\Qb = \Zb \bLambda^{1/2} \Vb^\top$ for $\Zb$ with independent sub-Gaussian entries. By Lemma~\ref{lemma:eigenvalue_concentration}, Lemma~\ref{lemma:eigenvalue_concentration2} and \eqref{eq:refinedBOderivation_eq1}, when $\tr(\bSigma) \geq \epsilon_{\lambda}$ we have
\begin{align}
    I_3&= \ab^\top (\Zb\bLambda \Zb^\top)^{-1} \Zb\bLambda^2 \Zb^\top (\Zb\bLambda \Zb^\top)^{-1} \ab \nonumber\\
    & \leq \ab^\top (\Zb\bLambda \Zb^\top)^{-2} \ab \cdot \big[\|\bSigma\|_F^2 + \epsilon_{\lambda}'\big] \nonumber\\
    & \leq \| \ab \|_2^2\cdot \frac{\|\bSigma\|_F^2 + \epsilon_{\lambda}'}{ [\tr(\bSigma) - \epsilon_{\lambda}]^2}.\label{eq:refinedBOderivation_eq1.5}
\end{align}
Here the first inequality follows by Lemma~\ref{lemma:eigenvalue_concentration2}, and the second inequality follows by Lemma~\ref{lemma:eigenvalue_concentration}. 
By definition, we have
\begin{align}
    \| \ab \|_2^2 & = \| D^{-1}( 1 + \yb^\top \Ab^{-1} \bnu ) \yb - \yb^\top \Ab^{-1} \yb \cdot \bnu \|_2^2 \nonumber \\
    & \leq 2 D^{-2} ( 1 + \yb^\top \Ab^{-1} \bnu )^2 \| \yb \|_2^2 + 2  D^{-2} ( \yb^\top \Ab^{-1} \yb  )^2\cdot  \|  \Qb \bmu\|_2^2. \nonumber
    % \label{eq:refinedBOderivation_eq2}
\end{align}

Then with the same proof as in Lemma~\ref{lemma:concentrationbounds}, when $n$ is sufficiently large, with probability at least $1- O(n^{-2})$ we have 
\begin{align*}
      \| \Qb \bmu \|_2^2 \leq  2n \| \bmu \|_{\bSigma}^2.
\end{align*}
Therefore we have
\begin{align}
    \| \ab \|_2^2 & \leq 2 D^{-2} ( 1 + \yb^\top \Ab^{-1} \bnu )^2 \| \yb \|_2^2 + 2  D^{-2} ( \yb^\top \Ab^{-1} \yb  )^2\cdot  \|  \Qb \bmu\|_2^2 \nonumber\\
    & \leq 2 D^{-2} ( 1 + \yb^\top \Ab^{-1} \bnu )^2 \cdot n + 4  D^{-2} ( \yb^\top \Ab^{-1} \yb  )^2\cdot  n \cdot \| \bmu \|_{\bSigma}^2. \label{eq:refinedBOderivation_eq2}
\end{align}


Moreover, by Lemma~\ref{lemma:concentrationbounds} and the assumption that $\tr( \bSigma ) \geq C \max\{ \epsilon_{\lambda} , n, n \| \bmu \|_{\bSigma} \}$ for some large enough constant $C$, we have
\begin{align*}
    &|\yb^\top \Ab^{-1} \bnu| \leq \frac{c_3 n   }{  \tr( \bSigma )  - \epsilon_{\lambda}  } \| \bmu \|_{\bSigma} \leq \sqrt{2} - 1, \\
    &  \yb^\top \Ab^{-1} \yb \leq \frac{n}{ \tr( \bSigma )  - \epsilon_{\lambda} } \leq \frac{2n}{\tr( \bSigma )},
\end{align*}
where $c_3$ is an absolute constant. Plugging the above bounds into \eqref{eq:refinedBOderivation_eq2}, we obtain
\begin{align*}
    \| \ab \|_2^2 & \leq 2 D^{-2} ( 1 + \yb^\top \Ab^{-1} \bnu )^2 \cdot n + 4  D^{-2} ( \yb^\top \Ab^{-1} \yb  )^2\cdot  n \cdot \| \bmu \|_{\bSigma}^2\\
    &\leq 4 D^{-2} \cdot n +  8 D^{-2} \cdot n  \cdot \bigg[\frac{n}{\tr( \bSigma )} \cdot \| \bmu \|_{\bSigma}\bigg]^2\\
    &\leq 5 D^{-2} \cdot n,
\end{align*}
where the last inequality utilizes the assumption  $\tr( \bSigma ) \geq C n \| \bmu \|_{\bSigma} $ for some large enough constant $C$ again. Further plugging this bound into \eqref{eq:refinedBOderivation_eq1.5}, we obtain
\begin{align}
    I_3 &\leq \| \ab \|_2^2\cdot \frac{\|\bSigma\|_F^2 + \epsilon_{\lambda}'}{ [\tr(\bSigma) - \epsilon_{\lambda}]^2} \leq 5 D^{-2} n \cdot \frac{\|\bSigma\|_F^2 + \epsilon_{\lambda}'}{ [\tr(\bSigma) - \epsilon_{\lambda}]^2} \nonumber\\
    &\leq c_4 D^{-2}\cdot  \frac{ n\cdot \|\bSigma\|_F^2 +  n^2\cdot \| \bSigma\|_2^2  + n^{3/2}\cdot \| \bSigma^2\|_F}{ [\tr(\bSigma) ]^2},\label{eq:refinedBOderivation_I3bound_eq1}
\end{align}
where $c_4$ is an absolute constant. 
Note that we have
\begin{align*}
     n^{3/2}\cdot \| \bSigma^2\|_F \leq n\cdot \| \bSigma\|_F\cdot (\sqrt{n}\cdot \| \bSigma\|_2)\leq n\cdot ( \| \bSigma\|_F^2 + n\cdot \| \bSigma\|_2^2)/ 2.
\end{align*}
Plugging this bound into \eqref{eq:refinedBOderivation_I3bound_eq1}, we have
\begin{align*}
    I_3 \leq c_5 D^{-2}\cdot\frac{ n\cdot \|\bSigma\|_F^2 +  n^2\cdot \| \bSigma\|_2^2 }{ [\tr(\bSigma) ]^2} = c_5 H(\bmu, \Qb, \yb, \bSigma)\cdot ( n\cdot \|\bSigma\|_F^2 +  n^2\cdot \| \bSigma\|_2^2 ),
\end{align*}
where we use the definition $H(\bmu, \Qb, \yb, \bSigma) =  [D\cdot \tr(\bSigma) ]^{-2}$, and $c_4$ is an absolute constant. This finishes the proof of the upper bound of $I_3$. 
\end{proof}




\section{Proof of Lemmas in Appendix~\ref{section:proof_appendix1}}\label{section:proof_appendix2}
Here we present the proofs of lemmas we used in Appendix~\ref{section:proof_appendix1}.
\subsection{Proof of Lemma~\ref{lemma:eigenvalue_concentration}}
The proof of Lemma~\ref{lemma:eigenvalue_concentration} is motivated by the analysis given in \citet{bartlett2020benign}. However here in Lemma~\ref{lemma:eigenvalue_concentration} we give a slightly tighter bound. The proof is as follows. 

\begin{proof}[Proof of Lemma~\ref{lemma:eigenvalue_concentration}]
Let $\cN$ be a $1/4$-net on the unit sphere $s^{n-1}$. Then by Lemma~5.2 in \citet{vershynin2010introduction}, we have $|\cN| \leq 9^n$. Denote $\zb_j =  \lambda_j^{-1/2} \Qb \vb_j \in \RR^{n}$. Then by definition, for any fixed unit vector $\hat\ab\in \cN$ we have $ \hat\ab^\top \Ab \hat\ab = \Qb\Qb^\top = \hat\ab^\top \sum_{j=1}^p \lambda_j \zb_j \zb_j^\top \hat\ab = \sum_{j=1}^p \lambda_j (\hat\ab^\top\zb_j)^2$. By Lemma 5.9 in \citet{vershynin2010introduction}, there exists an absolute constant $c_1$ such that $\| \hat\ab^\top\zb_j \|_{\psi_2} \leq c_1 \sigma_u$. Therefore by Lemma 21 and Corollary 23 in \citet{bartlett2020benign}, for any $t>0$, with probability at least $1 - 2\exp(-t)$ we have
\begin{align*}
    \big| \hat\ab^\top \Ab \hat\ab - \tr(\bSigma) \big| \leq c_2 \sigma_u^2 \max \big(  t\cdot \| \bSigma\|_{2}  , \sqrt{t}\cdot  \| \bSigma\|_{F}  \big).
\end{align*}
Applying an union bound over all $\hat\ab\in \cN$, we have that with probability at least $1 - 2\cdot 9^n \exp(-t )$,
\begin{align*}
    \big| \hat\ab^\top \Ab \hat\ab - \tr(\bSigma) \big| \leq c_2 \sigma_u^2 \max \big(  t\cdot \| \bSigma\|_{2} , \sqrt{t}\cdot  \| \bSigma\|_{F}  \big) 
\end{align*}
for all $\hat\ab\in \cN$. Therefore by Lemma~25 in \citet{bartlett2020benign}, with probability at least $1 - 2\cdot 9^n \exp(-t )$, we have
\begin{align*}
    \big\| \Ab  - \tr(\bSigma) \Ib \big\|_2 \leq c_3 \sigma_u^2  \big(  t \cdot \| \bSigma\|_{2}  + \sqrt{t}\cdot  \| \bSigma\|_{F}  \big),
\end{align*}
where $c_3$ is an absolute constant. 
Setting $t = c_4 n$ for some large enough constant $c_4$, we have that with probability at least $1 - n^{-2}$,
\begin{align*}
    \big\| \Ab  - \tr(\bSigma) \Ib \big\|_2 \leq c_5  \sigma_u^2 \big(  n \cdot \| \bSigma\|_{2} + \sqrt{n}\cdot  \| \bSigma\|_{F}  \big),
\end{align*}
where $c_5$ is an absolute constant. This completes the proof.
\end{proof}






\subsection{Proof of Lemma~\ref{lemma:matrixcalculation}}
Here we present the proof of Lemma~\ref{lemma:matrixcalculation}. Our proof utilizes a key lemma by \citet{wang2020benign}, and gives further simplifications of the result. 
\begin{proof}[Proof of Lemma~\ref{lemma:matrixcalculation}]

Denote $s = \yb^\top \Ab^{-1} \yb$, $t = \bnu^\top \Ab^{-1} \bnu$, $h = \yb^\top \Ab^{-1} \bnu$. Then we have $D =  \| \bmu\|_2^2 s - st + (h+1)^2$. 
By Lemma 3 in \citet{wang2020benign}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} &= \yb^\top \Ab^{-1} - D^{-1}\cdot [ \| \bmu \|_2^2 s + h^2 + h - st ] \cdot \yb^\top \Ab^{-1} - D^{-1} s \cdot \bnu^\top \Ab^{-1}.
\end{align*}
Rearranging terms, we obtain
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} 
    & =\bigg[ 1 - \frac{\| \bmu \|_2^2 s + h^2 + h - st }{ \| \bmu\|_2^2 s - st + (h+1)^2} \bigg] \cdot \yb^\top \Ab^{-1} - D^{-1} s \cdot \bnu^\top \Ab^{-1} \\
    & = \frac{ h + 1 }{ \| \bmu\|_2^2 s - st + (h+1)^2} \cdot \yb^\top \Ab^{-1} - D^{-1} s \cdot \bnu^\top \Ab^{-1}\\
    & = D^{-1} [ (h + 1) \yb^\top \Ab^{-1} - s \cdot \bnu^\top \Ab^{-1}].
\end{align*}
At last, by the definition of $D$, we have
\begin{align*}
     D &= \yb^\top\Ab^{-1} \yb \cdot  (\| \bmu\|_2^2 - \bmu^\top\Qb^\top (\Qb\Qb^\top)^{-1} \Qb\bmu)  + (1 + \yb^\top\Ab^{-1} \bnu)^2\\
     &\geq (1 + \yb^\top\Ab^{-1} \bnu)^2,
\end{align*}
where we utilize the fact that $ \yb^\top\Ab^{-1} \yb \geq 0$ and $\| \bmu\|_2^2 \geq \bmu^\top\Qb^\top (\Qb\Qb^\top)^{-1} \Qb\bmu$. Since $\yb^\top\Ab^{-1} \bnu \neq 1$ with probability $1$, we see that $D > 0$ almost surely. This completes the proof.
\end{proof}






\subsection{Proof of Lemma~\ref{lemma:concentrationbounds}}
The proof of Lemma~\ref{lemma:concentrationbounds} is based on the application of eigenvalue concentration results in Lemma~\ref{lemma:eigenvalue_concentration}. We present the details as follows.
\begin{proof}[Proof of Lemma~\ref{lemma:concentrationbounds}]
The bounds on $\yb^\top \Ab^{-1} \yb$ are directly derived from Lemma~\ref{lemma:eigenvalue_concentration} and the fact that $\| \yb \|_2^2 = n$. To derive the bounds for $\bnu\Ab^{-1}\bnu$, we note that by definition, $\bnu = \Qb\bmu$ and 
\begin{align*}
    \bnu^\top \Ab^{-1} \bnu = \bmu^\top \Qb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu. %= \| \Qb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu \|_2^2
\end{align*}

Denote $\zb_i =  \lambda_i^{-1/2} \Qb \vb_i \in \RR^{n}$,  $\Zb = [\zb_1,\ldots, \zb_p] \in \RR^{n \times p}$, and $\tilde\bmu = \Lambda^{1/2} \Vb^\top \bmu$. Then $\Qb = \Zb \bLambda^{1/2} \Vb^\top$, $\Qb \bmu = \Zb \tilde\bmu$, and
\begin{align*}
    \bmu^\top \Qb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu &= \bmu^\top \Vb  \bLambda^{1/2} \Zb^\top ( \Zb \bLambda \Zb^\top )^{-1} \Zb \bLambda^{1/2} \Vb^\top \bmu\\
    &= \tilde\bmu^\top  \Zb^\top ( \Zb \bLambda \Zb^\top )^{-1} \Zb  \tilde\bmu\\
    &\leq \frac{ \| \Zb \tilde\bmu \|_2^2 }{ \tr(\bSigma) - \epsilon_{\lambda} }.
\end{align*}
Similarly, we have
\begin{align*}
    \bmu^\top \Qb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu \geq \frac{ \| \Zb \tilde\bmu \|_2^2 }{  \tr(\bSigma) + \epsilon_{\lambda} }.
\end{align*}
We now proceed to give upper and lower bounds for the term $\| \Zb \tilde\bmu \|_2^2 =  \sum_{i=1}^n ( \sum_{j=1}^p \Zb_{ij} \tilde\mu_j )^2$. Note that by definition,  $\Zb_{ij}$ for $i\in [n]$ and $j\in [p]$ are independent sub-Gaussian vectors with $\| \Zb_{ij} \|_{\psi_2} \leq \sigma_u$. By Lemma~5.9 in \citet{vershynin2010introduction}, we have
\begin{align*}
    \Bigg\| \sum_{j=1}^p \Zb_{ij} \tilde\mu_j \Bigg\|_{\psi_2} \leq c_1 \| \tilde\bmu \|_2 \cdot \sigma_u, 
\end{align*}
where $c_1$ is an absolute constant. Therefore by Lemma~5.14 in \citet{vershynin2010introduction}, we have
\begin{align*}
     \Bigg\| \Bigg(\sum_{j=1}^p \Zb_{ij} \tilde\mu_j \Bigg)^2 - \| \tilde\bmu \|_2^2 \Bigg\|_{\psi_1} \leq  c_2 \| \tilde\bmu_j \|_2^2,
\end{align*}
where we merge $\sigma_u$ into the absolute constant $c_2$. 
By Bernstein's inequality, with probability at least $1 - n^{-2}$, 
\begin{align*}
    \big| \| \Zb \tilde\bmu \|_2^2 - \EE \| \Zb \tilde\bmu \|_2^2 \big| \leq c_3 \| \tilde\bmu \|_2^2\cdot \sqrt{n\log( n)},
\end{align*}
where $c_3$ is an absolute constant. Therefore we have%Merging $\sigma_u^2$ into the constant coefficient, we have
% with probability at least $1 - \exp(-n/c_1)$,
% \begin{align*}
%     \big| \| \Zb \tilde\bmu \|_2^2 - \EE \| \Zb \tilde\bmu \|_2^2 \big|\leq n\cdot \| \tilde\bmu \|_2 /c_1.
% \end{align*}
% Therefore we have
\begin{align}\label{eq:concentrationbounds_eq1}
     n  \| \tilde\bmu \|_2^2 - c_3 \| \tilde\bmu \|_2^2\cdot \sqrt{n\log( n)} \leq  \| \Qb \bmu \|_2^2 = \| \Zb \tilde\bmu \|_2^2 \leq  n \| \tilde\bmu \|_2^2 + c_3 \| \tilde\bmu \|_2^2\cdot \sqrt{n\log( n)},
\end{align}
and
\begin{align*}
    \frac{n - c_3 \sqrt{n\log( n)}}{ \tr(\bSigma) + \epsilon_{\lambda} } \cdot \| \tilde\bmu \|_2 \leq  \bnu^\top \Ab^{-1} \bnu \leq \frac{n + c_3 \sqrt{n\log( n)}}{ \tr(\bSigma) - \epsilon_{\lambda} } \cdot \| \tilde\bmu \|_2
\end{align*}
Similarly for $\yb^\top \Ab^{-1} \bnu$, by Cauchy-Schwarz inequality, for large enough $n$ we have
\begin{align*}
    |\yb^\top \Ab^{-1} \bnu| = |\yb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu| \leq \| \yb \|_2 \cdot \| (\Qb\Qb^\top)^{-1} \Qb \bmu \|_2 = \sqrt{n} \cdot \sqrt{ \bmu^\top \Qb^\top (\Qb\Qb^\top)^{-2} \Qb \bmu }.
\end{align*}
Applying Lemma~\ref{lemma:eigenvalue_concentration} and the inequality \eqref{eq:concentrationbounds_eq1}, we have
\begin{align*}
    |\yb^\top \Ab^{-1} \bnu|
    &\leq\frac{\sqrt{n} }{  \tr(\bSigma) - \epsilon_{\lambda}  } \| \Qb\bmu \|_2 \leq\frac{\sqrt{n} \cdot \sqrt{n + c_3 \sqrt{n\log( n)}} }{  \tr(\bSigma) - \epsilon_{\lambda}  } \| \tilde\bmu \|_2 \leq \frac{c_4 n }{  \tr(\bSigma) - \epsilon_{\lambda}  } \| \tilde\bmu \|_2,
\end{align*}
where $c_4$ is an absolute constant. Note that $\| \tilde\bmu \|_2 = \| \bmu \|_{\bSigma}$. This completes the proof.
% Denote $\Zb = \Qb \Sigma^{-1/2}$. Then each row of $\Zb$ follows standard Gaussian distribution.
\end{proof}












\subsection{Proof of Lemma~\ref{lemma:anisotropicbound2}}
The proof Lemma~\ref{lemma:anisotropicbound2} utilizes the polarization identity and is similar with the proof of Lemma~\ref{lemma:anisotropicbound1}.
\begin{proof}[Proof of Lemma~\ref{lemma:anisotropicbound2}]
The proof is very similar to the proof of Lemma~\ref{lemma:anisotropicbound1}. We have
\begin{align*}
    \yb^\top \Ab^{-1} \eb_i y_i &= \frac{1}{4\sqrt{n}} (\yb + \sqrt{n}\eb_i y_i )^\top \Ab^{-1} (\yb + \sqrt{n}\eb_i y_i ) - \frac{1}{4\sqrt{n}} (\yb - \sqrt{n}\eb_i y_i )^\top \Ab^{-1} (\yb - \sqrt{n}\eb_i y_i )\\
    &\geq \frac{1}{4\sqrt{n}} \bigg[ \frac{\| \yb + \sqrt{n}\eb_i y_i \|_2^2}{\tr(\bSigma)  + \epsilon_{\lambda}} - \frac{\| \yb - \sqrt{n}\eb_i y_i \|_2^2}{\tr(\bSigma)  - \epsilon_{\lambda}}  \bigg]\\
    &= \frac{1}{4\sqrt{n}} \bigg[ \frac{2n + 2\sqrt{n}}{\tr(\bSigma)  + \epsilon_{\lambda}} - \frac{2n - 2\sqrt{n}}{\tr(\bSigma)  - \epsilon_{\lambda}}  \bigg]\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{(n + \sqrt{n}) (\tr(\bSigma)  - \epsilon_{\lambda}) - (n - \sqrt{n}) (\tr(\bSigma)  + \epsilon_{\lambda}) }{\tr(\bSigma)^2  - \epsilon_{\lambda}^2}\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{2\sqrt{n} \tr(\bSigma) - 2n \epsilon_{\lambda} }{\tr(\bSigma)^2  - \epsilon_{\lambda}^2}\\
     & = \frac{ \tr(\bSigma) - \sqrt{n} \epsilon_{\lambda} }{\tr(\bSigma)^2  - \epsilon_{\lambda}^2},
\end{align*}
where we use the polarization identity $\ab^\top \Mb \bbb = 1/4(\ab+\bbb)^\top \Mb (\ab+\bbb) - 1/4(\ab-\bbb)^\top \Mb (\ab-\bbb)$ in the first equality and use  Lemma~\ref{lemma:eigenvalue_concentration} 
to derive the inequality. This completes the proof.
\end{proof}




\section{Proof of Corollaries}\label{section:corollaryproof}
Here we provide the proof of the Corollaries~\ref{col:isotropic}, \ref{col:polynomialdecay} and \ref{col:rare-weak} in Section~\ref{section:mainresults}.  


\subsection{Proof of Corollary~\ref{col:isotropic}}
The proof of Corollary~\ref{col:isotropic} is a direct application of Theorem~\ref{thm:BOnew}. The detailed proof is as follows.

\begin{proof}[Proof of Corollary~\ref{col:isotropic}]
When $\bSigma = \Ib$, we have $\tr( \bSigma ) = d$, $\|\bSigma \|_2 = 1$, $\| \bSigma \|_F = \sqrt{d}$ and $\| \bmu \|_{\bSigma} = \| \bmu \|_2$. Under the condition in  Corollary~\ref{col:isotropic} that $d \geq C \max\big\{ n^{2} , n\sqrt{\log(n)}\cdot \| \bmu \|_{2} \big\}$ and $\| \bmu \|_2 \geq C $ for some large enough absolute constant $C$, it is easy to check that the conditions of Theorem~\ref{thm:BOnew}
$$\tr( \bSigma ) = \Omega\big( \max\big\{ n^{3/2} \|\bSigma \|_2, n\| \bSigma \|_F , n\sqrt{\log(n)}\cdot \| \bmu \|_{\bSigma} \big\} \big),~\| \bmu \|_2 \geq C \| \bSigma\|_{2}$$ 
hold. Therefore by Theorem~\ref{thm:BOnew}, we have
\begin{align*}
    R(\hat\btheta_{\text{SVM}}) \leq \exp\Bigg(  \frac{ - c_1 n \| \bmu \|_2^4  }{  n  \| \bmu \|_{\bSigma}^2+ \| \bSigma\|_F^2 +  n\| \bSigma\|_2^2 } \Bigg) \leq \exp\Bigg(  \frac{ - c_2 n \| \bmu \|_2^4  }{  n  \| \bmu \|_2^2+ d} \Bigg),
\end{align*}
where $c_1$, $c_2$ are absolute constants. This completes the proof. 
\end{proof}


\subsection{Proof of Corollary~\ref{col:polynomialdecay}}
Here we present the proof of Corollary~\ref{col:polynomialdecay}, which is mostly based on the estimation of the order of the summations $\sum_{k=1}^d k^{-\alpha}$ and $\sum_{k=1}^d k^{-2\alpha}$. 
\begin{proof}[Proof of Corollary~\ref{col:polynomialdecay}]
We first consider the case when $\alpha\in [0, 1/2)$. We have
\begin{align*}
    \tr(\bSigma) = \sum_{k=1}^d \lambda_k = \sum_{k=1}^d k^{-\alpha} \geq \int_{t=1}^{d} t^{-\alpha} \dd t = \frac{d^{1 - \alpha}}{1 - \alpha} - \frac{1}{1 - \alpha} > \frac{d^{1 - \alpha}}{2(1 - \alpha)}
\end{align*}
when $d$ is sufficiently large. Similarly, we have
    % &\| \bSigma \|_F^2 = \sum_{k=1}^d \lambda_k^2 = \sum_{k=1}^d k^{-2\alpha} \geq \int_{t=1}^{d} t^{-2\alpha} \dd t = \frac{d^{1 - 2\alpha}}{1 - 2\alpha} - \frac{1}{1 - 2\alpha},\\
\begin{align*}
    \| \bSigma \|_F^2 = \sum_{k=1}^d \lambda_k^2 = 1 + \sum_{k=2}^d k^{-2\alpha} \leq 1 + \int_{t=1}^{d-1} t^{-2\alpha} \dd t = 1 + \frac{(d - 1)^{1 - 2\alpha}}{1 - 2\alpha} - \frac{1}{1 - 2\alpha} \leq 1 + \frac{d^{1 - 2\alpha}}{1 - 2\alpha}.
\end{align*}
% when $d^{1 - 2\alpha}$ is sufficiently large.
Therefore, a sufficient condition for the assumptions in Theorem~\ref{thm:BOnew} to hold is that $\| \bmu \|_2 = \omega(1)$ and
% $$\tr( \bSigma ) \geq C \max\{ n^{3/2} \|\bSigma \|_2, n\| \bSigma \|_F , n\sqrt{\log(n)}\cdot \| \bmu \|_{\bSigma} \}$$ 
\begin{align*}
    &\frac{d^{1 - \alpha}}{2(1 - \alpha)}  \geq C n^{3/2},\\
    &\frac{d^{1 - \alpha}}{2(1 - \alpha)}  \geq C n \cdot \sqrt{1 + \frac{d^{1 - 2\alpha}}{1 - 2\alpha} },\\
    &\frac{d^{1 - \alpha}}{2(1 - \alpha)}  \geq C n\sqrt{\log(n)}\cdot \| \bmu \|_{\bSigma} .
\end{align*}
After simplifying the result, we derive the condition that  $ d = \tilde\Omega( n^{\frac{3}{2(1 - \alpha)}} + n^2 + ( n \| \bmu \|_{\bSigma})^{\frac{1}{1 - \alpha}}  )$. We further check the conditions on $\| \bmu \|_2$ that lead to $o(1)$ population risk. Note that when $\| \bmu \|_2 = \omega(1)$, $\| \bmu \|_2^4 / \| \bmu \|_{\bSigma}^2 = \omega(1)$. We also check the condition that $n \| \bmu \|_2^4 / \| \bSigma \|_F^2 = \omega(1)$. A sufficient condition is that
\begin{align*}
    n \| \bmu \|_2^4 = \omega\bigg( 1 + \frac{d^{1 - 2\alpha}}{1 - 2\alpha} \bigg).
\end{align*}
Simplifying the condition completes the proof for the case $\alpha\in [0, 1/2)$.

For the case $\alpha = 1/2$, we have
\begin{align*}
    \tr(\bSigma) = \sum_{k=1}^d \lambda_k = \sum_{k=1}^d k^{-1/2} \geq \int_{t=1}^{d} t^{-1/2} \dd t = \frac{d^{1 - 1/2}}{1 - 1/2} - \frac{1}{1 - 1/2} > \sqrt{d}
\end{align*}
when $d$ is sufficiently large. Moreover,
\begin{align*}
    \| \bSigma \|_F^2 = \sum_{k=1}^d \lambda_k^2 = 1 + \sum_{k=2}^d k^{-1} \leq 1 + \int_{t=1}^{d-1} t^{-1} \dd t = 1 + \log(d - 1) \leq 1 + \log(d).
\end{align*}
Verifying the conditions
\begin{align*}
    &\sqrt{d}  \geq C n^{3/2},\\
    &\sqrt{d} \geq C n \cdot \sqrt{1 + \log(d) },\\
    &\sqrt{d} \geq C n\sqrt{\log(n)}\cdot \| \bmu \|_{\bSigma}
\end{align*}
then gives a sufficient condition $ d = \tilde\Omega( n^{3} + n^2 \| \bmu \|_{\bSigma}^2  )$, $\| \bmu \|_2 = \omega(1)$ for the assumptions in Theorem~\ref{thm:BOnew} to hold. It is also easy to verify that when $\| \bmu \|_2 = \omega( 1 + n^{-1/4}(\log(d))^{1/4} )$ we have $R(\hat\btheta_{\text{SVM}}) = o(1)$. 


Finally for the case $\alpha\in (1/2, 1 )$, we have
\begin{align*}
    \tr(\bSigma) = \sum_{k=1}^d \lambda_k = \sum_{k=1}^d k^{-\alpha} \geq \int_{t=1}^{d} t^{-\alpha} \dd t = \frac{d^{1 - \alpha}}{1 - \alpha} - \frac{1}{1 - \alpha}.
\end{align*}
Moreover, in this setting we have $\| \bSigma \|_F^2 \leq c_1$ for some absolute constant $c_1$. It is therefore easy to check that $\| \bmu \|_2 = \omega(1)$ and
\begin{align*}
    d = \tilde\Omega( n^{\frac{3}{2(1 - \alpha)}} + ( n \| \bmu \|_{\bSigma})^{\frac{1}{1 - \alpha}}  )
\end{align*}
are sufficient for the assumptions in Theorem~\ref{thm:BOnew} to hold, and we also have $R(\hat\btheta_{\text{SVM}}) = o(1)$.
\end{proof}


\subsection{Proof of Corollary~\ref{col:rare-weak}}
The proof of Corollary~\ref{col:rare-weak} for the rare/weak feature model is rather straightforward. 
\begin{proof}[Proof of Corollary~\ref{col:rare-weak}]
Note that in the rare/weak feature model we have $\| \bmu \|_2 = \gamma \sqrt{s}$. Therefore the conditions of Corollary~\ref{col:isotropic} are satisfied and we have 
\begin{align*}
    R(\hat\btheta_{\text{SVM}} ) \leq  \exp\bigg( - \frac{  c_1 n \| \bmu \|_2^4  }{  n  \| \bmu \|_{2}^2+ d } \bigg) = \exp\bigg( - \frac{  c_1 n \gamma^4 s^2  }{  n  \gamma^2 s + d } \bigg),
\end{align*}
where $c_1$ is an absolute constant. This completes the proof. 
\end{proof}



\section{Experiments}
In this section we present simulation results to backup our population risk bound in Theorem~\ref{thm:BOnew}. We generate $\ub$ as a standard Gaussian vector, and set $\bSigma = \diag\{ \lambda_1,\ldots, \lambda_d\}$ with $\lambda_k = k^{-\alpha}$ for some parameter $\alpha \in [0,1)$, which matches the setting studied in Section~\ref{section:mainresults}. The mean vector $\bmu$ is generated uniformly from the sphere centered at the origin with radius $r$. All population risks are calculated by taking the average of $100$ independent experiments. Note that since we are considering Gaussian mixtures in our experiments, the population risk can be directly calculated with the Gaussian cumulative distribution function:
\begin{align*}
    R(\hat\btheta_{\text{SVM}}) &= \PP[ \btheta^\top \bmu  < y\cdot \hat\btheta_{\text{SVM}}^\top  \bLambda^{1/2} \ub ].
\end{align*}
The derivation of the above result is in the proof of Lemma~\ref{lemma:subGaussian_riskbound} in Appendix~\ref{section:proof_subGaussian_riskbound}.

\noindent\textbf{Population risk versus the norm of the mean vector $\| \bmu\|_2$.} We first present experimental results on the relation between the population risk and the norm of the mean vector $\| \bmu\|_2$. Note that in our setting, the risk bound in Theorem~\ref{thm:BOnew} reduces to the following bound:
\begin{align*}
    R(\hat\btheta_{\text{SVM}}) \leq \exp\Bigg(  \frac{ - C' n \| \bmu \|_2^4  }{  n  \| \bmu \|_{\bSigma}^2+ \sum_{k=1}^d k^{-2\alpha}} \Bigg).
\end{align*}
Based on this bound, we can first see that the population risk should be smaller when $\alpha$ is larger. Moreover, the dependency of $R(\hat\btheta_{\text{SVM}})$ depends on the comparison between the scaling of the two terms in the denominator. When
\begin{align}\label{eq:experimentdiscussion_eq1}
    \sum_{k=1}^d k^{-2\alpha} \geq  n  \| \bmu \|_{\bSigma}^2,
\end{align}
we can expect that $ -\log (R(\hat{\btheta}_{\text{SVM}})) $ should be roughly of order $\| \bmu \|_2^4$. On the other hand, if \eqref{eq:experimentdiscussion_eq1} does not hold, then $ -\log (R(\hat{\btheta}_{\text{SVM}})) $ should be roughly of order $\| \bmu \|_2^2$. It is also clear that whether \eqref{eq:experimentdiscussion_eq1} holds heavily depends on the values of the sample size $n$ and $\alpha$: when $n$ is large, then \eqref{eq:experimentdiscussion_eq1} is less likely to be satisfied. Moreover, when $\alpha > 1/2$, \eqref{eq:experimentdiscussion_eq1} cannot hold because in this case $\sum_{k=1}^d k^{-2\alpha}$ is upper bounded by a constant. 

\begin{figure}[ht!]
	\begin{center}
% 	\vspace{-0.25in}
		\begin{tabular}{cc}
 		\hspace{-0.2in}
			\subfigure[$R(\hat{\btheta}_{\text{SVM}})$ versus $\| \bmu \|_2$, $n = 10$, $d = 2000$]{\includegraphics[width=0.4\linewidth,angle=0]{figures/errorplot_risk_mu_d2000n10.pdf}\label{subfig:1}}
			& 
			\subfigure[$ -\log (R(\hat{\btheta}_{\text{SVM}})) $
 versus $\| \bmu \|_2^2$, $n = 10$, $d = 2000$]{\includegraphics[width=0.4\linewidth,angle=0]{ICML21/figures/errorplot_risk_mu_log_d2000n10.pdf}\label{subfig:2}}
			\\
			\subfigure[$R(\hat{\btheta}_{\text{SVM}})$ versus $\| \bmu \|_2$, $n = 100$, $d = 2000$]{\includegraphics[width=0.4\linewidth,angle=0]{figures/errorplot_risk_mu_d2000n100.pdf}\label{subfig:3}}
			& 
			\subfigure[$-\log(R(\hat{\btheta}_{\text{SVM}}))$ versus $\| \bmu \|_2^2$, $n = 100$, $d = 2000$]{\includegraphics[width=0.4\linewidth,angle=0]{figures/errorplot_risk_mu_log_d2000n100.pdf}\label{subfig:4}}
		    \end{tabular}
	\end{center}
	\vskip -12pt
	\caption{Experiments on the dependency of the population risk $R(\hat\btheta_{\text{SVM}})$ on the norm of the mean vector $\| \bmu\|_2$ with different values of $\alpha$ and sample size $n$. (a) and (b) gives the curves with $n = 10$, while (c) and (d) are for the case $n = 100$. Moreover, (a) and (c) gives the curves of $R(\hat\btheta_{\text{SVM}})$ versus $\| \bmu\|_2$, and to further test the tightness of our risk bound, in (c) and (d) we also study the relation between $-\log(R(\hat{\btheta}_{\text{SVM}}))$ and $\| \bmu \|_2^2$. The dimension $d$ is set to $2000$ in all these figures. In (d) we omit the last point $\|\bmu \|_2 = 16$ in the curve for $\alpha = 0.8$ because the population risk in this case is too small and is dominated by the numerical accuracy.
	} 
	\label{fig:errorcurve}
%	\vspace{-.15in}
\end{figure}

In Figure~\ref{fig:errorcurve}, we verify the above argument by verifying the dependency of the population risk $R(\hat\btheta_{\text{SVM}})$ on the norm of the mean vector $\| \bmu\|_2$ with different values of $\alpha$ and sample size $n$. From Figures~\ref{subfig:1} and \ref{subfig:3}, we can see that $R(\hat{\btheta}_{\text{SVM}})$ decreases with $\| \bmu \|_2$ and $\alpha$. From~\ref{subfig:2}, we verify that when $n = 10$ (which is rather small) and when $\alpha = 0, 0.2, 0.4$, $ -\log (R(\hat{\btheta}_{\text{SVM}})) $ is linear in $\| \bmu \|_2^2$. This verifies our discussion for the setting when \eqref{eq:experimentdiscussion_eq1} holds. On the other hand, when $\alpha = 0.6,0.8$, $ -\log (R(\hat{\btheta}_{\text{SVM}})) $ has a higher order dependency in $\| \bmu \|_2^2$, which is because $\sum_{k=1}^d k^{-2\alpha}$ is upper bounded by a constant and \eqref{eq:experimentdiscussion_eq1} cannot hold. In Figure~\ref{subfig:4}, we further verify that when $n = 100$, \eqref{eq:experimentdiscussion_eq1} never hold and $ -\log (R(\hat{\btheta}_{\text{SVM}})) $ is of order $\| \bmu \|_2^2$ for all choices of $\alpha$. This set of experiments verifies our risk bound in Theorem~\ref{thm:BOnew}. 



\noindent\textbf{Verification of the dimension-dependent and dimension-free settings.} In Corollary~\ref{col:polynomialdecay}, we have discussed that when $\alpha < 1/2$, achieving a small population risk requires a larger $\| \bmu \|_2$ when $d$ is larger. On the other hand, when $\alpha > 1/2$, the requirement on $\| \bmu \|_2$ to achieve small population error is dimension-free. Here we present experimental results to verify our claim. The results are given in Figure~\ref{fig:phasetransition}. We can see very clearly that when $\alpha = 0.2$, the risk curves for different $d$ are different, and larger $d$ results in worse population risk. However, when $\alpha = 0.8$, all the risk curves are almost exactly the same, which indicates that the population risk is dimension-free. This verifies our claim in Corollary~\ref{col:polynomialdecay}.


\begin{figure}[h]
	\begin{center}
% 	\vspace{-0.25in}
		\begin{tabular}{cc}
 		\hspace{-0.2in}
			\subfigure[$R(\hat{\btheta}_{\text{SVM}})$ versus $\| \bmu \|_2$, $\alpha = 0.2$, $n = 10$]{\includegraphics[width=0.4\linewidth,angle=0]{figures/errorplot_risk_mu_n10_alpha02.pdf}\label{subfig:5}}
			& 
			\subfigure[$R(\hat{\btheta}_{\text{SVM}})$ versus $\| \bmu \|_2$, $\alpha = 0.8$, $n = 10$]{\includegraphics[width=0.4\linewidth,angle=0]{ICML21/figures/errorplot_risk_mu_n10_alpha08.pdf}\label{subfig:6}}
		    \end{tabular}
	\end{center}
	\vskip -12pt
	\caption{The population risk curve with respect to $\| \bmu\|_2$ with different values of $\alpha$ and dimension $d$. (a) shows the result for $\alpha = 0.2$, while (b) is for the case $\alpha = 0.8$. The sample size $n$ is set to $10$ in both experiments.} 
	\label{fig:phasetransition}
%	\vspace{-.15in}
\end{figure}

\end{document}
