\documentclass[11pt]{article}
\pdfoutput=1

\usepackage{mathrsfs}
\usepackage{amsmath,amssymb}
\usepackage{bm}
\usepackage{natbib}
\usepackage[usenames]{color}
\usepackage{amsthm}

\usepackage{multirow} 
\usepackage{enumitem}

\newcommand\dd{\mathrm{d}}
\DeclareMathOperator{\sech}{sech}
\DeclareMathOperator{\csch}{csch}
\DeclareMathOperator{\arcsec}{arcsec}
\DeclareMathOperator{\arccot}{arcCot}
\DeclareMathOperator{\arccsc}{arcCsc}
\DeclareMathOperator{\arccosh}{arcCosh}
\DeclareMathOperator{\arcsinh}{arcsinh}
\DeclareMathOperator{\arctanh}{arctanh}
\DeclareMathOperator{\arcsech}{arcsech}
\DeclareMathOperator{\arccsch}{arcCsch}
\DeclareMathOperator{\arccoth}{arcCoth} 


\usepackage[colorlinks,
linkcolor=red,
anchorcolor=blue,
citecolor=blue
]{hyperref}

\renewcommand{\baselinestretch}{1.05}

\def \dd {\rm{d}}




\usepackage{setspace}
%\setstretch{1.5}
\usepackage[left=1in, right=1in, top=1in, bottom=1in]{geometry}

\usepackage{xcolor}
\newcommand{\bin}[1]{\textcolor{blue}{[Bin: #1]}}

\ifdefined\final
\usepackage[disable]{todonotes}
\else
\usepackage[textsize=tiny]{todonotes}
\fi
\setlength{\marginparwidth}{0.8in}
\newcommand{\todoy}[2][]{\todo[size=\scriptsize,color=blue!20!white,#1]{Yuan: #2}}
\newcommand{\todod}[2][]{\todo[size=\scriptsize,color=red!20!white,#1]{XXX: #2}}
\newcommand{\todoq}[2][]{\todo[size=\scriptsize,color=orange!20!white,#1]{Quanquan: #2}}
\newcommand{\mip}[1]{\langle #1 \rangle}


\title{\huge Benign Overfitting in Linear Classification}


\author
{
    
}


\date{}

\usepackage{mylatexstyle}







\def\supp{\mathop{\text{supp}}}
\def\card{\mathop{\text{card}}}
\def\rank{\mathrm{rank}}
\def\dd{\mathrm{d}}
\def\tr{\mathop{\text{Tr}}}
\newcommand{\red}{\color{red}}
\newcommand{\blue}{\color{blue}}
\newcommand{\la}{\langle}
\newcommand{\ra}{\rangle}
\newcommand{\cIs}{\cI_{\hat{s}}}
\def \CC {\textcolor{red}}
\newtheorem{condition}[theorem]{Condition}

%\def \hbtheta{\hat \btheta}
\def \hbtheta{ \btheta}
\def \barell{\bar \ell}
\def \Breg {B_{\psi}}
\def \tbtheta {\tilde{\btheta}}
\def \poly {\mathrm{poly}}



\def\supp{\mathop{\text{supp}}}
\def\card{\mathop{\text{card}}}
\def\rank{\mathrm{rank}}
\def\tr{\mathop{\text{Tr}}}
\def \CC {\textcolor{red}}
\def \CCC {\textcolor{blue}}

%\def \hbtheta{\hat \btheta}
\def \hbtheta{ \btheta}
\def \barell{\bar \ell}
\def \Breg {B_{\psi}}
\def \tbtheta {\tilde{\btheta}}
\def \poly {\mathrm{poly}}

\begin{document}

\maketitle




% \section{Introduction}




Our contribution:
\begin{enumerate}
\item \todoq{consider logistic regression, implicit bias, finite-time analysis}
\item Infinite-dimensional linear classification
\item Non-isotropic
%\item Comparison with Phil
%\item \todoq{can we do sub-Gaussian?}
\item \todoq{Can we cover Massart noise?}
\item Distribution-specific PAC learning, halfspace, linear classification
\item do we require the component of the sub-Gaussian r.v. to be independent? See peter's benign overfitting in ridge regression paper.
\end{enumerate}

\section{Related Work}

% The phenomenon of benign overfitting has been studied by a recent line of work \citep{bartlett2020benign,chatterji2020finite,muthukumar2020classification,wang2020benign}. 

The phenomenon of benign overfitting was first raised up by \citet{bartlett2020benign}, where the authors studied over-parameterized linear regression and showed that under certain conditions, minimum norm interpolator on the noisy training data can still achieve asymptotically optimal population risk. A later work \citet{chatterji2020finite} studied the risk of interpolating linear classifiers under the setting where the two class-conditional distributions have different means but the same covariance
matrix. \citet{muthukumar2020classification} studied benign overfitting for a different classification problem with Gaussian features, and highlighted an important observation that under certain conditions, the solutions of hard-margin support vector machine (SVM) are identical to the least-squares minimum-norm interpolators. Following the same intuition, a very recent work \citet{wang2020benign} further studied the equivalence between SVM and minimum norm interpolator and the benign overfitting phenomenon under an isotropic Gaussian mixture model. The results in this paper are closely related to \citet{muthukumar2020classification} and \citet{wang2020benign}. Compared with these existing results, our work applies to infinite-dimensional, non-isotropic sub-Gaussian mixtures, and gives a tighter population risk bound. 

Our work is also closely related to the phenomenon of double descent studied in recent works \citep{belkin2019reconciling,belkin2019two,hastie2019surprises,mei2019generalization}. Specifically, these results show that the curve of risk versus over-parameterization has a double descent shape, and therefore indicate that over-parameterization can be beneficial to achieve small risk.


\section{Problem Setting}
Suppose that $y_i$, $i\in [n]$ are generated as i.i.d. Rademacher variables. Suppose that $\qb$ is generated from a distribution with covariance matrix $\bSigma$ such that $\qb = \Vb \bLambda^{1/2} \ub $, where $\bSigma = \Vb \bLambda \Vb^\top$ is the spectral decomposition of $\bSigma$, and $\ub$ is a random vector with independent zero-mean sub-Gaussian entries. Without loss of generality, we assume that $\EE u_j^2 = 1$ and $\sup_j \| u_j \|_{\psi_2} \leq \sigma_q$ (This is because we can absorb the scaling into $\bSigma$). The features are given as
\begin{align*}
    \xb = y_i\cdot \bmu + \qb,
\end{align*}
where $\bmu \in \RR^p$ is a fixed vector. We denote
\begin{align*}
    \Xb = \yb \bmu^\top + \Qb,
\end{align*}
where $\Xb = [ \xb_1,\ldots, \xb_n ]^\top, \Qb = [ \qb_1,\ldots, \qb_n ]^\top \in \RR^{n \times p}$.%, and each row of $\Qb$ are generated independently from $N(\mathbf{0}, \bSigma)$. 




\section{Main Results}



Consider
\begin{align*}
    \hat\btheta_{\text{LS}} = \argmin \| \btheta \|_2^2 \qquad \text{subject to } y_i = \btheta^\top \xb_i, i\in [n]
\end{align*}
and 
\begin{align*}
    \hat\btheta_{\text{SVM}} = \argmin \| \btheta \|_2^2 \qquad \text{subject to } y_i \cdot \btheta^\top \xb_i \geq 1, i\in [n]
\end{align*}

% We study the classification risk
% \begin{align*}
%     R(\btheta) = \PP( y\cdot \btheta^\top \xb < 0 ).
% \end{align*}


\begin{lemma}\label{lemma:equivalence}
Suppose that $y_i \cdot \eb_i^\top (\Xb\Xb^\top)^{-1}\yb >0$ for $i\in[n]$. Then $\hat\btheta_{\text{SVM}}$ also solves the least square problem.
\end{lemma}

Denote $\Ub = \Qb\Qb^\top$, $\db = \Qb\bmu$. Then the following lemma is given as Lemma 3 in \citet{wang2020benign}.

\begin{lemma}\label{lemma:matrixcalculation}
The following calculation of $\yb^\top (\Xb\Xb^\top)^{-1}$ holds: 
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \db ) \yb^\top \Ub^{-1} - \yb^\top \Ub^{-1} \yb \cdot \db^\top \Ub^{-1}].
\end{align*}
% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} &= \yb^\top \Ub^{-1} - D^{-1}\cdot [ \| \bmu \|_2^2 s + h^2 + h - st ] \cdot \yb^\top \Ub^{-1} - D^{-1}\db^\top \Ub^{-1}\\
%     & =\bigg[ 1 - \frac{\| \bmu \|_2^2 s + h^2 + h - st }{ \| \bmu\|_2^2 s - st + (h+1)^2} \bigg] \cdot \yb^\top \Ub^{-1} - D^{-1}\db^\top \Ub^{-1} \\
%     & = \frac{ h + 1 }{ \| \bmu\|_2^2 s - st + (h+1)^2} \cdot \yb^\top \Ub^{-1} - D^{-1}\db^\top \Ub^{-1}\\
%     & = D^{-1} [ (h + 1) \yb^\top \Ub^{-1} - \db^\top \Ub^{-1}].
% \end{align*}
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:matrixcalculation}]

Denote $s = \yb^\top \Ub^{-1} \yb$, $t = \db^\top \Ub^{-1} \db$, $h = \yb^\top \Ub^{-1} \db$, $D = s (\| \bmu\|_2^2 - t) + (h+1)^2$. Then by Lemma 3 in \citet{wang2020benign}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} &= \yb^\top \Ub^{-1} - D^{-1}\cdot [ \| \bmu \|_2^2 s + h^2 + h - st ] \cdot \yb^\top \Ub^{-1} - D^{-1} s \cdot \db^\top \Ub^{-1}.
\end{align*}
Rearranging terms, we obtain
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} 
    & =\bigg[ 1 - \frac{\| \bmu \|_2^2 s + h^2 + h - st }{ \| \bmu\|_2^2 s - st + (h+1)^2} \bigg] \cdot \yb^\top \Ub^{-1} - D^{-1} s \cdot \db^\top \Ub^{-1} \\
    & = \frac{ h + 1 }{ \| \bmu\|_2^2 s - st + (h+1)^2} \cdot \yb^\top \Ub^{-1} - D^{-1} s \cdot \db^\top \Ub^{-1}\\
    & = D^{-1} [ (h + 1) \yb^\top \Ub^{-1} - s \cdot \db^\top \Ub^{-1}].
\end{align*}
\end{proof}



Denote by $\lambda_1,\lambda_2,\ldots, \lambda_p$ the eigenvalues of $\bSigma$ in descending order, with corresponding eigenvectors $\vb_1,\ldots,\vb_p$. Denote $\bLambda = \diag\{ \lambda_1,\lambda_2,\ldots, \lambda_p \}$  and  $\Vb = [ \vb_1,\ldots,\vb_p ]$, then $\bSigma = \Vb \bLambda \Vb^\top$. We also denote by $\mu_n(\Ub)$ the eigenvalues of $\Ub$.
 

% The following lemma is given by Lemma 26 in \citet{bartlett2020benign}.

% \begin{lemma}\label{lemma:eigenvalue_concentration}
% There exists an absolute constant $c$ such that, with probability at least $1 - 2\exp(-n/c)$,
% \begin{align*}
%     c^{-1} \sum_{i} \lambda_i - c\lambda_1 n \leq \mu_n(\Ub) \leq \mu_1(\Ub) \leq c \sum_{i} \lambda_i + c\lambda_1 n.
% \end{align*}

% \end{lemma}


% The following lemma (Lemma 1 in \citet{muthukumar2020classification}) gives another estimate on the eigenvalues of $\Ub$.


% \begin{lemma}\label{lemma:eigenvalue_concentration2}
%     With probability at least $1 - 2/n$, 
%     \begin{align*}
%         \Big\| \Ub - d'(n) \Ib \Big\|_2 \leq \epsilon_{u} := \frac{1}{12\sqrt{n}} \| \blambda \|_1 + 4\sqrt{n\log(n)}\cdot \| \blambda \|_2 + 8n\log(n)\cdot \lambda_1.
%     \end{align*}
% \end{lemma}
% \todoq{It seems that the $\|\lambda\|_1$ term on the right hand size does not appear in Peter's paper}
% \todoq{Is Peter's bound tighter than this one?}





The following lemma summarized from the proof of Lemma~26 in \citet{bartlett2020benign} estimates the eigenvalues of $\Ub$.


\begin{lemma}\label{lemma:eigenvalue_concentration}
    With probability at least $1 - 1/n$, \todoy{Todo: change notation, trace, 2-norm, F-norm, etc}
    \begin{align*}
    \big\| \Ub  - \tr{\bSigma} \Ib \big\|_2 \leq \epsilon_{u} := C \sigma_q^2  \big( \lambda_1 n + \sqrt{n}\cdot \| \blambda \|_2 \big),
\end{align*}
where $C$ is an absolute constant.
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:eigenvalue_concentration}]
Let $\cN$ be a $1/4$-net on the unit sphere $s^{n-1}$. Then by Lemma~5.2 in \citet{vershynin2010introduction}, we have $|\cN| \leq 9^n$. Denote $\zb_i =  \lambda_i^{-1/2} \Qb \vb_i \in \RR^{n}$. Then by definition, for any fixed unit vector $\hat\ab\in \cN$ we have $ \hat\ab^\top \Ub \hat\ab = \Qb\Qb^\top = \hat\ab^\top \sum_{j=1}^p \lambda_j \zb \zb^\top \hat\ab = \sum_{j=1}^p \lambda_j (\hat\ab^\top\zb)^2$. By Lemma 5.9 in \citet{vershynin2010introduction}, there exists an absolute constant $c_1$ such that $\| \hat\ab^\top\zb \|_{\psi_2} \leq c_1 \sigma_q^2$. Therefore by Lemma 21 and Corollary 23 in \citet{bartlett2020benign}, for any $t>0$, with probability at least $1 - 2\exp(-t)$ we have
\begin{align*}
    \big| \hat\ab^\top \Ub \hat\ab - \| \blambda\|_1 \big| \leq c_2 \sigma_q^2 \max \big( \lambda_1 t , \sqrt{t}\cdot \| \blambda \|_2 \big).
\end{align*}
Applying an union bound over all $\hat\ab\in \cN$, we have that with probability at least $1 - 2\cdot 9^n \exp(-t )$,
\begin{align*}
    \big| \hat\ab^\top \Ub \hat\ab - \| \blambda\|_1 \big| \leq c_2 \sigma_q^2 \max \big( \lambda_1 t , \sqrt{t}\cdot \| \blambda \|_2 \big)
\end{align*}
for all $\hat\ab\in \cN$. Therefore by Lemma~25 in \citet{bartlett2020benign}, with probability at least $1 - 2\cdot 9^n \exp(-t )$, we have
\begin{align*}
    \big\| \Ub  - \| \blambda\|_1 \Ib \big\|_2 \leq c_3 \sigma_q^2  \big( \lambda_1 t + \sqrt{t}\cdot \| \blambda \|_2 \big),
\end{align*}
where $c_3$ is an absolute constant. 
Setting $t = c_4 n$ for some large enough constant $c_4$, we have that with probability at least $1 - 1/n$,
\begin{align*}
    \big\| \Ub  - \| \blambda\|_1 \Ib \big\|_2 \leq c_5  \sigma_q^2 \big( \lambda_1 n + \sqrt{n}\cdot \| \blambda \|_2 \big),
\end{align*}
where $c_5$ is an absolute constant. This completes the proof.
\end{proof}










% \begin{lemma}\label{lemma:concentrationbounds}
% With probability at least $1 - 2/n$,
% \begin{align*}
%     \frac{n}{ c \sum_{i} \lambda_i + c\lambda_1 n } \leq  \yb^\top \Ub^{-1} \yb \leq \frac{n}{ c^{-1} \sum_{i} \lambda_i - c\lambda_1 n }
% \end{align*}

% \begin{align*}
%     \frac{n}{ c \sum_{i} \lambda_i + c\lambda_1 n } \cdot \| \bmu \|_{\bSigma}^2 \leq  \db^\top \Ub^{-1} \db \leq \frac{n}{ c^{-1} \sum_{i} \lambda_i - c\lambda_1 n } \cdot \| \bmu \|_{\bSigma}^2
% \end{align*}


% \begin{align*}
%     |\yb^\top \Ub^{-1} \db| \leq \frac{n}{ c^{-1} \sum_{i} \lambda_i - c\lambda_1 n }  \| \bmu \|_{\bSigma}
% \end{align*}

% % \begin{align*}
% %     \frac{n}{ \|\blambda \|_1  - \epsilon_{u} } \leq  \yb^\top \Ub^{-1} \yb \leq \frac{n}{ \|\blambda \|_1  + \epsilon_{u} }
% % \end{align*}

% % \begin{align*}
% %     \frac{n}{ \|\blambda \|_1  - \epsilon_{u} } \cdot \| \tilde\bmu \|_2 \leq  \db^\top \Ub^{-1} \db \leq \frac{n}{ \|\blambda \|_1  + \epsilon_{u} } \cdot \| \tilde\bmu \|_2
% % \end{align*}



% \end{lemma}



% The following lemma gives bounds on $f_i = \db^\top \Ub^{-1} \eb_i y_i$, $g_i = \yb^\top \Ub^{-1} \eb_i y_i$. \todoq{what is the intuition of this lemma?}


% The following lemma is summarized form Lemma 1 in \citet{muthukumar2020classification}.


\begin{lemma}\label{lemma:concentrationbounds}
With probability at least $1 - 2/n$, the following inequalities hold. %\todoq{What is $s$ and $t$?}, \todoy{simplify notation}
\begin{align*}
    &\frac{n}{ \|\blambda \|_1  + \epsilon_{u} } \leq  \yb^\top \Ub^{-1} \yb \leq \frac{n}{ \|\blambda \|_1  - \epsilon_{u} },\\
    &\frac{n  - C \sqrt{n\log( n)}}{ \|\blambda \|_1  + \epsilon_{u} } \cdot \| \bmu \|_{\bSigma}^2 \leq  \db^\top \Ub^{-1} \db \leq \frac{ n  + C \sqrt{n\log( n)} }{ \|\blambda \|_1  - \epsilon_{u} } \cdot \| \bmu \|_{\bSigma}^2,\\
    &|\yb^\top \Ub^{-1} \db| \leq \frac{C n   }{  \|\blambda \|_1  - \epsilon_{u}  } \| \bmu \|_{\bSigma},
\end{align*}
where $C$ is an absolute constant.
% \begin{align*}
%     \frac{n  - c \sqrt{n\log( n)}}{ \|\blambda \|_1  + \epsilon_{u} } \cdot \| \bmu \|_{\bSigma}^2 \leq  \db^\top \Ub^{-1} \db \leq \frac{ n  + c \sqrt{n\log( n)} }{ \|\blambda \|_1  - \epsilon_{u} } \cdot \| \bmu \|_{\bSigma}^2
% \end{align*}

% \begin{align*}
%     |\yb^\top \Ub^{-1} \db| \leq \frac{n  + c \sqrt{n\log( n)} }{  \|\blambda \|_1  - \epsilon_{u}  } \| \bmu \|_{\bSigma}
% \end{align*}

\end{lemma}



\begin{proof}[Proof of Lemma~\ref{lemma:concentrationbounds}]
The bounds on $\yb^\top \Ub^{-1} \yb$ are directly derived from Lemma~\ref{lemma:eigenvalue_concentration} and the fact that $\| \yb \|_2^2 = n$. To derive the bounds for $\db\Ub^{-1}\db$, we note that by definition, $\db = \Qb\bmu$ and 
\begin{align*}
    \db^\top \Ub^{-1} \db = \bmu^\top \Qb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu. %= \| \Qb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu \|_2^2
\end{align*}

Denote $\zb_i =  \lambda_i^{-1/2} \Qb \vb_i \in \RR^{n}$,  $\Zb = [\zb_1,\ldots, \zb_p] \in \RR^{n \times p}$, and $\tilde\bmu = \Lambda^{1/2} \Vb^\top \bmu$. Then $\Qb = \Zb \bLambda^{1/2} \Vb^\top$, and
\begin{align*}
    \bmu^\top \Qb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu &= \bmu^\top \Vb  \bLambda^{1/2} \Zb^\top ( \Zb \bLambda \Zb^\top )^{-1} \Zb \bLambda^{1/2} \Vb^\top \bmu\\
    &= \tilde\bmu^\top  \Zb^\top ( \Zb \bLambda \Zb^\top )^{-1} \Zb  \tilde\bmu\\
    &\leq \frac{ \| \Zb \tilde\bmu \|_2^2 }{ \| \blambda \|_1 - \epsilon_{u} }.
\end{align*}
Similarly, we have
\begin{align*}
    \bmu^\top \Qb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu \geq \frac{ \| \Zb \tilde\bmu \|_2^2 }{  \| \blambda \|_1 + \epsilon_{u} }.
\end{align*}
We now proceed to give upper and lower bounds for the term $\| \Zb \tilde\bmu \|_2^2 =  \sum_{i=1}^n ( \sum_{j=1}^p \Zb_{ij} \tilde\mu_j )^2$. Note that by definition,  $\Zb_{ij}$ for $i\in [n]$ and $j\in [p]$ are independent sub-Gaussian vectors with $\| \Zb_{ij} \|_{\psi_2} \leq 1$. By Lemma~5.9 in \citet{vershynin2010introduction}, we have
\begin{align*}
    \Bigg\| \sum_{j=1}^p \Zb_{ij} \tilde\mu_j \Bigg\|_{\psi_2} \leq \| \tilde\bmu_j \|_2 \cdot \sigma_q, 
\end{align*}
and therefore by Lemma~5.14 in \citet{vershynin2010introduction}, we have
\begin{align*}
     \Bigg\| \Bigg(\sum_{j=1}^p \Zb_{ij} \tilde\mu_j \Bigg)^2 - \| \tilde\bmu \|_2^2 \Bigg\|_{\psi_1} \leq  2\| \tilde\bmu_j \|_2^2 \cdot (1 + \sigma_q^2).
\end{align*}
By Bernstein's inequality, with probability at least $1 - n^{-1}$, 
\begin{align*}
    \big| \| \Zb \tilde\bmu \|_2^2 - \EE \| \Zb \tilde\bmu \|_2^2 \big| \leq c_1 (1 + \sigma_q^2)\cdot \| \tilde\bmu \|_2^2\cdot \sqrt{n\log( n)},
\end{align*}
where $c_1$ is an absolute constant. Merging $\sigma_q^2$ into the constant coefficient, we have
% with probability at least $1 - \exp(-n/c_1)$,
% \begin{align*}
%     \big| \| \Zb \tilde\bmu \|_2^2 - \EE \| \Zb \tilde\bmu \|_2^2 \big|\leq n\cdot \| \tilde\bmu \|_2 /c_1.
% \end{align*}
% Therefore we have
\begin{align*}
     n  \| \tilde\bmu \|_2^2 - c_2 \| \tilde\bmu \|_2^2\cdot \sqrt{n\log( n)} \leq \| \Zb \tilde\bmu \|_2^2 \leq  n \| \tilde\bmu \|_2^2 + c_2 \| \tilde\bmu \|_2^2\cdot \sqrt{n\log( n)},
\end{align*}
and
\begin{align*}
    \frac{n - c_2 \sqrt{n\log( n)}}{ \| \blambda \|_1 + \epsilon_mu } \cdot \| \tilde\bmu \|_2 \leq  \db^\top \Ub^{-1} \db \leq \frac{n + c_2 \sqrt{n\log( n)}}{ \| \blambda \|_1 - \epsilon_mu } \cdot \| \tilde\bmu \|_2
\end{align*}
Similarly for $\yb^\top \Ub^{-1} \db$, by Cauchy-Schwarz inequality, for large enough $n$ we have
\begin{align*}
    |\yb^\top \Ub^{-1} \db| = |\yb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu| &\leq \| \yb \|_2 \cdot \| (\Qb\Qb^\top)^{-1} \Qb \bmu \|_2 = \sqrt{n} \cdot \sqrt{ \bmu^\top \Qb^\top (\Qb\Qb^\top)^{-2} \Qb \bmu }\\
    &\leq\frac{\sqrt{n} \cdot \sqrt{n + c_2 \sqrt{n\log( n)}} }{  \| \blambda \|_1 - \epsilon_mu  } \| \Qb\bmu \|_2\\
    &\leq \frac{c_3 n }{  \| \blambda \|_1 - \epsilon_mu  } \| \tilde\bmu \|_2,
\end{align*}
where $c_3$ is an absolute constant. Note that $\| \tilde\bmu \|_2 = \| \bmu \|_{\bSigma}$. This completes the proof.
% Denote $\Zb = \Qb \Sigma^{-1/2}$. Then each row of $\Zb$ follows standard Gaussian distribution.
\end{proof}






% \begin{proof}[Proof of Lemma]
% \begin{align*}
%      n \| \tilde\bmu \|_2 + c \| \tilde\bmu \|_2\cdot \sqrt{n\log( n)} 
% \end{align*}
% \end{proof}


% \begin{lemma}\label{lemma:asymmetricbounds}
% For any vector $\ub \in S^{n-1}$, 
% \end{lemma}

% \begin{proof}[Proof of Lemma~\ref{lemma:asymmetricbounds}]
% Denote $\ab_i = \sqrt{n} \eb_i y_i$. Then we have
% \begin{align*}
%     \yb^\top \Ub^{-1} \eb_i y_i = n^{-1/2} \yb^\top \Ub^{-1} \ab_i = \frac{1}{4\sqrt{n}} (\yb + \ab_i)^\top \Ub^{-1} (\yb + \ab_i) - \frac{1}{4\sqrt{n}} (\yb - \ab_i)^\top \Ub^{-1} (\yb - \ab_i).
% \end{align*}
% Note that by definition, $\| \yb + \ab_i \|_2^2 = $
% \end{proof}





\begin{lemma}\label{lemma:fgbounds}
\begin{align*}
    |\db^\top \Ub^{-1} \eb_i y_i|  \leq \frac{\sqrt{n}}{ \|\blambda \|_1  - \epsilon_{u} } \| \tilde\bmu \|_2. 
\end{align*}
\begin{align*}
    \yb^\top \Ub^{-1} \eb_i y_i \geq \frac{ \|\blambda \|_1 - \sqrt{n} \epsilon_{u} }{\|\blambda \|_1^2  - \epsilon_{u}^2}
\end{align*}
\end{lemma}


\begin{proof}[Proof of Lemma~\ref{lemma:fgbounds}]
With the same proof as in Lemma~\ref{lemma:concentrationbounds}, we have
\begin{align*}
    \| \Qb\bmu \|_2 = \sqrt{n} \cdot \| \bmu \|_{\bSigma}. 
\end{align*}

replacing $\yb$ with $\eb_i$, we have
\begin{align*}
   |\db^\top \Ub^{-1} \eb_i y_i| = | \db^\top \Ub^{-1} \eb_i | \leq  \frac{\sqrt{n}}{ \|\blambda \|_1  - \epsilon_{u} } \| \tilde\bmu \|_2. 
\end{align*}


\begin{align*}
    \yb^\top \Ub^{-1} \eb_i y_i &= \frac{1}{4\sqrt{n}} (\yb + \sqrt{n}\eb_i y_i )^\top \Ub^{-1} (\yb + \sqrt{n}\eb_i y_i ) - \frac{1}{4\sqrt{n}} (\yb - \sqrt{n}\eb_i y_i )^\top \Ub^{-1} (\yb - \sqrt{n}\eb_i y_i )\\
    &\geq \frac{1}{4\sqrt{n}} \bigg[ \frac{\| \yb + \sqrt{n}\eb_i y_i \|_2^2}{\|\blambda \|_1  + \epsilon_{u}} - \frac{\| \yb - \sqrt{n}\eb_i y_i \|_2^2}{\|\blambda \|_1  - \epsilon_{u}}  \bigg]\\
    &= \frac{1}{4\sqrt{n}} \bigg[ \frac{2n + 2\sqrt{n}}{\|\blambda \|_1  + \epsilon_{u}} - \frac{2n - 2\sqrt{n}}{\|\blambda \|_1  - \epsilon_{u}}  \bigg]\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{(n + \sqrt{n}) (\|\blambda \|_1  - \epsilon_{u}) - (n - \sqrt{n}) (\|\blambda \|_1  + \epsilon_{u}) }{\|\blambda \|_1^2  - \epsilon_{u}^2}\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{2\sqrt{n} \|\blambda \|_1 - 2n \epsilon_{u} }{\|\blambda \|_1^2  - \epsilon_{u}^2}\\
    & = \frac{ \|\blambda \|_1 - \sqrt{n} \epsilon_{u} }{\|\blambda \|_1^2  - \epsilon_{u}^2}.
\end{align*}

\end{proof}




\begin{theorem}\label{thm:interpolationregression}
There exists a large enough absolute constant $C$ such that when $ \|\blambda \|_1 \geq C n^{3/2} \| \tilde\bmu \|_2$ and $\|\blambda \|_1 \geq C  \sqrt{n} \epsilon_{u}$, $\hat\btheta_{\text{SVM}}$ also solves the least square problem.
\end{theorem}
\begin{proof}[Proof of Theorem~\ref{thm:interpolationregression}] By Lemma~\ref{lemma:matrixcalculation}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \db ) \yb^\top \Ub^{-1} \eb_i y_i - \yb^\top \Ub^{-1} \yb \cdot \db^\top \Ub^{-1} \eb_i y_i].
\end{align*}
Plugging in the inequalities in Lemma~\ref{lemma:concentrationbounds} and Lemma~\ref{lemma:fgbounds}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i &\geq D^{-1} \bigg[ \bigg( 1 - \frac{n  + c_1 \sqrt{n\log( n)} }{  \|\blambda \|_1  - \epsilon_{u}  } \| \tilde\bmu \|_2 \bigg) \cdot \frac{ \|\blambda \|_1 - \sqrt{n} \epsilon_{u} }{\|\blambda \|_1^2  - \epsilon_{u}^2} - \frac{n}{ \|\blambda \|_1  - \epsilon_{u} } \cdot \frac{\sqrt{n}}{ \|\blambda \|_1  - \epsilon_{u} } \| \tilde\bmu \|_2\bigg] \\
    &\geq D^{-1} \bigg[ c_2 \cdot \bigg( 1 - \frac{c_3 n  }{  \|\blambda \|_1    } \| \tilde\bmu \|_2 \bigg) \cdot \frac{ \|\blambda \|_1 - \sqrt{n} \epsilon_{u} }{\|\blambda \|_1^2} - \frac{n^{3/2}}{ \|\blambda \|_1^2  } \cdot  \| \tilde\bmu \|_2\bigg],
\end{align*}
where $c_2,c_3 > 0$ are absolute constants.
Therefore when $ \|\blambda \|_1 \geq C n^{3/2} \| \tilde\bmu \|_2$ and $\|\blambda \|_1 \geq C  \sqrt{n} \epsilon_{u}$ for some large enough constant $C$, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i 
    &\geq D^{-1} \bigg[ c_4 \cdot  \frac{ \|\blambda \|_1 - \sqrt{n} \epsilon_{u} }{\|\blambda \|_1^2} - \frac{n^{3/2}}{ \|\blambda \|_1^2  } \cdot  \| \tilde\bmu \|_2\bigg] > 0.
\end{align*}
Applying Lemma~\ref{lemma:equivalence} completes the proof.
\end{proof}




\begin{remark}
Compared with the results of \citet{wang2020benign}, our result is applicable to a more general setting that allows non-isotropic, non-Gaussian data and allow infinite-dimensional features. Under the setting when $\lambda_i = 1$ for $i\in[p]$, Theorem~\ref{thm:interpolationregression} recovers  \CC{( the corrected version of)} the results in \citet{wang2020benign}. 
\end{remark}










\subsection{Benign Overfitting}

\begin{lemma}\label{lemma:riskbound}
\todoy{This part may be improvable with a more careful calculation} Under the conditions of Theorem~\ref{thm:interpolationregression}, 
\begin{align*}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg[ - \frac{ [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2}{ \lambda_1\cdot \yb^\top (\Xb\Xb^\top)^{-1} \yb} \bigg].
\end{align*}
\end{lemma}

The following lemma is summarized from \citet{wang2020benign}. 


% $s = \yb^\top \Ub^{-1} \yb$, $t = \db^\top \Ub^{-1} \db$, $h = \yb^\top \Ub^{-1} \db$

\begin{lemma}\label{lemma:riskboundcalculation}
\begin{align*}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &= \frac{ \big[ \yb^\top \Ub^{-1} \yb \cdot (\| \bmu \|_2^2 - \db^\top \Ub^{-1} \db) +  (\yb^\top \Ub^{-1} \db)^2 +  \yb^\top \Ub^{-1} \db \big]^2 }{ \yb^\top \Ub^{-1} \yb \cdot \big[ \yb^\top \Ub^{-1} \yb \cdot (\| \bmu \|_2^2 - \db^\top \Ub^{-1} \db) +  (\yb^\top \Ub^{-1} \db + 1)^2 \big]}.
\end{align*}
\end{lemma}


\begin{lemma}\label{lemma:riskboundssimplify}
Suppose that  $ \|\blambda \|_1 \geq C n^{3/2} \| \tilde\bmu \|_2$ and $\|\blambda \|_1 \geq C  \sqrt{n} \epsilon_{u}$, $\hat\btheta_{\text{SVM}}$
for some large enough absolute constant $C$. 
\begin{align*}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} \geq  \frac{ n\cdot \big(  \| \bmu \|_2^2  - 8 \| \bmu \|_{\bSigma} \big)^2 }{ 64 \big(  n \cdot \| \bmu \|_2^2 + \|\blambda \|_1  \big) }.
\end{align*}
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:riskboundssimplify}]
By Lemma~\ref{lemma:riskboundcalculation} we have
\begin{align}\label{eq:riskboundssimplify_eq1}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &= \frac{ \big[ \yb^\top \Ub^{-1} \yb \cdot (\| \bmu \|_2^2 - \db^\top \Ub^{-1} \db) +  (\yb^\top \Ub^{-1} \db)^2 +  \yb^\top \Ub^{-1} \db \big]^2 }{ \yb^\top \Ub^{-1} \yb \cdot \big[ \yb^\top \Ub^{-1} \yb \cdot (\| \bmu \|_2^2 - \db^\top \Ub^{-1} \db) +  (\yb^\top \Ub^{-1} \db + 1)^2 \big]}.
\end{align}
% By definition, we have
By Lemma~\ref{lemma:concentrationbounds}, with probability at least $1 - n^{-1}$, we have
\begin{align*}
    \| \bmu \|_2^2 - \db^\top \Ub^{-1} \db \geq  \| \bmu \|_2^2 -  \frac{n}{ 2 \| \blambda \|_1}  \| \bmu \|_{\bSigma}^2 \geq \| \bmu \|_2^2 - \frac{ \lambda_1 n}{ 2\| \blambda \|_1}  \| \bmu \|_2^2 =  \frac{ 2 \| \blambda \|_1 -  \lambda_1 n}{2  \| \blambda \|_1}  \| \bmu \|_2^2 \geq \| \bmu \|_2^2 / 2.
\end{align*}
Plugging the above inequality into \eqref{eq:riskboundssimplify_eq1}, we have
\begin{align*}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &= \frac{ \big[ \yb^\top \Ub^{-1} \yb \cdot (\| \bmu \|_2^2 - \db^\top \Ub^{-1} \db) +  (\yb^\top \Ub^{-1} \db)^2 +  \yb^\top \Ub^{-1} \db \big]^2 }{ \yb^\top \Ub^{-1} \yb \cdot \big[ \yb^\top \Ub^{-1} \yb \cdot (\| \bmu \|_2^2 - \db^\top \Ub^{-1} \db) +  (\yb^\top \Ub^{-1} \db + 1)^2 \big]}\\
    &\geq  \frac{ \big[ \yb^\top \Ub^{-1} \yb \cdot \| \bmu \|_2^2 / 2 - | \yb^\top \Ub^{-1} \db | \big]^2 }{ \yb^\top \Ub^{-1} \yb \cdot \big[ \yb^\top \Ub^{-1} \yb \cdot \| \bmu \|_2^2 + 2 \big]}\\
    &\geq \frac{ \big[ \yb^\top \Ub^{-1} \yb \cdot \| \bmu \|_2^2 / 2 -  2n  \|\blambda \|_1^{-1}  \cdot \| \bmu \|_{\bSigma} \big]^2 }{ \yb^\top \Ub^{-1} \yb \cdot \big[ \yb^\top \Ub^{-1} \yb \cdot \| \bmu \|_2^2 + 2 \big]}
\end{align*}
where we utilize Lemma~\ref{lemma:concentrationbounds} to derive the condition 
\begin{align*}
    \yb^\top \Ub^{-1} \db \leq \frac{ n  + c \sqrt{n\log( n)} }{ \|\blambda \|_1  - \epsilon_{u} } \cdot \| \bmu \|_{\bSigma} \leq  \frac{ 2n  }{ \|\blambda \|_1 } \cdot \| \bmu \|_{\bSigma} \leq \sqrt{2} - 1.
\end{align*}
% $\yb^\top \Ub^{-1} \db \leq \frac{ n  + c \sqrt{n\log( n)} }{ \|\blambda \|_1  - \epsilon_{u} } \cdot \| \bmu \|_{\bSigma} \leq  \frac{ 2n  }{ \|\blambda \|_1 } \cdot \| \bmu \|_{\bSigma} \leq \sqrt{2} - 1$.
Applying the bound on $\yb^\top \Ub^{-1} \yb$ in Lemma~\ref{lemma:concentrationbounds}, we obtain %have $\yb^\top \Ub^{-1} \yb \cdot \| \bmu \|_2^2 \leq \frac{n}{ \|\blambda \|_1  - \epsilon_{u} } \cdot \| \bmu \|_2^2 \leq 1$. Therefore

\begin{align*}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &\geq \frac{ \big( n \|\blambda \|_1^{-1} \cdot \| \bmu \|_2^2 / 4 -  2n  \|\blambda \|_1^{-1}  \cdot \| \bmu \|_{\bSigma} \big)^2 }{ 2n \|\blambda \|_1^{-1} \cdot \big( 2 n \|\blambda \|_1^{-1} \cdot \| \bmu \|_2^2 + 2 \big) } \\
    &=  \frac{ n\cdot \big(  \| \bmu \|_2^2 / 4 -    2 \| \bmu \|_{\bSigma} \big)^2 }{ 2 \|\blambda \|_1  \cdot \big( 2 n \|\blambda \|_1^{-1} \cdot \| \bmu \|_2^2 + 2 \big) } \\
    &=  \frac{ n\cdot \big(  \| \bmu \|_2^2  - 8 \| \bmu \|_{\bSigma} \big)^2 }{ 64 \big(  n \cdot \| \bmu \|_2^2 + \|\blambda \|_1  \big) }
\end{align*}
This completes the proof. 

% \begin{align*}
%     \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} \geq  \frac{ (\yb^\top \Ub^{-1} \yb)^2 \cdot \| \bmu \|_2^4 / 4  }{ \yb^\top \Ub^{-1} \yb \cdot 3} = \yb^\top \Ub^{-1} \yb \cdot \| \bmu \|_2^4 / 12 \geq \frac{ n \| \bmu \|_2^4  }{24 \| \blambda \|_1},
% \end{align*}


% Denote $\overline\bmu = \Vb^\top \bmu$, then%$\tilde\bmu = \Lambda^{1/2} \Vb^\top \bmu$
% \begin{align*}
%     \| \bmu \|_2^2 = \| \overline\bmu \|_2^2  = \sum_{j=1}^p \lambda_j \overline\mu_j^2 / \lambda_j \geq 
% \end{align*}


\end{proof}






\begin{theorem}\label{thm:benignoverfitting}
Suppose that  $ \|\blambda \|_1 \geq C n^{3/2} \| \bmu \|_{\bSigma}$, $\|\blambda \|_1 \geq C  \sqrt{n} \epsilon_{u}$, and $\| \bmu \|_2 \geq C\lambda_1$ for some large enough absolute constant $C$. Then
\begin{align*}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg(- \frac{1 }{256 \lambda_1} \cdot \min\bigg\{ \| \mu \|_2^2 , \frac{n \| \bmu \|_2^4 }{\| \blambda \|_1} \bigg\}\bigg).
\end{align*}
\end{theorem}

\begin{proof}[Proof of Theorem~\ref{thm:benignoverfitting}]
The proof follows by a direct combination of Lemma~\ref{lemma:riskboundcalculation} and Lemma~\ref{lemma:riskboundssimplify}. 
\end{proof}

\begin{remark}
Theorem~\ref{thm:benignoverfitting} establishes a risk bound for interpolating classifiers. It implies the result of \citet{wang2020benign} when setting $\lambda_j = 1$, $j\in [p]$. \citet{chatterji2020finite} gives an $\exp(-\Omega( \| \bmu \|_2^4 / p ))$ risk bound under the conditions $p = \Omega( n \| \bmu \|_2^2 ) $ and $\EE( \| \qb \|_2^2) \geq \Omega(p)$. Under the same conditions, the bound in Theorem~\ref{thm:benignoverfitting} reduces to $\exp(-\Omega( n \| \bmu \|_2^4 / p ))$, which is clearly sharper than the bound in \citet{chatterji2020finite}. Moreover, our result also holds for for more general settings without these conditions, and is applicable to the infinite-dimensional setting.




% Compared with \citet{chatterji2020finite} which requires $\EE( \| \qb \|_2^2) \geq \Omega(p)$, our result does not rely on this assumption and therefore works for the infinite-dimensional setting. Moreover, 


% Compared with the results in \citet{chatterji2020finite} and \citet{wang2020benign},
% Compared with the results of \citet{wang2020benign}, our result is applicable to a more general setting that allows non-isotropic, non-Gaussian data and allow infinite-dimensional features. Under the setting when $\lambda_i = 1$ for $i\in[p]$, Theorem~\ref{thm:interpolationregression} recovers  \CC{( the corrected version of)} the results in \citet{wang2020benign}. 

\end{remark}







\section{Improving the Condition for Classification-Regression Equivalence}

We consider the isotropic Gaussian mixture setting.

\begin{lemma}\label{lemma:eigenvalueconcentration_isotropic}[Lemma 2 in \citet{muthukumar2020classification}]
For any vector $\ab\in S^{d-1}$, with probability at least $1 - 2n^{-2}$, 
\begin{align*}
    & \ub^\top \Ub^{-1} \ub \geq \frac{1}{d'(n) + \sqrt{4\ln(n) d'(n)} + 4\ln(n)}, \\
    & \ub^\top \Ub^{-1} \ub \leq \frac{1}{d'(n) - \sqrt{4\ln(n) d'(n)}} \leq \frac{1}{d'(n) - \sqrt{4\ln(n) d'(n)} - 4\ln(n)},
\end{align*}
where $d'(n) = d - n + 1$.
\end{lemma}

Denote $\epsilon_{u} = \sqrt{4\ln(n) d'(n)} + 4\ln(n)$. Then by Lemma \ref{lemma:eigenvalueconcentration_isotropic}, we have
\begin{align}\label{eq:isotropic_eigenvalueconcentration}
    \frac{1}{d'(n) - \epsilon_{u}} \geq \ub^\top \Ub^{-1} \ub \geq \frac{1}{d'(n) + \epsilon_{u}}.
\end{align}



\begin{lemma}\label{lemma:isotropicbound1}
With probability at least $1 - 4n^{-1}$, 
\begin{align*}
    \big| \bmu^\top\Qb^\top \Ub^{-1} \eb_i y_i \big| \leq \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  }.
\end{align*}
for all $i\in [n]$, where $d'(n) = d - n + 1$. 
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:isotropicbound1}] 
\begin{align}
    \bmu^\top\Qb^\top\Ub^{-1} \eb_i y_i &= \frac{1}{\| \Qb\bmu \|_2}\cdot (\Qb\bmu)^\top \Ub^{-1} (\| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    & = \frac{1}{4\| \Qb\bmu \|_2}\cdot (\Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i)^\top \Ub^{-1} (\Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    &\quad - \frac{1}{4\| \Qb\bmu \|_2}\cdot (\Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i)^\top \Ub^{-1} (\Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    & \leq \frac{1}{4\| \Qb\bmu \|_2}\cdot \bigg[  \frac{\| \Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i \|_2^2}{ d'(n)- \epsilon_{u}  } - \frac{\| \Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i \|_2^2}{ d'(n)+ \epsilon_{u}  }     \bigg] \nonumber\\
    &= \frac{1}{4\| \Qb\bmu \|_2}\cdot \bigg[  \frac{2 \| \Qb\bmu \|_2^2 + 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu }{ d'(n)- \epsilon_{u}  } - \frac{2\| \Qb\bmu \|_2^2  - 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu }{ d'(n)+ \epsilon_{u}  } \bigg] \nonumber \\
    &= \frac{1}{2\| \Qb\bmu \|_2}\cdot \frac{2\| \Qb\bmu \|_2^2 \cdot \epsilon_{u} + 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  }\nonumber\\
    &= \frac{\| \Qb\bmu \|_2 \cdot \epsilon_{u} +  y_i \eb_i^\top \Qb\bmu \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  },\label{eq:fgbounds_eq1}
\end{align}
where the first equality holds due to the parallelogram law $\ab^\top \Mb \bbb = 1/4(\ab+\bbb)^\top \Mb (\ab+\bbb) - 1/4(\ab-\bbb)^\top \Mb (\ab-\bbb)$, and the first inequality follows by \eqref{eq:isotropic_eigenvalueconcentration}.
Under the isotropic Gaussian mixture setting, $e_j^\top \Qb\bmu $, $j=1,\ldots, n$ are i.i.d. Gaussian random variables with variance $\| \bmu \|_2^2$. Therefore by Berstein inequality, with probability at least $1 - n^{-1}$ we have
\begin{align*}
    \big| \| \Qb\bmu \|_2^2 - n \| \bmu \|_2^2 \big| \leq  n\cdot c_1 \| \bmu \|_2^2 \sqrt{\frac{\log(n)}{n}} = c_1 \| \bmu \|_2^2 \sqrt{n\log(n)}, 
\end{align*}
where $c_1$ is an absolute constant. 
Therefore when $n$ is large enough, $\| \Qb\bmu \|_2^2 \leq 2n \| \bmu \|_2^2$. Similarly, by Gaussian tail bound and union bound, with probability at least $1 - n^{-1}$, 
\begin{align*}
    | e_i^\top \Qb\bmu | \leq c_2\sqrt{\log(n)}
\end{align*}
for all $i\in[n]$, where $c_2$ is an absolute constant. Therefore we have
\begin{align*}
    \db^\top \Ub^{-1} \eb_i y_i &\leq \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  }.
\end{align*}
With the exact same proof, we also have
\begin{align*}
    -\db^\top \Ub^{-1} \eb_i y_i &\leq \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  }.
\end{align*}
This completes the proof.
\end{proof}








\begin{lemma}\label{lemma:isotropicbound2}
With probability at least $1 - 2n^{-1}$, 
\begin{align*}
    \yb^\top \Ub^{-1} \eb_i y_i  \geq \frac{ d'(n) - \sqrt{n} \epsilon_{u} }{d'(n)^2  - \epsilon_{u}^2}
\end{align*}
for all $i\in [n]$. 
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:isotropicbound2}]
The proof is very similar to the proof of Lemma~\ref{lemma:isotropicbound1}. We have
\begin{align*}
    \yb^\top \Ub^{-1} \eb_i y_i &= \frac{1}{4\sqrt{n}} (\yb + \sqrt{n}\eb_i y_i )^\top \Ub^{-1} (\yb + \sqrt{n}\eb_i y_i ) - \frac{1}{4\sqrt{n}} (\yb - \sqrt{n}\eb_i y_i )^\top \Ub^{-1} (\yb - \sqrt{n}\eb_i y_i )\\
    &\geq \frac{1}{4\sqrt{n}} \bigg[ \frac{\| \yb + \sqrt{n}\eb_i y_i \|_2^2}{d'(n)  + \epsilon_{u}} - \frac{\| \yb - \sqrt{n}\eb_i y_i \|_2^2}{d'(n)  - \epsilon_{u}}  \bigg]\\
    &= \frac{1}{4\sqrt{n}} \bigg[ \frac{2n + 2\sqrt{n}}{d'(n)  + \epsilon_{u}} - \frac{2n - 2\sqrt{n}}{d'(n)  - \epsilon_{u}}  \bigg]\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{(n + \sqrt{n}) (d'(n)  - \epsilon_{u}) - (n - \sqrt{n}) (d'(n)  + \epsilon_{u}) }{d'(n)^2  - \epsilon_{u}^2}\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{2\sqrt{n} d'(n) - 2n \epsilon_{u} }{d'(n)^2  - \epsilon_{u}^2}\\
     & = \frac{ d'(n) - \sqrt{n} \epsilon_{u} }{d'(n)^2  - \epsilon_{u}^2},
\end{align*}
where we use the parallelogram law $\ab^\top \Mb \bbb = 1/4(\ab+\bbb)^\top \Mb (\ab+\bbb) - 1/4(\ab-\bbb)^\top \Mb (\ab-\bbb)$ in the first equality and use \eqref{eq:isotropic_eigenvalueconcentration} in the first inequality. This completes the proof.
\end{proof}


\begin{lemma}\label{lemma:isotropic_equivalencecondition}
Soppose that $d > C n \log(n)$ and $d >  C n \| \bmu \|_2$ for some large enough absolute constant $C$. Then with probability at least $1 - 4n^{-1}$, $\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i> 0$ for all $i \in [n]$, and therefore hard margin SVM equals the minimum norm interpolator. 
\end{lemma}

\begin{proof}{Proof of Lemma~\ref{lemma:isotropic_equivalencecondition}}
By Lemma~\ref{lemma:matrixcalculation}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \db ) \yb^\top \Ub^{-1} - \yb^\top \Ub^{-1} \yb \cdot \db^\top \Ub^{-1}].
\end{align*}
Moreover, by Lemma~4 in \citet{wang2020benign}, if $d = \Omega(n \| \bmu \|_2) $ and $d = \Omega(n) $, then with probability at least $1 - n^{-1}$, $\yb^\top \Ub^{-1} \db \geq -1/2$, $\yb^\top \Ub^{-1} \yb \leq c_1 n/d$ for some absolute constant $c_1$.
Therefore as long as $d'(n) > \epsilon_{u}$
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i &= D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \db ) \yb^\top \Ub^{-1} \eb_i y_i  - \yb^\top \Ub^{-1} \yb \cdot \db^\top \Ub^{-1}\eb_i y_i ] \\
    &\propto  ( 1 + \yb^\top \Ub^{-1} \db ) \yb^\top \Ub^{-1} \eb_i y_i  - \yb^\top \Ub^{-1} \yb \cdot \db^\top \Ub^{-1}\eb_i y_i\\
    &\geq (1/2)\cdot \yb^\top \Ub^{-1} \eb_i y_i  - c_1 (n/d)\cdot |\db^\top \Ub^{-1}\eb_i y_i|\\
    &\geq  \frac{1}{2} \cdot \frac{ d'(n) - \sqrt{n} \epsilon_{u} }{d'(n)^2  - \epsilon_{u}^2} -  \frac{c_1n}{d}\cdot \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  },
\end{align*}
where $c_1,c_2$ are absolute constants. 
Therefore $\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i >0$ if there exists a large enough constant $c_3$ such that
\begin{align*}
    &d'(n) > c_3 \sqrt{n} \epsilon_{u},\\
    &d'(n) > c_3 (n^{3/2} / d) \cdot  \| \bmu \|_2 \cdot \epsilon_{u},\\
    &d'(n) > c_3 (n/d)\cdot \sqrt{\log(n)} \cdot d'(n).
\end{align*}
Plugging in the definitions $d'(n) = d - n + 1$ and 
$\epsilon_{u} = \sqrt{4\ln(n) d'(n)} + 4\ln(n)$, we see that the above conditions are satisfied when $d > c_4 n \log(n)$ and $d >  c_4 n \| \bmu \|_2$ for some large enough absolute constant $c_4$. This completes the proof. 
\end{proof}


\section{New Anisotropic Sub-Gaussian Equivalence Result}



\begin{lemma}\label{lemma:isotropicbound1}
With probability at least $1 - 4n^{-1}$, 
\begin{align*}
    \big| \bmu^\top\Qb^\top \Ub^{-1} \eb_i y_i \big| \leq \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  }.
\end{align*}
for all $i\in [n]$, where $d'(n) = d - n + 1$. 
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:isotropicbound1}] 
\begin{align}
    \bmu^\top\Qb^\top\Ub^{-1} \eb_i y_i &= \frac{1}{\| \Qb\bmu \|_2}\cdot (\Qb\bmu)^\top \Ub^{-1} (\| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    & = \frac{1}{4\| \Qb\bmu \|_2}\cdot (\Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i)^\top \Ub^{-1} (\Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    &\quad - \frac{1}{4\| \Qb\bmu \|_2}\cdot (\Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i)^\top \Ub^{-1} (\Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    & \leq \frac{1}{4\| \Qb\bmu \|_2}\cdot \bigg[  \frac{\| \Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i \|_2^2}{ d'(n)- \epsilon_{u}  } - \frac{\| \Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i \|_2^2}{ d'(n)+ \epsilon_{u}  }     \bigg] \nonumber\\
    &= \frac{1}{4\| \Qb\bmu \|_2}\cdot \bigg[  \frac{2 \| \Qb\bmu \|_2^2 + 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu }{ d'(n)- \epsilon_{u}  } - \frac{2\| \Qb\bmu \|_2^2  - 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu }{ d'(n)+ \epsilon_{u}  } \bigg] \nonumber \\
    &= \frac{1}{2\| \Qb\bmu \|_2}\cdot \frac{2\| \Qb\bmu \|_2^2 \cdot \epsilon_{u} + 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  }\nonumber\\
    &= \frac{\| \Qb\bmu \|_2 \cdot \epsilon_{u} +  y_i \eb_i^\top \Qb\bmu \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  },\label{eq:fgbounds_eq1}
\end{align}
where the first equality holds due to the parallelogram law $\ab^\top \Mb \bbb = 1/4(\ab+\bbb)^\top \Mb (\ab+\bbb) - 1/4(\ab-\bbb)^\top \Mb (\ab-\bbb)$, and the first inequality follows by \eqref{eq:isotropic_eigenvalueconcentration}.
Under the isotropic Gaussian mixture setting, $e_j^\top \Qb\bmu $, $j=1,\ldots, n$ are i.i.d. Gaussian random variables with variance $\| \bmu \|_2^2$. Therefore by Berstein inequality, with probability at least $1 - n^{-1}$ we have
\begin{align*}
    \big| \| \Qb\bmu \|_2^2 - n \| \bmu \|_2^2 \big| \leq  n\cdot c_1 \| \bmu \|_2^2 \sqrt{\frac{\log(n)}{n}} = c_1 \| \bmu \|_2^2 \sqrt{n\log(n)}, 
\end{align*}
where $c_1$ is an absolute constant. 
Therefore when $n$ is large enough, $\| \Qb\bmu \|_2^2 \leq 2n \| \bmu \|_2^2$. Similarly, by Gaussian tail bound and union bound, with probability at least $1 - n^{-1}$, 
\begin{align*}
    | e_i^\top \Qb\bmu | \leq c_2\sqrt{\log(n)}
\end{align*}
for all $i\in[n]$, where $c_2$ is an absolute constant. Therefore we have
\begin{align*}
    \db^\top \Ub^{-1} \eb_i y_i &\leq \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  }.
\end{align*}
With the exact same proof, we also have
\begin{align*}
    -\db^\top \Ub^{-1} \eb_i y_i &\leq \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  }.
\end{align*}
This completes the proof.
\end{proof}








\begin{lemma}\label{lemma:isotropicbound2}
With probability at least $1 - 2n^{-1}$, 
\begin{align*}
    \yb^\top \Ub^{-1} \eb_i y_i  \geq \frac{ d'(n) - \sqrt{n} \epsilon_{u} }{d'(n)^2  - \epsilon_{u}^2}
\end{align*}
for all $i\in [n]$. 
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:isotropicbound2}]
The proof is very similar to the proof of Lemma~\ref{lemma:isotropicbound1}. We have
\begin{align*}
    \yb^\top \Ub^{-1} \eb_i y_i &= \frac{1}{4\sqrt{n}} (\yb + \sqrt{n}\eb_i y_i )^\top \Ub^{-1} (\yb + \sqrt{n}\eb_i y_i ) - \frac{1}{4\sqrt{n}} (\yb - \sqrt{n}\eb_i y_i )^\top \Ub^{-1} (\yb - \sqrt{n}\eb_i y_i )\\
    &\geq \frac{1}{4\sqrt{n}} \bigg[ \frac{\| \yb + \sqrt{n}\eb_i y_i \|_2^2}{d'(n)  + \epsilon_{u}} - \frac{\| \yb - \sqrt{n}\eb_i y_i \|_2^2}{d'(n)  - \epsilon_{u}}  \bigg]\\
    &= \frac{1}{4\sqrt{n}} \bigg[ \frac{2n + 2\sqrt{n}}{d'(n)  + \epsilon_{u}} - \frac{2n - 2\sqrt{n}}{d'(n)  - \epsilon_{u}}  \bigg]\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{(n + \sqrt{n}) (d'(n)  - \epsilon_{u}) - (n - \sqrt{n}) (d'(n)  + \epsilon_{u}) }{d'(n)^2  - \epsilon_{u}^2}\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{2\sqrt{n} d'(n) - 2n \epsilon_{u} }{d'(n)^2  - \epsilon_{u}^2}\\
     & = \frac{ d'(n) - \sqrt{n} \epsilon_{u} }{d'(n)^2  - \epsilon_{u}^2},
\end{align*}
where we use the parallelogram law $\ab^\top \Mb \bbb = 1/4(\ab+\bbb)^\top \Mb (\ab+\bbb) - 1/4(\ab-\bbb)^\top \Mb (\ab-\bbb)$ in the first equality and use \eqref{eq:isotropic_eigenvalueconcentration} in the first inequality. This completes the proof.
\end{proof}


\begin{lemma}\label{lemma:isotropic_equivalencecondition}
Soppose that $d > C n \log(n)$ and $d >  C n \| \bmu \|_2$ for some large enough absolute constant $C$. Then with probability at least $1 - 4n^{-1}$, $\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i> 0$ for all $i \in [n]$, and therefore hard margin SVM equals the minimum norm interpolator. 
\end{lemma}

\begin{proof}{Proof of Lemma~\ref{lemma:isotropic_equivalencecondition}}
By Lemma~\ref{lemma:matrixcalculation}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \db ) \yb^\top \Ub^{-1} - \yb^\top \Ub^{-1} \yb \cdot \db^\top \Ub^{-1}].
\end{align*}
Moreover, by Lemma~4 in \citet{wang2020benign}, if $d = \Omega(n \| \bmu \|_2) $ and $d = \Omega(n) $, then with probability at least $1 - n^{-1}$, $\yb^\top \Ub^{-1} \db \geq -1/2$, $\yb^\top \Ub^{-1} \yb \leq c_1 n/d$ for some absolute constant $c_1$.
Therefore as long as $d'(n) > \epsilon_{u}$
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i &= D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \db ) \yb^\top \Ub^{-1} \eb_i y_i  - \yb^\top \Ub^{-1} \yb \cdot \db^\top \Ub^{-1}\eb_i y_i ] \\
    &\propto  ( 1 + \yb^\top \Ub^{-1} \db ) \yb^\top \Ub^{-1} \eb_i y_i  - \yb^\top \Ub^{-1} \yb \cdot \db^\top \Ub^{-1}\eb_i y_i\\
    &\geq (1/2)\cdot \yb^\top \Ub^{-1} \eb_i y_i  - c_1 (n/d)\cdot |\db^\top \Ub^{-1}\eb_i y_i|\\
    &\geq  \frac{1}{2} \cdot \frac{ d'(n) - \sqrt{n} \epsilon_{u} }{d'(n)^2  - \epsilon_{u}^2} -  \frac{c_1n}{d}\cdot \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  },
\end{align*}
where $c_1,c_2$ are absolute constants. 
Therefore $\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i >0$ if there exists a large enough constant $c_3$ such that
\begin{align*}
    &d'(n) > c_3 \sqrt{n} \epsilon_{u},\\
    &d'(n) > c_3 (n^{3/2} / d) \cdot  \| \bmu \|_2 \cdot \epsilon_{u},\\
    &d'(n) > c_3 (n/d)\cdot \sqrt{\log(n)} \cdot d'(n).
\end{align*}
Plugging in the definitions $d'(n) = d - n + 1$ and 
$\epsilon_{u} = \sqrt{4\ln(n) d'(n)} + 4\ln(n)$, we see that the above conditions are satisfied when $d > c_4 n \log(n)$ and $d >  c_4 n \| \bmu \|_2$ for some large enough absolute constant $c_4$. This completes the proof. 
\end{proof}









\section{Some Calculations Regarding Lemma~\ref{lemma:riskbound}}

We have
\begin{align*}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg[ - \frac{ [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2}{  2 \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb} \bigg].
\end{align*}
By definition, $\Xb = \yb \bmu^\top + \Qb$, Therefore
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb \leq 2(\yb^\top (\Xb\Xb^\top)^{-1} \yb)^2 \bmu^\top \bSigma \bmu + 2 \yb^\top (\Xb\Xb^\top)^{-1} \Qb \bSigma \Qb^\top (\Xb\Xb^\top)^{-1} \yb
\end{align*}

The key is then to give upper bound of the term $\yb^\top (\Xb\Xb^\top)^{-1} \Qb \bSigma \Qb^\top (\Xb\Xb^\top)^{-1} \yb$. By our calculation in Lemma~\ref{lemma:matrixcalculation}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \db ) \yb - \yb^\top \Ub^{-1} \yb \cdot \db]^\top \Ub^{-1}.
\end{align*}
Denote $\ab = D^{-1}( 1 + \yb^\top \Ub^{-1} \db ) \yb - \yb^\top \Ub^{-1} \yb \cdot \db$. Then
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \Qb \bSigma \Qb^\top (\Xb\Xb^\top)^{-1} \yb &= \ab^\top (\Qb\Qb^\top)^{-1}\Qb\bSigma\Qb^\top (\Qb\Qb^\top)^{-1} \ab\\
    & = \ab^\top (\Zb\bLambda \Zb^\top)^{-1} \Zb\bLambda^2 \Zb^\top (\Zb\bLambda \Zb^\top)^{-1} \ab,
\end{align*}
where we plug in $\Qb = \Zb \bLambda^{1/2} \Vb^\top$ for $\Zb$ with independent sub-Gaussian entries.


% \begin{proof}[Proof of Lemma~\ref{lemma:riskboundcalculation}]
% \begin{align*}
%     \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb}
% \end{align*}

% By definition, we have $\Xb = \yb \bmu^\top + \Qb$, and therefore
% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu = \yb^\top (\Xb\Xb^\top)^{-1} (\yb \bmu^\top + \Qb) \bmu = \| \bmu \|_2^2 \cdot \yb^\top (\Xb\Xb^\top)^{-1} \yb 
%     + \yb^\top (\Xb\Xb^\top)^{-1}  \Qb \bmu.
% \end{align*}
% Hence we have
% \begin{align*}
%     \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} =  \| \bmu \|_2^2 + \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Qb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb}.
% \end{align*}
% By Lemma~\ref{lemma:matrixcalculation}, we have
% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \db ) \yb^\top \Ub^{-1} - \yb^\top \Ub^{-1} \yb \cdot \db^\top \Ub^{-1}].
% \end{align*}
% Therefore
% \begin{align*}
%     \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &=  \| \bmu \|_2^2 + \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Qb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb}\\
%     &=  \| \bmu \|_2^2 + \frac{( 1 + \yb^\top \Ub^{-1} \db ) \yb^\top \Ub^{-1} \db - \yb^\top \Ub^{-1} \yb \cdot \db^\top \Ub^{-1} \db }{( 1 + \yb^\top \Ub^{-1} \db ) \yb^\top \Ub^{-1} \yb - \yb^\top \Ub^{-1} \yb \cdot \db^\top \Ub^{-1} \yb}
% \end{align*}
% \end{proof}






\bibliography{deeplearningreference}
\bibliographystyle{ims}

\end{document}
