\documentclass[11pt]{article}
\pdfoutput=1

\usepackage{mathrsfs}
\usepackage{amsmath,amssymb}
\usepackage{bm}
\usepackage{natbib}
\usepackage[usenames]{color}
\usepackage{amsthm}

\usepackage{multirow} 
\usepackage{enumitem}

\newcommand\dd{\mathrm{d}}
\DeclareMathOperator{\sech}{sech}
\DeclareMathOperator{\csch}{csch}
\DeclareMathOperator{\arcsec}{arcsec}
\DeclareMathOperator{\arccot}{arcCot}
\DeclareMathOperator{\arccsc}{arcCsc}
\DeclareMathOperator{\arccosh}{arcCosh}
\DeclareMathOperator{\arcsinh}{arcsinh}
\DeclareMathOperator{\arctanh}{arctanh}
\DeclareMathOperator{\arcsech}{arcsech}
\DeclareMathOperator{\arccsch}{arcCsch}
\DeclareMathOperator{\arccoth}{arcCoth} 


\usepackage[colorlinks,
linkcolor=red,
anchorcolor=blue,
citecolor=blue
]{hyperref}

\renewcommand{\baselinestretch}{1.05}

\def \dd {\rm{d}}




\usepackage{setspace}
%\setstretch{1.5}
\usepackage[left=1in, right=1in, top=1in, bottom=1in]{geometry}

\usepackage{xcolor}
\newcommand{\bin}[1]{\textcolor{blue}{[Bin: #1]}}

\ifdefined\final
\usepackage[disable]{todonotes}
\else
\usepackage[textsize=tiny]{todonotes}
\fi
\setlength{\marginparwidth}{0.8in}
\newcommand{\todoy}[2][]{\todo[size=\scriptsize,color=blue!20!white,#1]{Yuan: #2}}
\newcommand{\todod}[2][]{\todo[size=\scriptsize,color=red!20!white,#1]{XXX: #2}}
\newcommand{\todoq}[2][]{\todo[size=\scriptsize,color=orange!20!white,#1]{Quanquan: #2}}
\newcommand{\mip}[1]{\langle #1 \rangle}


\title{\huge Logistic Regression for Sub-Gaussian Mixtures: Overparameterization and Benign Overfitting}

%Over-parameterized Logistic Regression for Sub-Gaussian Mixtures: Risk Bounds and Benign Overfitting

\author
{
    
}


\date{}

\usepackage{mylatexstyle}







\def\supp{\mathop{\text{supp}}}
\def\card{\mathop{\text{card}}}
\def\rank{\mathrm{rank}}
\def\dd{\mathrm{d}}
\def\tr{\mathop{\text{tr}}}
\newcommand{\red}{\color{red}}
\newcommand{\blue}{\color{blue}}
\newcommand{\la}{\langle}
\newcommand{\ra}{\rangle}
\newcommand{\cIs}{\cI_{\hat{s}}}
\def \CC {\textcolor{red}}
\newtheorem{condition}[theorem]{Condition}

%\def \hbtheta{\hat \btheta}
\def \hbtheta{ \btheta}
\def \barell{\bar \ell}
\def \Breg {B_{\psi}}
\def \tbtheta {\tilde{\btheta}}
\def \poly {\mathrm{poly}}



\def\supp{\mathop{\text{supp}}}
\def\card{\mathop{\text{card}}}
\def\rank{\mathrm{rank}}
\def \CC {\textcolor{red}}
\def \CCC {\textcolor{blue}}

%\def \hbtheta{\hat \btheta}
\def \hbtheta{ \btheta}
\def \barell{\bar \ell}
\def \Breg {B_{\psi}}
\def \tbtheta {\tilde{\btheta}}
\def \poly {\mathrm{poly}}

\begin{document}

\maketitle




% \section{Introduction}

\begin{abstract}
    Modern machine learning systems such as deep neural networks are often highly over-parameterized so that they can overfit the noisy training data, yet they can still achieve small test errors in practice. In order to understand this ``benign ovefitting'' phenomenon, \citet{bartlett2020benign} established risk bounds of minimum-norm interpolator for over-parameterized linear regression models. In this paper, we study benign overfitting of logistic regression without explicit regularization for linear classification problems. In specific, we consider data generated from sub-Gaussian mixtures, and provide a tight risk bound of logistic regression in the overparameterized setting. Our results precisely characterize the condition under which benign overfitting can occur for logistic regression, and are tighter than previous works.  %(1) under certain conditions, we show that, for mixture of sub-Gaussian data, the maximum margin classifier on the training data is equivalent to the minimum norm interpolator; and (2) A tight risk bound for anisotropic data distributions along with a lower bound. %Our results are applicable to the anisotropic setting and are therefore more general than previous works. Moreover, 
    Simulation results corroborate our theory.
    %When reducing to the isotropic case, our risk bound is tighter than existing results for Gaussian data and sub-Gaussian mixtures.
    % The phenomenon of benign overfitting is one of the key mysteries
\end{abstract}

% Some existing works have studied benign overfitting in sub-Gaussian mixtures. However, they are mainly focused on the isotropic setting. 

% Our contribution:
% \begin{enumerate}
% \item \todoq{consider logistic regression, implicit bias, finite-time analysis}
% \item Infinite-dimensional linear classification
% \item Anisotropic
% %\item Comparison with Phil
% %\item \todoq{can we do sub-Gaussian?}
% \item \todoq{Can we cover Massart noise?}
% \item Distribution-specific PAC learning, halfspace, linear classification
% \item do we require the component of the sub-Gaussian r.v. to be independent? See peter's benign overfitting in ridge regression paper.
% \end{enumerate}



\section{Introduction}

In modern machine learning, complicated models such as deep neural networks have received increasing popularity. These complicated machine learning models can known to have the power to perfectly fit the training data and achieve interpolation. This gives rise to the study of over-parameterized machine learning models under the interpolation regime.

Among the recent studies of learning under the interpolation regime, a remarkable work
\citep{bartlett2020benign} proposed an intriguing phenomenon called \textit{benign overfitting}, which states that over-parameterized models that interpolate noisy training data can still achieve good population risk.  Benign overfitting is quite surprising as it, to certain extent, contradicts with the classic PAC learning theory on the overfitting issue. To explain benign overfitting, \citet{bartlett2020benign} and a series of follow-up works \citep{muthukumar2020harmless,tsigler2020benign} have studied over-parameterized linear regression, and demonstrated that under certain conditions 
on the eigenvalues of the input covariance matrix, the population risk of minimum norm interpolator can be asymptotically optimal.



More recently, benign overfitting has also been studied in the setting of linear classification \citep{chatterji2020finite,muthukumar2020classification,wang2020benign}. \citet{muthukumar2020classification} studied the setting where the data inputs are Gaussian and the labels are generated from a ground truth linear model with noise, and utilized a equivalence result between the hard-margin SVM solution and the minimum norm interpolator to study benign overfitting. 
\citet{chatterji2020finite,wang2020benign} studied the benign overfitting phenomenon in Gaussian mixture models and establish population error bound for the maximum margin classifier. However, unlike the results in the regression setting, the current results for Gaussian/sub-Gaussian mixture models are focused on isotropic or close-to-isotropic settings, where the risk bounds  does not reflect the impact of the spectrum of the covariance matrix.



% Motivated by the observation that over-parameterized linear logistic regression produces the maximum margin classifier, i.e., hard-margin support vector machine (SVM) solution \citep{soudry2017implicit}, these works establish population risk bounds of the maximum margin classifier for two-class linear classification problems. 




% A particularly interesting observation studied by  \citet{muthukumar2020classification,wang2020benign} is the equivalence between classification and regression in the over-parameterized setting. 

% More recently, benign overfitting has also been studied in the setting of over-parameterized linear classification for Gaussian/sub-Gaussian mixtures \citep{chatterji2020finite,muthukumar2020classification,wang2020benign}. Motivated by the observation that over-parameterized linear logistic regression produces the maximum margin classifier, i.e., hard-margin support vector machine (SVM) solution \citep{soudry2017implicit}, these works establish population risk bounds of the maximum margin classifier for two-class linear classification. 
% A particularly interesting observation studied by  \citet{muthukumar2020classification,wang2020benign} is the equivalence between classification and regression in the over-parameterized setting. 

%  \citet{wang2020benign} specifically utilized an observation on the  equivalence between classification and regression in the over-parameterized setting \citet{muthukumar2020classification}, and proved that under certain conditions, the hard-margin SVM solution is exactly the minimum norm linear interpolator.  \citet{wang2020benign} showed that the hard-margin SVM solution for isotropic Gaussian mixtures. 



% Among the recent studies of learning under the interpolation regime, A remarkable work \citet{muthukumar2020classification} showed an interesting  phenomenon on the equivalence between classification and regression in the over-parameterized setting. The authors proved that for Gaussian design linear classification, under certain conditions the hard-margin SVM solution coincides with the minimum norm solution of least square regression. \citet{wang2020benign} also studied isotropic Gaussian mixture classification and showed that under certain conditions, the hard-margin SVM solution is exactly the minimum norm linear interpolator. 


% These works also utilized the equivalence results between classification and regression to establish population classification error bound for the study of benign overfitting.

% studied in \citet{muthukumar2020classification,wang2020benign} together with


% The equivalence between hard-margin SVM and minimum norm interpolator is closely related to the \textit{benign overfitting} phenomenon \citep{bartlett2020benign}. 

% A surprising phenomenon is that , while at the same time generalize well on test data.  



In this work, we study the benign overfitting phenomenon in anisptropic sub-Gaussian mixture models, where the $d$-dimensional data from two classes have the same coraviance matrix $\bSigma$ but have different means $\bmu$ and $-\bmu$ respectively. We first demonstrate that under certain conditions regarding to the eigenvalues of $\bSigma$, the bias vector $\bmu$ and the sample size $n$, the maximum margin classifier for this problem is identical to the minimum norm interpolator. We then utilize this result to establish a tight population error bound of the maximum margin classifier. Our result reveals how the eigenvalues of the Gaussian covaraince matrix $\bSigma$ affect the benign property of the classification problem, and is tighter and more general compared with existing results on Gaussian mixture models. The contributions of this paper are as follows: 

\begin{itemize}[leftmargin = *]
    \item We establish a condition for anisotropic sub-Gaussian mixture models under which the maximum margin classifier coincides with the minimum norm interpolator of the training data. \CC{Although we are considering a more complicated model, our condition on $\bSigma$ is better than the conditions given in \citet{muthukumar2020classification} by a $\log(n)$ factor, and our condition on the relation between $\bSigma$, $\bmu$ and $n$ is better than the result in \citet{wang2020benign} by a  $\log(n)$ factor as well}.  
    \item We also establish a tight population error bound for the maximum margin classifier. Our bound works for anisotropic settings, which is more general than the settings studied in \citet{chatterji2020finite,wang2020benign} and covers the infinite-dimensional setting. When reducing our bound to the setting studied in \citet{chatterji2020finite}, our result gives a bound $\exp(-\Omega( n \| \bmu \|_2^4 / d ))$, which is tighter than the risk bound $\exp(-\Omega( \| \bmu \|_2^4 / d ))$ in \CC{\citet{chatterji2020finite} by a factor of $n$ in the exponent.} For certain anisotropic settings which is not covered by existing results, our bound can be even tighter.  
\end{itemize}




\subsection{Additional Related Work}

% The phenomenon of benign overfitting has been studied by a recent line of work \citep{bartlett2020benign,chatterji2020finite,muthukumar2020classification,wang2020benign}. 

% The phenomenon of benign overfitting was first raised up by \citet{bartlett2020benign}, where the authors studied over-parameterized linear regression and showed that under certain conditions, minimum norm interpolator on the noisy training data can still achieve asymptotically optimal population risk. A later work \citet{chatterji2020finite} studied the risk of interpolating linear classifiers under the setting where the two class-conditional distributions have different means but the same covariance
% matrix. \citet{muthukumar2020classification} studied benign overfitting for a different classification problem with Gaussian features, and highlighted an important observation that under certain conditions, the solutions of hard-margin support vector machine (SVM) are identical to the least-squares minimum-norm interpolators. Following the same intuition, a very recent work \citet{wang2020benign} further studied the equivalence between SVM and minimum norm interpolator and the benign overfitting phenomenon under an isotropic Gaussian mixture model. The results in this paper are closely related to \citet{muthukumar2020classification} and \citet{wang2020benign}. Compared with these existing results, our work applies to infinite-dimensional, anisotropic sub-Gaussian mixtures, and gives a tighter population risk bound. 

Benign overfitting is closely related to the phenomenon of double descent studied in recent works. \citet{belkin2019reconciling,belkin2019two} showed expiremental results and theoretical analysis on specific models demonstrating that the curve of risk versus over-parameterization has a double descent shape, and therefore indicate that over-parameterization can be beneficial to achieve small risk. \citet{hastie2019surprises,wu2020optimal} studied the double descent phenomenon in linear regression under the setting where the dimension $d$ and sample size $n$ have fixed ratio. \citet{mei2019generalization,liao2020random,montanari2020interpolation} further extended the setting to random feature models and studied double descent when the sample size, data dimension and the number of random features have fixed ratios.

Our work is also related to the study of implicit bias, which is to analyze the impact of training algorithms when the over-parameterized models have multiple global minima. Specifically, \citet{soudry2017implicit} showed that if the training data are linearly separable, then gradient descent on unregularized logistic regression gives the maximum margin solution. \citet{li2018algorithmic} further studied the implicit bias of gradient descent for logistic regression on non-separable data. \citet{gunasekar2018characterizing} studied the implicit bias of various optimization methods for generic objective functions. \citet{gunasekar2017implicit,arora2019implicit,lyu2019gradient} studied the implicit bias of gradient flow for learning linear networks or homogeneous networks. \CC{These studies of implicit bias offers a justification for our study of maximum margin classifiers and minimum norm interpolators for linear models.}


Our work is also motivated by the recent study of the generalization error bounds of over-parameterized neural networks. \citet{arora2019fine} gave a generalization error bound of over-parameterized two-layer networks interpolating the training data. \citet{allen2018learning} showed that two- and three-layer networks can have small test error when learning a class of smooth networks. \citet{cao2019generalizationsgd,chen2019much} gave generalization bounds for over-parameterized deep ReLU networks trained by (stochastic) gradient descent. While these results can handle noises when using early stopping or online gradient descent, they cannot give meaningful generalization bounds when the neural networks interpolate noisy training data, and therefore cannot explain benign overfitting. 


% \citet{gunasekar2017implicit}

% A series of papers \citep{gunasekar2017implicit,soudry2017implicit,gunasekar2018characterizing,gunasekar2018implicit,nacson2018stochastic,li2018algorithmic,jacot2020implicit} studied implicit bias problem, aiming to figure out when there are multiple optimal solutions of a training objective function, what kind of nice properties the optimal found by a certain training algorithm would have. Implicit bias results of gradient descent, stochastic gradient descent, or mirror descent for various problem settings including matrix factorization, logistic regression, deep linear networks as well as homogeneous models. The major difference between these results and our work is that implicit bias results usually focus on the parameter space, while we study the functions a neural network prefer to learn in the function space. 

\section{Problem Setting}
Suppose that $y_i$, $i\in [n]$ are generated as i.i.d. Rademacher variables. Suppose that $\qb$ is generated from a distribution with covariance matrix $\bSigma$ such that $\qb = \Vb \bLambda^{1/2} \ub $, where $\bSigma = \Vb \bLambda \Vb^\top$ is the spectral decomposition of $\bSigma$, and $\ub$ is a random vector with independent zero-mean sub-Gaussian entries. Without loss of generality, we assume that $\EE u_j^2 = 1$ and $\sup_j \| u_j \|_{\psi_2} \leq \sigma_q$ (This is because we can absorb the scaling into $\bSigma$). The features are given as
\begin{align*}
    \xb = y_i\cdot \bmu + \qb,
\end{align*}
where $\bmu \in \RR^p$ is a fixed vector. We denote
\begin{align*}
    \Xb = \yb \bmu^\top + \Qb,
\end{align*}
where $\Xb = [ \xb_1,\ldots, \xb_n ]^\top, \Qb = [ \qb_1,\ldots, \qb_n ]^\top \in \RR^{n \times p}$.%, and each row of $\Qb$ are generated independently from $N(\mathbf{0}, \bSigma)$. 




\section{Main Results}



Consider
\begin{align*}
    \hat\btheta_{\text{LS}} = \argmin \| \btheta \|_2^2 \qquad \text{subject to } y_i = \btheta^\top \xb_i, i\in [n]
\end{align*}
and 
\begin{align*}
    \hat\btheta_{\text{SVM}} = \argmin \| \btheta \|_2^2 \qquad \text{subject to } y_i \cdot \btheta^\top \xb_i \geq 1, i\in [n]
\end{align*}

% We study the classification risk
% \begin{align*}
%     R(\btheta) = \PP( y\cdot \btheta^\top \xb < 0 ).
% \end{align*}


\begin{lemma}\label{lemma:equivalence}
Suppose that $y_i \cdot \eb_i^\top (\Xb\Xb^\top)^{-1}\yb >0$ for $i\in[n]$. Then $\hat\btheta_{\text{SVM}}$ also solves the least square problem.
\end{lemma}

Denote $\Ub = \Qb\Qb^\top$, $\bnu = \Qb\bmu$. Then the following lemma is given as Lemma 3 in \citet{wang2020benign}.

\begin{lemma}\label{lemma:matrixcalculation}
The following calculation of $\yb^\top (\Xb\Xb^\top)^{-1}$ holds: 
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1}].
\end{align*}
% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} &= \yb^\top \Ub^{-1} - D^{-1}\cdot [ \| \bmu \|_2^2 s + h^2 + h - st ] \cdot \yb^\top \Ub^{-1} - D^{-1}\bnu^\top \Ub^{-1}\\
%     & =\bigg[ 1 - \frac{\| \bmu \|_2^2 s + h^2 + h - st }{ \| \bmu\|_2^2 s - st + (h+1)^2} \bigg] \cdot \yb^\top \Ub^{-1} - D^{-1}\bnu^\top \Ub^{-1} \\
%     & = \frac{ h + 1 }{ \| \bmu\|_2^2 s - st + (h+1)^2} \cdot \yb^\top \Ub^{-1} - D^{-1}\bnu^\top \Ub^{-1}\\
%     & = D^{-1} [ (h + 1) \yb^\top \Ub^{-1} - \bnu^\top \Ub^{-1}].
% \end{align*}
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:matrixcalculation}]

Denote $s = \yb^\top \Ub^{-1} \yb$, $t = \bnu^\top \Ub^{-1} \bnu$, $h = \yb^\top \Ub^{-1} \bnu$, $D = s (\| \bmu\|_2^2 - t) + (h+1)^2$. Then by Lemma 3 in \citet{wang2020benign}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} &= \yb^\top \Ub^{-1} - D^{-1}\cdot [ \| \bmu \|_2^2 s + h^2 + h - st ] \cdot \yb^\top \Ub^{-1} - D^{-1} s \cdot \bnu^\top \Ub^{-1}.
\end{align*}
Rearranging terms, we obtain
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} 
    & =\bigg[ 1 - \frac{\| \bmu \|_2^2 s + h^2 + h - st }{ \| \bmu\|_2^2 s - st + (h+1)^2} \bigg] \cdot \yb^\top \Ub^{-1} - D^{-1} s \cdot \bnu^\top \Ub^{-1} \\
    & = \frac{ h + 1 }{ \| \bmu\|_2^2 s - st + (h+1)^2} \cdot \yb^\top \Ub^{-1} - D^{-1} s \cdot \bnu^\top \Ub^{-1}\\
    & = D^{-1} [ (h + 1) \yb^\top \Ub^{-1} - s \cdot \bnu^\top \Ub^{-1}].
\end{align*}
\end{proof}



Denote by $ \| \bSigma\|_{2} ,\lambda_2,\ldots, \lambda_p$ the eigenvalues of $\bSigma$ in descending order, with corresponding eigenvectors $\vb_1,\ldots,\vb_p$. Denote $\bLambda = \diag\{  \| \bSigma\|_{2} ,\lambda_2,\ldots, \lambda_p \}$  and  $\Vb = [ \vb_1,\ldots,\vb_p ]$, then $\bSigma = \Vb \bLambda \Vb^\top$. We also denote by $\mu_n(\Ub)$ the eigenvalues of $\Ub$.
 

% The following lemma is given by Lemma 26 in \citet{bartlett2020benign}.

% \begin{lemma}\label{lemma:eigenvalue_concentration}
% There exists an absolute constant $c$ such that, with probability at least $1 - 2\exp(-n/c)$,
% \begin{align*}
%     c^{-1} \sum_{i} \lambda_i - c n \cdot \| \bSigma\|_{2} \leq \mu_n(\Ub) \leq \mu_1(\Ub) \leq c \sum_{i} \lambda_i + c \| \bSigma\|_{2}  n.
% \end{align*}

% \end{lemma}


% The following lemma (Lemma 1 in \citet{muthukumar2020classification}) gives another estimate on the eigenvalues of $\Ub$.


% \begin{lemma}\label{lemma:eigenvalue_concentration2}
%     With probability at least $1 - 2/n$, 
%     \begin{align*}
%         \Big\| \Ub - d'(n) \Ib \Big\|_2 \leq \epsilon_{u} := \frac{1}{12\sqrt{n}} \tr(\bSigma) + 4\sqrt{n\log(n)}\cdot  \| \bSigma\|_{F}  + 8n\log(n)\cdot  \| \bSigma\|_{2} .
%     \end{align*}
% \end{lemma}
% \todoq{It seems that the $\|\lambda\|_1$ term on the right hand size does not appear in Peter's paper}
% \todoq{Is Peter's bound tighter than this one?}





The following lemma summarized from the proof of Lemma~26 in \citet{bartlett2020benign} estimates the eigenvalues of $\Ub$.


\begin{lemma}\label{lemma:eigenvalue_concentration}
    With probability at least $1 - 1/n$, \todoy{Todo: change notation, trace, 2-norm, F-norm, etc}
    \begin{align*}
    \big\| \Ub  - \tr(\bSigma) \Ib \big\|_2 \leq \epsilon_{u} := C \sigma_q^2  \big( n\cdot \| \bSigma\|_2  + \sqrt{n}\cdot \| \bSigma\|_F \big),
\end{align*}
where $C$ is an absolute constant.
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:eigenvalue_concentration}]
Let $\cN$ be a $1/4$-net on the unit sphere $s^{n-1}$. Then by Lemma~5.2 in \citet{vershynin2010introduction}, we have $|\cN| \leq 9^n$. Denote $\zb_i =  \lambda_i^{-1/2} \Qb \vb_i \in \RR^{n}$. Then by definition, for any fixed unit vector $\hat\ab\in \cN$ we have $ \hat\ab^\top \Ub \hat\ab = \Qb\Qb^\top = \hat\ab^\top \sum_{j=1}^p \lambda_j \zb \zb^\top \hat\ab = \sum_{j=1}^p \lambda_j (\hat\ab^\top\zb)^2$. By Lemma 5.9 in \citet{vershynin2010introduction}, there exists an absolute constant $c_1$ such that $\| \hat\ab^\top\zb \|_{\psi_2} \leq c_1 \sigma_q^2$. Therefore by Lemma 21 and Corollary 23 in \citet{bartlett2020benign}, for any $t>0$, with probability at least $1 - 2\exp(-t)$ we have
\begin{align*}
    \big| \hat\ab^\top \Ub \hat\ab - \tr(\bSigma) \big| \leq c_2 \sigma_q^2 \max \big(  t\cdot \| \bSigma\|_{2}  , \sqrt{t}\cdot  \| \bSigma\|_{F}  \big).
\end{align*}
Applying an union bound over all $\hat\ab\in \cN$, we have that with probability at least $1 - 2\cdot 9^n \exp(-t )$,
\begin{align*}
    \big| \hat\ab^\top \Ub \hat\ab - \tr(\bSigma) \big| \leq c_2 \sigma_q^2 \max \big(  t\cdot \| \bSigma\|_{2} , \sqrt{t}\cdot  \| \bSigma\|_{F}  \big) 
\end{align*}
for all $\hat\ab\in \cN$. Therefore by Lemma~25 in \citet{bartlett2020benign}, with probability at least $1 - 2\cdot 9^n \exp(-t )$, we have
\begin{align*}
    \big\| \Ub  - \tr(\bSigma) \Ib \big\|_2 \leq c_3 \sigma_q^2  \big(  t \cdot \| \bSigma\|_{2}  + \sqrt{t}\cdot  \| \bSigma\|_{F}  \big),
\end{align*}
where $c_3$ is an absolute constant. 
Setting $t = c_4 n$ for some large enough constant $c_4$, we have that with probability at least $1 - 1/n$,
\begin{align*}
    \big\| \Ub  - \tr(\bSigma) \Ib \big\|_2 \leq c_5  \sigma_q^2 \big(  n \cdot \| \bSigma\|_{2} + \sqrt{n}\cdot  \| \bSigma\|_{F}  \big),
\end{align*}
where $c_5$ is an absolute constant. This completes the proof.
\end{proof}










% \begin{lemma}\label{lemma:concentrationbounds}
% With probability at least $1 - 2/n$,
% \begin{align*}
%     \frac{n}{ c \sum_{i} \lambda_i + c n \cdot \| \bSigma\|_{2} } \leq  \yb^\top \Ub^{-1} \yb \leq \frac{n}{ c^{-1} \sum_{i} \lambda_i - c n \cdot \| \bSigma\|_{2} }
% \end{align*}

% \begin{align*}
%     \frac{n}{ c \sum_{i} \lambda_i + c n \cdot \| \bSigma\|_{2} } \cdot \| \bmu \|_{\bSigma}^2 \leq  \bnu^\top \Ub^{-1} \bnu \leq \frac{n}{ c^{-1} \sum_{i} \lambda_i - c n \cdot \| \bSigma\|_{2} } \cdot \| \bmu \|_{\bSigma}^2
% \end{align*}


% \begin{align*}
%     |\yb^\top \Ub^{-1} \bnu| \leq \frac{n}{ c^{-1} \sum_{i} \lambda_i - c n \cdot \| \bSigma\|_{2} }  \| \bmu \|_{\bSigma}
% \end{align*}

% % \begin{align*}
% %     \frac{n}{ \tr( \bSigma )  - \epsilon_{u} } \leq  \yb^\top \Ub^{-1} \yb \leq \frac{n}{ \tr( \bSigma )  + \epsilon_{u} }
% % \end{align*}

% % \begin{align*}
% %     \frac{n}{ \tr( \bSigma )  - \epsilon_{u} } \cdot \| \tilde\bmu \|_2 \leq  \bnu^\top \Ub^{-1} \bnu \leq \frac{n}{ \tr( \bSigma )  + \epsilon_{u} } \cdot \| \tilde\bmu \|_2
% % \end{align*}



% \end{lemma}



% The following lemma gives bounds on $f_i = \bnu^\top \Ub^{-1} \eb_i y_i$, $g_i = \yb^\top \Ub^{-1} \eb_i y_i$. \todoq{what is the intuition of this lemma?}


% The following lemma is summarized form Lemma 1 in \citet{muthukumar2020classification}.


\begin{lemma}\label{lemma:concentrationbounds}
With probability at least $1 - 2/n$, the following inequalities hold. %\todoq{What is $s$ and $t$?}, \todoy{simplify notation}
\begin{align*}
    &\frac{n}{ \tr( \bSigma )  + \epsilon_{u} } \leq  \yb^\top \Ub^{-1} \yb \leq \frac{n}{ \tr( \bSigma )  - \epsilon_{u} },\\
    &\frac{n  - C \sqrt{n\log( n)}}{ \tr( \bSigma )  + \epsilon_{u} } \cdot \| \bmu \|_{\bSigma}^2 \leq  \bnu^\top \Ub^{-1} \bnu \leq \frac{ n  + C \sqrt{n\log( n)} }{ \tr( \bSigma )  - \epsilon_{u} } \cdot \| \bmu \|_{\bSigma}^2,\\
    &|\yb^\top \Ub^{-1} \bnu| \leq \frac{C n   }{  \tr( \bSigma )  - \epsilon_{u}  } \| \bmu \|_{\bSigma},
\end{align*}
where $C$ is an absolute constant.
% \begin{align*}
%     \frac{n  - c \sqrt{n\log( n)}}{ \tr( \bSigma )  + \epsilon_{u} } \cdot \| \bmu \|_{\bSigma}^2 \leq  \bnu^\top \Ub^{-1} \bnu \leq \frac{ n  + c \sqrt{n\log( n)} }{ \tr( \bSigma )  - \epsilon_{u} } \cdot \| \bmu \|_{\bSigma}^2
% \end{align*}

% \begin{align*}
%     |\yb^\top \Ub^{-1} \bnu| \leq \frac{n  + c \sqrt{n\log( n)} }{  \tr( \bSigma )  - \epsilon_{u}  } \| \bmu \|_{\bSigma}
% \end{align*}

\end{lemma}



\begin{proof}[Proof of Lemma~\ref{lemma:concentrationbounds}]
The bounds on $\yb^\top \Ub^{-1} \yb$ are directly derived from Lemma~\ref{lemma:eigenvalue_concentration} and the fact that $\| \yb \|_2^2 = n$. To derive the bounds for $\bnu\Ub^{-1}\bnu$, we note that by definition, $\bnu = \Qb\bmu$ and 
\begin{align*}
    \bnu^\top \Ub^{-1} \bnu = \bmu^\top \Qb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu. %= \| \Qb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu \|_2^2
\end{align*}

Denote $\zb_i =  \lambda_i^{-1/2} \Qb \vb_i \in \RR^{n}$,  $\Zb = [\zb_1,\ldots, \zb_p] \in \RR^{n \times p}$, and $\tilde\bmu = \Lambda^{1/2} \Vb^\top \bmu$. Then $\Qb = \Zb \bLambda^{1/2} \Vb^\top$, and
\begin{align*}
    \bmu^\top \Qb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu &= \bmu^\top \Vb  \bLambda^{1/2} \Zb^\top ( \Zb \bLambda \Zb^\top )^{-1} \Zb \bLambda^{1/2} \Vb^\top \bmu\\
    &= \tilde\bmu^\top  \Zb^\top ( \Zb \bLambda \Zb^\top )^{-1} \Zb  \tilde\bmu\\
    &\leq \frac{ \| \Zb \tilde\bmu \|_2^2 }{ \tr(\bSigma) - \epsilon_{u} }.
\end{align*}
Similarly, we have
\begin{align*}
    \bmu^\top \Qb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu \geq \frac{ \| \Zb \tilde\bmu \|_2^2 }{  \tr(\bSigma) + \epsilon_{u} }.
\end{align*}
We now proceed to give upper and lower bounds for the term $\| \Zb \tilde\bmu \|_2^2 =  \sum_{i=1}^n ( \sum_{j=1}^p \Zb_{ij} \tilde\mu_j )^2$. Note that by definition,  $\Zb_{ij}$ for $i\in [n]$ and $j\in [p]$ are independent sub-Gaussian vectors with $\| \Zb_{ij} \|_{\psi_2} \leq 1$. By Lemma~5.9 in \citet{vershynin2010introduction}, we have
\begin{align*}
    \Bigg\| \sum_{j=1}^p \Zb_{ij} \tilde\mu_j \Bigg\|_{\psi_2} \leq \| \tilde\bmu_j \|_2 \cdot \sigma_q, 
\end{align*}
and therefore by Lemma~5.14 in \citet{vershynin2010introduction}, we have
\begin{align*}
     \Bigg\| \Bigg(\sum_{j=1}^p \Zb_{ij} \tilde\mu_j \Bigg)^2 - \| \tilde\bmu \|_2^2 \Bigg\|_{\psi_1} \leq  2\| \tilde\bmu_j \|_2^2 \cdot (1 + \sigma_q^2).
\end{align*}
By Bernstein's inequality, with probability at least $1 - n^{-1}$, 
\begin{align*}
    \big| \| \Zb \tilde\bmu \|_2^2 - \EE \| \Zb \tilde\bmu \|_2^2 \big| \leq c_1 (1 + \sigma_q^2)\cdot \| \tilde\bmu \|_2^2\cdot \sqrt{n\log( n)},
\end{align*}
where $c_1$ is an absolute constant. Merging $\sigma_q^2$ into the constant coefficient, we have
% with probability at least $1 - \exp(-n/c_1)$,
% \begin{align*}
%     \big| \| \Zb \tilde\bmu \|_2^2 - \EE \| \Zb \tilde\bmu \|_2^2 \big|\leq n\cdot \| \tilde\bmu \|_2 /c_1.
% \end{align*}
% Therefore we have
\begin{align*}
     n  \| \tilde\bmu \|_2^2 - c_2 \| \tilde\bmu \|_2^2\cdot \sqrt{n\log( n)} \leq \| \Zb \tilde\bmu \|_2^2 \leq  n \| \tilde\bmu \|_2^2 + c_2 \| \tilde\bmu \|_2^2\cdot \sqrt{n\log( n)},
\end{align*}
and
\begin{align*}
    \frac{n - c_2 \sqrt{n\log( n)}}{ \tr(\bSigma) + \epsilon_mu } \cdot \| \tilde\bmu \|_2 \leq  \bnu^\top \Ub^{-1} \bnu \leq \frac{n + c_2 \sqrt{n\log( n)}}{ \tr(\bSigma) - \epsilon_mu } \cdot \| \tilde\bmu \|_2
\end{align*}
Similarly for $\yb^\top \Ub^{-1} \bnu$, by Cauchy-Schwarz inequality, for large enough $n$ we have
\begin{align*}
    |\yb^\top \Ub^{-1} \bnu| = |\yb^\top (\Qb\Qb^\top)^{-1} \Qb \bmu| &\leq \| \yb \|_2 \cdot \| (\Qb\Qb^\top)^{-1} \Qb \bmu \|_2 = \sqrt{n} \cdot \sqrt{ \bmu^\top \Qb^\top (\Qb\Qb^\top)^{-2} \Qb \bmu }\\
    &\leq\frac{\sqrt{n} \cdot \sqrt{n + c_2 \sqrt{n\log( n)}} }{  \tr(\bSigma) - \epsilon_mu  } \| \Qb\bmu \|_2\\
    &\leq \frac{c_3 n }{  \tr(\bSigma) - \epsilon_mu  } \| \tilde\bmu \|_2,
\end{align*}
where $c_3$ is an absolute constant. Note that $\| \tilde\bmu \|_2 = \| \bmu \|_{\bSigma}$. This completes the proof.
% Denote $\Zb = \Qb \Sigma^{-1/2}$. Then each row of $\Zb$ follows standard Gaussian distribution.
\end{proof}






% \begin{proof}[Proof of Lemma]
% \begin{align*}
%      n \| \tilde\bmu \|_2 + c \| \tilde\bmu \|_2\cdot \sqrt{n\log( n)} 
% \end{align*}
% \end{proof}


% \begin{lemma}\label{lemma:asymmetricbounds}
% For any vector $\ub \in S^{n-1}$, 
% \end{lemma}

% \begin{proof}[Proof of Lemma~\ref{lemma:asymmetricbounds}]
% Denote $\ab_i = \sqrt{n} \eb_i y_i$. Then we have
% \begin{align*}
%     \yb^\top \Ub^{-1} \eb_i y_i = n^{-1/2} \yb^\top \Ub^{-1} \ab_i = \frac{1}{4\sqrt{n}} (\yb + \ab_i)^\top \Ub^{-1} (\yb + \ab_i) - \frac{1}{4\sqrt{n}} (\yb - \ab_i)^\top \Ub^{-1} (\yb - \ab_i).
% \end{align*}
% Note that by definition, $\| \yb + \ab_i \|_2^2 = $
% \end{proof}






\begin{lemma}\label{lemma:anisotropicbound1}
With probability at least $1 - 4n^{-1}$, 
\begin{align*}
    \big| \bmu^\top\Qb^\top \Ub^{-1} \eb_i y_i \big| \leq \frac{\sqrt{2n} \| \bmu \|_{\bSigma} \cdot \epsilon_{u} +  C\sqrt{\log(n)} \cdot \tr(\bSigma)}{ \tr(\bSigma)^2 - \epsilon_{u}^2  }.
\end{align*}
for all $i\in [n]$, where $C$ is an absolute constant. 
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:anisotropicbound1}] 
\begin{align}
    \bmu^\top\Qb^\top\Ub^{-1} \eb_i y_i &= \frac{1}{\| \Qb\bmu \|_2}\cdot (\Qb\bmu)^\top \Ub^{-1} (\| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    & = \frac{1}{4\| \Qb\bmu \|_2}\cdot (\Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i)^\top \Ub^{-1} (\Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    &\quad - \frac{1}{4\| \Qb\bmu \|_2}\cdot (\Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i)^\top \Ub^{-1} (\Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    & \leq \frac{1}{4\| \Qb\bmu \|_2}\cdot \bigg[  \frac{\| \Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i \|_2^2}{ \tr(\bSigma)- \epsilon_{u}  } - \frac{\| \Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i \|_2^2}{ \tr(\bSigma)+ \epsilon_{u}  }     \bigg] \nonumber\\
    &= \frac{1}{4\| \Qb\bmu \|_2}\cdot \bigg[  \frac{2 \| \Qb\bmu \|_2^2 + 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu }{ \tr(\bSigma)- \epsilon_{u}  } - \frac{2\| \Qb\bmu \|_2^2  - 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu }{ \tr(\bSigma)+ \epsilon_{u}  } \bigg] \nonumber \\
    &= \frac{1}{2\| \Qb\bmu \|_2}\cdot \frac{2\| \Qb\bmu \|_2^2 \cdot \epsilon_{u} + 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu \cdot \tr(\bSigma)}{ \tr(\bSigma)^2 - \epsilon_{u}^2  }\nonumber\\
    &= \frac{\| \Qb\bmu \|_2 \cdot \epsilon_{u} +  y_i \eb_i^\top \Qb\bmu \cdot \tr(\bSigma)}{ \tr(\bSigma)^2 - \epsilon_{u}^2  },\label{eq:fgbounds_eq1}
\end{align}
where the first equality holds due to the parallelogram law $\ab^\top \Mb \bbb = 1/4(\ab+\bbb)^\top \Mb (\ab+\bbb) - 1/4(\ab-\bbb)^\top \Mb (\ab-\bbb)$, and the first inequality follows by Lemma~\ref{lemma:eigenvalue_concentration}. Based on our model assumption, we can denote $\Qb = \Zb \bLambda^{1/2} \Vb^\top$, where the entries of $\Zb$ are independent sub-Gaussian random variables with $ \| \Zb_{ij} \|_{\psi_2} \leq \sigma_q$ for all $i\in [n]$ and $j\in[p]$. Denote $\tilde\bmu = \Lambda^{1/2} \Vb^\top \bmu$. Then with the same proof as in Lemma~\ref{lemma:concentrationbounds}, we have 
\begin{align*}
      \| \Qb \bmu \|_2^2= \| \Zb \tilde\bmu \|_2^2 \leq  2n \| \tilde\bmu \|_2^2  = 2n \| \bmu \|_{\bSigma}^2
\end{align*}
when $n$ is large enough. Moreover, we also have
\begin{align*}
    \| y_i \eb_i^\top \Qb \bmu \|_{\psi_2} = \Bigg\| \sum_{j=1}^p \Zb_{ij} \tilde\mu_j \Bigg\|_{\psi_2} \leq \| \tilde\bmu \|_2 \cdot \sigma_q. 
\end{align*}
Therefore by Hoeffding's inequality, with probability at least $1 - n^{-1}$, we have
\begin{align}
    | y_i \eb_i^\top \Qb \bmu | \leq c_1 \| \tilde\bmu \|_2 \cdot \sqrt{\log(n)} = c_1 \| \bmu \|_{\bSigma} \cdot \sqrt{\log(n)},\label{eq:fgbounds_eq2}
\end{align}
where $c_1$ is an absolute constant. Therefore we have
\begin{align*}
    \bnu^\top \Ub^{-1} \eb_i y_i &\leq \frac{\sqrt{2n} \| \bmu \|_{\bSigma} \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot \tr(\bSigma)}{ \tr(\bSigma)^2 - \epsilon_{u}^2  }.
\end{align*}
With the exact same proof, we also have
\begin{align*}
    -\bnu^\top \Ub^{-1} \eb_i y_i &\leq \frac{\sqrt{2n} \| \bmu \|_{\bSigma} \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot \tr(\bSigma)}{ \tr(\bSigma)^2 - \epsilon_{u}^2  }.
\end{align*}
This completes the proof.
\end{proof}







\begin{lemma}\label{lemma:anisotropicbound2}
With probability at least $1 - 2n^{-1}$, 
\begin{align*}
    \yb^\top \Ub^{-1} \eb_i y_i  \geq \frac{ \tr(\bSigma) - \sqrt{n} \epsilon_{u} }{\tr(\bSigma)^2  - \epsilon_{u}^2}
\end{align*}
for all $i\in [n]$. 
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:anisotropicbound2}]
The proof is very similar to the proof of Lemma~\ref{lemma:anisotropicbound1}. We have
\begin{align*}
    \yb^\top \Ub^{-1} \eb_i y_i &= \frac{1}{4\sqrt{n}} (\yb + \sqrt{n}\eb_i y_i )^\top \Ub^{-1} (\yb + \sqrt{n}\eb_i y_i ) - \frac{1}{4\sqrt{n}} (\yb - \sqrt{n}\eb_i y_i )^\top \Ub^{-1} (\yb - \sqrt{n}\eb_i y_i )\\
    &\geq \frac{1}{4\sqrt{n}} \bigg[ \frac{\| \yb + \sqrt{n}\eb_i y_i \|_2^2}{\tr(\bSigma)  + \epsilon_{u}} - \frac{\| \yb - \sqrt{n}\eb_i y_i \|_2^2}{\tr(\bSigma)  - \epsilon_{u}}  \bigg]\\
    &= \frac{1}{4\sqrt{n}} \bigg[ \frac{2n + 2\sqrt{n}}{\tr(\bSigma)  + \epsilon_{u}} - \frac{2n - 2\sqrt{n}}{\tr(\bSigma)  - \epsilon_{u}}  \bigg]\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{(n + \sqrt{n}) (\tr(\bSigma)  - \epsilon_{u}) - (n - \sqrt{n}) (\tr(\bSigma)  + \epsilon_{u}) }{\tr(\bSigma)^2  - \epsilon_{u}^2}\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{2\sqrt{n} \tr(\bSigma) - 2n \epsilon_{u} }{\tr(\bSigma)^2  - \epsilon_{u}^2}\\
     & = \frac{ \tr(\bSigma) - \sqrt{n} \epsilon_{u} }{\tr(\bSigma)^2  - \epsilon_{u}^2},
\end{align*}
where we use the parallelogram law $\ab^\top \Mb \bbb = 1/4(\ab+\bbb)^\top \Mb (\ab+\bbb) - 1/4(\ab-\bbb)^\top \Mb (\ab-\bbb)$ in the first equality and use  Lemma~\ref{lemma:eigenvalue_concentration} \eqref{eq:isotropic_eigenvalueconcentration} in the first inequality. This completes the proof.
\end{proof}








\begin{theorem}[Old]
There exists a large enough absolute constant $C$ such that when $ \tr( \bSigma ) \geq C n^{3/2} \|\bmu \|_{\bSigma}$ and $\tr( \bSigma ) \geq C  \sqrt{n} \epsilon_{u}$, $\hat\btheta_{\text{SVM}}$ also solves the least square problem.
\end{theorem}


\begin{theorem}[old2]
There exists a large enough absolute constant $C$ such that when 
\begin{align*}
    &\tr( \bSigma ) \geq C n \| \bmu \|_{\bSigma},\\
    &\tr(\bSigma)^2 \geq Cn^{5/2}  \| \bmu \|_{\bSigma} \|\bSigma \|_2, \\
    &\tr(\bSigma)^2 \geq C n^2 \| \bmu \|_{\bSigma} \|\bSigma \|_F, \\
    &\tr( \bSigma ) \geq C  n\sqrt{\log(n)},
\end{align*}
$\hat\btheta_{\text{SVM}}$ also solves the least square problem.
\end{theorem}



\begin{theorem}[New]\label{thm:interpolationregression}
There exists a large enough absolute constant $C$ such that when $\tr( \bSigma ) \geq C n \sqrt{n} \cdot \| \bSigma \|_2 + n\cdot \| \bSigma \|_F  +  n\sqrt{\log(n)})$ and $\tr( \bSigma ) \geq C n\cdot \| \bmu \|_{\bSigma} )$, 
% \begin{align*}
%     &\tr( \bSigma ) \geq C n \| \bmu \|_{\bSigma},\\
%     &\tr(\bSigma)^2 \geq Cn^{5/2}  \| \bmu \|_{\bSigma} \|\bSigma \|_2, \\
%     &\tr(\bSigma)^2 \geq C n^2 \| \bmu \|_{\bSigma} \|\bSigma \|_F, \\
%     &\tr( \bSigma ) \geq C  n\sqrt{\log(n)},
% \end{align*}
$\hat\btheta_{\text{SVM}}$ also solves the least square problem.
\end{theorem}


\begin{remark}
The condition $\tr( \bSigma ) = \Omega( n \sqrt{n} \cdot \| \bSigma \|_2 + n\cdot \| \bSigma \|_F  +  n\sqrt{\log(n)})$ in  Theorem~\ref{thm:interpolationregression} coincides with the condition given by \citet{muthukumar2020classification} for the anisotropic setting. 
\end{remark}




\begin{proof}[Proof of Theorem~\ref{thm:interpolationregression}] By Lemma~\ref{lemma:matrixcalculation}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} \eb_i y_i - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1} \eb_i y_i].
\end{align*}
Plugging in the inequalities in Lemmas~\ref{lemma:concentrationbounds}, \ref{lemma:anisotropicbound1} and \ref{lemma:anisotropicbound2}, we have that as long as $\tr(\bSigma) > c_1\max\{ n \| \bmu \|_{\bSigma}, \epsilon_{u}\}$ for some large enough constant $c_1$, 
\begin{align}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i &= D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} \eb_i y_i  - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1}\eb_i y_i ] \nonumber \\
    &\propto  ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} \eb_i y_i  - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1}\eb_i y_i  \nonumber\\
    &\geq (1/2)\cdot \yb^\top \Ub^{-1} \eb_i y_i  - \frac{ c_2 n }{ \tr(\bSigma)}\cdot |\bnu^\top \Ub^{-1}\eb_i y_i| \nonumber\\
    &\geq  \frac{1}{2} \cdot \frac{ \tr(\bSigma) - \sqrt{n} \epsilon_{u} }{\tr(\bSigma)^2  - \epsilon_{u}^2} -  \frac{c_2n}{\tr(\bSigma)}\cdot \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_3\sqrt{\log(n)} \cdot \tr(\bSigma)}{ \tr(\bSigma)^2 - \epsilon_{u}^2  } \nonumber\\
    &\propto  \tr(\bSigma) - \sqrt{n} \epsilon_{u}  -  c_4 (n^{3/2}/\tr(\bSigma))\cdot \| \bmu \|_{\bSigma} \cdot \epsilon_{u} - c_5 n \sqrt{\log(n)}, \label{eq:equivalenceproof_eq1}
\end{align}
where $c_2,c_3,c_4,c_5> 0$ are absolute constants. Recall that in Lemma~\ref{lemma:eigenvalue_concentration}, 
$$\epsilon_{u} = c_6 \sigma_q^2  \big( n\cdot \| \bSigma\|_2  + \sqrt{n}\cdot \| \bSigma\|_F \big)$$
for some absolute constant $c_6$. \CC{The difference in concentration inequalities, i.e. difference between Lemma \ref{lemma:eigenvalue_concentration} and Lemma \ref{lemma:eigenvalueconcentration_isotropic} makes the proof for the anisotropic case more complicated. For the isotropic case, $\tr(\bSigma) \rightarrow d-n+1$, $\|\bSigma \|_F \rightarrow \sqrt{d-n+1}$}.

By our assumption that $\tr( \bSigma ) \geq C n \sqrt{n} \cdot \| \bSigma \|_2 + n\cdot \| \bSigma \|_F  +  n\sqrt{\log(n)})$ and $\tr( \bSigma ) \geq C n\cdot \| \bmu \|_{\bSigma} )$ for some large enough absolute constant $C$, we see that
\begin{align*}
     &\tr(\bSigma) \geq C\sqrt{n} \epsilon_{u},  
     &\tr(\bSigma)^2 \geq C^2 n^{3/2} \cdot \| \bmu \|_{\bSigma} \cdot \epsilon_{u}, \\
     & \tr(\bSigma) \geq - C n \sqrt{\log(n)}, 
\end{align*}
and therefore by \eqref{eq:equivalenceproof_eq1} we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i > 0.
\end{align*}
This completes the proof.
\end{proof}


% For our setting here, requiring $\tr(\bSigma) \geq  c_4 (n^{3/2}/\tr(\bSigma))\cdot \| \bmu \|_{\bSigma} \cdot \epsilon_{u}$ needs
% \begin{align*}
%     &\tr(\bSigma)^2 \geq c n^{5/2}  \| \bmu \|_{\bSigma} \|\bSigma \|_2,\\
%     &\tr(\bSigma)^2 \geq c n^2 \| \bmu \|_{\bSigma} \|\bSigma \|_F.
% \end{align*}
% We also require $\tr(\bSigma) \geq  c_5 n \sqrt{\log(n)} $. Summarizing the conditions above, we obtain the theorem result.


% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i &\geq D^{-1} \bigg[ \bigg( 1 - \frac{n  + c_1 \sqrt{n\log( n)} }{  \tr( \bSigma )  - \epsilon_{u}  } \| \tilde\bmu \|_2 \bigg) \cdot \frac{ \tr( \bSigma ) - \sqrt{n} \epsilon_{u} }{\tr( \bSigma )^2  - \epsilon_{u}^2} - \frac{n}{ \tr( \bSigma )  - \epsilon_{u} } \cdot \frac{\sqrt{n}}{ \tr( \bSigma )  - \epsilon_{u} } \| \tilde\bmu \|_2\bigg] \\
%     &\geq D^{-1} \bigg[ c_2 \cdot \bigg( 1 - \frac{c_3 n  }{  \tr( \bSigma )    } \| \tilde\bmu \|_2 \bigg) \cdot \frac{ \tr( \bSigma ) - \sqrt{n} \epsilon_{u} }{\tr( \bSigma )^2} - \frac{n^{3/2}}{ \tr( \bSigma )^2  } \cdot  \| \tilde\bmu \|_2\bigg],
% \end{align*}
% where $c_2,c_3 > 0$ are absolute constants.
% Therefore when $ \tr( \bSigma ) \geq C n^{3/2} \| \tilde\bmu \|_2$ and $\tr( \bSigma ) \geq C  \sqrt{n} \epsilon_{u}$ for some large enough constant $C$, we have
% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i 
%     &\geq D^{-1} \bigg[ c_4 \cdot  \frac{ \tr( \bSigma ) - \sqrt{n} \epsilon_{u} }{\tr( \bSigma )^2} - \frac{n^{3/2}}{ \tr( \bSigma )^2  } \cdot  \| \tilde\bmu \|_2\bigg] > 0.
% \end{align*}
% Applying Lemma~\ref{lemma:equivalence} completes the proof.





\begin{remark}
Compared with the results of \citet{wang2020benign}, our result is applicable to a more general setting that allows anisotropic, non-Gaussian data and allow infinite-dimensional features. Under the setting when $\lambda_i = 1$ for $i\in[p]$, Theorem~\ref{thm:interpolationregression} recovers  \CC{( the corrected version of)} the results in \citet{wang2020benign}. 
\end{remark}










\subsection{Benign Overfitting}

\begin{lemma}\label{lemma:riskbound}
\todoy{This part may be improvable with a more careful calculation} Under the conditions of Theorem~\ref{thm:interpolationregression}, 
\begin{align*}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg[ - \frac{ [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2}{  \| \bSigma\|_{2} \cdot \yb^\top (\Xb\Xb^\top)^{-1} \yb} \bigg].
\end{align*}
\end{lemma}

The following lemma is summarized from \citet{wang2020benign}. 


% $s = \yb^\top \Ub^{-1} \yb$, $t = \bnu^\top \Ub^{-1} \bnu$, $h = \yb^\top \Ub^{-1} \bnu$

\begin{lemma}\label{lemma:riskboundcalculation}
\begin{align*}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &= \frac{ \big[ \yb^\top \Ub^{-1} \yb \cdot (\| \bmu \|_2^2 - \bnu^\top \Ub^{-1} \bnu) +  (\yb^\top \Ub^{-1} \bnu)^2 +  \yb^\top \Ub^{-1} \bnu \big]^2 }{ \yb^\top \Ub^{-1} \yb \cdot \big[ \yb^\top \Ub^{-1} \yb \cdot (\| \bmu \|_2^2 - \bnu^\top \Ub^{-1} \bnu) +  (\yb^\top \Ub^{-1} \bnu + 1)^2 \big]}.
\end{align*}
\end{lemma}


\begin{lemma}\label{lemma:riskboundssimplify}
Suppose that  $ \tr( \bSigma ) \geq C n^{3/2} \| \tilde\bmu \|_2$ and $\tr( \bSigma ) \geq C  \sqrt{n} \epsilon_{u}$, $\hat\btheta_{\text{SVM}}$
for some large enough absolute constant $C$. 
\begin{align*}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} \geq  \frac{ n\cdot \big(  \| \bmu \|_2^2  - 8 \| \bmu \|_{\bSigma} \big)^2 }{ 64 \big(  n \cdot \| \bmu \|_2^2 + \tr( \bSigma )  \big) }.
\end{align*}
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:riskboundssimplify}]
By Lemma~\ref{lemma:riskboundcalculation} we have
\begin{align}\label{eq:riskboundssimplify_eq1}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &= \frac{ \big[ \yb^\top \Ub^{-1} \yb \cdot (\| \bmu \|_2^2 - \bnu^\top \Ub^{-1} \bnu) +  (\yb^\top \Ub^{-1} \bnu)^2 +  \yb^\top \Ub^{-1} \bnu \big]^2 }{ \yb^\top \Ub^{-1} \yb \cdot \big[ \yb^\top \Ub^{-1} \yb \cdot (\| \bmu \|_2^2 - \bnu^\top \Ub^{-1} \bnu) +  (\yb^\top \Ub^{-1} \bnu + 1)^2 \big]}.
\end{align}
% By definition, we have
By Lemma~\ref{lemma:concentrationbounds}, with probability at least $1 - n^{-1}$, we have
\begin{align*}
    \| \bmu \|_2^2 - \bnu^\top \Ub^{-1} \bnu \geq  \| \bmu \|_2^2 -  \frac{n}{ 2 \tr(\bSigma)}  \| \bmu \|_{\bSigma}^2 \geq \| \bmu \|_2^2 - \frac{  \| \bSigma\|_{2}  n}{ 2\tr(\bSigma)}  \| \bmu \|_2^2 =  \frac{ 2 \tr(\bSigma) -   \| \bSigma\|_{2}  n}{2  \tr(\bSigma)}  \| \bmu \|_2^2 \geq \| \bmu \|_2^2 / 2.
\end{align*}
Plugging the above inequality into \eqref{eq:riskboundssimplify_eq1}, we have
\begin{align*}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &= \frac{ \big[ \yb^\top \Ub^{-1} \yb \cdot (\| \bmu \|_2^2 - \bnu^\top \Ub^{-1} \bnu) +  (\yb^\top \Ub^{-1} \bnu)^2 +  \yb^\top \Ub^{-1} \bnu \big]^2 }{ \yb^\top \Ub^{-1} \yb \cdot \big[ \yb^\top \Ub^{-1} \yb \cdot (\| \bmu \|_2^2 - \bnu^\top \Ub^{-1} \bnu) +  (\yb^\top \Ub^{-1} \bnu + 1)^2 \big]}\\
    &\geq  \frac{ \big[ \yb^\top \Ub^{-1} \yb \cdot \| \bmu \|_2^2 / 2 - | \yb^\top \Ub^{-1} \bnu | \big]^2 }{ \yb^\top \Ub^{-1} \yb \cdot \big[ \yb^\top \Ub^{-1} \yb \cdot \| \bmu \|_2^2 + 2 \big]}\\
    &\geq \frac{ \big[ \yb^\top \Ub^{-1} \yb \cdot \| \bmu \|_2^2 / 2 -  2n  \tr( \bSigma )^{-1}  \cdot \| \bmu \|_{\bSigma} \big]^2 }{ \yb^\top \Ub^{-1} \yb \cdot \big[ \yb^\top \Ub^{-1} \yb \cdot \| \bmu \|_2^2 + 2 \big]}
\end{align*}
where we utilize Lemma~\ref{lemma:concentrationbounds} to derive the condition 
\begin{align*}
    \yb^\top \Ub^{-1} \bnu \leq \frac{ n  + c \sqrt{n\log( n)} }{ \tr( \bSigma )  - \epsilon_{u} } \cdot \| \bmu \|_{\bSigma} \leq  \frac{ 2n  }{ \tr( \bSigma ) } \cdot \| \bmu \|_{\bSigma} \leq \sqrt{2} - 1.
\end{align*}
% $\yb^\top \Ub^{-1} \bnu \leq \frac{ n  + c \sqrt{n\log( n)} }{ \tr( \bSigma )  - \epsilon_{u} } \cdot \| \bmu \|_{\bSigma} \leq  \frac{ 2n  }{ \tr( \bSigma ) } \cdot \| \bmu \|_{\bSigma} \leq \sqrt{2} - 1$.
Applying the bound on $\yb^\top \Ub^{-1} \yb$ in Lemma~\ref{lemma:concentrationbounds}, we obtain %have $\yb^\top \Ub^{-1} \yb \cdot \| \bmu \|_2^2 \leq \frac{n}{ \tr( \bSigma )  - \epsilon_{u} } \cdot \| \bmu \|_2^2 \leq 1$. Therefore

\begin{align*}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &\geq \frac{ \big( n \tr( \bSigma )^{-1} \cdot \| \bmu \|_2^2 / 4 -  2n  \tr( \bSigma )^{-1}  \cdot \| \bmu \|_{\bSigma} \big)^2 }{ 2n \tr( \bSigma )^{-1} \cdot \big( 2 n \tr( \bSigma )^{-1} \cdot \| \bmu \|_2^2 + 2 \big) } \\
    &=  \frac{ n\cdot \big(  \| \bmu \|_2^2 / 4 -    2 \| \bmu \|_{\bSigma} \big)^2 }{ 2 \tr( \bSigma )  \cdot \big( 2 n \tr( \bSigma )^{-1} \cdot \| \bmu \|_2^2 + 2 \big) } \\
    &=  \frac{ n\cdot \big(  \| \bmu \|_2^2  - 8 \| \bmu \|_{\bSigma} \big)^2 }{ 64 \big(  n \cdot \| \bmu \|_2^2 + \tr( \bSigma )  \big) }
\end{align*}
This completes the proof. 

% \begin{align*}
%     \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} \geq  \frac{ (\yb^\top \Ub^{-1} \yb)^2 \cdot \| \bmu \|_2^4 / 4  }{ \yb^\top \Ub^{-1} \yb \cdot 3} = \yb^\top \Ub^{-1} \yb \cdot \| \bmu \|_2^4 / 12 \geq \frac{ n \| \bmu \|_2^4  }{24 \tr(\bSigma)},
% \end{align*}


% Denote $\overline\bmu = \Vb^\top \bmu$, then%$\tilde\bmu = \Lambda^{1/2} \Vb^\top \bmu$
% \begin{align*}
%     \| \bmu \|_2^2 = \| \overline\bmu \|_2^2  = \sum_{j=1}^p \lambda_j \overline\mu_j^2 / \lambda_j \geq 
% \end{align*}


\end{proof}






\begin{theorem}\label{thm:benignoverfitting}
Suppose that  $ \tr( \bSigma ) \geq C n^{3/2} \| \bmu \|_{\bSigma}$, $\tr( \bSigma ) \geq C  \sqrt{n} \epsilon_{u}$, and $\| \bmu \|_2 \geq C \| \bSigma\|_{2} $ for some large enough absolute constant $C$. Then
\begin{align*}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg(- \frac{1 }{256  \| \bSigma\|_{2} } \cdot \min\bigg\{ \| \bmu \|_2^2 , \frac{n \| \bmu \|_2^4 }{\tr(\bSigma)} \bigg\}\bigg).
\end{align*}
\end{theorem}

\begin{proof}[Proof of Theorem~\ref{thm:benignoverfitting}]
The proof follows by a direct combination of Lemma~\ref{lemma:riskboundcalculation} and Lemma~\ref{lemma:riskboundssimplify}. 
\end{proof}

\begin{remark}
Theorem~\ref{thm:benignoverfitting} establishes a risk bound for interpolating classifiers. It implies the result of \citet{wang2020benign} when setting $\lambda_j = 1$, $j\in [p]$. \citet{chatterji2020finite} gives an $\exp(-\Omega( \| \bmu \|_2^4 / p ))$ risk bound under the conditions $p = \Omega( n \| \bmu \|_2^2 ) $ and $\EE( \| \qb \|_2^2) \geq \Omega(p)$. Under the same conditions, the bound in Theorem~\ref{thm:benignoverfitting} reduces to $\exp(-\Omega( n \| \bmu \|_2^4 / p ))$, which is clearly sharper than the bound in \citet{chatterji2020finite}. Moreover, our result also holds for for more general settings without these conditions, and is applicable to the infinite-dimensional setting.




% Compared with \citet{chatterji2020finite} which requires $\EE( \| \qb \|_2^2) \geq \Omega(p)$, our result does not rely on this assumption and therefore works for the infinite-dimensional setting. Moreover, 


% Compared with the results in \citet{chatterji2020finite} and \citet{wang2020benign},
% Compared with the results of \citet{wang2020benign}, our result is applicable to a more general setting that allows anisotropic, non-Gaussian data and allow infinite-dimensional features. Under the setting when $\lambda_i = 1$ for $i\in[p]$, Theorem~\ref{thm:interpolationregression} recovers  \CC{( the corrected version of)} the results in \citet{wang2020benign}. 

\end{remark}







\section{Improving the Condition for Classification-Regression Equivalence}

We consider the isotropic Gaussian mixture setting.

\begin{lemma}\label{lemma:eigenvalueconcentration_isotropic}[Lemma 2 in \citet{muthukumar2020classification}]
For any vector $\ab\in S^{d-1}$, with probability at least $1 - 2n^{-2}$, 
\begin{align*}
    & \ub^\top \Ub^{-1} \ub \geq \frac{1}{d'(n) + \sqrt{4\ln(n) d'(n)} + 4\ln(n)}, \\
    & \ub^\top \Ub^{-1} \ub \leq \frac{1}{d'(n) - \sqrt{4\ln(n) d'(n)}} \leq \frac{1}{d'(n) - \sqrt{4\ln(n) d'(n)} - 4\ln(n)},
\end{align*}
where $d'(n) = d - n + 1$.
\end{lemma}

Denote $\epsilon_{u} = \sqrt{4\ln(n) d'(n)} + 4\ln(n)$. Then by Lemma \ref{lemma:eigenvalueconcentration_isotropic}, we have
\begin{align}\label{eq:isotropic_eigenvalueconcentration}
    \frac{1}{d'(n) - \epsilon_{u}} \geq \ub^\top \Ub^{-1} \ub \geq \frac{1}{d'(n) + \epsilon_{u}}.
\end{align}



\begin{lemma}\label{lemma:isotropicbound1}
With probability at least $1 - 4n^{-1}$, 
\begin{align*}
    \big| \bmu^\top\Qb^\top \Ub^{-1} \eb_i y_i \big| \leq \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  }.
\end{align*}
for all $i\in [n]$, where $d'(n) = d - n + 1$. 
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:isotropicbound1}] 
\begin{align}
    \bmu^\top\Qb^\top\Ub^{-1} \eb_i y_i &= \frac{1}{\| \Qb\bmu \|_2}\cdot (\Qb\bmu)^\top \Ub^{-1} (\| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    & = \frac{1}{4\| \Qb\bmu \|_2}\cdot (\Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i)^\top \Ub^{-1} (\Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    &\quad - \frac{1}{4\| \Qb\bmu \|_2}\cdot (\Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i)^\top \Ub^{-1} (\Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    & \leq \frac{1}{4\| \Qb\bmu \|_2}\cdot \bigg[  \frac{\| \Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i \|_2^2}{ d'(n)- \epsilon_{u}  } - \frac{\| \Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i \|_2^2}{ d'(n)+ \epsilon_{u}  }     \bigg] \nonumber\\
    &= \frac{1}{4\| \Qb\bmu \|_2}\cdot \bigg[  \frac{2 \| \Qb\bmu \|_2^2 + 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu }{ d'(n)- \epsilon_{u}  } - \frac{2\| \Qb\bmu \|_2^2  - 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu }{ d'(n)+ \epsilon_{u}  } \bigg] \nonumber \\
    &= \frac{1}{2\| \Qb\bmu \|_2}\cdot \frac{2\| \Qb\bmu \|_2^2 \cdot \epsilon_{u} + 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  }\nonumber\\
    &= \frac{\| \Qb\bmu \|_2 \cdot \epsilon_{u} +  y_i \eb_i^\top \Qb\bmu \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  },\label{eq:fgbounds_anisotropic_eq1}
\end{align}
where the first equality holds due to the parallelogram law $\ab^\top \Mb \bbb = 1/4(\ab+\bbb)^\top \Mb (\ab+\bbb) - 1/4(\ab-\bbb)^\top \Mb (\ab-\bbb)$, and the first inequality follows by \eqref{eq:isotropic_eigenvalueconcentration}.
Under the isotropic Gaussian mixture setting, $e_j^\top \Qb\bmu $, $j=1,\ldots, n$ are i.i.d. Gaussian random variables with variance $\| \bmu \|_2^2$. Therefore by Berstein inequality, with probability at least $1 - n^{-1}$ we have
\begin{align*}
    \big| \| \Qb\bmu \|_2^2 - n \| \bmu \|_2^2 \big| \leq  n\cdot c_1 \| \bmu \|_2^2 \sqrt{\frac{\log(n)}{n}} = c_1 \| \bmu \|_2^2 \sqrt{n\log(n)}, 
\end{align*}
where $c_1$ is an absolute constant. 
Therefore when $n$ is large enough, $\| \Qb\bmu \|_2^2 \leq 2n \| \bmu \|_2^2$. Similarly, by Gaussian tail bound and union bound, with probability at least $1 - n^{-1}$, 
\begin{align*}
    | e_i^\top \Qb\bmu | \leq c_2\sqrt{\log(n)}
\end{align*}
for all $i\in[n]$, where $c_2$ is an absolute constant. Therefore we have
\begin{align*}
    \bnu^\top \Ub^{-1} \eb_i y_i &\leq \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  }.
\end{align*}
With the exact same proof, we also have
\begin{align*}
    -\bnu^\top \Ub^{-1} \eb_i y_i &\leq \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  }.
\end{align*}
This completes the proof.
\end{proof}








\begin{lemma}\label{lemma:isotropicbound2}
With probability at least $1 - 2n^{-1}$, 
\begin{align*}
    \yb^\top \Ub^{-1} \eb_i y_i  \geq \frac{ d'(n) - \sqrt{n} \epsilon_{u} }{d'(n)^2  - \epsilon_{u}^2}
\end{align*}
for all $i\in [n]$. 
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:isotropicbound2}]
The proof is very similar to the proof of Lemma~\ref{lemma:isotropicbound1}. We have
\begin{align*}
    \yb^\top \Ub^{-1} \eb_i y_i &= \frac{1}{4\sqrt{n}} (\yb + \sqrt{n}\eb_i y_i )^\top \Ub^{-1} (\yb + \sqrt{n}\eb_i y_i ) - \frac{1}{4\sqrt{n}} (\yb - \sqrt{n}\eb_i y_i )^\top \Ub^{-1} (\yb - \sqrt{n}\eb_i y_i )\\
    &\geq \frac{1}{4\sqrt{n}} \bigg[ \frac{\| \yb + \sqrt{n}\eb_i y_i \|_2^2}{d'(n)  + \epsilon_{u}} - \frac{\| \yb - \sqrt{n}\eb_i y_i \|_2^2}{d'(n)  - \epsilon_{u}}  \bigg]\\
    &= \frac{1}{4\sqrt{n}} \bigg[ \frac{2n + 2\sqrt{n}}{d'(n)  + \epsilon_{u}} - \frac{2n - 2\sqrt{n}}{d'(n)  - \epsilon_{u}}  \bigg]\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{(n + \sqrt{n}) (d'(n)  - \epsilon_{u}) - (n - \sqrt{n}) (d'(n)  + \epsilon_{u}) }{d'(n)^2  - \epsilon_{u}^2}\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{2\sqrt{n} d'(n) - 2n \epsilon_{u} }{d'(n)^2  - \epsilon_{u}^2}\\
     & = \frac{ d'(n) - \sqrt{n} \epsilon_{u} }{d'(n)^2  - \epsilon_{u}^2},
\end{align*}
where we use the parallelogram law $\ab^\top \Mb \bbb = 1/4(\ab+\bbb)^\top \Mb (\ab+\bbb) - 1/4(\ab-\bbb)^\top \Mb (\ab-\bbb)$ in the first equality and use \eqref{eq:isotropic_eigenvalueconcentration} in the first inequality. This completes the proof.
\end{proof}


\begin{lemma}\label{lemma:isotropic_equivalencecondition}
Soppose that $d > C n \log(n)$ and $d >  C n \| \bmu \|_2$ for some large enough absolute constant $C$. Then with probability at least $1 - 4n^{-1}$, $\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i> 0$ for all $i \in [n]$, and therefore hard margin SVM equals the minimum norm interpolator. 
\end{lemma}

\begin{proof}{Proof of Lemma~\ref{lemma:isotropic_equivalencecondition}}
By Lemma~\ref{lemma:matrixcalculation}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1}].
\end{align*}
Moreover, by Lemma~4 in \citet{wang2020benign}, if $d = \Omega(n \| \bmu \|_2) $ and $d = \Omega(n) $, then with probability at least $1 - n^{-1}$, $\yb^\top \Ub^{-1} \bnu \geq -1/2$, $\yb^\top \Ub^{-1} \yb \leq c_1 n/d$ for some absolute constant $c_1$.
Therefore as long as $d'(n) > \epsilon_{u}$
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i &= D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} \eb_i y_i  - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1}\eb_i y_i ] \\
    &\propto  ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} \eb_i y_i  - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1}\eb_i y_i\\
    &\geq (1/2)\cdot \yb^\top \Ub^{-1} \eb_i y_i  - c_1 (n/d)\cdot |\bnu^\top \Ub^{-1}\eb_i y_i|\\
    &\geq  \frac{1}{2} \cdot \frac{ d'(n) - \sqrt{n} \epsilon_{u} }{d'(n)^2  - \epsilon_{u}^2} -  \frac{c_1n}{d}\cdot \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{u} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{u}^2  },
\end{align*}
where $c_1,c_2$ are absolute constants. 
Therefore $\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i >0$ if there exists a large enough constant $c_3$ such that
\begin{align*}
    &d'(n) > c_3 \sqrt{n} \epsilon_{u},\\
    &d'(n) > c_3 (n^{3/2} / d) \cdot  \| \bmu \|_2 \cdot \epsilon_{u},\\
    &d'(n) > c_3 (n/d)\cdot \sqrt{\log(n)} \cdot d'(n).
\end{align*}
Plugging in the definitions $d'(n) = d - n + 1$ and 
$\epsilon_{u} = \sqrt{4\ln(n) d'(n)} + 4\ln(n)$, we see that the above conditions are satisfied when $d > c_4 n \log(n)$ and $d >  c_4 n \| \bmu \|_2$ for some large enough absolute constant $c_4$. This completes the proof. 
\end{proof}



\section{Some Calculations Regarding Lemma~\ref{lemma:riskbound}}



\begin{theorem}\label{thm:BOnew}
Suppose that $\tr( \bSigma ) \geq C \max\{ \epsilon_{u} , n, n \|\bSigma \|_2 , n \| \bmu \|_{\bSigma} \}$ and $\| \bmu \|_2 \geq C \| \bSigma\|_{2}$ for some large enough constant $C$. Then when $n$ is large enough, with probability at least $1 - n^{-1}$, 
\begin{align*}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq  \exp\bigg[ - \frac{  C' n\cdot \| \bmu \|_2^4  }{  n \cdot \| \bmu \|_{\bSigma}^2+ \tr(\bSigma^2) +  n\cdot \| \bSigma\|_2^2  + n^{1/2}\cdot \| \bSigma^2\|_F} \bigg].
\end{align*}
\end{theorem}

\begin{remark}
Theorem~\ref{thm:BOnew} is stronger than Theorem~\ref{thm:benignoverfitting} where the bound is 
\begin{align*}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg(- \frac{1 }{256  \| \bSigma\|_{2} } \cdot \min\bigg\{ \| \bmu \|_2^2 , \frac{n \| \bmu \|_2^4 }{\tr(\bSigma)} \bigg\}\bigg).
\end{align*}
By $\tr(\bSigma^2) \leq \| \bSigma \|_2\cdot \tr(\bSigma)$, $\| \bSigma^2\|_F \leq  \| \bSigma \|_2\cdot \| \bSigma\|_F$, $\| \bmu \|_{\bSigma}^2 \leq \| \bSigma \|_2  \| \bmu \|_2^2$ and the assumption $ \tr(\bSigma) > C\max\{ n\cdot  \| \bSigma \|_2, \sqrt{n}\cdot  \|\bSigma \|_F\}$, we obtain from Theorem~\ref{thm:BOnew} that
\begin{align*}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) &\leq  \exp\bigg[ - \frac{  C'' n\cdot \| \bmu \|_2^4  }{  n \cdot\| \bSigma \|_2  \| \bmu \|_2^2+  \| \bSigma \|_2\cdot \tr(\bSigma)} \bigg]\\
    &\leq \exp\bigg(- \frac{C''' }{ \| \bSigma\|_{2} } \cdot \min\bigg\{ \| \bmu \|_2^2 , \frac{n \| \bmu \|_2^4 }{\tr(\bSigma)} \bigg\}\bigg).
\end{align*}
Therefore the bound in Theorem~\ref{thm:BOnew} can be much tighter, e.g., when $d$ is large, $ \tr(\bSigma)$ can be close to infinity while $ \tr(\bSigma^2)$ can be bounded by a constant.
\end{remark}

% \begin{remark}
% The Bayes optimal 
% \end{remark}


\begin{proof}[Proof of Theorem~\ref{thm:BOnew}]


We have
\begin{align}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg[ - \frac{ [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2}{  2 \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb} \bigg].\label{eq:refinedBOderivation_eq0}
\end{align}


In the following, we will give a lower bound of the numerator and an upper bound of the denominator in the exponential term of \eqref{eq:refinedBOderivation_eq0} respectively. We first give a lower bound for $\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2$. By Lemma~\ref{lemma:matrixcalculation} and the notation $\Xb = \yb \bmu^\top + \Qb$, we have
\begin{align}
    \yb^\top (\Xb\Xb^\top)^{-1}  \Xb \bmu &= D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1}] (\yb \bmu^\top + \Qb)\bmu \nonumber\\
    &= D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1}] (\yb \cdot \| \bmu \|_2^2 + \Qb\bmu) \nonumber\\
    & = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} \yb - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1} \yb] \cdot \| \bmu \|_2^2 \nonumber \\
    & \qquad + D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1}\bnu - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1}\bnu] )\nonumber\\
    & = D^{-1}\cdot [ (\| \bmu \|_2^2 - \bnu^\top \Ub^{-1}\bnu) \yb^\top \Ub^{-1} \yb  + ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1}\bnu  ],\label{eq:refinedBO_numerator_eq0}
\end{align}
where the third equality follows by the notation $\bnu = \Qb\bmu$.
By Lemma~\ref{lemma:concentrationbounds} and the assumption that \CC{$\tr( \bSigma ) \geq C \max\{ \epsilon_{u} , n, n \|\bSigma \|_2 , n \| \bmu \|_{\bSigma} \}$} for some large enough constant $C$, for large enough $n$ we have
\begin{align*}
     &| \yb^\top \Ub^{-1} \bnu | \leq \frac{ n   }{  \tr( \bSigma )  - \epsilon_{u}  } \| \bmu \|_{\bSigma} \leq \frac{ 2n   }{  \tr( \bSigma ) } \| \bmu \|_{\bSigma} \leq 1,\\
     &0\leq \bnu^\top \Ub^{-1}\bnu \leq \frac{ n  + c_1 \sqrt{n\log( n)} }{ \tr( \bSigma )  - \epsilon_{u} } \cdot \| \bmu \|_{\bSigma}^2 \leq \frac{ 2n }{ \tr( \bSigma ) } \cdot \| \bmu \|_{\bSigma}^2 \leq \frac{ 2n \| \bSigma \|_2 }{ \tr( \bSigma ) } \cdot \| \bmu \|_2^2 \leq \frac{1}{2}\cdot  \| \bmu \|_2^2 ,\\
     & \yb^\top \Ub^{-1} \yb \geq \frac{n}{ \tr( \bSigma )  + \epsilon_{u} } \geq \frac{n}{ 2\tr( \bSigma ) }.
\end{align*}
Plugging the bounds above into \eqref{eq:refinedBO_numerator_eq0}, we obtain
\begin{align*}
    | \yb^\top (\Xb\Xb^\top)^{-1}  \Xb \bmu | &\geq D^{-1}\cdot  \bigg(\frac{1}{2}\cdot\| \bmu \|_2^2\cdot \yb^\top \Ub^{-1} \yb  - 2\cdot   |\yb^\top \Ub^{-1}\bnu | \bigg)\\
    &\geq D^{-1}\cdot \bigg[ \frac{n}{ 4\tr( \bSigma ) } \cdot \| \bmu \|_2^2 - \frac{ 4n   }{  \tr( \bSigma ) } \| \bmu \|_{\bSigma} \bigg]\\
    &\geq D^{-1}\cdot \frac{n}{ 4\tr( \bSigma ) } \cdot  ( \| \bmu \|_2^2 -16 \| \bmu \|_{\bSigma} )\\
    &\geq D^{-1}\cdot \frac{n}{ 4\tr( \bSigma ) } \cdot  ( \| \bmu \|_2^2 -16 \|\bSigma \|_2 \| \bmu \|_2 )\\
    &\geq D^{-1}\cdot \frac{n}{ 8\tr( \bSigma ) } \cdot  \| \bmu \|_2^2,
\end{align*}
where the last inequality follows by the assumption that \CC{$\| \bmu \|_2 \geq C \| \bSigma\|_{2} $ for some large enough absolute constant $C$}.
Therefore we have
\begin{align}\label{eq:refinedBO_numerator_bound}
    [\yb^\top (\Xb\Xb^\top)^{-1}  \Xb \bmu]^2 \geq D^{-2}\cdot \frac{n^2}{ 64 [\tr( \bSigma )]^2 } \cdot   \| \bmu \|_2^4 .
\end{align}

This gives an lower bound on the numerator. In the following we derive an upper bound for the denominator $\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb$. By definition, $\Xb = \yb \bmu^\top + \Qb$, Therefore
\begin{align}
    &\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb \nonumber \\
    &\qquad\leq 2\cdot \underbrace{(\yb^\top (\Xb\Xb^\top)^{-1} \yb)^2 \bmu^\top \bSigma \bmu}_{I_1} + 2\cdot \underbrace{ \yb^\top (\Xb\Xb^\top)^{-1} \Qb \bSigma \Qb^\top (\Xb\Xb^\top)^{-1} \yb}_{I_2}\label{eq:refinedBO_denominator_eq0}
\end{align}

% The key is then to give upper bound of the term $\yb^\top (\Xb\Xb^\top)^{-1} \Qb \bSigma \Qb^\top (\Xb\Xb^\top)^{-1} \yb$. 



For $I_1$, by Lemma~\ref{lemma:matrixcalculation} we have \todoy{change notation $D$}
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \yb &=  D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} \yb- \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1} \yb] \\
    & = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} \yb- \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1} \yb] \\
    &= D^{-1} \cdot \yb^\top \Ub^{-1} \yb\\
    &\leq D^{-1}  \cdot \frac{n}{ \tr( \bSigma )  - \epsilon_{u} }\\
    &\leq  2D^{-1}  \cdot \frac{n}{ \tr( \bSigma )},
\end{align*}
where the first inequality follows by Lemma~\ref{lemma:concentrationbounds}, and the second inequality follows by the assumption that $\tr( \bSigma ) \geq C \epsilon_{u}$ for some large enough constant $C$. Therefore we have
\begin{align}
    I_1 =  (\yb^\top (\Xb\Xb^\top)^{-1} \yb)^2 \bmu^\top \bSigma \bmu \leq  4D^{-2}  \cdot \frac{n^2 \cdot \| \bmu \|_{\bSigma}^2}{ [\tr( \bSigma )]^2}. \label{eq:refinedBOderivation_I1bound} 
\end{align}



For $I_2$, by our calculation in Lemma~\ref{lemma:matrixcalculation}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb - \yb^\top \Ub^{-1} \yb \cdot \bnu]^\top \Ub^{-1}.
\end{align*}
Denote $\ab = D^{-1}( 1 + \yb^\top \Ub^{-1} \bnu ) \yb - \yb^\top \Ub^{-1} \yb \cdot \bnu$. Then
\begin{align}
    I_2 &= \yb^\top (\Xb\Xb^\top)^{-1} \Qb \bSigma \Qb^\top (\Xb\Xb^\top)^{-1} \yb \nonumber \\
    &= \ab^\top (\Qb\Qb^\top)^{-1}\Qb\bSigma\Qb^\top (\Qb\Qb^\top)^{-1} \ab \nonumber \\
    & = \ab^\top (\Zb\bLambda \Zb^\top)^{-1} \Zb\bLambda^2 \Zb^\top (\Zb\bLambda \Zb^\top)^{-1} \ab, \label{eq:refinedBOderivation_eq1}
\end{align}
where we plug in $\Qb = \Zb \bLambda^{1/2} \Vb^\top$ for $\Zb$ with independent sub-Gaussian entries.


With exactly the same proof as Lemma~\ref{lemma:eigenvalue_concentration}, we have the following lemma.



\begin{lemma}\label{lemma:eigenvalue_concentration2}
    With probability at least $1 - 1/n$,
    \begin{align*}
    \big\| \Zb\bLambda^2 \Zb^\top   - \tr(\bSigma^2) \Ib \big\|_2 \leq \epsilon_{u}' := C \sigma_q^2  \big( n\cdot \| \bSigma\|_2^2  + \sqrt{n}\cdot \| \bSigma^2\|_F \big),
\end{align*}
where $C$ is an absolute constant.
\end{lemma}

By Lemma~\ref{lemma:eigenvalue_concentration}, Lemma~\ref{lemma:eigenvalue_concentration2} and \eqref{eq:refinedBOderivation_eq1}, when $\tr(\bSigma) \geq \epsilon_{u}$ we have
\begin{align}
    I_2&= \ab^\top (\Zb\bLambda \Zb^\top)^{-1} \Zb\bLambda^2 \Zb^\top (\Zb\bLambda \Zb^\top)^{-1} \ab \nonumber\\
    & \leq \ab^\top (\Zb\bLambda \Zb^\top)^{-2} \ab \cdot \big[\tr(\bSigma^2) + \epsilon_{u}'\big] \nonumber\\
    & \leq \| \ab \|_2^2\cdot \frac{\tr(\bSigma^2) + \epsilon_{u}'}{ [\tr(\bSigma) - \epsilon_{u}]^2}.\label{eq:refinedBOderivation_eq1.5}
\end{align}
Here the first inequality follows by Lemma~\ref{lemma:eigenvalue_concentration2}, and the second inequality follows by Lemma~\ref{lemma:eigenvalue_concentration}. 
By definition, we have
\begin{align}
    \| \ab \|_2^2 & = \| D^{-1}( 1 + \yb^\top \Ub^{-1} \bnu ) \yb - \yb^\top \Ub^{-1} \yb \cdot \bnu \|_2^2 \nonumber \\
    & \leq 2 D^{-2} ( 1 + \yb^\top \Ub^{-1} \bnu )^2 \| \yb \|_2^2 + 2  D^{-2} ( \yb^\top \Ub^{-1} \yb  )^2\cdot  \|  \Qb \bmu\|_2^2. \nonumber
    % \label{eq:refinedBOderivation_eq2}
\end{align}

Then with the same proof as in Lemma~\ref{lemma:concentrationbounds}, when $n$ is sufficiently large, with probability at least $1- n^{-1}$ we have 
\begin{align*}
      \| \Qb \bmu \|_2^2 \leq  2n \| \bmu \|_{\bSigma}^2.
\end{align*}
Therefore we have
\begin{align}
    \| \ab \|_2^2 & \leq 2 D^{-2} ( 1 + \yb^\top \Ub^{-1} \bnu )^2 \| \yb \|_2^2 + 2  D^{-2} ( \yb^\top \Ub^{-1} \yb  )^2\cdot  \|  \Qb \bmu\|_2^2 \nonumber\\
    & \leq 2 D^{-2} ( 1 + \yb^\top \Ub^{-1} \bnu )^2 \cdot n + 4  D^{-2} ( \yb^\top \Ub^{-1} \yb  )^2\cdot  n \cdot \| \bmu \|_{\bSigma}^2. \label{eq:refinedBOderivation_eq2}
\end{align}


Moreover, by Lemma~\ref{lemma:concentrationbounds} and the assumption that \CC{$\tr( \bSigma ) \geq C \max\{ \epsilon_{u} , n, n \| \bmu \|_{\bSigma} \}$} for some large enough constant $C$, we have
\begin{align*}
    &|\yb^\top \Ub^{-1} \bnu| \leq \frac{c_2 n   }{  \tr( \bSigma )  - \epsilon_{u}  } \| \bmu \|_{\bSigma} \leq \sqrt{2} - 1, \\
    &  \yb^\top \Ub^{-1} \yb \leq \frac{n}{ \tr( \bSigma )  - \epsilon_{u} } \leq \frac{2n}{\tr( \bSigma )},
\end{align*}
where $c_2$ is an absolute constant. Plugging the above bounds into \eqref{eq:refinedBOderivation_eq2}, we obtain
\begin{align*}
    \| \ab \|_2^2 & \leq 2 D^{-2} ( 1 + \yb^\top \Ub^{-1} \bnu )^2 \cdot n + 4  D^{-2} ( \yb^\top \Ub^{-1} \yb  )^2\cdot  n \cdot \| \bmu \|_{\bSigma}^2\\
    &\leq 4 D^{-2} \cdot n +  8 D^{-2} \cdot n  \cdot \bigg[\frac{n}{\tr( \bSigma )} \cdot \| \bmu \|_{\bSigma}\bigg]^2\\
    &\leq 5 D^{-2} \cdot n,
\end{align*}
where the last inequality utilizes the assumption  \CC{$\tr( \bSigma ) \geq C n \| \bmu \|_{\bSigma} $} for some large enough constant $C$ again. Further plugging this bound into \eqref{eq:refinedBOderivation_eq1.5}, we obtain
\begin{align}
    I_2 &\leq \| \ab \|_2^2\cdot \frac{\tr(\bSigma^2) + \epsilon_{u}'}{ [\tr(\bSigma) - \epsilon_{u}]^2} \leq 5 D^{-2} n \cdot \frac{\tr(\bSigma^2) + \epsilon_{u}'}{ [\tr(\bSigma) - \epsilon_{u}]^2} \nonumber\\
    &\leq c_3 D^{-2}\cdot  \frac{ n\cdot \tr(\bSigma^2) +  n^2\cdot \| \bSigma\|_2^2  + n^{3/2}\cdot \| \bSigma^2\|_F}{ [\tr(\bSigma) ]^2},\label{eq:refinedBOderivation_I2bound}
\end{align}
where $c_3$ is an absolute constant. Here the second inequality above follows by the definition of $\epsilon_{u}'$ in Lemma~\ref{lemma:eigenvalue_concentration2} and the assumption that $\tr(\bSigma) > C \epsilon_{u}'$ for some large enough constant $C$. 

Plugging the bounds \eqref{eq:refinedBOderivation_I1bound} and \eqref{eq:refinedBOderivation_I2bound} into \eqref{eq:refinedBO_denominator_eq0}, we obtain
\begin{align}
    &\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb \nonumber  \\
    &\qquad \leq 8D^{-2}  \cdot \frac{n^2 \cdot \| \bmu \|_{\bSigma}^2}{ [\tr( \bSigma )]^2} + c_4\cdot D^{-2}\cdot  \frac{ n\cdot \tr(\bSigma^2) +  n^2\cdot \| \bSigma\|_2^2  + n^{3/2}\cdot \| \bSigma^2\|_F}{ [\tr(\bSigma) ]^2} \nonumber\\
    &\qquad \leq c_5 D^{-2}  \cdot  \frac{ n^2 \cdot \| \bmu \|_{\bSigma}^2+  n\cdot \tr(\bSigma^2) +  n^2\cdot \| \bSigma\|_2^2  + n^{3/2}\cdot \| \bSigma^2\|_F}{ [\tr(\bSigma) ]^2} \label{eq:refinedBO_denominator_bound}
\end{align}
Finally, plugging in the bounds \eqref{eq:refinedBO_numerator_bound} \eqref{eq:refinedBO_denominator_bound} into \eqref{eq:refinedBOderivation_eq0}, we obtain
\begin{align*}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) &\leq \exp\bigg[ - \frac{ [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2}{  2 \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb} \bigg]\\
    &\leq \exp\bigg[ - \frac{ D^{-2}\cdot \frac{n^2}{ 64 [\tr( \bSigma )]^2 } \cdot   \| \bmu \|_2^4  }{  c_6 D^{-2}  \cdot  \frac{ n^2 \cdot \| \bmu \|_{\bSigma}^2+  n\cdot \tr(\bSigma^2) +  n^2\cdot \| \bSigma\|_2^2  + n^{3/2}\cdot \| \bSigma^2\|_F}{ [\tr(\bSigma) ]^2}} \bigg]\\
    &\leq  \exp\bigg[ - \frac{  c_7 n\cdot \| \bmu \|_2^4  }{  n \cdot \| \bmu \|_{\bSigma}^2+ \tr(\bSigma^2) +  n\cdot \| \bSigma\|_2^2  + n^{1/2}\cdot \| \bSigma^2\|_F} \bigg].
    % \\
    % &\leq \exp\bigg[ - c_8 \max\bigg\{\frac{\| \bmu \|_2^2}{\| \bSigma\|_2} , \frac{  n\cdot \| \bmu \|_2^4  }{ \tr(\bSigma^2) +  n\cdot \| \bSigma\|_2^2  + n^{1/2}\cdot \| \bSigma^2\|_F} \bigg\} \bigg]
\end{align*}
This completes the proof.

\end{proof}
% \begin{proof}[Proof of Lemma~\ref{lemma:riskboundcalculation}]
% \begin{align*}
%     \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb}
% \end{align*}

% By definition, we have $\Xb = \yb \bmu^\top + \Qb$, and therefore
% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu = \yb^\top (\Xb\Xb^\top)^{-1} (\yb \bmu^\top + \Qb) \bmu = \| \bmu \|_2^2 \cdot \yb^\top (\Xb\Xb^\top)^{-1} \yb 
%     + \yb^\top (\Xb\Xb^\top)^{-1}  \Qb \bmu.
% \end{align*}
% Hence we have
% \begin{align*}
%     \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} =  \| \bmu \|_2^2 + \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Qb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb}.
% \end{align*}
% By Lemma~\ref{lemma:matrixcalculation}, we have
% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1}].
% \end{align*}
% Therefore
% \begin{align*}
%     \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &=  \| \bmu \|_2^2 + \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Qb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb}\\
%     &=  \| \bmu \|_2^2 + \frac{( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} \bnu - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1} \bnu }{( 1 + \yb^\top \Ub^{-1} \bnu ) \yb^\top \Ub^{-1} \yb - \yb^\top \Ub^{-1} \yb \cdot \bnu^\top \Ub^{-1} \yb}
% \end{align*}
% \end{proof}






\bibliography{deeplearningreference}
\bibliographystyle{ims}

\end{document}
