


\section{Main Results}




% Denote $\Ab = \Qb\Qb^\top$, $\bnu = \Qb\bmu$. 





Denote by $ \| \bSigma\|_{2} ,\lambda_2,\ldots, \lambda_p$ the eigenvalues of $\bSigma$ in descending order, with corresponding eigenvectors $\vb_1,\ldots,\vb_p$. Denote $\bLambda = \diag\{  \| \bSigma\|_{2} ,\lambda_2,\ldots, \lambda_p \}$  and  $\Vb = [ \vb_1,\ldots,\vb_p ]$, then $\bSigma = \Vb \bLambda \Vb^\top$. We also denote by $\mu_n(\Ab)$ the eigenvalues of $\Ab$.
 

% The following lemma is given by Lemma 26 in \citet{bartlett2020benign}.

% \begin{lemma}\label{lemma:eigenvalue_concentration}
% There exists an absolute constant $c$ such that, with probability at least $1 - 2\exp(-n/c)$,
% \begin{align*}
%     c^{-1} \sum_{i} \lambda_i - c n \cdot \| \bSigma\|_{2} \leq \mu_n(\Ab) \leq \mu_1(\Ab) \leq c \sum_{i} \lambda_i + c \| \bSigma\|_{2}  n.
% \end{align*}

% \end{lemma}


% The following lemma (Lemma 1 in \citet{muthukumar2020classification}) gives another estimate on the eigenvalues of $\Ab$.


% \begin{lemma}\label{lemma:eigenvalue_concentration2}
%     With probability at least $1 - 2/n$, 
%     \begin{align*}
%         \Big\| \Ab - d'(n) \Ib \Big\|_2 \leq \epsilon_{\lambda} := \frac{1}{12\sqrt{n}} \tr(\bSigma) + 4\sqrt{n\log(n)}\cdot  \| \bSigma\|_{F}  + 8n\log(n)\cdot  \| \bSigma\|_{2} .
%     \end{align*}
% \end{lemma}
% \todoq{It seems that the $\|\lambda\|_1$ term on the right hand size does not appear in Peter's paper}
% \todoq{Is Peter's bound tighter than this one?}


















% \begin{lemma}\label{lemma:concentrationbounds}
% With probability at least $1 - 2/n$,
% \begin{align*}
%     \frac{n}{ c \sum_{i} \lambda_i + c n \cdot \| \bSigma\|_{2} } \leq  \yb^\top \Ab^{-1} \yb \leq \frac{n}{ c^{-1} \sum_{i} \lambda_i - c n \cdot \| \bSigma\|_{2} }
% \end{align*}

% \begin{align*}
%     \frac{n}{ c \sum_{i} \lambda_i + c n \cdot \| \bSigma\|_{2} } \cdot \| \bmu \|_{\bSigma}^2 \leq  \bnu^\top \Ab^{-1} \bnu \leq \frac{n}{ c^{-1} \sum_{i} \lambda_i - c n \cdot \| \bSigma\|_{2} } \cdot \| \bmu \|_{\bSigma}^2
% \end{align*}


% \begin{align*}
%     |\yb^\top \Ab^{-1} \bnu| \leq \frac{n}{ c^{-1} \sum_{i} \lambda_i - c n \cdot \| \bSigma\|_{2} }  \| \bmu \|_{\bSigma}
% \end{align*}

% % \begin{align*}
% %     \frac{n}{ \tr( \bSigma )  - \epsilon_{\lambda} } \leq  \yb^\top \Ab^{-1} \yb \leq \frac{n}{ \tr( \bSigma )  + \epsilon_{\lambda} }
% % \end{align*}

% % \begin{align*}
% %     \frac{n}{ \tr( \bSigma )  - \epsilon_{\lambda} } \cdot \| \tilde\bmu \|_2 \leq  \bnu^\top \Ab^{-1} \bnu \leq \frac{n}{ \tr( \bSigma )  + \epsilon_{\lambda} } \cdot \| \tilde\bmu \|_2
% % \end{align*}



% \end{lemma}



% The following lemma gives bounds on $f_i = \bnu^\top \Ab^{-1} \eb_i y_i$, $g_i = \yb^\top \Ab^{-1} \eb_i y_i$. \todoq{what is the intuition of this lemma?}


% The following lemma is summarized form Lemma 1 in \citet{muthukumar2020classification}.








% \begin{proof}[Proof of Lemma]
% \begin{align*}
%      n \| \tilde\bmu \|_2 + c \| \tilde\bmu \|_2\cdot \sqrt{n\log( n)} 
% \end{align*}
% \end{proof}


% \begin{lemma}\label{lemma:asymmetricbounds}
% For any vector $\ub \in S^{n-1}$, 
% \end{lemma}

% \begin{proof}[Proof of Lemma~\ref{lemma:asymmetricbounds}]
% Denote $\ab_i = \sqrt{n} \eb_i y_i$. Then we have
% \begin{align*}
%     \yb^\top \Ab^{-1} \eb_i y_i = n^{-1/2} \yb^\top \Ab^{-1} \ab_i = \frac{1}{4\sqrt{n}} (\yb + \ab_i)^\top \Ab^{-1} (\yb + \ab_i) - \frac{1}{4\sqrt{n}} (\yb - \ab_i)^\top \Ab^{-1} (\yb - \ab_i).
% \end{align*}
% Note that by definition, $\| \yb + \ab_i \|_2^2 = $
% \end{proof}






% \begin{lemma}\label{lemma:anisotropicbound1}
% With probability at least $1 - 4n^{-1}$, 
% \begin{align*}
%     \big| \bmu^\top\Qb^\top \Ab^{-1} \eb_i y_i \big| \leq \frac{\sqrt{2n} \| \bmu \|_{\bSigma} \cdot \epsilon_{\lambda} +  C\| \bmu \|_{\bSigma} \sqrt{\log(n)} \cdot \tr(\bSigma)}{ \tr(\bSigma)^2 - \epsilon_{\lambda}^2  }.
% \end{align*}
% for all $i\in [n]$, where $C$ is an absolute constant. 
% \end{lemma}















% \begin{theorem}[Old]
% There exists a large enough absolute constant $C$ such that when $ \tr( \bSigma ) \geq C n^{3/2} \|\bmu \|_{\bSigma}$ and $\tr( \bSigma ) \geq C  \sqrt{n} \epsilon_{\lambda}$, $\hat\btheta_{\text{SVM}}$ also solves the least square problem.
% \end{theorem}


% \begin{theorem}[old2]
% There exists a large enough absolute constant $C$ such that when 
% \begin{align*}
%     &\tr( \bSigma ) \geq C n \| \bmu \|_{\bSigma},\\
%     &\tr(\bSigma)^2 \geq Cn^{5/2}  \| \bmu \|_{\bSigma} \|\bSigma \|_2, \\
%     &\tr(\bSigma)^2 \geq C n^2 \| \bmu \|_{\bSigma} \|\bSigma \|_F, \\
%     &\tr( \bSigma ) \geq C  n\sqrt{\log(n)},
% \end{align*}
% $\hat\btheta_{\text{SVM}}$ also solves the least square problem.
% \end{theorem}



% \begin{theorem}[New]\label{thm:interpolationregression}
% There exists a large enough absolute constant $C$ such that when $\tr( \bSigma ) \geq C n \sqrt{n} \cdot \| \bSigma \|_2 + n\cdot \| \bSigma \|_F )$ and $\tr( \bSigma ) \geq C n\sqrt{\log(n)}\cdot \| \bmu \|_{\bSigma} )$, 
% $\hat\btheta_{\text{SVM}}$ also solves the least square problem.
% \end{theorem}
% \begin{align*}
%     &\tr( \bSigma ) \geq C n \| \bmu \|_{\bSigma},\\
%     &\tr(\bSigma)^2 \geq Cn^{5/2}  \| \bmu \|_{\bSigma} \|\bSigma \|_2, \\
%     &\tr(\bSigma)^2 \geq C n^2 \| \bmu \|_{\bSigma} \|\bSigma \|_F, \\
%     &\tr( \bSigma ) \geq C  n\sqrt{\log(n)},
% \end{align*}

% \begin{remark}
% The condition $\tr( \bSigma ) = \Omega( n \sqrt{n} \cdot \| \bSigma \|_2 + n\cdot \| \bSigma \|_F  +  n\sqrt{\log(n)})$ in  Theorem~\ref{thm:interpolationregression} coincides with the condition given by \citet{muthukumar2020classification} for the anisotropic setting. 
% \end{remark}





% For our setting here, requiring $\tr(\bSigma) \geq  c_4 (n^{3/2}/\tr(\bSigma))\cdot \| \bmu \|_{\bSigma} \cdot \epsilon_{\lambda}$ needs
% \begin{align*}
%     &\tr(\bSigma)^2 \geq c n^{5/2}  \| \bmu \|_{\bSigma} \|\bSigma \|_2,\\
%     &\tr(\bSigma)^2 \geq c n^2 \| \bmu \|_{\bSigma} \|\bSigma \|_F.
% \end{align*}
% We also require $\tr(\bSigma) \geq  c_5 n \sqrt{\log(n)} $. Summarizing the conditions above, we obtain the theorem result.


% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i &\geq D^{-1} \bigg[ \bigg( 1 - \frac{n  + c_1 \sqrt{n\log( n)} }{  \tr( \bSigma )  - \epsilon_{\lambda}  } \| \tilde\bmu \|_2 \bigg) \cdot \frac{ \tr( \bSigma ) - \sqrt{n} \epsilon_{\lambda} }{\tr( \bSigma )^2  - \epsilon_{\lambda}^2} - \frac{n}{ \tr( \bSigma )  - \epsilon_{\lambda} } \cdot \frac{\sqrt{n}}{ \tr( \bSigma )  - \epsilon_{\lambda} } \| \tilde\bmu \|_2\bigg] \\
%     &\geq D^{-1} \bigg[ c_2 \cdot \bigg( 1 - \frac{c_3 n  }{  \tr( \bSigma )    } \| \tilde\bmu \|_2 \bigg) \cdot \frac{ \tr( \bSigma ) - \sqrt{n} \epsilon_{\lambda} }{\tr( \bSigma )^2} - \frac{n^{3/2}}{ \tr( \bSigma )^2  } \cdot  \| \tilde\bmu \|_2\bigg],
% \end{align*}
% where $c_2,c_3 > 0$ are absolute constants.
% Therefore when $ \tr( \bSigma ) \geq C n^{3/2} \| \tilde\bmu \|_2$ and $\tr( \bSigma ) \geq C  \sqrt{n} \epsilon_{\lambda}$ for some large enough constant $C$, we have
% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i 
%     &\geq D^{-1} \bigg[ c_4 \cdot  \frac{ \tr( \bSigma ) - \sqrt{n} \epsilon_{\lambda} }{\tr( \bSigma )^2} - \frac{n^{3/2}}{ \tr( \bSigma )^2  } \cdot  \| \tilde\bmu \|_2\bigg] > 0.
% \end{align*}
% Applying Lemma~\ref{lemma:equivalence} completes the proof.





\begin{remark}
Compared with the results of \citet{wang2020benign}, our result is applicable to a more general setting that allows anisotropic, non-Gaussian data and allow infinite-dimensional features. Under the setting when $\lambda_i = 1$ for $i\in[p]$, Theorem~\ref{thm:interpolationregression} recovers  \CC{( the corrected version of)} the results in \citet{wang2020benign}. 
\end{remark}










\subsection{Benign Overfitting}

\begin{lemma}\label{lemma:riskbound}
Under the conditions of Theorem~\ref{thm:interpolationregression}, 
\begin{align*}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg[ - \frac{ [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2}{  \| \bSigma\|_{2} \cdot \yb^\top (\Xb\Xb^\top)^{-1} \yb} \bigg].
\end{align*}
\end{lemma}

The following lemma is summarized from \citet{wang2020benign}. 


% $s = \yb^\top \Ab^{-1} \yb$, $t = \bnu^\top \Ab^{-1} \bnu$, $h = \yb^\top \Ab^{-1} \bnu$

\begin{lemma}\label{lemma:riskboundcalculation}
\begin{align*}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &= \frac{ \big[ \yb^\top \Ab^{-1} \yb \cdot (\| \bmu \|_2^2 - \bnu^\top \Ab^{-1} \bnu) +  (\yb^\top \Ab^{-1} \bnu)^2 +  \yb^\top \Ab^{-1} \bnu \big]^2 }{ \yb^\top \Ab^{-1} \yb \cdot \big[ \yb^\top \Ab^{-1} \yb \cdot (\| \bmu \|_2^2 - \bnu^\top \Ab^{-1} \bnu) +  (\yb^\top \Ab^{-1} \bnu + 1)^2 \big]}.
\end{align*}
\end{lemma}


\begin{lemma}\label{lemma:riskboundssimplify}
Suppose that  $ \tr( \bSigma ) \geq C n^{3/2} \| \tilde\bmu \|_2$ and $\tr( \bSigma ) \geq C  \sqrt{n} \epsilon_{\lambda}$, $\hat\btheta_{\text{SVM}}$
for some large enough absolute constant $C$. 
\begin{align*}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} \geq  \frac{ n\cdot \big(  \| \bmu \|_2^2  - 8 \| \bmu \|_{\bSigma} \big)^2 }{ 64 \big(  n \cdot \| \bmu \|_2^2 + \tr( \bSigma )  \big) }.
\end{align*}
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:riskboundssimplify}]
By Lemma~\ref{lemma:riskboundcalculation} we have
\begin{align}\label{eq:riskboundssimplify_eq1}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &= \frac{ \big[ \yb^\top \Ab^{-1} \yb \cdot (\| \bmu \|_2^2 - \bnu^\top \Ab^{-1} \bnu) +  (\yb^\top \Ab^{-1} \bnu)^2 +  \yb^\top \Ab^{-1} \bnu \big]^2 }{ \yb^\top \Ab^{-1} \yb \cdot \big[ \yb^\top \Ab^{-1} \yb \cdot (\| \bmu \|_2^2 - \bnu^\top \Ab^{-1} \bnu) +  (\yb^\top \Ab^{-1} \bnu + 1)^2 \big]}.
\end{align}
% By definition, we have
By Lemma~\ref{lemma:concentrationbounds}, with probability at least $1 - n^{-1}$, we have
\begin{align*}
    \| \bmu \|_2^2 - \bnu^\top \Ab^{-1} \bnu \geq  \| \bmu \|_2^2 -  \frac{n}{ 2 \tr(\bSigma)}  \| \bmu \|_{\bSigma}^2 \geq \| \bmu \|_2^2 - \frac{  \| \bSigma\|_{2}  n}{ 2\tr(\bSigma)}  \| \bmu \|_2^2 =  \frac{ 2 \tr(\bSigma) -   \| \bSigma\|_{2}  n}{2  \tr(\bSigma)}  \| \bmu \|_2^2 \geq \| \bmu \|_2^2 / 2.
\end{align*}
Plugging the above inequality into \eqref{eq:riskboundssimplify_eq1}, we have
\begin{align*}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &= \frac{ \big[ \yb^\top \Ab^{-1} \yb \cdot (\| \bmu \|_2^2 - \bnu^\top \Ab^{-1} \bnu) +  (\yb^\top \Ab^{-1} \bnu)^2 +  \yb^\top \Ab^{-1} \bnu \big]^2 }{ \yb^\top \Ab^{-1} \yb \cdot \big[ \yb^\top \Ab^{-1} \yb \cdot (\| \bmu \|_2^2 - \bnu^\top \Ab^{-1} \bnu) +  (\yb^\top \Ab^{-1} \bnu + 1)^2 \big]}\\
    &\geq  \frac{ \big[ \yb^\top \Ab^{-1} \yb \cdot \| \bmu \|_2^2 / 2 - | \yb^\top \Ab^{-1} \bnu | \big]^2 }{ \yb^\top \Ab^{-1} \yb \cdot \big[ \yb^\top \Ab^{-1} \yb \cdot \| \bmu \|_2^2 + 2 \big]}\\
    &\geq \frac{ \big[ \yb^\top \Ab^{-1} \yb \cdot \| \bmu \|_2^2 / 2 -  2n  \tr( \bSigma )^{-1}  \cdot \| \bmu \|_{\bSigma} \big]^2 }{ \yb^\top \Ab^{-1} \yb \cdot \big[ \yb^\top \Ab^{-1} \yb \cdot \| \bmu \|_2^2 + 2 \big]}
\end{align*}
where we utilize Lemma~\ref{lemma:concentrationbounds} to derive the condition 
\begin{align*}
    \yb^\top \Ab^{-1} \bnu \leq \frac{ n  + c \sqrt{n\log( n)} }{ \tr( \bSigma )  - \epsilon_{\lambda} } \cdot \| \bmu \|_{\bSigma} \leq  \frac{ 2n  }{ \tr( \bSigma ) } \cdot \| \bmu \|_{\bSigma} \leq \sqrt{2} - 1.
\end{align*}
% $\yb^\top \Ab^{-1} \bnu \leq \frac{ n  + c \sqrt{n\log( n)} }{ \tr( \bSigma )  - \epsilon_{\lambda} } \cdot \| \bmu \|_{\bSigma} \leq  \frac{ 2n  }{ \tr( \bSigma ) } \cdot \| \bmu \|_{\bSigma} \leq \sqrt{2} - 1$.
Applying the bound on $\yb^\top \Ab^{-1} \yb$ in Lemma~\ref{lemma:concentrationbounds}, we obtain %have $\yb^\top \Ab^{-1} \yb \cdot \| \bmu \|_2^2 \leq \frac{n}{ \tr( \bSigma )  - \epsilon_{\lambda} } \cdot \| \bmu \|_2^2 \leq 1$. Therefore

\begin{align*}
    \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &\geq \frac{ \big( n \tr( \bSigma )^{-1} \cdot \| \bmu \|_2^2 / 4 -  2n  \tr( \bSigma )^{-1}  \cdot \| \bmu \|_{\bSigma} \big)^2 }{ 2n \tr( \bSigma )^{-1} \cdot \big( 2 n \tr( \bSigma )^{-1} \cdot \| \bmu \|_2^2 + 2 \big) } \\
    &=  \frac{ n\cdot \big(  \| \bmu \|_2^2 / 4 -    2 \| \bmu \|_{\bSigma} \big)^2 }{ 2 \tr( \bSigma )  \cdot \big( 2 n \tr( \bSigma )^{-1} \cdot \| \bmu \|_2^2 + 2 \big) } \\
    &=  \frac{ n\cdot \big(  \| \bmu \|_2^2  - 8 \| \bmu \|_{\bSigma} \big)^2 }{ 64 \big(  n \cdot \| \bmu \|_2^2 + \tr( \bSigma )  \big) }
\end{align*}
This completes the proof. 

% \begin{align*}
%     \frac{[\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu]^2 }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} \geq  \frac{ (\yb^\top \Ab^{-1} \yb)^2 \cdot \| \bmu \|_2^4 / 4  }{ \yb^\top \Ab^{-1} \yb \cdot 3} = \yb^\top \Ab^{-1} \yb \cdot \| \bmu \|_2^4 / 12 \geq \frac{ n \| \bmu \|_2^4  }{24 \tr(\bSigma)},
% \end{align*}


% Denote $\overline\bmu = \Vb^\top \bmu$, then%$\tilde\bmu = \Lambda^{1/2} \Vb^\top \bmu$
% \begin{align*}
%     \| \bmu \|_2^2 = \| \overline\bmu \|_2^2  = \sum_{j=1}^p \lambda_j \overline\mu_j^2 / \lambda_j \geq 
% \end{align*}


\end{proof}






\begin{theorem}\label{thm:benignoverfitting}
Suppose that  $ \tr( \bSigma ) \geq C n^{3/2} \| \bmu \|_{\bSigma}$, $\tr( \bSigma ) \geq C  \sqrt{n} \epsilon_{\lambda}$, and $\| \bmu \|_2 \geq C \| \bSigma\|_{2} $ for some large enough absolute constant $C$. Then
\begin{align*}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg(- \frac{1 }{256  \| \bSigma\|_{2} } \cdot \min\bigg\{ \| \bmu \|_2^2 , \frac{n \| \bmu \|_2^4 }{\tr(\bSigma)} \bigg\}\bigg).
\end{align*}
\end{theorem}

\begin{proof}[Proof of Theorem~\ref{thm:benignoverfitting}]
The proof follows by a direct combination of Lemma~\ref{lemma:riskboundcalculation} and Lemma~\ref{lemma:riskboundssimplify}. 
\end{proof}

\begin{remark}
Theorem~\ref{thm:benignoverfitting} establishes a risk bound for interpolating classifiers. It implies the result of \citet{wang2020benign} when setting $\lambda_j = 1$, $j\in [p]$. \citet{chatterji2020finite} gives an $\exp(-\Omega( \| \bmu \|_2^4 / p ))$ risk bound under the conditions $p = \Omega( n \| \bmu \|_2^2 ) $ and $\EE( \| \qb \|_2^2) \geq \Omega(p)$. Under the same conditions, the bound in Theorem~\ref{thm:benignoverfitting} reduces to $\exp(-\Omega( n \| \bmu \|_2^4 / p ))$, which is clearly sharper than the bound in \citet{chatterji2020finite}. Moreover, our result also holds for for more general settings without these conditions, and is applicable to the infinite-dimensional setting.




% Compared with \citet{chatterji2020finite} which requires $\EE( \| \qb \|_2^2) \geq \Omega(p)$, our result does not rely on this assumption and therefore works for the infinite-dimensional setting. Moreover, 


% Compared with the results in \citet{chatterji2020finite} and \citet{wang2020benign},
% Compared with the results of \citet{wang2020benign}, our result is applicable to a more general setting that allows anisotropic, non-Gaussian data and allow infinite-dimensional features. Under the setting when $\lambda_i = 1$ for $i\in[p]$, Theorem~\ref{thm:interpolationregression} recovers  \CC{( the corrected version of)} the results in \citet{wang2020benign}. 

\end{remark}







\section{Improving the Condition for Classification-Regression Equivalence}

We consider the isotropic Gaussian mixture setting.

\begin{lemma}\label{lemma:eigenvalueconcentration_isotropic}[Lemma 2 in \citet{muthukumar2020classification}]
For any vector $\ab\in S^{d-1}$, with probability at least $1 - 2n^{-2}$, 
\begin{align*}
    & \ub^\top \Ab^{-1} \ub \geq \frac{1}{d'(n) + \sqrt{4\ln(n) d'(n)} + 4\ln(n)}, \\
    & \ub^\top \Ab^{-1} \ub \leq \frac{1}{d'(n) - \sqrt{4\ln(n) d'(n)}} \leq \frac{1}{d'(n) - \sqrt{4\ln(n) d'(n)} - 4\ln(n)},
\end{align*}
where $d'(n) = d - n + 1$.
\end{lemma}

Denote $\epsilon_{\lambda} = \sqrt{4\ln(n) d'(n)} + 4\ln(n)$. Then by Lemma \ref{lemma:eigenvalueconcentration_isotropic}, we have
\begin{align}\label{eq:isotropic_eigenvalueconcentration}
    \frac{1}{d'(n) - \epsilon_{\lambda}} \geq \ub^\top \Ab^{-1} \ub \geq \frac{1}{d'(n) + \epsilon_{\lambda}}.
\end{align}



\begin{lemma}\label{lemma:isotropicbound1}
With probability at least $1 - 4n^{-1}$, 
\begin{align*}
    \big| \bmu^\top\Qb^\top \Ab^{-1} \eb_i y_i \big| \leq \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{\lambda} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{\lambda}^2  }.
\end{align*}
for all $i\in [n]$, where $d'(n) = d - n + 1$. 
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:isotropicbound1}] 
\begin{align}
    \bmu^\top\Qb^\top\Ab^{-1} \eb_i y_i &= \frac{1}{\| \Qb\bmu \|_2}\cdot (\Qb\bmu)^\top \Ab^{-1} (\| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    & = \frac{1}{4\| \Qb\bmu \|_2}\cdot (\Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i)^\top \Ab^{-1} (\Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    &\quad - \frac{1}{4\| \Qb\bmu \|_2}\cdot (\Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i)^\top \Ab^{-1} (\Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i) \nonumber \\
    & \leq \frac{1}{4\| \Qb\bmu \|_2}\cdot \bigg[  \frac{\| \Qb\bmu + \| \Qb\bmu \|_2\cdot \eb_i y_i \|_2^2}{ d'(n)- \epsilon_{\lambda}  } - \frac{\| \Qb\bmu - \| \Qb\bmu \|_2\cdot \eb_i y_i \|_2^2}{ d'(n)+ \epsilon_{\lambda}  }     \bigg] \nonumber\\
    &= \frac{1}{4\| \Qb\bmu \|_2}\cdot \bigg[  \frac{2 \| \Qb\bmu \|_2^2 + 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu }{ d'(n)- \epsilon_{\lambda}  } - \frac{2\| \Qb\bmu \|_2^2  - 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu }{ d'(n)+ \epsilon_{\lambda}  } \bigg] \nonumber \\
    &= \frac{1}{2\| \Qb\bmu \|_2}\cdot \frac{2\| \Qb\bmu \|_2^2 \cdot \epsilon_{\lambda} + 2 y_i\| \Qb\bmu \|_2\cdot \eb_i^\top \Qb\bmu \cdot d'(n)}{ d'(n)^2 - \epsilon_{\lambda}^2  }\nonumber\\
    &= \frac{\| \Qb\bmu \|_2 \cdot \epsilon_{\lambda} +  y_i \eb_i^\top \Qb\bmu \cdot d'(n)}{ d'(n)^2 - \epsilon_{\lambda}^2  },\label{eq:fgbounds_anisotropic_eq1}
\end{align}
where the first equality holds due to the parallelogram law $\ab^\top \Mb \bbb = 1/4(\ab+\bbb)^\top \Mb (\ab+\bbb) - 1/4(\ab-\bbb)^\top \Mb (\ab-\bbb)$, and the first inequality follows by \eqref{eq:isotropic_eigenvalueconcentration}.
Under the isotropic Gaussian mixture setting, $e_j^\top \Qb\bmu $, $j=1,\ldots, n$ are i.i.d. Gaussian random variables with variance $\| \bmu \|_2^2$. Therefore by Berstein inequality, with probability at least $1 - n^{-1}$ we have
\begin{align*}
    \big| \| \Qb\bmu \|_2^2 - n \| \bmu \|_2^2 \big| \leq  n\cdot c_1 \| \bmu \|_2^2 \sqrt{\frac{\log(n)}{n}} = c_1 \| \bmu \|_2^2 \sqrt{n\log(n)}, 
\end{align*}
where $c_1$ is an absolute constant. 
Therefore when $n$ is large enough, $\| \Qb\bmu \|_2^2 \leq 2n \| \bmu \|_2^2$. Similarly, by Gaussian tail bound and union bound, with probability at least $1 - n^{-1}$, 
\begin{align*}
    | e_i^\top \Qb\bmu | \leq c_2\sqrt{\log(n)}
\end{align*}
for all $i\in[n]$, where $c_2$ is an absolute constant. Therefore we have
\begin{align*}
    \bnu^\top \Ab^{-1} \eb_i y_i &\leq \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{\lambda} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{\lambda}^2  }.
\end{align*}
With the exact same proof, we also have
\begin{align*}
    -\bnu^\top \Ab^{-1} \eb_i y_i &\leq \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{\lambda} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{\lambda}^2  }.
\end{align*}
This completes the proof.
\end{proof}








\begin{lemma}\label{lemma:isotropicbound2}
With probability at least $1 - 2n^{-1}$, 
\begin{align*}
    \yb^\top \Ab^{-1} \eb_i y_i  \geq \frac{ d'(n) - \sqrt{n} \epsilon_{\lambda} }{d'(n)^2  - \epsilon_{\lambda}^2}
\end{align*}
for all $i\in [n]$. 
\end{lemma}

\begin{proof}[Proof of Lemma~\ref{lemma:isotropicbound2}]
The proof is very similar to the proof of Lemma~\ref{lemma:isotropicbound1}. We have
\begin{align*}
    \yb^\top \Ab^{-1} \eb_i y_i &= \frac{1}{4\sqrt{n}} (\yb + \sqrt{n}\eb_i y_i )^\top \Ab^{-1} (\yb + \sqrt{n}\eb_i y_i ) - \frac{1}{4\sqrt{n}} (\yb - \sqrt{n}\eb_i y_i )^\top \Ab^{-1} (\yb - \sqrt{n}\eb_i y_i )\\
    &\geq \frac{1}{4\sqrt{n}} \bigg[ \frac{\| \yb + \sqrt{n}\eb_i y_i \|_2^2}{d'(n)  + \epsilon_{\lambda}} - \frac{\| \yb - \sqrt{n}\eb_i y_i \|_2^2}{d'(n)  - \epsilon_{\lambda}}  \bigg]\\
    &= \frac{1}{4\sqrt{n}} \bigg[ \frac{2n + 2\sqrt{n}}{d'(n)  + \epsilon_{\lambda}} - \frac{2n - 2\sqrt{n}}{d'(n)  - \epsilon_{\lambda}}  \bigg]\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{(n + \sqrt{n}) (d'(n)  - \epsilon_{\lambda}) - (n - \sqrt{n}) (d'(n)  + \epsilon_{\lambda}) }{d'(n)^2  - \epsilon_{\lambda}^2}\\
    & = \frac{1}{2\sqrt{n}} \cdot  \frac{2\sqrt{n} d'(n) - 2n \epsilon_{\lambda} }{d'(n)^2  - \epsilon_{\lambda}^2}\\
     & = \frac{ d'(n) - \sqrt{n} \epsilon_{\lambda} }{d'(n)^2  - \epsilon_{\lambda}^2},
\end{align*}
where we use the parallelogram law $\ab^\top \Mb \bbb = 1/4(\ab+\bbb)^\top \Mb (\ab+\bbb) - 1/4(\ab-\bbb)^\top \Mb (\ab-\bbb)$ in the first equality and use \eqref{eq:isotropic_eigenvalueconcentration} in the first inequality. This completes the proof.
\end{proof}


\begin{lemma}\label{lemma:isotropic_equivalencecondition}
Soppose that $d > C n \log(n)$ and $d >  C n \| \bmu \|_2$ for some large enough absolute constant $C$. Then with probability at least $1 - 4n^{-1}$, $\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i> 0$ for all $i \in [n]$, and therefore hard margin SVM equals the minimum norm interpolator. 
\end{lemma}

\begin{proof}{Proof of Lemma~\ref{lemma:isotropic_equivalencecondition}}
By Lemma~\ref{lemma:matrixcalculation}, we have
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1}].
\end{align*}
Moreover, by Lemma~4 in \citet{wang2020benign}, if $d = \Omega(n \| \bmu \|_2) $ and $d = \Omega(n) $, then with probability at least $1 - n^{-1}$, $\yb^\top \Ab^{-1} \bnu \geq -1/2$, $\yb^\top \Ab^{-1} \yb \leq c_1 n/d$ for some absolute constant $c_1$.
Therefore as long as $d'(n) > \epsilon_{\lambda}$
\begin{align*}
    \yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i &= D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} \eb_i y_i  - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1}\eb_i y_i ] \\
    &\propto  ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} \eb_i y_i  - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1}\eb_i y_i\\
    &\geq (1/2)\cdot \yb^\top \Ab^{-1} \eb_i y_i  - c_1 (n/d)\cdot |\bnu^\top \Ab^{-1}\eb_i y_i|\\
    &\geq  \frac{1}{2} \cdot \frac{ d'(n) - \sqrt{n} \epsilon_{\lambda} }{d'(n)^2  - \epsilon_{\lambda}^2} -  \frac{c_1n}{d}\cdot \frac{\sqrt{2n} \| \bmu \|_2 \cdot \epsilon_{\lambda} +  c_2\sqrt{\log(n)} \cdot d'(n)}{ d'(n)^2 - \epsilon_{\lambda}^2  },
\end{align*}
where $c_1,c_2$ are absolute constants. 
Therefore $\yb^\top (\Xb\Xb^\top)^{-1} \eb_i y_i >0$ if there exists a large enough constant $c_3$ such that
\begin{align*}
    &d'(n) > c_3 \sqrt{n} \epsilon_{\lambda},\\
    &d'(n) > c_3 (n^{3/2} / d) \cdot  \| \bmu \|_2 \cdot \epsilon_{\lambda},\\
    &d'(n) > c_3 (n/d)\cdot \sqrt{\log(n)} \cdot d'(n).
\end{align*}
Plugging in the definitions $d'(n) = d - n + 1$ and 
$\epsilon_{\lambda} = \sqrt{4\ln(n) d'(n)} + 4\ln(n)$, we see that the above conditions are satisfied when $d > c_4 n \log(n)$ and $d >  c_4 n \| \bmu \|_2$ for some large enough absolute constant $c_4$. This completes the proof. 
\end{proof}



\section{Some Calculations Regarding Lemma~\ref{lemma:riskbound}}



% \begin{theorem}\label{thm:BOnew}
% Suppose that $\tr( \bSigma ) \geq C \max\{ \epsilon_{\lambda} , n, n \|\bSigma \|_2 , n \| \bmu \|_{\bSigma} \}$ and $\| \bmu \|_2 \geq C \| \bSigma\|_{2}$ for some large enough constant $C$. Then when $n$ is large enough, with probability at least $1 - n^{-1}$, 
% \begin{align*}
%     \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq  \exp\bigg( - \frac{  C' n\cdot \| \bmu \|_2^4  }{  n \cdot \| \bmu \|_{\bSigma}^2+ \| \bSigma\|_F^2 +  n\cdot \| \bSigma\|_2^2 } \bigg).
% \end{align*}
% \end{theorem}

% \begin{remark}
% Theorem~\ref{thm:BOnew} is stronger than Theorem~\ref{thm:benignoverfitting} where the bound is 
% \begin{align*}
%     \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg(- \frac{1 }{256  \| \bSigma\|_{2} } \cdot \min\bigg\{ \| \bmu \|_2^2 , \frac{n \| \bmu \|_2^4 }{\tr(\bSigma)} \bigg\}\bigg).
% \end{align*}
% By $\tr(\bSigma^2) \leq \| \bSigma \|_2\cdot \tr(\bSigma)$, $\| \bSigma^2\|_F \leq  \| \bSigma \|_2\cdot \| \bSigma\|_F$, $\| \bmu \|_{\bSigma}^2 \leq \| \bSigma \|_2  \| \bmu \|_2^2$ and the assumption $ \tr(\bSigma) > C\max\{ n\cdot  \| \bSigma \|_2, \sqrt{n}\cdot  \|\bSigma \|_F\}$, we obtain from Theorem~\ref{thm:BOnew} that
% \begin{align*}
%     \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) &\leq  \exp\bigg[ - \frac{  C'' n\cdot \| \bmu \|_2^4  }{  n \cdot\| \bSigma \|_2  \| \bmu \|_2^2+  \| \bSigma \|_2\cdot \tr(\bSigma)} \bigg]\\
%     &\leq \exp\bigg(- \frac{C''' }{ \| \bSigma\|_{2} } \cdot \min\bigg\{ \| \bmu \|_2^2 , \frac{n \| \bmu \|_2^4 }{\tr(\bSigma)} \bigg\}\bigg).
% \end{align*}
% Therefore the bound in Theorem~\ref{thm:BOnew} can be much tighter, e.g., when $d$ is large, $ \tr(\bSigma)$ can be close to infinity while $ \tr(\bSigma^2)$ can be bounded by a constant.
% \end{remark}

% \begin{remark}
% The Bayes optimal 
% \end{remark}


\begin{proof}[Proof of Theorem~\ref{thm:BOnew}]


We have
\begin{align}
    \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg[ - \frac{ [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2}{  2 \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb} \bigg].\label{eq:refinedBOderivation_eq0}
\end{align}


In the following, we will give a lower bound of the numerator and an upper bound of the denominator in the exponential term of \eqref{eq:refinedBOderivation_eq0} respectively. 



\end{proof}
% \begin{proof}[Proof of Lemma~\ref{lemma:riskboundcalculation}]
% \begin{align*}
%     \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb}
% \end{align*}

% By definition, we have $\Xb = \yb \bmu^\top + \Qb$, and therefore
% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu = \yb^\top (\Xb\Xb^\top)^{-1} (\yb \bmu^\top + \Qb) \bmu = \| \bmu \|_2^2 \cdot \yb^\top (\Xb\Xb^\top)^{-1} \yb 
%     + \yb^\top (\Xb\Xb^\top)^{-1}  \Qb \bmu.
% \end{align*}
% Hence we have
% \begin{align*}
%     \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} =  \| \bmu \|_2^2 + \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Qb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb}.
% \end{align*}
% By Lemma~\ref{lemma:matrixcalculation}, we have
% \begin{align*}
%     \yb^\top (\Xb\Xb^\top)^{-1} = D^{-1} [ ( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1}].
% \end{align*}
% Therefore
% \begin{align*}
%     \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb} &=  \| \bmu \|_2^2 + \frac{\yb^\top (\Xb\Xb^\top)^{-1} \Qb \bmu }{\yb^\top (\Xb\Xb^\top)^{-1} \yb}\\
%     &=  \| \bmu \|_2^2 + \frac{( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} \bnu - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1} \bnu }{( 1 + \yb^\top \Ab^{-1} \bnu ) \yb^\top \Ab^{-1} \yb - \yb^\top \Ab^{-1} \yb \cdot \bnu^\top \Ab^{-1} \yb}
% \end{align*}
% \end{proof}























% Here the second inequality above follows by the definition of $\epsilon_{\lambda}'$ in Lemma~\ref{lemma:eigenvalue_concentration2} and the assumption that $\tr(\bSigma) > C \epsilon_{\lambda}'$ for some large enough constant $C$. 

% Plugging the bounds \eqref{eq:refinedBOderivation_I1bound} and \eqref{eq:refinedBOderivation_I2bound} into \eqref{eq:refinedBO_denominator_eq0}, we obtain
% \begin{align}
%     &\yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb \nonumber  \\
%     &\qquad \leq 8D^{-2}  \cdot \frac{n^2 \cdot \| \bmu \|_{\bSigma}^2}{ [\tr( \bSigma )]^2} + c_4\cdot D^{-2}\cdot  \frac{ n\cdot \tr(\bSigma^2) +  n^2\cdot \| \bSigma\|_2^2  + n^{3/2}\cdot \| \bSigma^2\|_F}{ [\tr(\bSigma) ]^2} \nonumber\\
%     &\qquad \leq c_5 D^{-2}  \cdot  \frac{ n^2 \cdot \| \bmu \|_{\bSigma}^2+  n\cdot \tr(\bSigma^2) +  n^2\cdot \| \bSigma\|_2^2  + n^{3/2}\cdot \| \bSigma^2\|_F}{ [\tr(\bSigma) ]^2} \label{eq:refinedBO_denominator_bound}
% \end{align}
% Finally, plugging the bounds \eqref{eq:refinedBO_numerator_bound}, \eqref{eq:refinedBO_denominator_bound} into \eqref{eq:refinedBOderivation_eq0}, we obtain
% \begin{align*}
%     \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) &\leq \exp\bigg[ - \frac{ [ \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bmu  ]^2}{  2 \yb^\top (\Xb\Xb^\top)^{-1} \Xb \bSigma \Xb^\top (\Xb\Xb^\top)^{-1} \yb} \bigg]\\
%     &\leq \exp\bigg[ - \frac{ D^{-2}\cdot \frac{n^2}{ 64 [\tr( \bSigma )]^2 } \cdot   \| \bmu \|_2^4  }{  c_6 D^{-2}  \cdot  \frac{ n^2 \cdot \| \bmu \|_{\bSigma}^2+  n\cdot \tr(\bSigma^2) +  n^2\cdot \| \bSigma\|_2^2  + n^{3/2}\cdot \| \bSigma^2\|_F}{ [\tr(\bSigma) ]^2}} \bigg]\\
%     &\leq  \exp\bigg[ - \frac{  c_7 n\cdot \| \bmu \|_2^4  }{  n \cdot \| \bmu \|_{\bSigma}^2+ \tr(\bSigma^2) +  n\cdot \| \bSigma\|_2^2  + n^{1/2}\cdot \| \bSigma^2\|_F} \bigg]\\
%     &\leq  \exp\bigg[ - \frac{  c_7 n\cdot \| \bmu \|_2^4  }{  n \cdot \| \bmu \|_{\bSigma}^2+ \| \bSigma\|_F^2 +  n\cdot \| \bSigma\|_2^2  + n^{1/2}\cdot \| \bSigma^2\|_F} \bigg].
%     % \\
%     % &\leq \exp\bigg[ - c_8 \max\bigg\{\frac{\| \bmu \|_2^2}{\| \bSigma\|_2} , \frac{  n\cdot \| \bmu \|_2^4  }{ \tr(\bSigma^2) +  n\cdot \| \bSigma\|_2^2  + n^{1/2}\cdot \| \bSigma^2\|_F} \bigg\} \bigg]
% \end{align*}
% Finally, note that
% \begin{align*}
%     n^{1/2}\cdot \| \bSigma^2\|_F \leq n^{1/2} \cdot  \| \bSigma\|_2 \cdot  \| \bSigma\|_F \leq  (n \cdot  \| \bSigma\|_2^2 +  \| \bSigma\|_F^2) /2.
% \end{align*}
% Therefore we have
% \begin{align*}
%     \PP( y\cdot \hat\btheta_{\text{SVM}}^\top \xb < 0 ) \leq \exp\bigg( - \frac{  c_8 n\cdot \| \bmu \|_2^4  }{  n \cdot \| \bmu \|_{\bSigma}^2+ \| \bSigma\|_F^2 +  n\cdot \| \bSigma\|_2^2} \bigg)
% \end{align*}
% for some absolute constant $c_8$.
% This completes the proof.


