

%\theontk*

\subsection{Proof of Lemma~\ref{lemm:highproblambda}}
\label{ssec:highprobablambda}
We start with a result which explicitly shows that the output $\| \alpha^{(l)}\|_2$ is sub-Gaussian for Gaussian random weights. We recall that we are assuming $\phi(0)=0$, and note that the result can be extended to the general case straightforwardly.

\begin{lemm}
Let $A^{(l)} = [ \alpha^{(l)}(\x_i)] \in \R^{n \times m_l}$ be the outputs of layer $l$.
For $g \sim \cN(\bm{0}_{m_{l-1}},\sigma^2 \I_{m_{l-1}})$, let $\vartheta^{(l)} = \phi(\frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g ) \in \R^{n}$. Then, $\| \vartheta^{(l)}\|_2$ is a sub-Gaussian random variable with 
\begin{align*}
\| \|\vartheta^{(l)}\|_2 \|_{\psi_2} = \left\| \left\| \phi\left(\frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right) \right\|_2 \right\|_{\psi_2}\leq c \frac{\sqrt{2} (\sqrt{\log n} + 1)\sigma}{\sqrt{m_{l-1}}} \| A^{(l-1)} \|_F
\end{align*}
for some absolute constant $c>0$.
\label{lemm:gammasubg}
\end{lemm}
\proof First, note that since $\phi$ is 1-Lipschitz and $\phi(0) = 0$, we have $\norm{\phi( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g )}_2 \leq \norm{\frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g}_2$. Now, 
\begin{align*}
\P  \left( \left\| \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right\|_2^2 \geq \epsilon^2 \frac{1}{m_{l-1}} \| A^{(l-1)} \|_F^2 \right)
 &= \P \left( \frac{1}{m_{l-1}} \sum_{i=1}^n \langle \alpha_i^{(l-1)}, g \rangle^2 \geq \epsilon^2 \frac{1}{m_{l-1}} \sum_{i=1}^n \| \alpha_i^{(l-1)} \|_2^2 \right)\\
& \leq \sum_{i=1}^n \P \left( \frac{1}{m_{l-1}} \langle \alpha_i^{(l-1)}, g \rangle^2 \geq \epsilon^2 \frac{1}{m_{l-1}} \| \alpha_i^{(l-1)} \|_2^2 \right) \\
& = \sum_{i=1}^n \P \left( |\langle \alpha_i^{(l-1)}, g \rangle| \geq \epsilon \| \alpha_i^{(l-1)} \|_2 \right)\\
& \leq 2n \exp\left( - \frac{\epsilon^2}{2\sigma^2} \right)~.
\end{align*}
In other words, with $\tilde{\epsilon} = \frac{\epsilon}{\sqrt{m_{l-1}}} \| A^{(l-1)} \|_F$, for all $\tilde{\epsilon} > 0$ we have 
\begin{align*}
\P \left( \left\| \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right\|_2 \geq \tilde{\epsilon} \right) \leq 2n \exp\left( - \frac{m_{l-1} \tilde{\epsilon}^2}{2\sigma^2 \| A^{(l-1)} \|^2_F } \right) ~.
\end{align*}
Now, with $\tilde{\epsilon} = \frac{\sigma \sqrt{2 \log n} }{\sqrt{m_{l-1}}} \| A^{(l-1)} \|_F + \epsilon $, for all $\epsilon > 0$ we have
\begin{align*}
\P \left( \left\| \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right\|_2  \geq  \frac{\sigma \sqrt{2 \log n}}{\sqrt{m_{l-1}}} \| A^{(l-1)} \|_F + \epsilon  \right) & \leq 2n \exp \left( - \log n \right)  \exp\left( - \frac{m_{l-1} \epsilon^2}{2\sigma^2 \| A^{(l-1)} \|^2_F } \right) \\
\Rightarrow ~~~\P \left( \left\| \phi\left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right) \right\|_2  \geq  \frac{\sigma \sqrt{2 \log n }}{\sqrt{m_{l-1}}} \| A^{(l-1)} \|_F + \epsilon  \right) & \leq 2 \exp\left( - \frac{m_{l-1} \epsilon^2}{2\sigma^2 \| A^{(l-1)} \|^2_F } \right)~.
\end{align*}
Then, from Proposition~\ref{prop:subgsum}, it follows that $\| \phi( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g ) \|_2$ is sub-Gaussian with 
\begin{align*}
\left\| \left\| \phi\left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right) \right\|_2 \right\|_{\psi_2} \leq c \frac{\sqrt{2} (\sqrt{\log n} + 1)\sigma}{\sqrt{m_{l-1}}} \| A^{(l-1)} \|_F~,
\end{align*}
for some absolute constant $c$. This completes the proof. \qed 

\begin{prop}
Let $a_1, a_2 > 0$. If a non-negative random variable $Z$ satisfies $\P( Z \geq a_1 + \epsilon) \leq 2 \exp(-\epsilon^2/a_2^2)$, then $\| Z \|_{\psi_2} \leq c(a_1 + a_2)$, where $c$ is an absolute constant.
\label{prop:subgsum}
\end{prop}
\proof Note that $Z-a_1 = [Z-a_1]_- + [Z-a_1]_+$. Since $Z$ is non-negative, $|[Z-a_1]_-| \leq a_1$ implying $\|[Z-a_1]_-\|_{\psi_2} \leq c_1 a_1$, where $c_1$ is an absolute constant. Further, by definition, $[Z-a_1]_+$ is sub-Gaussian with $\| [Z-a_1]_+ \|_{\psi_2} \leq c_2 a_2$, where $c_2$ is an absolute constant. Now, by triangle inequality
\begin{align*}
\| Z \|_{\psi_2} & = \| a_1 + [Z-a_1]_- + [Z - a_1]_+ \|_{\psi_2} \\
& \leq a_1 + \|[Z-a_1]_-\|_{\psi_2} + \| [Z-a_1]_+ \|_{\psi_2} \leq a_1 + c_1 a_1 + c_2 a_2\\
& \leq c(a_1+a_2)~,
\end{align*}
where $c = \max(1+c_1, c_2)$. That competes the proof. \qed

% \abcomment{update the analysis -- can be done more directly, using Hoeffding(?)} Let $U = \left\{u_i = \frac{\alpha_i}{\| \alpha_i\|_2}, i \in [n]\right\}$ so that $u_i \in \R^{m_l}, i \in [n]$. Then, from Lemma~\ref{lem:newaux1}, with probability at least $(1-2\exp(-\tau^2/(2\sigma_0^2)))$, we have 
% \begin{align*}
% \frac{1}{\| \alpha_i^{(l-1)} \|_2} \| W_0^{(l)} \alpha_i^{(l)} \|_2  =  \| W_0^{(l)} u_i \|_2 \leq \sigma_0(\sqrt{m} + \sqrt{\log n}) + \tau~,
% \end{align*}
% since $W_0^{(l)} \in \R^{m_{l} \times m_{l-1}}$ has entries $w_{0,ij}^{(l)} \sim \cN(0,\sigma_0^2)$.
% % Then, from the proof on Lemma~\ref{lemm:outl2} and Assumption~\ref{asmp:ginit}, with probability at least $(1-2\exp(-\tau^2/(2\sigma_0^2)))$, we have 
% % \begin{align*}
% % ~\frac{1}{\| \alpha_i^{(l-1)} \|_2} \| W^{(l)} \alpha_i^{(l-1)} \|_2  \leq \sqrt{m}\left( \sigma_1 + \frac{\rho}{\sqrt{m}} \right) + \tau = \sqrt{m} \gamma + \tau~.
% % \end{align*}
% Since $\phi(0)=0$, with probability at least $(1-2\exp(-\tau^2/(2\sigma_0^2)))$, we have 
% \begin{align*}
% \| \alpha_i^{(l)} \|_2 & \leq ~\left\| \frac{1}{\sqrt{m_{l-1}}} W_0^{l} \alpha_i^{(l-1)} \right\|_2 
%  \leq \left(\gamma + \frac{\tau  }{\sqrt{m}} \right) \| \alpha_i^{(l-1)} \|_2~.
% \end{align*}
% Then, with probability at least $(1-2\exp(-\tau^2/(2\sigma_0^2)))$, we have 
% \begin{align*}
% \| \Gamma^{(l)} \|^2_2 & = \sum_{i=1}^n \left\| \phi\left( \frac{1}{\sqrt{m}} \langle \alpha_i^{(l-1)}, g \rangle\right) \right\|^2_2 \leq  \left(\gamma + \frac{\tau  }{\sqrt{m}} \right)^2 \| A^{(l-1)} \|_F^2~\\
% \Rightarrow \qquad \| \Gamma^{(l)} \|_F & \leq \gamma \| A^{(l-1)} \|_F + \frac{ \tau \| A^{(l-1)} \|_F}{\sqrt{m}}~.
% \end{align*}
% Then, with $t = \frac{ \| A^{(l-1)} \|_F}{\sqrt{m}} \tau$, we have with probability at least $(1 - 2\exp(-\frac{m t^2}{2 \sigma_0^2 \| A^{(l-1)} \|_F^2})$, we have 
% \begin{align*}
% \| \Gamma^{(l)} \|_F \leq \gamma \| A^{(l-1)} \|_F + t~,
% \end{align*}
% Then, from \abedit{Proposition ??} it follows that $\| \Gamma^{(l)} \|_F$ is a sub-Gaussian random variable with $\| \Gamma^{(l)} \|_{\psi_2} \leq \gamma \| A^{(l-1)}\|_F + \frac{\sigma_0 \| A^{(l-1)} |_F}{\sqrt{m}} \leq \left( \gamma + \frac{\sigma_1}{\sqrt{m}} \right) \| A^{(l-1)}\|_F$, since $\sigma_0 \leq \sigma_1$. That completes the proof. \qed 

We are now ready to prove Lemma~\ref{lemm:highproblambda}.
%
%\highproblambda*

\proof[Proof of Lemma~\ref{lemm:highproblambda}] Let $\vartheta^{(l)} = \phi(\frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g) \in \R^{n}$. If $m_l < n$, then $\lambda_{\min}(A^{(l)} (A^{(l)})^\top) = 0$. So, we assume $m_l \geq n$.
%
Let $w_j \in \R^{m_{l-1}}$ denote the $j$-th row of $W_0^{(l)}$.
For a given $t>0$, let $\hat{A}^{(l)} \in \R^{n \times m_l}$ so that $\hat{A}_{:,j}^{(l)} = \phi( \frac{1}{\sqrt{m_l}} A^{(l-1)} w_j^\top ) \1_{[\| \phi(\frac{1}{\sqrt{m_l}} A^{(l-1)} w_j^\top \|_2 \leq t]}\in\R^{m_{l-1}}$ and $\hat{\vartheta}^{(l)} = \phi( \frac{1}{\sqrt{m_l}} A^{(l-1)} g ) \1_{[\| \phi(\frac{1}{\sqrt{m_l}} A^{(l-1)} g \|_2 \leq t]} \in \R^n$. Then, we have 
%\lzcomment{I did not find the proof of $\lambda_{\max}$?}
\begin{align*}
(i)\;\; &
\lambda_{\min}( A^{(l)} ( A^{(l)})^\top ) \geq \lambda_{\min}( \hat{A}^{(l)} ( \hat{A}^{(l)})^\top )  ~,\\
(ii)\;\;&\lambda_{\max}( \hat{A}_{:,j}^{(l)} ( \hat{A}_{:,j}^{(l)})^\top ) \leq t^2~,
\end{align*}
where (i) follows immediately by definition of $\hat{A}^{(l)}$ and (ii) follows since for any unit vector $v \in \R^n$, 
%\begin{align*}
$v^\top \hat{A}_{:,j}^{(l)} ( \hat{A}_{:,j}^{(l)})^\top v = \langle v, \hat{A}_{:,j}^{(l)} \rangle^2 \leq \| \hat{A}_{:,j}^{(l)} \|_2^2 \leq t^2$.
%\end{align*}

From Lemma~\ref{lemm:gammasubg}, we know that $\| \| \vartheta^{(l)} \|_2 \|_{\psi_2} \leq c_1  \frac{\sqrt{2}(\sqrt{\log n}+1)\sigma}{\sqrt{m}} \| A^{(l-1)} \|_F$.  Recall that for any subGaussian random variable $Z$, $\P(Z \geq t) \leq \exp(1-c t^2/\| Z \|_{\psi_2}^2)$ for some absolute constant $c$. For our analysis with $\| \vartheta^{(l)} \|_2$, for a suitable constant $a > 0$ we will use 
\begin{align}
    t = \frac{  \sqrt{2} (\sqrt{\log n}+1) \sigma \| A^{(l-1)} \|_F}{\sqrt{cm_{l-1}}} \sqrt{ \max \left(1 , \log \frac{2a (\sqrt{\log n} + 1)^2 \sigma^2 \| A^{(l-1)} \|_F^2}{ c \lambda_l m_{l-1}} \right) }~.
\label{eq:thres}
\end{align}

Let
\begin{align*}
\hspace*{-10mm}
G_l & := \E_{g \sim \cN(\bm{0}_{m_{l-1}},\sigma^2 \I_{m_{l-1}})}\left[ \vartheta^{(l)} (\vartheta^{(l)})^\top \right] = \E_{g \sim \cN(\bm{0}_{m_{l-1}},\sigma^2 \I_{m_{l-1}})}\left[ \phi\left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right) \phi\left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right)^\top \right] ~, \\
\hat{G}_l & := \E_{g \sim \cN(\bm{0}_{m_{l-1}},\sigma^2 \I_{m_{l-1}})}\left[ \hat{\vartheta}^{(l)} (\hat{\vartheta}^{(l)})^\top \right] \\
& ~~= \E_{g \sim \cN(\bm{0}_{m_{l-1}},\sigma^2 \I_{m_{l-1}})}\left[ \phi\left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g\right) \phi \left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right)^\top \1_{[\| \phi ( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g) \|_2 \leq t]} \right] ~. 
\end{align*}
Note that $\lambda_l = \lambda_{\min}(G_l)$.

By Matrix Chernoff bound, for any $\epsilon \in [0,1)$, we have
\begin{align*}
\P\left( \lambda_{\min}( \hat{A}^{(l)} ( \hat{A}^{(l)})^\top ) \leq (1-\epsilon) \lambda_{\min}( \E_{W_0^{(l)}}[ \hat{A}^{(l)} ( \hat{A}^{(l)})^\top ]) \right) \leq n \left(\frac{e^{-\epsilon}}{(1-\epsilon)^{1-\epsilon}} \right)^{\lambda_{\min}\left(\E_{W_0^{(l)}}[\hat{A}^{(l)} ( \hat{A}^{(l)})^\top ]\right)/t^2}~.
\end{align*}
For $\epsilon=1/2$, with $c_3 = \frac{1}{2}(1 - \log 2)$, we have 
\begin{align*}
\P\left( \lambda_{\min}( \hat{A}^{(l)} ( \hat{A}^{(l)})^\top ) \leq \frac{m_l}{2} \lambda_{\min}(\hat{G}_l) \right) \leq \exp\left( - \frac{c_3 m_l}{t^2} \lambda_{\min}(\hat{G}) + \log n \right)~,
\end{align*}
where we used the fact that $\E_{W_0^{(l)}}[\hat{A}^{(l)}(\hat{A}^{(l)})^\top]=m_l\E_{g \sim \cN(\bm{0}_{m_{l-1}},\sigma^2 \I_{m_{l-1}})}[\hat{\vartheta}^{(l)}(\hat{\vartheta}^{(l)})^\top]=m_l\hat{G}_l$. 

With $m_l \geq \frac{t^2}{c_3 \lambda_{\min}(\hat{G}_l)} \log \frac{n}{\bar{\delta}}$, with probability at least $(1-\bar{\delta})$ we have 
\begin{align}
\label{eq:lowerbound_hat_A_l}
\lambda_{\min}( \hat{A}^{(l)} ( \hat{A}^{(l)})^\top ) \geq \frac{m_l}{2} \lambda_{\min}(\hat{G}_l) ~.
\end{align}
Now, note that
\begin{align*}
\hspace*{-25mm}
\| \hat{G}_l - G_l \|_2 & \leq 
 \E_{g \sim \cN(\bm{0}_{m_{l-1}},\sigma^2 \I_{m_{l-1}})} \left\|  \phi\left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g\right) \phi \left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right)^\top \1_{[\| \phi ( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g) \|_2 \leq t]} \right. \\
 &  \left. \phantom{\E   \phi\left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g\right) \phi \left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right)^\top } 
 -  \phi\left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right) \phi\left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right)^\top \right\|_2 \\
& = \E_{g \sim \cN(\bm{0}_{m_{l-1}},\sigma^2 \I_{m_{l-1}})}\left[ \left\| \phi \left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right) \right\|_2^2 \1_{[\| \phi ( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g) \|_2 > t]} \right]    \\
& \overset{(a)}{=} \int_{s=0}^{\infty} \P\left( \left\| \phi \left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right) \right\|_2 \1_{[\| \phi ( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g) \|_2 > t]} > \sqrt{s} \right) ds \\
& = \int_{s=0}^{\infty} \P\left( \left\| \phi \left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right) \right\|_2 > t \right) \P\left( \left\| \phi \left( \frac{1}{\sqrt{m_{l-1}}} A^{(l-1)} g \right) \right\|_2 > \sqrt{s} \right) ds \\
& \overset{(b)}{\leq}\exp(2) \exp\left( -c \frac{m_{l-1} t^2 }{2(\sqrt{\log n} +1)^2 \sigma^2 \| A^{(l-1)} \|_F^2} \right) \int_{s=0}^{\infty} \exp\left( -c \frac{m_{l-1} s }{2(\sqrt{\log n} +1)^2 \sigma^2 \| A^{(l-1)} \|_F^2} \right) ds \\
& \overset{(c)}{=} \exp(2) \exp\left( -  \max\left(1 , \log \frac{2a (\sqrt{\log n}  + 1)^2 \sigma^2 \| A^{(l-1)} \|_F^2}{c \lambda_l m_{l-1}} \right)  \right) \frac{2(\sqrt{\log n} +1)^2 \sigma^2 \|A^{(l-1)}\|_F^2}{cm_{l-1}} ~,
\end{align*}
where (a) follows since for any non-negative random variable $Z$,  $\E[Z] = \int_0^{\infty} \P(Z \geq s) ds$, (b) follows from Lemma~\ref{lemm:gammasubg}, and (c) follows from our choice of  $t$ in \eqref{eq:thres} and since for $b > 0$, $\int_0^\infty \exp(-s/b) ds = b$. To simplify further, we consider the following two exhaustive cases:

{\bf Case 1.} Assume
\begin{align*}
\frac{2a(\sqrt{\log n}  + 1)^2 \sigma^2 \| A^{(l-1)} \|_F^2}{c \lambda_l m_{l-1}} \leq \exp(1) \quad \Rightarrow \quad \frac{2 (\sqrt{\log n}  + 1)^2 \sigma^2 \| A^{(l-1)} \|_F^2}{c m_{l-1} }  \leq \frac{\lambda_l}{a} \exp(1)~. 
\end{align*}
Then,
\begin{align*}
\| \hat{G}_l - G_l \|_2 & \leq \exp(2) \exp(-1) \frac{2(\sqrt{\log n} +1)^2 \sigma^2 \|A^{(l-1)}\|_F^2}{cm_{l-1}}  \leq \exp(2) \exp(-1) \exp(1) \frac{\lambda_l}{a} = \frac{\exp(2)}{a} \lambda_l \overset{(a)}{\leq} \frac{\lambda_l}{2}~,
\end{align*}
where (a) follows if $a \geq 2 \exp(2)$.

{\bf Case 2.} On the other hand, assume
\begin{align*}
\frac{2a(\sqrt{\log n}  + 1)^2 \sigma^2 \| A^{(l-1)} \|_F^2}{c m_{l-1} \lambda_l} \geq \exp(1) ~. 
\end{align*}
Then,
\begin{align*}
\| \hat{G}_l - G_l \|_2 & \leq \exp(2) \frac{c \lambda_l m_{l-1}}{2a (\sqrt{\log n}+1)^2 \sigma^2 \| A^{(l-1)} \|_F^2}  \frac{2(\sqrt{\log n} +1)^2 \sigma^2 \|A^{(l-1)}\|_F^2}{cm_{l-1}} = \frac{\exp(2)}{a} \lambda_l \overset{(a)}{\leq} \frac{\lambda_l}{2}~,
\end{align*}
where (a) follows if $a \geq 2 \exp(2)$. Thus, choosing $a = 15$ in \eqref{eq:thres} ensures $\| \hat{G}_l - G_l \|_2 \leq \frac{\lambda_l}{2}$. As a result, 
\begin{equation}
    \label{eq:lowerbound_hat_A_to_A_l}
\lambda_{\min}(\hat{G}_l) \geq \lambda_{\min}(G_l) - \|\hat{G}_l - G_l \|_2 \geq \lambda_l/2.
\end{equation}
Then, for $m_l \geq \frac{2t^2}{c_3 \lambda_l} \log \frac{n}{\bar{\delta}}$, with probability at least $(1-\bar{\delta})$ we have 
\begin{align*}
\lambda_{\min}(A^{(l)} (A^{(l)})^\top ) & \geq \lambda_{\min}(\hat{A}^{(l)} (\hat{A}^{(l)})^\top ) \overset{(a)}{\geq} \frac{m_l}{2} \lambda_{\min}(\hat{G}) \overset{(b)}{\geq} \frac{m_l}{4} \lambda_l~.
\end{align*}
where (a) follows from~\eqref{eq:lowerbound_hat_A_l} and (b) from~\eqref{eq:lowerbound_hat_A_to_A_l}. Finally, note that we have used $m_l \geq n$ and $m_l \geq \frac{t^2}{c_3 \lambda_{\min}(\hat{G})} \log \frac{n}{\bar{\delta}}$ in the above analysis. Then, with $v = \frac{2 (\sqrt{ \log n} +1)^2 \sigma^2 \| A^{(l-1)} \|_F^2}{c_3 \lambda_l m_{l-1} }$, the choice of $t$ in \eqref{eq:thres}, and $a=15$, and noting that $\lambda_{\min}(\hat{G}) \geq \lambda_l/2$, the analysis holds if we have
\begin{align*}
m_l & \geq \max \left( n,c_2 v \max(1, \log (15 v)) \log \frac{n}{\bar{\delta}} \right)~.
\end{align*}
for some constant $c_2 > 0$. Choosing $\bar{\delta} = \frac{\delta}{L}$ completes the proof. \qed 


\subsection{Proof of Lemma~\ref{lemm:alphainit1}}
\label{ssec:alphainit}



% We present two versions of a result which establishes $\| \alpha^l(\x)\|_2^2 = \Theta(m)$. For the first, we assume the Gaussian variance is $\frac{\sigma_0^2}{c_{\phi,\sigma_0}}$ where $c_{\phi,\sigma_0} := \E_{z \sim \cN(0,\sigma_0^2)}[\phi^2(z)]$. For the second form, we keep the Gaussian variance as $\sigma_0^2$ but used a scaled activation function: $\bar{\phi}(a) = \frac{1}{\sqrt{c_{\phi,\sigma_0}}} \phi(a)$. Both versions yield effectively the same result and are equivalent.  
% \begin{remark}
% \abcomment{update ... their analysis was incorrect} Such scaling has appeared in earlier work with smooth activation functions [Du et al.].
% \end{remark}

% \alphainit*

% \proof We do the proof by induction. For $l=0$, $\| \alpha^{(0)}(\x) \|_2^2 = \| x \|_2^2 =  c_{\phi,\sigma_0} d$.  For $l=0$ and $m_0 = d$, so the result is satisfied at $l=0$ almost surely.

% Assume that the result holds for a certain $l$, so that
% \begin{align}
% c_{\phi,\sigma_0} \left(1 -  \frac{h_C(l)}{2h_C(L)}\right) m_l  \leq \| \alpha^l(\x) \|_2^2 \leq c_{\phi,\sigma_0} \left(1 +  \frac{h_C(l)}{2h_C(L)}\right) m_l \nonumber  \\
% \Rightarrow \quad - \frac{h_C(l)}{2h_C(L)}  \leq \frac{\| \alpha^l(\x) \|_2^2}{c_{\phi,\sigma_0} m_l} - 1 \leq  \frac{h_C(l)}{2h_C(L)} ~.
% \label{eq:rip1}
% \end{align}
% We condition on $\{ W_0^{(l')}, l' \in [l] \}$, and focus on layer $\alpha^{(l+1)}$ with random weights $W_0^{(l+1)} = [w_{1,:}; \cdots;w_{m_{l+1},:}] \in \R^{m_{l+1} \times m_l}$. Note that $\|\alpha^{(l+1)}(\x) \|_2^2 = \sum_{j=1}^{m_{l+1}} (\alpha^{(l+1)}_j(\x))^2$.  Since $\phi$ is 1-Lipschitz and $\phi(0)=0$, $|\phi(a)| \leq |a|$, we have
% \begin{align}
% \E_{W_0^{(l+1)}}\| \alpha^{(l+1)}(\x) \|_2^2 & = \sum_{j=1}^{m_{l+1}} \E_{w_{j,:}} [(\alpha^{(l+1)}_j(\x))^2] = m_{l+1} \E_{g \sim \cN\left(0,\frac{\sigma_0^2}{c_{\phi,\sigma_0}} \I\right)}\left[ \phi^2\left(\frac{1}{\sqrt{m_l}}\langle g, \alpha^{(l)}(\x) \rangle \right) \right]~\nonumber \\
% & = m_{l+1} \E_{g \sim \cN\left(0,\frac{\sigma_0^2}{c_{\phi,\sigma_0}} \I\right)}\left[ \phi^2\left(\frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{m_l}} \left\langle g, \frac{\alpha^{(l)}(\x)}{\| \alpha^{(l)}(\x)\|_2} \right\rangle \right) \right] \nonumber \\
% & = m_{l+1}  \E_{z \sim \cN\left(0,\frac{\sigma_0^2}{c_{\phi,\sigma_0}}\right)}\left[ \phi^2\left(\frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{m_l}} z \right) \right] \\
% & = m_{l+1}  \E_{z \sim \cN(0,\sigma_0^2)}\left[ \phi^2\left(\frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{c_{\phi,\sigma_0}m_l}} z \right) \right]
% \label{eq:rip2}
% \end{align}
% From Proposition~\ref{prop:gauss2} with $\beta = \frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{c_{\phi,\sigma_0 m_l}}}$, we have 
% \begin{align}
% c_{\phi,\sigma_0} - \sigma_0^2 \left| \frac{\| \alpha^l(\x) \|_2^2}{c_{\phi,\sigma_0} m_l} - 1 \right| & \leq \E_{z \sim \cN(0,\sigma_0^2)}\left[ \phi^2\left(\frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{c_{\phi,\sigma_0}m_l}} z \right) \right] 
%     \leq c_{\phi,\sigma_0} + \sigma_0^2 \left| \frac{\| \alpha^l(\x) \|_2^2}{c_{\phi,\sigma_0} m_l} - 1 \right| \nonumber \\
% \overset{(a)}{\Rightarrow} \quad c_{\phi,\sigma_0}\left( 1 - \frac{c_0 h_C(l)}{2h_C(L)} \right) & \leq \E_{z \sim \cN(0,\sigma_0^2)}\left[ \phi^2\left(\frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{c_{\phi,\sigma_0} m_l}} z \right) \right] 
%   \leq c_{\phi,\sigma_0}\left( 1 + \frac{c_0 h_C(l)}{2h_C(L)} \right)~,  
% %\Rightarrow \quad c_{\phi,\sigma_0}\left( 1 - \frac{h_C(l+1)}{2h_C(L)} \right) & \leq \E_{z \sim \cN(0,\sigma_0^2)}\left[ \phi^2\left(\frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{m_l}} z \right) \right]   \leq c_{\phi,\sigma_0}\left( 1 + \frac{h_C(l+1)}{2h_C(L)} \right)~,
%  \label{eq:rip3}
% \end{align}
% where (a) follows from \eqref{eq:rip1}. 
% %and (b) follows by the definition of $h_C(l)$.

% Combining \eqref{eq:rip2} and \eqref{eq:rip3}, we have 
% \begin{align}
% c_{\phi,\sigma_0} \left(1 -  \frac{c_0 h_C(l)}{2h_C(L)}\right) m_{l+1}  \leq \E_{W_0^{(l+1)}}\| \alpha^{(l+1)}(\x) \|_2^2 \leq c_{\phi,\sigma_0} \left(1 +  \frac{c_0 h_C(l)}{2h_C(L)}\right) m_{l+1}~.
% \label{eq:rip4}
% \end{align}

% Let $\kappa:= \| (\alpha_j^{(l+1)}(\x))^2 \|_{\psi_1}$. Then, from Lemma~\ref{lemm:gammasubg} and \eqref{eq:rip1}, we have
% $\kappa  = \| (\alpha_j^{(l+1)}(\x)) \|^2_{\psi_2} \leq \frac{4 \sigma^2}{m_l} \| \alpha^{(l)} \|_2^2 \leq 6 c_{\phi,\sigma_0} \sigma^2 = \tilde{O}(1)$. Conditioned on $\{ W_0^{(l')}, l' \in [l] \}$, since $\alpha_j^{(l+1)}(\x), j \in [m_{l+1}]$ are independent, from Bernstein's inequality we have 
% \begin{align*}
% \P\left( \left| \sum_{j=1}^{m_{l+1}} (\alpha_j^{(l+1)}(\x))^2 - \E_{W_0^{(l+1)}}\| \alpha^{(l+1)}(\x) \|_2^2  \right| \geq t \right) \leq 2 \exp\left[-c \min\left(\frac{t^2}{m_{l+1} \kappa^2} , \frac{t}{\kappa} \right) \right]
% \end{align*}
% Choosing $t = \frac{c_{\phi,\sigma_0}}{2h_C(L)} \frac{\kappa}{\min(c,\sqrt{c})} (m_{l+1})^{3/4} \log m_{l+1} \leq \frac{c_{\phi,\sigma_0}}{2h_C(L)} m_{l+1}$, we have 
% \begin{align*}
% \E_{W_0^{(l+1)}}\| \alpha^{(l+1)}(\x) \|_2^2 - \frac{c_{\phi,\sigma_0}}{2h_C(L)} m_{l+1}  & ~\leq~  \| \alpha^{(l+1)}(\x) \|_2^2  ~\leq~  E_{W_0^{(l+1)}}\| \alpha^{(l+1)}(\x) \|_2^2 + \frac{c_{\phi,\sigma_0}}{2h_C(L)} m_{l+1} \\
% \overset{(a)}{\Rightarrow} \quad  c_{\phi,\sigma_0} \left( 1 - \frac{1 + c_0 h_C(l)}{2h_C(L)} \right) m_{l+1} & ~\leq ~  \| \alpha^{(l+1)}(\x) \|_2^2  ~ \leq ~  c_{\phi,\sigma_0} \left( 1 - \frac{1 + c_0 h_C(l)}{2h_C(L)} \right) m_{l+1} \\
% \overset{(b)}{\Rightarrow} \quad c_{\phi,\sigma_0} \left( 1 - \frac{h_C(l+1)}{2h_C(L)} \right) m_{l+1} & ~\leq ~  \| \alpha^{(l+1)}(\x) \|_2^2  ~ \leq ~  c_{\phi,\sigma_0} \left( 1 - \frac{h_C(l+1)}{2h_C(L)} \right) m_{l+1}~,
% \end{align*}
% where (a) follows from \eqref{eq:rip4} and (b) follows since $h_C(l+1) = 1 + c_0 h_C(l)$, with probability at least 
% \begin{align*}
% 1 - 2 \exp\left[ - \min \left( \frac{c^2_{\phi,\sigma_0} \sqrt{m_{l+1}}}{8 h_C(L)^2}, \frac{c_{\phi,\sigma_0} \sqrt{m_{l+1}}}{4 h_C(L)}  \right) 2\log m_{l+1}\right] \leq 1 - \frac{2}{m_l}~.
% \end{align*}
% Applying union bound over all layers completes the proof. \qed 

% \abedit{Let $a_0 = c_0^2$. Recall that $L \leq \log n$, we have $h_C(L) = \frac{c_0^L-1}{c_0 - 1} \leq L c_0^L$, so that $h_C^2(L) \leq L^2 (a_0)^L \leq (\log n)^2 a_0^L \leq (\log n)^2 n^{\log a_0}$. Then, it suffices to have $m_{l+1} = \tilde{\Omega}(n^{\log a_0})$ where $a_0 = c_0^2 = \frac{\sigma_0^4}{c_{\phi,\sigma}^2}$. For $\phi(a)=a$, $c_0=a_0=1$, so from this analysis, $\tilde{\Omega}(1)$ width suffices (but linear may break due to other reasons). For ReLU, $c_0 = 2$, so $a_0=4$, and $\tilde{\Omega}(n^{1.39})$ width suffices, for this part of the analysis. For GeLU, $c_0$ will be a bit smaller, so mildly larger width will be needed}
%
% \abcomment{write the above formally ... we have the result!}

%\alphainit*

\proof[Proof of Lemma~\ref{lemm:alphainit1}] We do the proof by induction. Let $\x \in \{\x_i, i \in [n]\}$. For $l=0$, $\| \alpha^{(0)}(\x_i) \|_2^2 = \| x_i \|_2^2 =  c_{\phi,\sigma_0} d$.  For $l=0$ and $m_0 = d$, so the result is satisfied at $l=0$ almost surely.

Assume that the result holds for a certain $l$, so that for any $i \in [n]$
\begin{align}
c_{\phi,\sigma_0} \left(1 -  \frac{h_C(l)}{2h_C(L)}\right) m_l   \leq \min_{i \in [n]} \| \alpha^l(\x) \|_2^2 & \leq \max_{i \in [n]} \| \alpha^{(l)}(\x) \|_2^2 \leq c_{\phi,\sigma_0} \left(1 +  \frac{h_C(l)}{2h_C(L)}\right) m_l \nonumber  \\
\Rightarrow \quad \max_{i \in [n]} \left| \frac{\| \alpha^{(l)}(\x) \|_2^2}{c_{\phi,\sigma_0} m_l} - 1 \right| & \leq  \frac{h_C(l)}{2h_C(L)} ~.
\label{eq:rip1}
\end{align}
We condition on $\{ W_0^{(l')}, l' \in [l] \}$, and focus on layer $\alpha^{(l+1)}$ with random weights $W_0^{(l+1)} = [w_{1,:}; \cdots;w_{m_{l+1},:}] \in \R^{m_{l+1} \times m_l}$. Note that $\|\alpha^{(l+1)}(\x) \|_2^2 = \sum_{j=1}^{m_{l+1}} (\alpha^{(l+1)}_j(\x))^2$.  Since $\phi$ is 1-Lipschitz and $\phi(0)=0$, $|\phi(a)| \leq |a|$, we have
\begin{align}
\E_{W_0^{(l+1)}}\| \alpha^{(l+1)}(\x) \|_2^2 & = \sum_{j=1}^{m_{l+1}} \E_{w_{0,j,:}} [(\alpha^{(l+1)}_j(\x))^2] = m_{l+1} \E_{g \sim \cN\left(\bm{0}_{m_l},\frac{\sigma_0^2}{c_{\phi,\sigma_0}} \I_{m_l}\right)}\left[ \phi^2\left(\frac{1}{\sqrt{m_l}}\langle g, \alpha^{(l)}(\x) \rangle \right) \right]~\nonumber \\
& = m_{l+1} \E_{g \sim \cN\left(\bm{0}_{m_l},\frac{\sigma_0^2}{c_{\phi,\sigma_0}} \I_{m_l}\right)}\left[ \phi^2\left(\frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{m_l}} \left\langle g, \frac{\alpha^{(l)}(\x)}{\| \alpha^{(l)}(\x)\|_2} \right\rangle \right) \right] \nonumber \\
& = m_{l+1}  \E_{z \sim \cN\left(0,\frac{\sigma_0^2}{c_{\phi,\sigma_0}}\right)}\left[ \phi^2\left(\frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{m_l}} z \right) \right] \\
& = m_{l+1}  \E_{z \sim \cN(0,\sigma_0^2)}\left[ \phi^2\left(\frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{c_{\phi,\sigma_0}m_l}} z \right) \right].
\label{eq:rip2}
\end{align}
Now, from Proposition~\ref{prop:gauss2} with $\beta = \frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{c_{\phi,\sigma_0 m_l}}}$, we have 
\begin{align}
c_{\phi,\sigma_0} - \sigma_0^2 \left| \frac{\| \alpha^l(\x) \|_2^2}{c_{\phi,\sigma_0} m_l} - 1 \right| & \leq \E_{z \sim \cN(0,\sigma_0^2)}\left[ \phi^2\left(\frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{c_{\phi,\sigma_0}m_l}} z \right) \right] 
    \leq c_{\phi,\sigma_0} + \sigma_0^2 \left| \frac{\| \alpha^l(\x) \|_2^2}{c_{\phi,\sigma_0} m_l} - 1 \right| \nonumber \\
\overset{(a)}{\Rightarrow} \quad c_{\phi,\sigma_0}\left( 1 - \frac{\vartheta_0^2 h_C(l)}{2h_C(L)} \right) & \leq \E_{z \sim \cN(0,\sigma_0^2)}\left[ \phi^2\left(\frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{c_{\phi,\sigma_0} m_l}} z \right) \right] 
  \leq c_{\phi,\sigma_0}\left( 1 + \frac{\vartheta_0^2 h_C(l)}{2h_C(L)} \right)~,  
%\Rightarrow \quad c_{\phi,\sigma_0}\left( 1 - \frac{h_C(l+1)}{2h_C(L)} \right) & \leq \E_{z \sim \cN(0,\sigma_0^2)}\left[ \phi^2\left(\frac{\| \alpha^{(l)}(\x)\|_2}{\sqrt{m_l}} z \right) \right]   \leq c_{\phi,\sigma_0}\left( 1 + \frac{h_C(l+1)}{2h_C(L)} \right)~,
 \label{eq:rip3}
\end{align}
where (a) follows from \eqref{eq:rip1}. 
%and (b) follows by the definition of $h_C(l)$.

Combining \eqref{eq:rip2} and \eqref{eq:rip3}, we have 
\begin{align}
c_{\phi,\sigma_0} \left(1 -  \frac{\vartheta_0^2 h_C(l)}{2h_C(L)}\right) m_{l+1}  \leq \E_{W_0^{(l+1)}}\| \alpha^{(l+1)}(\x) \|_2^2 \leq c_{\phi,\sigma_0} \left(1 +  \frac{\vartheta_0^2 h_C(l)}{2h_C(L)}\right) m_{l+1}~.
\label{eq:rip4}
\end{align}

Let $\kappa:= \| (\alpha_j^{(l+1)}(\x))^2 \|_{\psi_1}$. Then, we have
$\kappa  = \| (\alpha_j^{(l+1)}(\x)) \|^2_{\psi_2} \overset{(a)}{\leq} \frac{4\bar{c} \sigma_0^2}{m_lc_{\phi,\sigma_0}} \| \alpha^{(l)}(\x) \|_2^2 \overset{(b)}{\leq} 6\bar{c}\sigma_0^2 = \tilde{O}(1)$,
where (a) follows from a similar procedure to Lemma~\ref{lemm:gammasubg} where $\bar{c}>0$ is some absolute constant, and (b) follows from~\eqref{eq:rip1}. Conditioned on $\{ W_0^{(l')}, l' \in [L] \}$, since $\alpha_j^{(l+1)}(\x)$, $j \in [m_{l+1}]$, are independent, from Bernstein's inequality we have 
\begin{align*}
\P\left( \left| \sum_{j=1}^{m_{l+1}} (\alpha_j^{(l+1)}(\x))^2 - \E_{W_0^{(l+1)}}\| \alpha^{(l+1)}(\x) \|_2^2  \right| \geq t \right) \leq 2 \exp\left[-c \min\left(\frac{t^2}{m_{l+1} \kappa^2} , \frac{t}{\kappa} \right) \right]
\end{align*}
for some absolute constant $c>0$. Then, by union bound 
\begin{align*}
\P\left( \max_{i \in [n]} ~\left| \sum_{j=1}^{m_{l+1}} (\alpha_j^{(l+1)}(\x_i))^2 - \E_{W_0^{(l+1)}}\| \alpha^{(l+1)}(\x) \|_2^2  \right| \geq t \right) \leq 2n \exp\left[-c \min\left(\frac{t^2}{m_{l+1} \kappa^2} , \frac{t}{\kappa} \right) \right]
\end{align*}
Choosing $t = \frac{c_{\phi,\sigma_0}}{2h_C(L)} \frac{\kappa}{\min(c,\sqrt{c})} (m_{l+1})^{3/4} (\log m_{l+1} + \log n) \leq \frac{c_{\phi,\sigma_0}}{2h_C(L)} m_{l+1}$, we have 
\begin{align*}
\E_{W_0^{(l+1)}}\| \alpha^{(l+1)}(\x) \|_2^2 - \frac{c_{\phi,\sigma_0}}{2h_C(L)} m_{l+1}  & \leq \min_{i \in [n]} \| \alpha^{(l+1)}(\x_i) \|_2^2 \leq \max_{i \in [n]} \| \alpha^{(l+1)}(\x_i) \|_2^2 \leq  E_{W_0^{(l+1)}}\| \alpha^{(l+1)}(\x) \|_2^2 + \frac{c_{\phi,\sigma_0}}{2h_C(L)} m_{l+1} \\
\overset{(a)}{\Rightarrow} \quad  c_{\phi,\sigma_0} \left( 1 - \frac{1 + \vartheta_0^2 h_C(l)}{2h_C(L)} \right) m_{l+1} & \leq \min_{i \in [n]} \| \alpha^{(l+1)}(\x_i) \|_2^2 \leq \max_{i \in [n]} \| \alpha^{(l+1)}(\x_i) \|_2^2 \leq c_{\phi,\sigma_0} \left( 1 - \frac{1 + \vartheta_0^2 h_C(l)}{2h_C(L)} \right) m_{l+1} \\
\overset{(b)}{\Rightarrow} \quad c_{\phi,\sigma_0} \left( 1 - \frac{h_C(l+1)}{2h_C(L)} \right) m_{l+1} & \leq \min_{i \in [n]} \| \alpha^{(l+1)}(\x_i) \|_2^2 \leq \max_{i \in [n]} \| \alpha^{(l+1)}(\x_i) \|_2^2 \leq  c_{\phi,\sigma_0} \left( 1 - \frac{h_C(l+1)}{2h_C(L)} \right) m_{l+1}~,
\end{align*}
where (a) follows from \eqref{eq:rip4} and (b) follows since $h_C(l+1) = 1 + \vartheta_0^2 h_C(l)$, with probability at least 
\begin{align*}
1 - 2n \exp\left[ - \min \left( \frac{c^2_{\phi,\sigma_0} m_{l+1}^{1/2}}{8 h_C(L)^2}, \frac{c_{\phi,\sigma_0}m_{l+1}^{3/4}}{4 h_C(L)}  \right) 2(\log m_{l+1}+\log n) \right] \geq 1 - \frac{2}{m_l^2}~.
\end{align*}
%\pccomment{When I was trying to derive the last inequality from the equation above, it seems we need to use the condition $\min \left( \frac{c^2_{\phi,\sigma_0} \pc{m_{l+1}^{3/4}}}{8 h_C(L)^2}, \frac{c_{\phi,\sigma_0}\pc{m_{l+1}^{3/4}}}{4 h_C(L)}  \right)\geq 1$, can this be double checked, please?} \abcomment{the first term inside the $\min$ should be a $m_{l+1}^{1/2}$ (reverted), now look at the (lower bound) condition on $m_{l+1}$ in the statement of the Theorem. Also, take a look at Remark 6.1, which shows the lower bound on $m_l$.}
Applying union bound over all layers completes the proof. \qed 

\begin{prop}
Let $c_{\phi,\sigma_0} := E_{z\sim \cN(0,\sigma_0^2)}[\phi^2(z)]$. Then,
%and $c_0 = \frac{\sigma_0^2}{c_{\phi,\sigma_0}}$. Then, 
\begin{align*}
c_{\phi,\sigma_0} - \sigma_0^2 |\beta^2 -1 | \leq \E_{z \sim \cN(0,\sigma_0^2)}[ \phi^2(\beta z)] \leq c_{\phi,\sigma_0} + \sigma_0^2 |\beta^2 -1| ~.
\end{align*}
\label{prop:gauss2}
\end{prop}
\proof We have 
\begin{align*}
\left| \E_{z\sim \cN(0,\sigma_0^2)} [\phi^2( \beta z)] -  E_{z\sim \cN(0,\sigma_0^2)}[\phi^2(z)] \right| 
& \leq  \E_{z\sim \cN(0,\sigma_0^2)}[|\phi^2 (\beta z) - \phi^2(z)|] \\
& = \E_{z\sim \cN(0,\sigma_0^2)}[|\phi(\beta z) -  \phi(z)| |\phi(\beta z) + \phi(z)|] \\ 
& \overset{(a)}{\leq} |\beta - 1| \E_{z\sim \cN(0,\sigma_0^2)}[| z| (|\phi(\beta z)| + |\phi(z)|)] \\
& \overset{(b)}{\leq} |\beta - 1| \E_{z \sim \cN(0,\sigma_0^2)} [|z| (\beta+1)|z|)] \\
&\leq |\beta^2 - 1| \E_{z \sim \cN(0,\sigma_0^2)} [|z|^2] \\
& = \sigma_0^2 |\beta^2 - 1|  ~,
%& = \frac{1}{c_{\phi}}  |\beta -1| 2\E_{z \sim \cN(0,1)}[\phi^2(z)]~,
\end{align*}
where (a) and (b) follows from the $1$-Lipschitzness of $\phi$. As a result, we have 
\begin{align*}
%c_{\phi}^2 - 2|\beta-1| c_{\phi}  \leq  c_{\phi}  \E_{z\sim \cN(0,1)} [\phi^2( \beta z) ] \leq \c_{\phi}^2 +  2|\beta-1| c_{\phi} \\
%\Righatarrow \quad 
c_{\phi,\sigma_0} - \sigma_0^2 |\beta^2-1| & \leq  \E_{z\sim \cN(0,1)} [\phi^2( \beta z) ] \leq c_{\phi,\sigma_0} +  \sigma_0^2 |\beta^2-1| ~.
% \\
% \Rightarrow \quad c_{\phi,\sigma_0}( 1 - c_0 |\beta^2 -1 |) & \leq \E_{z \sim \cN(0,\sigma_0^2)}[ \phi^2(\beta z)] \leq c_{\phi,\sigma_0}(1 + c_0|\beta^2 -1 |)~.
\end{align*}
This completes the proof. \qed 




\subsection{Proof of Lemma~\ref{lemm:lambdahermite}}
\label{ssec:lambdahermite}

We start with a specific consequence of the Schur product theorem~\cite[Lemma 6.5]{oymak2020hermite} applied to $r$-th order Hadamard product of positive definitive matrices.
\begin{prop}
Let $B = AA^\top$ where $A \in \R^{n \times p}$. Let $b_0 = \min_{i \in [n]} B_{ii}$. Then, for any $r \geq 1$ $\lambda_{\min}( (A A^\top)^{\odot r}) \geq b_0^{r-1} \lambda_{\min}(AA^\top)$.
\label{lemm:schurhad}
\end{prop}
\proof Recall that for PSD matrices $P, Q$, it holds that $\lambda_{\min}(P \odot Q) \geq \min_{i \in [n]} Q_{ii} \cdot \lambda_{\min}(P)$. Further, note that $B_{ii} \geq 0$ and $b_0 \geq 0$ by construction. Then,
\begin{align*}
\lambda_{\min}( (A A^\top)^{\odot r}) & = \lambda_{\min}(B^{\odot (r-1)} \odot B) \geq \min_{i \in [n]} (B^{\odot (r-1)})_{ii} \cdot \lambda_{\min}(B)  \leq  b_0^{r-1} \lambda_{\min}(AA^\top)~.
\end{align*}
That completes the proof. \qed 


Now, we are ready to prove Lemma~\ref{lemm:lambdahermite}.
%
%\lambdahermite*

\proof[Proof of Lemma~\ref{lemm:lambdahermite}] For convenience, let
\begin{align*}
\lambda_{l+1} := \lambda_{\min}\left(  \E_{g \sim 
\cN(\bm{0}_{m_{l}},\sigma^2 \I_{m_{l}})
}\left[ \phi\left( \frac{1}{\sqrt{m_{l}}} A^{(l)} g \right) \phi\left( \frac{1}{\sqrt{m_{l}}} (A^{(l)} g)^\top\right) \right] \right)
\end{align*}
Let $U_l \in \R^{n \times m_l}$ have $i$th row $U_{l,i:} = \frac{\alpha^{(l)}(\x_i)}{\|\alpha^{(l)}(\x_i)\|_2}$, so that $U_l$ is a row normalized version of $A^{(l)}$. Let $C_l = \diag(c_{l,i})$ where $c_{l,i} = \frac{\|\alpha^{(l)}(\x_i)\|_2}{\sqrt{m_l}}$. Note that $\frac{1}{\sqrt{m_l}} A^{(l)} = C_l U_l$. Further, from Lemma~\ref{lemm:alphainit1}, $\min_{i,l} c_{l,i} \geq \sqrt{\frac{c_{\phi,\sigma_0}}{2}}$ and $\max_{i,l} c_{l,i} \leq \sqrt{\frac{3c_{\phi,\sigma_0}}{2}}$ with probability at least $1 - 2n\sum^L_{l=1}\frac{1}{m_l}$.
Let $M^{(l)}_r(\phi) = \diag\left( \mu_r^{[c_i^2 \sigma^2]}(\phi) \right)$, and let $(\mu_{r,0}^{(l)})^2 = \min_{i \in [n]} \left( \mu_r^{[c_i^2 \sigma^2]}(\phi) \right)^2$.
Then, for any integer $r > 0$, we have 
\begin{align*}
\lambda_{l+1} & = \lambda_{\min} \left( \E_{g \sim 
\cN(\bm{0}_{m_{l}},\sigma^2 \I_{m_{l}})
}\left[ \phi\left( C_l U_l g\right) \phi\left( C_l U_l g\right)^\top \right] \right) \\
& \overset{(a)}{\geq} \sigma^{6r} \left( \frac{c_{\phi,\sigma_0}}{2} \right)^{3r} \lambda_{\min} \left( (M_r^{(l)}(\phi) (U_l)^{\star r}) (M_r^{(l)}(\phi) (U_l)^{\star r})^\top \right)~\\ 
& \overset{(b)}{\geq} (\mu_{r,0}^{(l)})^2 \sigma^{6r} \left( \frac{c_{\phi,\sigma_0}}{2} \right)^{3r} \lambda_{\min} \left( ((U_l)^{\star r}) ((U_l)^{\star r})^\top \right)~\\ 
& \geq (\mu_{r,0}^{(l)})^2 \sigma^{6r} \left( \frac{c_{\phi,\sigma_0}}{2} \right)^{3r} \lambda_{\min} \left(( U_l U_l^\top)^{\odot r} \right)~\\ 
& \overset{(c)}{\geq} (\mu_{r,0}^{(l)})^2 \sigma^{6r} \left( \frac{c_{\phi,\sigma_0}}{2} \right)^{3r} \lambda_{\min} \left( U_l U_l^\top \right) \\
& = (\mu_{r,0}^{(l)})^2 \sigma^{6r} \left( \frac{c_{\phi,\sigma_0}}{2} \right)^{3r} \frac{1}{m_l}  \lambda_{\min} \left( C_l^{-1} A^{(l)} (A^{(l)})^\top C_l^{-1} \right) \\
& \geq (\mu_{r,0}^{(l)})^2 \sigma^{6r} \left( \frac{c_{\phi,\sigma_0}}{2} \right)^{3r} \frac{2}{3 c_{\phi,\sigma_0}m_l} \lambda_{\min}  \left( A^{(l)} (A^{(l)})^\top \right) \\
& \overset{(d)}{\geq} (\mu_{r,0}^{(l)})^2 \sigma^{6r} \left( \frac{c_{\phi,\sigma_0}}{2} \right)^{3r} \frac{1}{6 c_{\phi,\sigma_0}} \lambda_l~,
\end{align*}
where (a) follows from Lemma~\ref{lemm:hermseries},
\begin{align*}
\E_{\tilde{g} \sim \cN(\bm{0}_{m_l},\sigma^2 \I_{m_l})}\left[ \phi(C_l U_l g) \phi(C_l U_l g)^\top \right] &\succeq \sum_{r'=0}^{\infty}  \sigma^{6r'} (\min_i c_{l,i})^{6r'} (M_r^{(l)}(\phi) (U_l)^{\star r'}) (M_r^{(l)}(\phi) (U_l)^{\star r'})^\top\\
&\succeq 
\sigma^{6r} \left(\frac{c_{\phi,\sigma_0}}{2}\right)^{3r} (M_r^{(l)}(\phi) (U_l)^{\star r}) (M_r^{(l)}(\phi) (U_l)^{\star r})^\top
\end{align*}
for any $r>0$; (b) follows since for a diagonal matrix $M$ with $\mu_0^2 = \min_{i \in [n]} M^2_{ii}$ and a compatible matrix $U$,  
\begin{align*}
\inf_{v : \|v \|_2=1} v^\top (M U) (MU)^\top v & = \inf_{v : \|v \|_2=1} v^\top M (U U^\top) M^\top v \geq \inf_{v : \|v \|_2=1} v^\top M M^\top v  \inf_{w : \|w \|_2=1} w^\top U U^\top w \\
& \geq \mu_0^2 \lambda_{\min}(UU^\top)~;
\end{align*}
(c) follows from Proposition~\ref{lemm:schurhad}; and (d) follows from Lemma~\ref{lemm:highproblambda}.
% Let
% \begin{align*}
% \overline{\lambda}_{l} := \lambda_{\min}\left(  \E_{g \sim \sigma^2 \I_{m_{l}}}\left[ \phi\left( U_{l-1} g \right) \phi\left( U_{l-1} g)^\top\right) \right] \right)
% \end{align*}
% be the normalized version of $\lambda_l$. Then, from Lemma~\ref{lemm:highproblambda}, with probability at least $(1-\frac{\delta}{L})$, we have 
% \begin{align*}
% \lambda_{\min}( U_l U_l^\top ) & \geq \frac{ \overline{\lambda}_l}{4} \\
% & = \frac{1}{4} \lambda_{\min} \left( \E_{g \sim \sigma^2 \I_{m_l}}\left[ \phi\left( U_{l-1} g\right) \phi\left( U_{l-1} g\right)^\top \right] \right) \\
% & \overset{(a)}{\geq} \left( \frac{\mu_r^{[\sigma^2]}(\phi)}{2} \right)^2 \sigma^{6r} \lambda_{\min}(U_{l-1} U_{l-1}^\top)
% \end{align*}
% where (a) follows from Lemma~\ref{lemm:hermseries}. 

Proceeding recursively, using $\sigma^2 = \nu_0^2 = \frac{\sigma_0^2}{c_{\phi,\sigma_0}}$, we have
\begin{align*}
\lambda_{l+1} & \geq\frac{ (\mu_{r,0}^{(l)})^2}{6 c_{\phi,\sigma_0}} \sigma^{6r} \left( \frac{c_{\phi,\sigma_0}}{2} \right)^{3r} \lambda_l \\
& \geq \frac{(\mu_{r,0}^{(l)})^2}{6 c_{\phi,\sigma_0}} \left( \frac{\sigma_0^2}{2} \right)^{3r} \lambda_l \\
& \geq  \left( \frac{(\mu_{r,0}^{(l)})^2}{6 c_{\phi,\sigma_0}} \right)^2 \left( \frac{\sigma_0^2}{2} \right)^{6r} \lambda_{l-1} \\
& \geq \left( \frac{(\mu_{r,0}^{(l)})^2}{6 c_{\phi,\sigma_0}} \right)^l \left( \frac{\sigma_0^2}{2} \right)^{3rl} \lambda_{1} 
% & \geq (\mu_{r,0}^{(l)})^2 \sigma^{6r} \left( \frac{c_{\phi,\sigma_0}}{2} \right)^{3r} \lambda_{\min} \left( U_l U_l^\top \right) \\
% & \geq (\mu_{r,0}^{(l)})^2 \sigma^{6r} \left( \frac{c_{\phi,\sigma_0}}{2} \right)^{3r} \left( \frac{\mu_r^{[\sigma^2]}(\phi)}{2} \right)^2 \sigma^{6r}  \lambda_{\min} \left( U_{l-1} U_{l-1}^\top \right) \\
% & \geq (\mu_{r,0}^{(l)})^2 \left( \frac{\sigma_0^2}{2} \right)^{3r} \sigma^{6rl} \left( \frac{\mu_r^{[\sigma^2]}(\phi)}{2} \right)^{2l} \lambda_{\min}(XX^\top) \\
% & \geq (\mu_{r,0}^{(l)})^2 \left( \frac{\sigma_0^2}{2} \right)^{3r} \left( \frac{\sigma_0^2}{c_{\phi,\sigma_0}} \right)^{3rl} \left( \frac{\mu_r^{[\sigma^2]}(\phi)}{2} \right)^{2l} \lambda_{\min}(XX^\top)~.
\end{align*}
That completes the proof. \qed 


% \begin{align}
% \lambda_{l+1} & \geq \left( \mu_r^{[\sigma^2]}(\phi) \right)^2 \frac{\sigma_0^{6r}c_{\phi,\sigma_0}^{r} m_l^r}{c_{\phi,\sigma_0}^{3r} m_{l}^r} c_{\phi,\sigma_0}^{r-1} m_l^{r-1} \frac{m_l }{4} \lambda_{l} \nonumber \\
% & \geq \frac{\left( \mu_r^{[\sigma^2]}(\phi) \right)^2}{4 c_{\phi,\sigma_0}} \left( \frac{m_l}{\nu_0} \right)^r \lambda_l \nonumber \\
% & \geq \left(\frac{\left( \mu_r^{[\sigma^2]}(\phi) \right)^2}{4 c_{\phi,\sigma_0}} \right)^l \left( \frac{m}{\nu_0} \right)^{rl} \lambda_1~.
% \label{eq:rec1}
% \end{align}
% From Lemma~\ref{lemm:highproblambda}, with probability at least $1- \frac{2L}{m^2}$, using $m_0 =d$ we have 
% \begin{align}
% \lambda_1 & \geq \left( \mu_r^{[\sigma^2]}(\phi) \right)^2 \frac{\sigma^{6r}c_{0}^{2r}}{m_{0}^r} \lambda_{\min}( {A^{(0)}}^{\star r} ({A^{(0)}}^{\star r})^\top  ) \nonumber \\
% & \geq \left( \mu_r^{[\sigma^2]}(\phi) \right)^2 \frac{\sigma_{0}^{6r}c_{\phi,\sigma_0}^{r} m_0^r}{c_{\phi,\sigma_0}^{3r} m_{0}^r} c_{\phi,\sigma_0}^{r-1} m_0^{r-1}\lambda_{\min}( {A^{(0)}} ({A^{(0)}})^\top  ) \nonumber \\
% & = \frac{ \left( \mu_r^{[\sigma^2]}(\phi) \right)^2}{d c_{\phi,\sigma_0}} \left( \frac{d}{\nu_0} \right)^r \lambda_{\min}( X X^\top  )~.
% \label{eq:rec2}
% \end{align}
% Combining \eqref{eq:rec1} and \eqref{eq:rec2}, we have
% \begin{align*}
% \lambda_{l+1} & \geq \left(\frac{\left( \mu_r^{[\sigma^2]}(\phi) \right)^2}{4 c_{\phi,\sigma_0}} \right)^{l+1} \left( \frac{m}{\nu_0} \right)^{rl} \frac{1}{d} \left( \frac{d}{\nu_0} \right)^r \lambda_{\min}( X X^\top  )~.
% \end{align*}
% That completes the proof. \qed

%\abcomment{the lower bound $c_l$ can possibly be obtained from the SubG property of $\alpha_i^{(l)}$ ... variant of Lemma~\ref{lemm:gammasubg}}





\subsection{Background on Hermite Polynomials and Hermite Series Expansions}
\label{ssec:ghermite}
Let $L^2(\R,w(\x))$ denote the set of all functions $f : \R \to \R$ such that 
\begin{align}
    \int_{-\infty}^{\infty} f^2(\x) w(\x) dx < \infty~.
\end{align}
{\bf Probabilist's and Physicist's Hermite Polynomials.} The normalized \emph{probabilist's Hermite  polynomials} are given by:
\begin{align}
H_r(\x) = \frac{(-1)^r}{\sqrt{r!}} e^{\frac{x^2}{2}} \frac{d^r}{dx^r} e^{-\frac{x^2}{2}}~.
\label{eq:probherm}
\end{align}
The polynomials are orthogonal with respect to the weight function $w(\x) = e^{-\frac{x^2}{2}}$ in the sense that
\begin{align}
\int_{-\infty}^{\infty} H_r(\x) H_{r'}(\x) w(\x) dx & = \sqrt{2\pi} \delta_{r r'}~,
\end{align}
where $\delta_{r r'} = 1$, if $r=r'$, and 0 otherwise, i.e., the Kronecker delta. The corresponding unnormalized probabilist's Hermite polynomials are given by $\bar{H}_r(\x) = \sqrt{r!} H_r(\x)$. 

The normalized physicist's Hermite polynomials are respectively
\begin{align}
\tilde{H}_r(\x) = \frac{(-1)^r}{\sqrt{r!}} e^{x^2} \frac{d^r}{dx^r} e^{-x^2}~.
\label{eq:phyherm}
\end{align}
The polynomials are orthogonal with respect to the weight function $\tilde{w}(\x) = e^{-x^2}$ in the sense that
\begin{align}
\int_{-\infty}^{\infty} \tilde{H}_r(\x) \tilde{H}_{r'}(\x) \tilde{w}(\x) dx & = \sqrt{2\pi} 2^r \delta_{r r'}~,
\end{align}
where $\delta_{r r'}$ is the Kronecker delta. The corresponding unnormalized physicist's Hermite polynomials are given by $\bar{\tilde{H}}_r(\x) = \sqrt{r!} \tilde{H}_r(\x)$. 

{\bf Generalized Hermite Polynomials.} Our analysis of potentially inhomogeneous activation functions will need the substantially more flexible notion of normalized \emph{generalized Hermite polynomials} $H_r^{[q]}(\x)$, for a given $q>0$, which are orthogonal with respect to $w^{[q]}(\x) = \frac{1}{\sqrt{2\pi q}} e^{-x^2/2a}$, and are given by
\begin{align}
    H_r^{[q]}(\x) =  \frac{(-1)^r}{\sqrt{r!}} e^{\frac{x^2}{2q}} \frac{d^r}{dx^r} e^{-\frac{x^2}{2q}}~.
\label{eq:genherm}
\end{align}
It is easy to see that $H_r^{[1]}(\x) = H_r(\x)$, the probabilist's Hermite polynomial in \eqref{eq:probherm}, and $H_r^{[\frac{1}{2}]} = \tilde{H}_r(\x)$, the physicist's Hermite polynomial in \eqref{eq:phyherm}. Furthermore, the generalized Hermite polynomials can be written as scaled versions of probabilist's Hermite polynomials as
\begin{align}
    H_r^{[q]}(\x) = a^{\frac{r}{2}} H_r\left( \frac{x}{\sqrt{q}} \right)~.
\label{eq:hermtrans}
\end{align}

{\bf Hermite Series.} The polynomials $\{ H_r(\x) \}_{r=0}^{\infty}$ form an orthonormal basis for $L^2\left(\R,\frac{e^{-x^2/2}}{\sqrt{2\pi}}\right)$ which is a Hilbert space with inner product
\begin{align}
\langle \phi_1, \phi_2 \rangle = \int_{-\infty}^{\infty} \phi_1(\x) \phi_2(\x) \frac{e^{-x^2/2}}{\sqrt{2\pi}} dx ~.
\end{align}
Thus, any function in $L^2\left(\R,\frac{e^{-x^2/2}}{\sqrt{2\pi}}\right)$ can be represented as a Hermite series expansion
\begin{align}
    \phi(\x) = \sum_{r=0}^{\infty} \mu_r(\phi) H_r(\x) ~,
\end{align}
where $\mu_r(\phi)$ is the $r$-th Hermite coefficient given by
\begin{align}
\mu_r(\phi) = \int_{-\infty}^{\infty} \phi(z) H_r(z) \frac{e^{-z^2/2}}{\sqrt{2\pi}} dz ~.
\end{align}
Note that $\phi \in L^2\left(\R,\frac{e^{-x^2/2}}{\sqrt{2\pi}}\right)$ if and only if $\| \phi \|^2 = \langle \phi,\phi \rangle = \sum_{r=0}^{\infty} \mu_r^2(\phi) < \infty$.

For our analysis with inhomogeneous activation functions, we will need to use Hermite series expansions with generalized Hermite polynomials. The polynomials $\{ H_r^{[q]}(\x) \}_{r=0}^{\infty}$ form an orthonormal basis for $L^2\left(\R,\frac{e^{-x^2/2a}}{\sqrt{2\pi a}}\right)$ which is a Hilbert space with inner product
\begin{align}
\langle \phi_1, \phi_2 \rangle = \int_{-\infty}^{\infty} \phi_1(\x) \phi_2(\x) \frac{e^{-x^2/2q}}{\sqrt{2\pi q}} dx ~.
\end{align}
Any function in $L^2\left(\R,\frac{e^{-x^2/2q}}{\sqrt{2\pi q}}\right)$ can be represented as a Hermite series expansion:
\begin{align}
    \phi(\x) = \sum_{r=0}^{\infty} \mu_r^{[q]}(\phi) H_r^{[q]}(\x) ~,
\end{align}
where $\mu_r^{[q]}(\phi)$ is the $r$-th Hermite coefficient given by
\begin{align}
\mu_r^{[q]}(\phi) = \int_{-\infty}^{\infty} \phi(z) H_r^{[q]}(z) \frac{e^{-z^2/2q}}{\sqrt{2\pi q}} dz ~.
\end{align}
Note that $\phi \in L^2\left(\R,\frac{e^{-x^2/2q}}{\sqrt{2\pi q}}\right)$ if and only if $\| \phi \|^2 = \langle \phi,\phi \rangle = \sum_{r=0}^{\infty} (\mu_r^{[q]}(\phi))^2 < \infty$.


\subsection{Expectation of Product of Hermite Polynomials}
Our NTK analysis for general activation functions, including inhomogeneous functions, depends on the following key result on expectation of product of Hermite polynomials. The equivalent prior analysis in~\citep{oymak2020hermite,ng2020hermite1,ng2021hermite2} only works for homogeneous functions, and uses basic Hermite polynomials. Our general analysis instead uses generalized Hermite polynomials.

\begin{lemm}
Let $\u_x,\u_y \in \R^d$ be unit vectors, and let $c_x, c_y \in \R_{++}$ be positive constants. Then, for $r,r'=0,1,\ldots$ and $\delta_{rr'}$ denoting the Kronecker delta, we have
\begin{align}
\E_{\tilde{\g} \sim \cN(\bm{0}_d,\sigma^2 \I_d)}\left[ H_r^{[c_x^2 \sigma^2]}(c_x \langle \tilde{\g}, \u_x\rangle) H_{r'}^{[c_y^2 \sigma^2]}(c_y \langle \tilde{\g}, \u_y \rangle) \right] = \sigma^{6r}  c_x^{3r} c_y^{3r} \langle \u_x, \u_y \rangle^r \delta_{rr'}~.
\end{align}
\label{lemm:hermprod}
\end{lemm}
\proof Let $\g \sim \cN(\bm{0},\I_d)$ so that $\sigma\g$ is identically distributed as $\tilde{\g} \sim \cN(\bm{0},\sigma^2 \I_d)$, and consider any $s,t,\in\R$. Then,
\begin{align*}
\E_{\tilde{\g} \sim \cN(\bm{0}_d,\sigma^2 \I_d)}\left[ \exp(s c_x \langle \tilde{\g},\u_x\rangle + t c_y \langle \tilde{\g},\u_y\rangle) \right] 
& = \E_{\g \sim \cN(\bm{0}_d,\I_d)}\left[ \exp(s\sigma c_x \langle \g,\u_x\rangle + t \sigma c_y \langle \g,\u_y\rangle) \right]\\
& = \prod_{j=1}^d \E_{\g \sim \cN(\bm{0}_d,\I_d)}\left[ \exp(s \sigma c_x g_j u_{x,j} + t \sigma c_y g_j u_{y,j}) \right] \\
& = \prod_{j=1}^d \exp\left( \frac{\sigma^2 (s c_x u_{x,j} + t c_y u_{y,j})^2}{2} \right) \\
& = \exp\left( \frac{s^2 \sigma^2 c_x^2}{2} \| \u_x\|_2^2 + \frac{t^2 \sigma^2 c_y}{2} \| \u_y \|_2^2 + st \sigma^2 c_x c_y \langle \u_x, \u_y \rangle \right) ~,
\end{align*}
so that, since $\|\u_x\|_2^2 = \|\u_y \|_2^2 = 1$, we have
\begin{align}
\E_{\g \sim \cN(\bm{0}_d,\I_d)}\left[ \exp\left(s\sigma c_x \langle \g,\u_x\rangle - \frac{s^2\sigma^2 c_x^2}{2}  \right) \exp \left( t \sigma c_y \langle \g,\u_y\rangle - \frac{t^2 \sigma^2 c_y^2}{2}  \right) \right] =  \exp\left( st \sigma^2 c_x c_y \langle \u_x, \u_y \rangle \right) ~.
\label{eq:hermprod1}
\end{align}
We consider the functions $f,h : \R^d \to \R$ defined as
\begin{align}
    f(s) = \exp \left( s\sigma c_x \langle \g,\u_x \rangle - \frac{s^2\sigma^2 c_x^2}{2} \right)~, \qquad h(t) = \exp \left( t\sigma c_y \langle \g,\u_y\rangle - \frac{t^2\sigma^2 c_y^2}{2} \right)~.
\end{align}
Consider the Taylor expansion of $f(s)$ with respect to $f(0)$ given by
\begin{align}
f(s) = \sum_{r=0}^{\infty} f_r(0) \frac{s^r}{\sqrt{r!}}~, \qquad \text{where} \qquad f_r(0) = \frac{1}{\sqrt{r!}} \left. \frac{d^r}{ds^r} e^{s \sigma c_x \langle \g, \u_x \rangle - \frac{s^2\sigma^2 c_x^2}{2}  } \right|_{s=0}~.
\end{align}
With $z = \langle \g, \u_x \rangle$ and $\tilde{z} = \frac{z}{\sigma c_x}$, we have 
\begin{align*}
f_r(0) & = \frac{1}{\sqrt{r!}} \left. \frac{d^r}{ds^r} e^{s \sigma c_x z - \frac{s^2\sigma^2 c_x^2}{2}  } \right|_{s=0}  \\
& = \frac{1}{\sqrt{r!}} \left. e^{\frac{z^2}{2}} \frac{d^r}{d s^r} e^{-\frac{1}{2} (z - s \sigma c_x)^2} \right|_{s=0} \\
& = \frac{1}{\sqrt{r!}} \left. e^{\frac{z^2}{2}} \frac{d^r}{d s^r} e^{-\frac{\sigma^2 c_x^2}{2} \left( \frac{z}{\sigma c_x} - s \right)^2} \right|_{s=0} \\
& = \frac{1}{\sqrt{r!}} \left. e^{\frac{\sigma^2 c_x^2 \tilde{z}^2}{2}} \frac{d^r}{d s^r} e^{-\frac{\sigma^2 c_x^2}{2} ( \tilde{z} - s )^2} \right|_{s=0} \\
& \overset{(a)}{=} \frac{(-1)^r}{\sqrt{r!}} \left. e^{\frac{\sigma^2 c_x^2 \tilde{z}^2}{2}} \frac{d^r}{d {\tilde{z}}^r} e^{-\frac{\sigma^2 c_x^2}{2} ( \tilde{z} - s )^2} \right|_{s=0} \\
& = \frac{(-1)^r}{\sqrt{r!}} e^{\frac{\sigma^2 c_x^2 \tilde{z}^2}{2}} \frac{d^r}{d {\tilde{z}}^r} e^{-\frac{\sigma^2 c_x^2 \tilde{z}^2}{2}}  \\
& \overset{(b)}{=} H^{\left[\frac{1}{\sigma^2 c_x^2}\right]}_r(\tilde{z})~,
\end{align*}
where (a) follows by the transport or advection equation $\frac{d}{ds} \psi(\tilde{z} - s) = (-1) \frac{d}{d \tilde{z}} \psi(\tilde{z}-s)$ and the equality of mixed partial derivatives for any sufficiently smooth function $\psi$, and (b) follows by definition of generalized Hermite polynomials in \eqref{eq:genherm}. Now, note that
\begin{align*}
H^{\left[\frac{1}{\sigma^2 c_x^2}\right]}_r(\tilde{z}) & \overset{(a)}{=} \frac{1}{\sigma^r c_x^r} H_r(\sigma c_x \tilde{z}) \\
& = \frac{1}{\sigma^r c_x^r} H_r(z) \\
& = \frac{1}{\sigma^r c_x^r} H_r\left( \langle \g, \u_x\rangle \right) \\
& = \frac{1}{\sigma^r c_x^r} H_r\left( \frac{c_x}{\sigma c_x} \langle \tilde{\g}, \u_x \rangle \right) \\
& \overset{(b)}{=} \frac{1}{\sigma^{2r} c_x^{r}} H^{[c_x^2\sigma^2]}_r\left(c_x \langle \tilde{\g}, \u_x\rangle \right)~.
\end{align*}
where (a) and (b) follow from~\eqref{eq:hermtrans}. Thus,
\begin{align}
f(s) = \sum_{r=0}^{\infty} f_r(0) \frac{s^r}{\sqrt{r!}}~, \qquad \text{where} \qquad f_r(0) 
%= H^{\left[\frac{1}{\sigma^2 \|\x\|^2}\right]}_r\left( \frac{1}{\sigma \|\x\|_2} \langle \g,\x \rangle \right) 
= \frac{1}{\sigma^{2r} c_x^{2r}} H^{[c_x^2 \sigma^2]}_r\left( c_x \langle \tilde{\g}, \u_x\rangle \right)~.
\end{align}

Similarly, considering the Taylor expansion of $h(t)$ with respect to $h(0)$, we have 
\begin{align}
h(t) = \sum_{r'=0}^{\infty} h_{r'}(0) \frac{t^{r'}}{\sqrt{r'!}}~, \qquad \text{where} \qquad h_{r'}(0) 
%= H^{\left[\frac{1}{c_y^2 \sigma^2}\right]}_r \left( \frac{1}{\sigma \|\y\|_2} \langle \g,\y \rangle \right)
= \frac{1}{\sigma^{2r'} c_y^{2r'}} H^{[c_y^2\sigma^2]}_{r'} \left( c_y \langle \tilde{\g}, \u_y \rangle \right)~.
\end{align}
Then, from \eqref{eq:hermprod1}, we have 
\begin{align*}
\E_{\tilde{\g} \sim \cN(\bm{0}_d,\sigma^2 \I_d)}&\left[ \left( \sum_{r=0}^{\infty} \frac{1}{\sigma^{2r} c_x^{2r}} H^{[c_x^2 \sigma^2]}_r( c_x \langle \tilde{\g}, \u_x\rangle) \frac{s^r}{\sqrt{r!}} \right)  \left( \sum_{r'=0}^{\infty} \frac{1}{\sigma^{2r'} c_y^{2r'}} H^{[c_y^2 \sigma^2]}_{r'}\left( c_y \langle \tilde{\g}, \u_y \rangle \right) \frac{t^{r'}}{\sqrt{r'!}} \right) \right]\\
&=  \sum_{r=0}^{\infty} \frac{\sigma^{2r} c_x^r c_y^r \langle \u_x, \u_y \rangle^r}{r!} s^r t^r~.
\end{align*}
Since the equality holds for arbitrary $s,t \in R$, equating coefficients of  $s^r t^{r'}$ on both sides, we have 
\begin{align}
\E_{\tilde{\g} \sim \cN(\bm{0}_d,\sigma^2 \I_d)}\left[ H_r^{[c_x^2 \sigma^2]}(c_x \langle \tilde{\g}, \u_x\rangle) H_{r'}^{[c_y^2 \sigma^2]}(c_y \langle \tilde{\g}, \u_y \rangle) \right] = \sigma^{6r} c_x^{3r} c_y^{3r} \langle \u_x, \u_y \rangle^r \delta_{rr'}~,
\end{align}
where $\delta_{rr'}$ is the Kronecker delta. That completes the proof. \qed 

The following result is an important consequence of Lemma~\ref{lemm:hermprod}.

\begin{lemm}
Let $\phi$ be an inhomogeneous activation function. Let $\u_x, \u_y\in\R^d$ be unit vectors and $c_x, c_y$ be positive constants. Then,  we have
\begin{align}
\E_{\tilde{\g} \sim \cN(\bm{0}_d,\sigma^2 \I_d)}\left[ \phi(c_x\langle \tilde{\g}, \u_x \rangle) \phi(c_y\langle \tilde{\g}, \u_y \rangle) \right] = \sum_{r=0}^{\infty} \mu_r^{[c_x^2 \sigma^2]}(\phi) \mu_r^{[c_y^2 \sigma^2]}(\phi) \sigma^{6r} c_x^{3r} c_y^{3r}  \langle \u_x, \u_y \rangle^r ~. 
\end{align}
Further, let $U = [\u_1,\cdots,\u_n]^\top \in \R^{n \times m}$ be such that $\| \u_i \|_2 =1, i \in [n]$. Let $C =\diag(c_i)\in\R^{n\times n}, c_i > 0$, and $c_0 = \min_{i \in [n]} c_i > 0$. Let $M_r(\phi) =  \diag\left( \mu_r^{[c_i^2 \sigma^2]}(\phi)\right)$. Then,
\begin{align}
\E_{\tilde{\g} \sim \cN(\bm{0}_d,\sigma^2 \I_d)}\left[ \phi(C U \tilde{\g}) \phi(C U \tilde{\g})^\top \right] \succeq \sum_{r=0}^{\infty}  \sigma^{6r} c_0^{6r} (M_r(\phi) U^{\star r}) (M_r(\phi) U^{\star r})^\top~,
\end{align}
where $U^{\star r} \in \R^{n \times mr}$ is such that the $i$th row $U^{\star r}_{i,:} = (\u_i^{\odot r})^\top$\pccomment{I used to say 
 "$\otimes$" instead of "$\odot$", but I think this is the right notation to keep consistency.}, i.e., $r$ times Kronecker product of $\u_i$ with itself.
\label{lemm:hermseries}
\end{lemm}
\proof Consider the generalized Hermite series expansion of $\phi(\cdot)$ in terms of the generalized Hermite functions $H_n^{[\sigma^2]}$:
\begin{align}
\phi(c_x\langle \tilde{\g}, \u_x \rangle) = \sum_{r=0}^{\infty} \mu_r^{[c_x^2 \sigma^2]}(\phi) H_r^{[c_x^2 \sigma^2]}(c_x \langle \tilde{\g}, \u_x \rangle)~.
\end{align}
Then, we have 
\begin{align*}
\E_{\tilde{\g} \sim \cN(\bm{0}_d,\sigma^2 \I_d)}&\left[ \phi(c_x\langle \tilde{\g}, \u_x \rangle) \phi(c_y\langle \tilde{\g}, \u_y \rangle) \right] \\
& = \E_{\tilde{\g} \sim \cN(\bm{0}_d,\sigma^2 \I_d)}\left[ \left( \sum_{r=0}^{\infty} \mu_r^{[c_x^2 \sigma^2]}(\phi) H_r^{[c_x^2 \sigma^2]}(c\langle \tilde{\g}, \u_x \rangle) \right) \left( \sum_{r'=0}^{\infty} \mu_{r'}^{[c_y^2 \sigma^2]}(\phi) H_{r'}^{[c_y^2 \sigma^2]}(c\langle \tilde{\g}, \u_y \rangle) \right) \right] ~\\
& =  \sum_{r,r'=0}^{\infty} \mu_r^{[c_x^2 \sigma^2]}(\phi) \mu_{r'}^{[c_y^2 \sigma^2]}(\phi) \E_{\tilde{\g} \sim \cN(\bm{0}_d,\sigma^2 \I_d)}\left[H_r^{[c_x^2 \sigma^2]}(c_x \langle \tilde{\g}, \u_x \rangle) H_{r'}^{[c_y^2 \sigma^2]}(c_y \langle \tilde{\g}, \u_y \rangle) \right] \\
& \overset{(a)}{=} \sum_{r=0}^{\infty} \mu_r^{[c_x^2 \sigma^2]}(\phi) \mu_r^{[c_y^2 \sigma^2]}(\phi)  \sigma^{6r} c_x^{3r} c_y^{3r} \langle \u_x, \u_y \rangle^r~.
\end{align*}
%\pccomment{So now the exponent of $c$ is $4r$ instead of $2r$, please double check.}\abcomment{the scaling is $c^{2r}$, similar to $a^{2r}$, not $c^{4r}$.} 
where (a) follows from Lemma~\ref{lemm:hermprod}. 

The matrix case result follows by noting that for any $i, j \in [n]$, $c_i^{3r} c_j^{3r} \geq c_0^{6r}$, $\langle \u_i , \u_j \rangle^r = \langle \u_i^{\otimes r}, \u_j^{\otimes r} \rangle$, and $\mu_r^{[c_i^2 \sigma^2]}(\phi)$ form the diagonal elements of $M_r(\phi)$. That completes the proof. \qed 


\subsection{Additional Remarks on $\lambda_1$}
\label{ssec:lambda1}
The main result in Theorem~\ref{theo:ntk0} establishes $\lambda_{\min}( K_{\ntk}(\cdot ;\theta_0) ) \geq c_0 \lambda_1$ where 
\begin{align*}
\lambda_1 = \lambda_{\min}\left(\E_{g \sim 
\cN(\bm{0}_{d},\sigma^2 \I_{d})}\left[ \phi\left(\frac{1}{\sqrt{d}} X g\right) \phi\left(\frac{1}{\sqrt{d}} X g\right)^\top\right]\right)~,
\end{align*}
where $\sigma^2 = \nu_0^2 = \frac{\sigma_0^2}{c_{\phi,\sigma_0}}$.
We made some high level informal remarks on why and when $\lambda_1 > 0$ in Remark~\ref{rem:lambda1}. Such results have been studied in the recent literature~\citep{SD-JL-HL-LW-XZ:19,DZ-YC-DZ-QG:20,ZAZ-YL-ZS:19,oymak2020hermite,ng2021hermite2}. %YLA-YL:18
We provide additional details on the topic.

Related to assumptions in \cite{SD-JL-HL-LW-XZ:19}, the simplest analysis comes from assuming $\lambda_{\min}(XX^\top) = c_{\phi,\sigma_0} d \lambda_0 > 0$ for some positive constant $\lambda_0$,  where the scaling is simply because $\|\x_i\|_2^2 = c_{\phi,\sigma_0} d$. With $\bar{X}:=\frac{1}{\sqrt{d c_{\phi,\sigma_0}}}X$ so that rows of $\bar{X}$ satisfy $\|\bar{\x}_i\|_2 = 1$ and $\lambda_{\min}(\bar{X}\bar{X}^\top) = \lambda_0 > 0$.
%Let $U_l \in \R^{n \times m_l}$ have rows $\u_{l,i} = \frac{\alpha^{(l)}(\x_i)}{\|\alpha^{(l)}(\x_i)\|_2}$, so that $U_l$ is a row normalized version of $A^{(l)}$. 
Let $C_0 = \diag(c_{0,i})$ where $c_{0,i} = \sqrt{c_{\phi,\sigma_0}}$. Note that $\frac{1}{\sqrt{d}} X = C_0 \bar{X}$. 
%Further, from Lemma~\ref{lemm:alphainit1}, $\min_{i,l} c_{l,i} \geq \sqrt{\frac{c_{\phi,\sigma_0}}{2}}$ and $\max_{i,l} c_{l,i} \leq \sqrt{\frac{3c_{\phi,\sigma_0}}{2}}$ with probability at least $1 - \frac{2L}{m^2}$.
Let $M^{(0)}_r(\phi) = \mu_r^{[\sigma_0^2]}(\phi) \diag( 1)$ \pccomment{Is $\diag( 1)$ the same as the identity matrix? If so, we can perhaps use a notation for the identity matrix such as $I$, though we might need to introduce such notation, maybe just here; I don't think we have introduced the notation $\diag(\cdot)$ either.)}, and let $(\mu_{r,0}^{(0)})^2 =  \left( \mu_r^{[\sigma_0^2]}(\phi) \right)^2$. From Lemma~\ref{lemm:hermseries}, for any integer $r > 0$, we have 
\begin{align*}
\lambda_{1} & = \lambda_{\min} \left( \E_{g \sim \cN(\bm{0}_{d},\sigma^2 \I_{d})}\left[ \phi\left( \frac{1}{\sqrt{d}} X g\right) \phi\left( \frac{1}{\sqrt{d}} X g\right)^\top \right] \right) \\
& \geq \sigma^{6r} \left( c_{\phi,\sigma_0} \right)^{3r} \lambda_{\min} \left( (M_r^{(0)}(\phi) (\bar{X})^{\star r}) (M_r^{(0)}(\phi) (\bar{X})^{\star r})^\top \right)~\\ 
& \geq (\mu_{r,0}^{(0)})^2 \sigma^{6r} \left( c_{\phi,\sigma_0} \right)^{3r} \lambda_{\min} \left( (\bar{X})^{\star r}) (\bar{X})^{\star r})^\top \right)~\\ 
& = (\mu_{r,0}^{(l)})^2 \sigma^{6r} \left( c_{\phi,\sigma_0} \right)^{3r} \lambda_{\min} \left( \bar{X} \bar{X}^\top)^{\odot r} \right)~\\ 
& \geq (\mu_{r,0}^{(l)})^2 \sigma^{6r} \left( c_{\phi,\sigma_0} \right)^{3r} \lambda_{\min} \left( \bar{X} \bar{X}^\top \right) \\
& \geq (\mu_{r,0}^{(l)})^2 \sigma^{6r} \left( c_{\phi,\sigma_0} \right)^{3r} \lambda_0~,
\end{align*}
which gives the desired result.


$\lambda_1$ can also be lower bounded by making assumptions on the activation function $\phi$, e.g., the separability and/or the distribution of $x_i$~\citep{YL-YL:18,ZAZ-YL-ZS:19,oymak2020hermite,ng2021hermite2}. For any unit vector $v$, 
\begin{align*}
\lambda_1(v) & := v^\top \E_{g \sim \cN\left(\bm{0}_{d},\frac{\sigma_0^2}{c_{\phi,\sigma_0}}\I_d\right)}[\phi(\sqrt{c_{\phi,\sigma_0}} \bar{X}g) \phi(\sqrt{c_{\phi,\sigma_0}}  \bar{X}g)^\top] v \\
& = v^\top \E_{g \sim \cN(\bm{0}_d,\sigma_0^2\I_d)}[\phi(\bar{X}g) \phi(\bar{X}g)^\top] v \\
& = \E_{g \sim \cN(\bm{0}_d,\sigma_0^2\I_d)}[ \| \phi(\bar{X}g)^\top v \|_2^2]~.
\end{align*}
Note that with $\tilde{g} = \bar{X}g~, $
it suffices to show $\E_{\tilde{g}}[\langle \phi(\tilde{g}), v \rangle^2] = \E_{Z = \langle \phi(\tilde{g}), v \rangle}[Z^2] \geq \chi_0 > 0$, for some uniform positive constant $\chi_0$ since $\lambda_1 = \inf_v \lambda_1(v)$. For any $c > 0$, by Markov's inequality, we have
\begin{align*}
\P( \| \phi(\bar{X}g)^\top v \|_2 \geq c ) & = \P ( \| \phi(\bar{X}g)^\top v \|_2^2 \geq c^2) \leq \frac{ \E[\| \phi(\bar{X}g)^\top v \|_2^2]}{c^2} \\
\Rightarrow \qquad \E[\| \phi(\bar{X}g)^\top v \|_2^2 & \geq c^2 \P( \| \phi(\bar{X}g)^\top v \|_2 \geq c )~.
\end{align*}
Thus, the problem boils down to lower bounding $\P( \| \phi(\bar{X}g)^\top v \|_2 \geq c )$ for a suitable choice of $c$, or, more conveniently $\P( \| \phi(\bar{X}g)^\top v \|_2 \geq c \|v\|_{\infty} )$ and using $\| v \|_{\infty} \geq \frac{1}{\sqrt{n}}$. Proceeding further rigorously needs specific assumptions on the activation $\phi$, as has been done in recent related work~\citep{oymak2020hermite,ZAZ-YL-ZS:19,YL-YL:18}. 