
Consider a training set $\cD = \{(\x_i,y_i)\}_{i=1}^n, \x_i \in \cX \subseteq \R^d, y_i \in \cY \subseteq \R$. We will denote by $X\in\R^{n\times d}$ the matrix whose $i$th row is $\x_i^\top$. 
%For a suitable loss function $\ell$, the goal is to minimize
%the empirical loss:   $\cL(\theta)  = \frac{1}{n} \sum_{i=1}^n \ell(  y_i, \hat{y}_i) = \frac{1}{n} \sum_{i=1}^n \ell (y_i,f(\theta;\x_i))$, where the prediction $\hat{y}_i:= f(\theta;\x_i)$ is from a deep model, and the parameter vector $\theta\in\R^p$. 
In our setting $f$ is a feed-forward multi-layer (fully-connected) neural network with depth $L$~\footnote{\pcedit{The network has $L$ hidden layers, and so has depth $L+1$ if considering the output layer. However, since the term $L$ appears more frequently in our results than $L+1$, $L$ will be referred as the \emph{depth} for convenience.}}\lzcomment{the network depth is $L+1$  actually}
 and widths $m_l, l \in [L] := \{1,\ldots,L\}$ given by
\begin{equation}
\begin{split}
    &\alpha^{(0)}(\x)  = \x~, \\
     &\alpha^{(l)}(\x) = \phi \left( \frac{1}{\sqrt{m_{l-1}}} W^{(l)} \alpha^{(l-1)}(\x) \right)~,~~~l=1,\ldots,L~,\\
    &f(\theta;\x)  = \alpha^{(L+1)}(\x) = \frac{1}{\sqrt{m_L}} \v^\top \alpha^{(L)}(\x)~,
    \end{split}
\label{eq:DNN}    
\end{equation}
where $W^{(l)} \in \R^{m_l \times m_{l-1}}, l \in [L]$ are layer-wise weight matrices, $\v\in\R^{m_{L}}$ is the last layer vector, $\phi(\cdot )$ is the smooth (pointwise) activation function, and the total set of parameters is represented by the weight vector 
\begin{align}
\theta &:= (\vec(W^{(1)})^\top,\ldots,\vec(W^{(L)})^\top, \v^\top )^\top \nonumber \\
&~~~~~~\in \R^{\sum_{k=1}^L m_k m_{k-1}+m_{L}}~,
\label{eq:theta_def}
\end{align}
with $m_0=d$. \pcdelete{For simplicity, we will assume that the width of all the layers is the same, i.e., $m_l = m$, $l\in [L]$.} For simplicity, we consider deep models with only one output, i.e., $f(\theta;\x) \in \R$ as in~\citep{SD-JL-HL-LW-XZ:19}, but our results can be extended to multi-dimension outputs as in~\citep{DZ-QG:19}, using $\V \in \R^{k \times m_L}$ for $k$ outputs at the last layer. \pcedit{We use the notation $\alpha^{(l)}(\x)=\phi(\tilde{\alpha}^{(l)}(\x))$, with $\alpha^{(l)}$ being the output and  $\tilde{\alpha}^{(l)}$ the pre-activation at later $l$.}
\pcedit{We also let $A^{(l)} \in \R^{n \times m_{l}}$ be such that the $i$th row is defined as $A^{(l)}_{i,:} := \alpha^{(l)}(x_i)$, i.e., $A^{(l)}$ is the output (matrix) of layer $l\in[L]$ for input dataset $x_i, i\in[n]$ -- the weight vector $\theta$ under which this is evaluated will be understood by the context.} \pcedit{Likewise, we let $A^{(L+1)}\in \R^n$ be the vector of outputs for the input dataset.} Let $\bm{0}_p$ be the zero vector of dimension $p$ and $\I_p$ the $p\times p$ identity matrix.

\pcdelete{Define the pointwise loss $\ell_i:=\ell(y_i,\cdot) : \R \to \R_+$ and denote its first- and second-derivative as $\ell'_i := \frac{d \ell(y_i,\hat{y}_i)}{d \hat{y}_i}$ and $\ell''_i := \frac{d^2 \ell(y_i,\hat{y}_i)}{d \hat{y}_i^2}$.
%
%
The particular case of square loss is $\ell(y_i,\hat{y}_i)=(y_i-\hat{y}_i)^2$. We denote the gradient and Hessian of $f(\cdot;\x_i):\R^p \to \R$ as
$\nabla_i f := \frac{\partial f(\theta;\x_i)}{\partial \theta}$, and $\nabla^2_i f := \frac{\partial^2 f(\theta;\x_i)}{\partial \theta^2}$. The {\em neural tangent kernel} (NTK) $K_{\ntk}( \cdot ; \theta) \in \R^{n \times n}$ corresponding to parameter $\theta$ is defined as 
%\begin{align}
$K_{\ntk}(\x_i,\x_j; \theta) = \langle \nabla_i f, \nabla_j f \rangle$.
%\label{eq:ntk}
%\end{align}
By chain rule, the gradient and Hessian of the empirical loss w.r.t.~$\theta$ are given by
%\begin{align}
$\frac{\partial \cL(\theta)}{\partial \theta} = \frac{1}{n} \sum_{i=1}^n \ell_i' \nabla_i f$ and % \\
$\frac{\partial^2 \cL(\theta)}{\partial \theta^2} = \frac{1}{n} \sum_{i=1}^n \left[ \ell''_i \nabla_i f  \nabla_i f^\top + \ell'_i \nabla_i^2 f  \right]$.
%\end{align}
}

We denote the gradient and Hessian of $f(\cdot;\x_i):\R^p \to \R$ as
$\nabla_i f := \frac{\partial f(\theta;\x_i)}{\partial \theta}$, and $\nabla^2_i f := \frac{\partial^2 f(\theta;\x_i)}{\partial \theta^2}$. The {\em neural tangent kernel} (NTK) $K_{\ntk}( \cdot ; \theta) \in \R^{n \times n}$ corresponding to parameter $\theta$ is defined as 
\begin{align}
K_{\ntk}(\x_i,\x_j; \theta) = \langle \nabla_i f, \nabla_j f \rangle.
\label{eq:ntk}
\end{align}

We make the following assumption regarding the activation function $\phi$:
\begin{asmp}[\textbf{Activation function}]
The activation $\phi$ is 1-Lipschitz, i.e., $|\phi'| \leq 1$, and $\beta_\phi$-smooth, i.e., $|\phi_l''| \leq \beta_\phi$.
\label{asmp:actinit}
\end{asmp}
\begin{remark}
Our analysis holds for any $\varsigma_{\phi}$-Lipchitz smooth activations, with a dependence on $\varsigma_{\phi}$ on most key results. The main (qualitative) conclusions stay true if $\varsigma_{\phi} \leq 1 + o(1)$ or $\varsigma_{\phi} = \text{poly}(L)$, which is typically satisfied for commonly used smooth activations and moderate values of $L$. \qed
\end{remark}
%
%
\pcdelete{We define two types of balls over parameters that will be used throughout our analysis.}
%
\pcdelete{\begin{defn}[\textbf{Norm balls}]
Given $\overline{\theta}\in\R^p$ of the form~\eqref{eq:theta_def} with parameters $\overline{W}^{(l)}, l \in [L], \overline{\v}$ and
with $\| \cdot \|_2$ denoting spectral norm for matrices and $L_2$-norm for vectors, we define
\begin{align}
B_{\rho, \rho_1}^{\spec}(\bar{\theta}) & := \left\{ \theta \in \R^p ~\text{as in \eqref{eq:theta_def}} ~\mid \| W^{(\ell)} - \overline{W}^{(\ell)} \|_2 \leq \rho,\right.\nonumber \\
&\left. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ \ell \in [L], \| \v - \bar{\v} \|_2 \leq \rho_1 \right\}~,\label{eq:specball} \\
B_{\rho}^{\euc}(\bar{\theta}) & := \left\{ \theta \in \R^p ~\text{as in \eqref{eq:theta_def}} ~\mid \| \theta - \bar{\theta} \|_2 \leq \rho \right\}~.\label{eq:frobball}
\end{align}
\end{defn}
}

\begin{asmp}[Input data scaling]
\label{asmp:scaling}
Every input data $\x_i \in \R^d$, $i\in[n]$, has norm $\norm{\x_i}_2^2 = d$.    
\end{asmp}
\pcedit{The previous assumption is done for convenience. Scaling assumptions are common in the literature~\citep{ZAZ-YL-ZS:19,oymak2020hermite,ng2021hermite2}.} \abcomment{state this as a formal assumption} \pccomment{Done. For a moment I thought this scaling was for the optimization part and not the NTK analysis!}

%\subsection*{Spectral Norm of the Hessian}
%\label{sec:arXiv_hessian}
%\input{AISTATS_NTK/sec/arx1_hessian}


\abcomment{We have not described how the initialization weights $\theta_0$ (or $W_0$) are sampled, copy from the old draft, we can show how $\sigma_0^2$ gets used.}
\pccomment{The thing is that the way the initialization weights are initialized are on the statement of Theorem~4.1. We would have to change it.} \abcomment{Got it}