\subsection{Convergence Analysis: Unknown Desired Loss} 

\lemmNTKstep*


\proof
Observe that $K_{\ntk}(\cdot;\theta)=J(\theta) J(\theta)^\top$, where the Jacobian
$$
J(\theta)=
\begin{bmatrix}
\left(\frac{\partial f(\theta;x_1)}{\partial W^{(1)}}\right)^\top&\dots& \left(\frac{\partial f(\theta;x_1)}{\partial W^{(L+1)}}\right)^\top\\
\vdots &\ddots&\vdots\\
\left(\frac{\partial f(\theta;x_n)}{\partial W^{(1)}}\right)^\top&\dots& \left(\frac{\partial f(\theta;x_n)}{\partial W^{(L+1)}}\right)^\top
\end{bmatrix}\in\R^{n\times m+Lm^2}~.
$$
Then, the spectral norm of the change in the NTK is given by
\begin{equation}
\label{eq:lower_NTK_aux1}
\begin{aligned}
\norm{K_{\ntk}(\cdot;\theta_{t+1})-K_{\ntk}(\cdot;\theta_t)}_2
    &=\norm{J(\theta_{t+1}) J(\theta_{t+1})^\top - J(\theta_t) J(\theta_t)^\top}_2\\
    &=\norm{J(\theta_{t+1}) (J(\theta_{t+1})-J(\theta_t))^\top - (J(\theta_{t+1}) - J(\theta_t)) J(\theta_t)^\top}_2\\
    &\leq (\norm{J(\theta_{t+1})}_2 + \norm{J(\theta_t)}_2) \norm{J(\theta_{t+1})-J(\theta_t)}_2~.
    \end{aligned}
\end{equation}
Now, for any $\theta\in B_{\rho,\rho_1}^{\spec}(\theta_0)$,
$$
\norm{J(\theta)}_2^2\leq \norm{J(\theta)}_F^2 = \sum^n_{i=1}\norm{\frac{\partial f(\theta;x_i)}{\partial \theta}}_2^2 \overset{(a)}{\leq} n \varrho^2
$$
where (a) follows from Lemma~\ref{cor:gradient-bounds} with $\varrho$ defined therein. Assuming $\theta_t, \theta_{t+1} \in B_{\rho,\rho_1}^{\spec}(\theta_0)$, we have $\norm{J(\theta_t)}_2, \norm{J(\theta_{t+1})}_2 \leq \sqrt{n} \varrho$, so that from \eqref{eq:lower_NTK_aux1} we get
\begin{equation}
\label{eq:lower_NTK_aux2}
\begin{aligned}
\norm{K_{\ntk}(\cdot;\theta_{t+1})-K_{\ntk}(\cdot;\theta_t)}_2 \leq 2\sqrt{n} \varrho \norm{J(\theta_{t+1})-J(\theta_t)}_2~.
    \end{aligned}
\end{equation}
Now, note that
\begin{align*}
\norm{J(\theta_{t+1})-J(\theta_t)}_2
& \leq \norm{J(\theta_{t+1})-J(\theta_t)}_F \\
& \leq \sqrt{\sum^n_{i=1}\norm{\frac{\partial f(\theta_{t+1};x_i)}{\partial \theta} - \frac{\partial f(\theta_t;x_i)}{\partial \theta}}_2^2}\\
    &\overset{(a)}{\leq} \sqrt{n} \sup_{\tilde{\theta}_t, i} \norm{\frac{\partial^2 f(\tilde{\theta}_t;x_i)}{\partial \theta^2}}_2 \norm{\theta_{t+1}-\theta_t}_2 \\
    &\overset{(b)}{\leq}\frac{c_H \sqrt{n} }{\sqrt{m}}  \norm{\theta_{t+1}-\theta_t}_2\\
    &\overset{(c)}{=}  \frac{c_H \sqrt{n} }{\sqrt{{m}}} \eta_t \norm{ \nabla \bar{\cL}(\theta_t)}_2 \\
    & \overset{(d)}{\leq} \frac{2 c_H \sqrt{n}  \varrho}{\sqrt{m}} \eta_t \sqrt{\bar{\cL}(\theta_t)}~,
\end{align*}
where (a) follows from the mean-value theorem with $\tilde{\theta}_t\in\{(1-\xi) \theta_t+\xi \theta_{t+1} \text{for some } \xi\in[0,1]\}$, (b) follows from Theorem~\ref{theo:bound-Hess} since $\tilde{\theta}\in B_{\rho,\rho_1}^{\spec}(\theta_0)$, (c) follows from the gradient descent update, and (d) follows from Corollary~\ref{cor:total-bound}. 

Then, using ~\eqref{eq:lower_NTK_aux2}, we have
\begin{equation}
\label{eq:lower_NTK_aux3}    
\norm{K_{\ntk}(\cdot;\theta_{t+1})-K_{\ntk}(\cdot;\theta_t)}_2
\leq 4 c_H \varrho^2  \frac{n}{\sqrt{m}} \eta_t \sqrt{\bar{\cL}(\theta_t)}~.
\end{equation}
Then, by triangle inequality 
\begin{align*}
\lambda_{\min}(K_{\ntk}(\cdot;\theta_{t+1})) 
& \geq \lambda_{\min}(K_{\ntk}(\cdot;\theta_t)) - \norm{K_{\ntk}(\cdot;\theta_{t+1})-K_{\ntk}(\cdot;\theta_t)}_2 \\
&\overset{(a)}{\geq} \lambda_{\min}(K_{\ntk}(\cdot;\theta_t)) -  4 c_H \varrho^2  \frac{n}{\sqrt{m}} \eta_t \sqrt{\bar{\cL}(\theta_t)}~,
\end{align*}
where (a) follows from~\eqref{eq:lower_NTK_aux3}. That completes the proof.  \qed 

\abdelete{
\corrNTKinit*
%
\proof  \qed }




% \lemmNTKBall*

% \proof
% Observe that $K_{\ntk}(\cdot;\theta)=J(\theta)(J(\theta))^\top$, where
% $$
% J(\theta)=
% \begin{bmatrix}
% \left(\frac{\partial f(\theta;x_1)}{\partial W^{(1)}}\right)^\top&\dots& \left(\frac{\partial f(\theta;x_1)}{\partial W^{(L+1)}}\right)^\top\\
% \vdots &\ddots&\vdots\\
% \left(\frac{\partial f(\theta;x_n)}{\partial W^{(1)}}\right)^\top&\dots& \left(\frac{\partial f(\theta;x_n)}{\partial W^{(L+1)}}\right)^\top
% \end{bmatrix}\in\R^{n\times m+Lm^2}.
% $$
% Then, 
% \begin{equation}
%     \label{eq:lower_NTK_aux_1}
%     \begin{aligned}
%     \norm{K_{\ntk}(\cdot;\theta)-K_{\ntk}(\cdot;\theta_0)}_2
%     &=\norm{J(\theta)(J(\theta))^\top-J(\theta_0)(J(\theta_0))^\top}_2\\
%     &=\norm{J(\theta)(J(\theta)-J(\theta_0))^\top-(J(\theta)-J(\theta_0))(J(\theta_0))^\top}_2\\
%     &\leq (\norm{J(\theta)}_2+\norm{J(\theta_0)}_2)\norm{J(\theta)-J(\theta_0)}_2.
%     \end{aligned}
% \end{equation}
% Now, for any $\theta\in B^2_\rho(\theta_0)$,
% $$
% \norm{J(\theta)}_2^2\leq \norm{J(\theta)}_F^2 = \sum^n_{i=1}\norm{\frac{\partial f(\theta;x_i)}{\partial \theta}}_2^2 \overset{(a)}{\leq} n \varrho^2
% $$
% where (a) follows from~\citep[Lemma 4.1 ]{AB-PCV-LZ-MB:22} with $\varrho$ defined therein. Using this result in~\eqref{eq:lower_NTK_aux_1},
% \begin{equation}
%     \label{eq:lower_NTK_aux_2}
%     \begin{aligned}
%     \norm{K_{\ntk}(\cdot;\theta)-K_{\ntk}(\cdot;\theta_0)}_2
%     \leq 2\sqrt{n} \varrho
%   \norm{J(\theta)-J(\theta_0)}_2.
%     \end{aligned}
% \end{equation}
% Now, observe that
% \begin{align*}
%     \norm{J(\theta)-J(\theta_0)}_2
%     &=\norm{J(\theta)-J(\theta_0)}_F\\
%     &=\sqrt{\sum^n_{i=1}\norm{\frac{\partial f(\theta;x_i)}{\partial \theta} - \frac{\partial f(\theta_0;x_i)}{\partial \theta}}_2^2}\\
%     &\overset{(a)}{\leq} \sqrt{\sum^n_{i=1}\norm{\frac{\partial f(\tilde{\theta};x_i)}{\partial \theta}}_2^2\norm{\theta-\theta_0}_2^2}\\
%     %
%     &\overset{(b)}{\leq}\frac{c_H}{\sqrt{m}}\sqrt{\sum^n_{i=1}\norm{\theta-\theta_0}_2^2}\\
%     &\leq\frac{c_H\sqrt{n}}{\sqrt{m}}\rho.
% \end{align*}
% where (a) follows from the mean-value theorem with $\tilde{\theta}=\theta_0+\xi(\theta-\theta_0)$ for some $\xi\in[0,1]$; (b) follows from Theorem~\ref{theo:bound-Hess} since $\tilde{\theta}\in B^2_\rho(\theta_0)$. Then, using this result back in~\eqref{eq:lower_NTK_aux_2}, we obtain
% \begin{equation}
% \label{eq:lower_NTK_aux_3}    
% \norm{K_{\ntk}(\cdot;\theta)-K_{\ntk}(\cdot;\theta_0)}_2
% \leq 2c_H\varrho\frac{n}{\sqrt{m}}\rho.
% \end{equation}

% Now,
% \begin{align*}
% |\lambda_{\min}(K_{\ntk}(\cdot;\theta))|&=
% |\lambda_{\min}(K_{\ntk}(\cdot;\theta)-K_{\ntk}(\cdot;\theta_0)+K_{\ntk}(\cdot;\theta_0))|\\
% &=\min_{\norm{x}_2=1}\norm{(K_{\ntk}(\cdot;\theta)-K_{\ntk}(\cdot;\theta_0))x+K_{\ntk}(\cdot;\theta_0)x}_2\\
% &\geq\min_{\norm{x}_2=1}\norm{K_{\ntk}(\cdot;\theta_0)x}_2-\norm{(K_{\ntk}(\cdot;\theta)-K_{\ntk}(\cdot;\theta_0))x}_2\\
% &\geq\min_{\norm{x}_2=1}\norm{K_{\ntk}(\cdot;\theta_0)x}_2-\max_{\norm{x}_2=1}\norm{(K_{\ntk}(\cdot;\theta)-K_{\ntk}(\cdot;\theta_0))x}_2\\
% &=|\lambda_{\min}(K_{\ntk}(\cdot;\theta_0))| - \norm{K_{\ntk}(\cdot;\theta)-K_{\ntk}(\cdot;\theta_0)}_2\\
% &\overset{(a)}{\geq} |\lambda_{\min}(K_{\ntk}(\cdot;\theta_0))| -  2c_H\varrho\frac{n}{\sqrt{m}}\rho,
% \end{align*}
% where (a) follows from~\eqref{eq:lower_NTK_aux_3}. Under the theorem's conditions, we have that $\varrho=O(\poly(L))$ and $c_H=O(\poly(L)(1+\gamma^{2L}))$. This finishes the proof.
% %
% \qed



\theoNTKconv*


%\begin{proof}
% We first notice that by~\cite[Corollary~4.1]{AB-PCV-LZ-MB:22} and using induction, we can set $\eta_t \leq \frac{\rho}{c\,T}$ for some appropriate $c=O(\poly(L))$, to ensure that with high-probability $\norm{\theta_t-\theta_0}\leq \rho$ for every $t\in[T]$.

% Now, let us choose $m=\Omega(n^2)$ such that 
% %$\lambda_{\min}(K_{\ntk}(\cdot;\theta))\geq \frac{\lambda_{\min}(K_{\ntk}(\cdot;\theta_0))}{K}$
% $\lambda_{\min}(K_{\ntk}(\cdot;\theta))\geq \frac{\lambda_0}{K}$
% for some constant $K>0$. Notice that this choice of $m$ also ensures we can obtain the NTK condition at initialization, since $\sqrt{n}\geq \log(n)$ as $n$ grows.

\proof 
\pcedit{First note that with probability at least $\left(1-\frac{2(L+1)}{m}\right)$, all the bounds in Theorem~\ref{theo:bound-Hess}, Lemma~\ref{cor:gradient-bounds}, Lemma~\ref{lem:BoundTotalLoss} and Corollary~\ref{cor:total-bound} hold (and so 
does Lemma~\ref{lemm:NTK_step}, since its proof uses these bounds).}

\pcedit{Before proceeding further, we note that for $L=O(1)$ we obtain the constant $c_H = O((1+\rho_1)(1+(4\nu_0+\frac{\rho}{\sqrt{m}})^{O(1)}))=O((1+\rho_1))$ following Theorem~\ref{theo:bound-Hess}, where the last equality follows from the fact that $m=\Omega(\frac{n^4}{\lambda_0^2})> \Omega(\frac{n}{\lambda_0^2})$ and $\rho=\Theta(\frac{\sqrt{n}}{\lambda_0})$. Moreover, since $\rho_1=\Theta(\frac{\sqrt{n}}{\lambda_0})$, we obtain $c_H=O(1+\frac{\sqrt{n}}{\lambda_0})$. 
%
We will use 
$$c_H \leq  c_2 (1+\frac{\sqrt{n}}{\lambda_0})$$ for some suitable constant $c_2>0$.}

\pcedit{
Likewise, we also note that $\varrho^2=O((1+\frac{1}{m}(1+\rho_1)^2(1+\gamma^{2(L+1)})$ (see definition in Lemma~\ref{cor:gradient-bounds}). Then, using the fact that, $L=O(1)$, $\rho_1,\rho=\Theta(\frac{\sqrt{n}}{\lambda_0})$ and $m=\Omega(\frac{n^4}{\lambda_0^2})> \Omega(\frac{n}{\lambda_0^2})$, we obtain that $\varrho^2=O(1)$. We will use 
$$\varrho^2 \leq  c_3$$ for some suitable constant $c_3>0$.
}

\pcedit{
Finally, we observe that $\bar{c}_{\rho_1,\gamma}=O((1+\rho_1^2)(1+\gamma^L))$ (see definition in Lemma~\ref{lem:BoundTotalLoss}). Then, taking the definition of $\beta$ (as in Theorem~\ref{theo:smoothnes}), we have that $\beta=b\varrho^2+\frac{1}{\sqrt{m}}O(\poly(L)(1+\gamma^{3L})(1+\rho_1^2)$. Again, in a similar fashion as in the analysis of the expressions $c_H$ and $\varrho$, we have that in our problem setting $\beta=O(1+\frac{\sqrt{n}}{\lambda_0})$. We will use 
$$\beta \leq  c_4 (1+\frac{\sqrt{n}}{\lambda_0})$$ for some suitable constant $c_4>0$.
} \abcomment{$\beta$ should stay a constant, dependence on $\frac{\rho_1^2}{\sqrt{m}}$; also, $\beta$ is the smoothness of the normalized loss}


We now proceed to do the proof by induction. First, for $t=1$, we show that, based on the choice of the step size, $\theta_{1} \in B_{\rho,\rho_1}^{\spec}(\theta_0)$. To see this, note that
\begin{align*}
\| \theta_{1} - \theta_0 \|_2 & = \eta  \| \nabla \bar{\cL}(\theta_{0})\|_2 \\
& \overset{(a)}{\leq}2 \varrho \eta    \sqrt{\bar{\cL}(\theta_0)} \\
& \overset{(b)}{\leq} 2\varrho \eta \sqrt{n \bar{c}_{0,4\nu_0}}\\
& = 2\varrho \eta \lambda_0 \frac{\sqrt{n \bar{c}_{0,4\nu_0}}}{\lambda_0}\\
& \leq 2\varrho \sqrt{\bar{c}_{0,4\nu_0}}\frac{\sqrt{n} }{\lambda_0} \\
& \overset{(c)}{\leq} \rho, \rho_1~,
\end{align*}
where (a) follows from Corollary~\ref{cor:total-bound}, 
(b) from Lemma~\ref{lem:BoundTotalLoss}, (c) follows since $\rho, \rho_1 = \Theta(\frac{\sqrt{n}}{\lambda_0})$. \pccomment{I wonder if there is a circular argument here. Since $\varrho$ depends on both $\rho$ and $\rho_1$, I used the fact that $\rho_1,\rho=O(\frac{\sqrt{n}}{\lambda_0})$ to to show that $\varrho = O(1)$ (independent of $n$, of course) --- yet, at the same time, we have that $\rho,\rho_1$ is \textbf{defined} to be an upper bound to the quantity $2\varrho\sqrt{\bar{c}_{0,q\nu_0}}$ above, i.e., of $\rho$ itself!!! This could be OK since actually $\rho_1,\rho=O(\frac{\sqrt{n}}{\lambda_0})$, but I wonder if the constants match up correctly on their scaling!  I think the derivation might be fine, since for upper bounding $\varrho$ $m$ may be able to be chosen appropriately sufficiently large, but still this needs to be checked!} Hence, $\theta_1 \in B_{\rho,\rho_1}^{\spec}(\theta_0)$. We now take the smoothness property from Theorem~\ref{theo:smoothnes}, %with $\beta=O((1+\frac{\sqrt{n}}{\lambda_0}))$ given that $L=O(1)$, 
and further obtain
\begin{equation}
\begin{aligned}
\bar{\cL}(\theta_{1})-\bar{\cL}(\theta_{0})&\leq \langle \theta_{1}-\theta_0,\nabla_{\theta}\bar{\cL}(\theta_0)\rangle  +\frac{\beta\pcedit{n}}{2}\norm{\theta_{1}-\theta_0}^2_2\\
& \overset{(a)}{\leq} -\eta\norm{\nabla_\theta \bar{\cL}(\theta_0)}_2^2 +\frac{\beta\eta^2\pcedit{n}}{2}\norm{\nabla_\theta \bar{\cL}(\theta_0)}^2_2\\
&= - \eta \left(1 - \frac{\beta\eta\pcedit{n}}{2} \right)\norm{\nabla_\theta \bar{\cL}(\theta_t)}^2_2\\
%
&\overset{(b)}{\leq} - \frac{\eta}{2}\norm{\nabla_\theta \bar{\cL}(\theta_0)}^2_2 \\
&\overset{(c)}{\leq} - \frac{\eta}{2} \ell_0'^\top K_{\ntk}(\theta_0)\ell_0' \\
&\overset{(d)}{\leq} - \frac{\eta}{2} ~\lambda_{\min}(K_{\ntk}(\theta_0))~ \|\ell'_0 \|_2^2 \\
%&\overset{(c)}{\leq} - \frac{\eta_t}{2} \left( \lambda_0 - \tilde{c} \frac{n}{\sqrt{m}} \sum_{\tau=1}^t \eta_{\tau} \right) \|\ell'_t \|_2^2 \\
&\overset{(e)}{\leq} - \frac{\eta}{2}  \lambda_0 4 \bar{\cL}(\theta_0) \\
&\leq - \eta\lambda_0 \bar{\cL}(\theta_0)\\
\implies \qquad \bar{\cL}(\theta_{1})&\leq \left(1- \eta  \lambda_0 \right) \bar{\cL}(\theta_0),
\end{aligned}
\end{equation}
where (a) follows from the gradient descent update;  
%
(b) follows from our choice of step-size $\eta  \leq \frac{1}{\beta n}$ so that $-(1-\frac{\beta \eta \pcedit{n}}{2}) \leq -\frac{1}{2}$; 
%
(c) follows from the following property valid for any iterate $\theta_t\in\R^p$, 
\begin{align*}
\left\| \nabla_\theta \bar{\cL}(\theta_t) \right\|_2^2 = \left\| \sum_{i=1}^n \ell'_{t,i} \nabla_\theta f(\theta_t;\x_i) \right\|_2^2 =  \sum_{i=1}^n \sum_{j=1}^n \ell'_{t,i} \ell'_{t,j} \langle \nabla_{\theta} f(\theta_t;\x_i), \nabla_{\theta} f(\theta_t;\x_j) \rangle = \ell_t'^T K_{\ntk}(\cdot;\theta_t) \ell_t'~,
\end{align*}
where $\ell_t' := [\ell'_{t,i}] \in \R^n$, with $\ell'_{t,i} = -2(y_i-f(\theta_t;\x_i))$ and  $\ell_{t,i} = (y_i - f(\theta_t;\x_i))^2$; 
%
(d)  follows from the definition of minimum eigenvalue;
%
and (e) follows from the following property valid for any iterate $\theta_t\in\R^p$,
\abdelete{
\begin{align*}
\tilde{c} \frac{n}{\sqrt{m}} \sum_{\tau=1}^t \eta_{\tau} \leq \tilde{c} \frac{n}{\sqrt{m}} T \eta \leq \frac{\lambda_0}{2}~,
\end{align*}
so that $\left(\frac{\beta\eta_t^2}{2}-\eta_t\right)<0$ and $\lambda_{\min}(K_{\ntk}(\cdot;\theta))\geq \frac{\lambda_0}{K}$; }
%
\begin{equation}
\norm{\ell'_t}^2_2 =\sum_{i=1}^n \ell'^2_{t,i} = 4\sum_{i=1}^n (y_i - f(\theta_t;\x_i))^2 = 4 \bar{\cL}(\theta_t)~.
\label{eq:sqlossgrad}
\end{equation}
Notice that, from our choice of step-size $\eta < \frac{1}{\lambda_0}$, we have that $1-\eta\lambda_0\in(0,1)$.


Continuing with our proof by induction, we take the following induction hypothesis: we assume that 
\begin{align}
\bar{\cL}(\theta_t) \leq \left(1- \eta \lambda_0 \right)^{t-1} \bar{\cL}(\theta_0)
\end{align}
and that $\theta_\tau\in B^{\spec}_{\rho,\rho_1}(\theta_0)$ for $\tau\leq t$.

First, based on the choice of the step sizes, we show that $\theta_{t+1} \in B_{\rho,\rho_1}^{\spec}(\theta_0)$. To see this, note that, using similar inequalities as in our analysis for the case $t=1$,
\begin{align*}
\| \theta_{t+1} - \theta_0 \|_2 & \leq \sum_{\tau=0}^{t} \| \theta_{\tau+1} - \theta_{\tau} \|_2 \\
& = \sum_{\tau=0}^t \eta \| \nabla_\theta \bar{\cL}(\theta_{\tau})\|_2 \\
& \leq 2 \varrho \eta   \sum_{\tau=0}^t  \sqrt{\bar{\cL}(\theta_\tau)} \\
& \overset{(a)}{\leq} 2 \varrho \eta   \left(\sum_{\tau=0}^t \left(1 - \eta \lambda_0 \right)^{\tau/2} \right) \sqrt{\bar{\cL}(\theta_0)}\\
& \leq 2 \varrho \eta \frac{\sqrt{\bar{\cL}(\theta_0)}}{1 - \sqrt{1- \eta \lambda_0}} \\
& \overset{(b)}{\leq} \frac{4 \varrho \sqrt{\bar{\cL}(\theta_0)}}{\lambda_0}\\
& \leq 4 \varrho \sqrt{c_{0,4\nu_0}} \frac{\sqrt{n} }{\lambda_0} \\
& \overset{(c)}{\leq} \abedit{\rho, \rho_1}~,
\end{align*}

%\abcomment{need to handle $\rho_1$ suitably, or can we make $\rho_1 = \rho$ for this work}
where (a) follows from our induction hypothesis, (b) follows from $\frac{x}{1-\sqrt{1-x\lambda_0}}\leq \frac{2}{\lambda_0}$ for $x<\frac{1}{\lambda_0}$, and (c) follows since $\rho, \rho_1 = \Theta(\frac{\sqrt{n}}{\lambda_0})$.   
%\abcomment{update, $\rho = O(\sqrt{n})$, $m = \Omega(n)$, so we can choose $m$ such that $\rho < \sqrt{m}$}
%
\abdelete{\pccomment{Now, I am thinking about something. Imagine we have some positive constant $C>1$, then, we would like to choose step-size $\eta$ such that $\frac{1}{1-\sqrt{1-\eta \lambda_0}}<C$, under the condition that $\eta<\frac{1}{\lambda_0}$. Doing some algebraic work, we can obtain that this holds iff $\frac{1}{\lambda_0}\left(1-\left(1-\frac{1}{C}\right)^2\right)<\eta$. As we can see, this still satisfies the initial condition $\eta<\frac{1}{\lambda_0}$. Then, we can have a not very attractive statement like this: 
"Let $C>1$ be a sufficiently large constant such that the interval
\begin{align*}
\mathcal{I}=\left(\frac{1}{\lambda_0}\left(1-\left(1-\frac{1}{C}\right)^2\right)\;,\; \min\left( \frac{1}{\beta} , \frac{1}{\lambda_0}, \frac{1}{2 \sqrt{\bar{c}_{0,4\nu_0}} \varrho} \min\left( \frac{\lambda_0}{n c_H} , \pcedit{\frac{\rho}{\sqrt{n}}} \right) \right)\right),
%\\ &= \abedit{\Theta \left( \frac{1}{n} \right)}~, 
\end{align*}
is well-defined and non-empty. Then, choose a step-size $\eta_t\equiv\eta\in\mathcal{I}$."  
The nice thing is that then we would not need to worry about $\rho=\Omega(n)$}}

Now, we have
\begin{align*}
\lambda_{\min}(K_{\ntk}(\cdot;\theta_t)) 
& \overset{(a)}{\geq} \lambda_{\min}(K_{\ntk}(\cdot;\theta_{t-1})) - 4 c_H \varrho^2 \frac{n}{\sqrt{{m}}} \eta \sqrt{\bar{\cL}(\theta_{t-1})} \\
& \geq K_{\ntk}(\cdot;\theta_0) - 4 c_H \varrho^2 \eta  \frac{n}{\sqrt{m}} \sum_{\tau=0}^{t-1}  \sqrt{\bar{\cL}(\theta_{\tau})}\\
& \overset{(b)}{\geq} \lambda_0 - 4 c_2 \varrho^2 \eta \frac{n}{ \sqrt{m}}\pcedit{(1+\frac{\sqrt{n}}{\lambda_0})} \left(\sum_{\tau=0}^t \left(1 - \eta \lambda_0 \right)^{\tau/2} \right) \sqrt{\bar{\cL}(\theta_0)}\\
& \geq \lambda_0 - 8 c_2 \varrho^2 \frac{\pcedit{n} \sqrt{\bar{\cL}(\theta_0)}}{ \sqrt{m}}\pcedit{(1+\frac{\sqrt{n}}{\lambda_0})} \frac{\eta}{1 - \sqrt{1- \eta \lambda_0}} \\
& \overset{(c)}{\geq} \lambda_0 -  \frac{\pcedit{1}}{\sqrt{m}}\pcedit{(n^{3/2}+\frac{n^2}{\lambda_0})} \frac{\bar{c} \eta }{1 - \sqrt{1- \eta \lambda_0}} \\
& \geq \lambda_0 -  2\bar{c} \frac{\pcedit{1}}{\sqrt{m}}\pcedit{(\frac{n^{3/2}}{\lambda_0}+\frac{n^2}{\lambda_0^2})}\\
&\pcedit{ \geq \lambda_0 -  4\bar{c} \frac{\pcedit{1}}{\sqrt{m}}\pcedit{\max\{\frac{n^{3/2}}{\lambda_0},\frac{n^2}{\lambda_0^2}\}}}~,
\end{align*}
%\abcomment{ongoing, because [LZB'20] does not help, since their Theorem 2 needs $\| H \|_2 \leq \frac{c}{n^2}$ which needs $m = n^4$}
%
where (a) follows from Lemma~\ref{lemm:NTK_step}, (b) follows by the induction hypothesis, and (c) follows with $\bar{c} = 8 \sqrt{c_{0,\sigma_1}} \pcedit{c_3}$. %\varrho^2$. 
Then, with $m \pcedit{\geq 64\bar{c}^2\max\{\frac{n^{3}}{\lambda_0^4},\frac{n^4}{\lambda_0^6}\}}$ we have $K_{\ntk}(\theta_t) \geq \lambda_0/2$.

%\newpage 

Since $\theta_t,\theta_{t+1}\in B_{\rho,\rho_1}^{\spec}(\theta_0)$, we now take the smoothness property and further obtain, using similar inequalities as in our analysis for the case $t=1$,
\begin{equation}
\begin{aligned}
\bar{\cL}(\theta_{t+1})-\bar{\cL}(\theta_{t})&\leq \langle \theta_{t+1}-\theta_t,\nabla_{\theta}\bar{\cL}(\theta_t)\rangle  +\frac{\beta\pcedit{n}}{2}\norm{\theta_{t+1}-\theta_t}^2_2\\
& \overset{(a)}{\leq} -\eta\norm{\nabla_\theta \bar{\cL}(\theta_t)}_2^2 +\frac{\beta\eta^2\pcedit{n}}{2}\norm{\nabla_\theta \bar{\cL}(\theta_t)}^2_2\\
&= - \eta \left(1 - \frac{\beta\eta\pcedit{n}}{2} \right)\norm{\nabla_\theta \bar{\cL}(\theta_t)}^2_2\\
%
&\leq - \frac{\eta}{2} \ell_t'^\top K_{\ntk}(\theta_t)\ell_t' \\
&\leq - \frac{\eta}{2} ~\lambda_{\min}(K_{\ntk}(\theta_t))~ \|\ell'_t \|_2^2 \\
%&\overset{(c)}{\leq} - \frac{\eta_t}{2} \left( \lambda_0 - \tilde{c} \frac{n}{\sqrt{m}} \sum_{\tau=1}^t \eta_{\tau} \right) \|\ell'_t \|_2^2 \\
&\overset{(b)}{\leq} - \frac{\eta}{2}  \frac{\lambda_0}{2} 4 \cL(\theta_t) \\
\implies \qquad \cL(\theta_{t+1})&\leq \left(1- \eta  \lambda_0 \right) \bar{\cL}(\theta_t),
\end{aligned}
\end{equation}
where (a) follows from the gradient descent update, and (b) from our recently derived result. \qed 


\subsection{Convergence Analysis: Known Desired Loss} 


\lemmNTKball*
%
\proof
Observe that $K_{\ntk}(\theta)=J(\theta) J(\theta)^\top$, where the Jacobian
$$
J(\theta)=
\begin{bmatrix}
\left(\frac{\partial f(\theta;x_1)}{\partial W^{(1)}}\right)^\top&\dots& \left(\frac{\partial f(\theta;x_1)}{\partial W^{(L+1)}}\right)^\top\\
\vdots &\ddots&\vdots\\
\left(\frac{\partial f(\theta;x_n)}{\partial W^{(1)}}\right)^\top&\dots& \left(\frac{\partial f(\theta;x_n)}{\partial W^{(L+1)}}\right)^\top
\end{bmatrix}\in\R^{n\times m+Lm^2}~.
$$
Then, the spectral norm of the change in the NTK is given by
\begin{equation}
\label{eq:lower_NTK_aux21}
\begin{aligned}
\norm{K_{\ntk}(\theta_{t+1})-K_{\ntk}(\theta_t)}_2
    &=\norm{J(\theta_{t+1}) J(\theta_{t+1})^\top - J(\theta_t) J(\theta_t)^\top}_2\\
    &=\norm{J(\theta_{t+1}) (J(\theta_{t+1})-J(\theta_t))^\top - (J(\theta_{t+1}) - J(\theta_t)) J(\theta_t)^\top}_2\\
    &\leq (\norm{J(\theta_{t+1})}_2 + \norm{J(\theta_t)}_2) \norm{J(\theta_{t+1})-J(\theta_t)}_2~.
    \end{aligned}
\end{equation}
Now, for any $\theta\in B_{\rho,\rho_1}^{\spec}(\theta_0)$,
$$
\norm{J(\theta)}_2^2\leq \norm{J(\theta)}_F^2 = \sum^n_{i=1}\norm{\frac{\partial f(\theta;x_i)}{\partial \theta}}_2^2 \overset{(a)}{\leq} n \varrho^2
$$
where (a) follows from Lemma~\ref{cor:gradient-bounds} with $\varrho$ defined therein. For $\theta \in B_{\rho,\rho_1}^{\spec}(\theta_0)$, we have $\norm{J(\theta)}_2 \leq \sqrt{n} \varrho$, so that from \eqref{eq:lower_NTK_aux21} we get
\begin{equation}
\label{eq:lower_NTK_aux22}
\begin{aligned}
\norm{K_{\ntk}(\theta)-K_{\ntk}(\theta_0)}_2 \leq 2\sqrt{n} \varrho \norm{J(\theta)-J(\theta_0)}_2~.
    \end{aligned}
\end{equation}
Now, note that
\begin{align*}
\norm{J(\theta)-J(\theta_0)}_2\\
& \norm{J(\theta)-J(\theta_0)}_F \\
& \leq \sqrt{\sum^n_{i=1}\norm{\frac{\partial f(\theta_{t+1};x_i)}{\partial \theta} - \frac{\partial f(\theta_t;x_i)}{\partial \theta}}_2^2}\\
    &\overset{(a)}{\leq} \sqrt{n} \sup_{\tilde{\theta},i} \norm{\frac{\partial^2 f(\tilde{\theta};x_i)}{\partial \theta^2}}_2 \norm{\theta -\theta_0}_2 \\
    &\overset{(b)}{\leq}\frac{c_H\sqrt{n} }{\sqrt{m}}  \norm{\theta - \theta_0}_2\\
    &\overset{(c)}{=}  \frac{c_H \sqrt{n} (L\rho + \rho_1)}{\sqrt{{m}}} 
%    & \overset{(d)}{\leq} \frac{2 c_H \varrho}{\sqrt{m}} \eta_t \sqrt{\cL(\theta_t)}~,
\end{align*}
where (a) follows from the mean-value theorem with $\tilde{\theta}_t=(1-\xi) \theta_t+\xi \theta_{t+1}$ for some $\xi\in[0,1]$, and (b) follows from Theorem~\ref{theo:bound-Hess} since $\tilde{\theta}\in B_{\rho,\rho_1}^{\spec}(\theta_0)$ and (c) follows since $\theta \in B_{\rho,\rho_1}^{\spec}(\theta_0)$.
%from the gradient descent update, and (d) follows from Corollary~\ref{cor:total-bound}. 
%
Then, using ~\eqref{eq:lower_NTK_aux22}, we have
\begin{equation}
\label{eq:lower_NTK_aux23}    
\norm{K_{\ntk}(\theta_{t+1})-K_{\ntk}(\theta_t)}_2
\leq 2 c_H \varrho (L \rho+\rho_1)  \frac{n}{\sqrt{m}} ~.
\end{equation}
Then, by triangle inequality 
\begin{align*}
\lambda_{\min}(K_{\ntk}(\theta)) 
& \geq \lambda_{\min}(K_{\ntk}(\theta_0)) - \norm{K_{\ntk}(\theta)-K_{\ntk}(\theta_0)}_2 \\
&\overset{(a)}{\geq} \lambda_{\min}(K_{\ntk}(\theta_0)) -  2 c_H \varrho (L\rho + \rho_1) \frac{n}{ \sqrt{m}} ~,
\end{align*}
where (a) follows from~\eqref{eq:lower_NTK_aux23}. That completes the proof.  \qed 

\theoNTKconvBall*

\proof 

\pcedit{First note that with probability at least $\left(1-\frac{2(L+1)}{m}\right)$, all the bounds in Theorem~\ref{theo:bound-Hess}, Lemma~\ref{cor:gradient-bounds}, Lemma~\ref{lem:BoundTotalLoss} and Corollary~\ref{cor:total-bound} hold (and so 
does Lemma~\ref{lemm:NTK_step}, since its proof uses these bounds).}

\pcedit{Before proceeding further, we note that for $L=O(1)$ we obtain the constant $c_H = O((1+\rho_1)(1+(4\nu_0+\frac{\rho}{\sqrt{m}})^{O(1)}))=O((1+\rho_1))$ following Theorem~\ref{theo:bound-Hess}, where the last equality follows from the fact that $m=\Omega(\frac{n}{\lambda_0^2})$ and $\rho=\Theta(\frac{\sqrt{n}}{\lambda_0})$. Moreover, since $\rho_1=\Theta(\frac{\sqrt{n}}{\lambda_0})$, we obtain $c_H=O(1+\frac{\sqrt{n}}{\lambda_0})$. 
%
We will use 
$$c_H \leq  c_2 (1+\frac{\sqrt{n}}{\lambda_0})$$ for some suitable constant $c_2>0$.}

\pcedit{
Likewise, we also note that $\varrho^2=O((1+\frac{1}{m}(1+\rho_1)^2(1+\gamma^{2(L+1)})$ (see definition in Lemma~\ref{cor:gradient-bounds}). Then, using the fact that, $L=O(1)$, $\rho_1,\rho=\Theta(\frac{\sqrt{n}}{\lambda_0})$ and $m=\Omega(\frac{n}{\lambda_0^2})$, we obtain that $\varrho^2=O(1)$. We will use 
$$\varrho^2 \leq  c_3$$ for some suitable constant $c_3>0$.
}

\pcedit{
Finally, we observe that $\bar{c}_{\rho_1,\gamma}=O((1+\rho_1^2)(1+\gamma^L))$ (see definition in Lemma~\ref{lem:BoundTotalLoss}). Then, taking the definition of $\beta$ (as in Theorem~\ref{theo:smoothnes}), we have that $\beta=b\varrho^2+\frac{1}{\sqrt{m}}O(\poly(L)(1+\gamma^{3L})(1+\rho_1^2)$. Again, in a similar fashion as in the analysis of the expressions $c_H$ and $\varrho$, we have that in our problem setting $\beta=O(1+\frac{\sqrt{n}}{\lambda_0})$. We will use 
$$\beta \leq  c_4 (1+\frac{\sqrt{n}}{\lambda_0})$$ for some suitable constant $c_4>0$.
}


We proceed by induction to show
\begin{align}
\bar{\cL}(\theta_{t+1}) \leq \left(1- \eta \lambda_0 \right)^{t} \bar{\cL}(\theta_0)~,
\end{align}
\pcedit{and $\theta_{t+1}\in B^{\spec}_{\rho,\rho_1}(\theta_0)$, for $t\leq T$.} The base case of the induction follows from the proof of Theorem~\ref{thm:conv-NTK}.
To continue with our proof by induction, we take the following induction hypothesis: we assume that 
\begin{align}
\bar{\cL}(\theta_t) \leq \left(1- \eta \lambda_0 \right)^{t-1} \bar{\cL}(\theta_0)
\end{align}
and that $\theta_\tau\in B^{\spec}_{\rho,\rho_1}(\theta_0)$ for $\tau\leq t$.

First, based on the choice of the step sizes, we show that $\theta_{t+1} \in B_{\rho,\rho_1}^{\spec}(\theta_0)$. To see this, note that, using similar inequalities as in our analysis for the case $t=1$,
\begin{align*}
\| \theta_{t+1} - \theta_0 \|_2 & \leq \sum_{\tau=0}^{t} \| \theta_{\tau+1} - \theta_{\tau} \|_2 \\
& = \sum_{\tau=0}^t \eta \| \nabla_\theta \bar{\cL}(\theta_{\tau})\|_2 \\
& \leq 2 \varrho \eta   \sum_{\tau=0}^t  \sqrt{\bar{\cL}(\theta_\tau)} \\
& \overset{(a)}{\leq} 2 \varrho \eta   \left(\sum_{\tau=0}^t \left(1 - \eta \lambda_0 \right)^{\tau/2} \right) \sqrt{\bar{\cL}(\theta_0)}\\
& \leq 2 \varrho \eta \frac{\sqrt{\bar{\cL}(\theta_0)}}{1 - \sqrt{1- \eta \lambda_0}} \\
& \overset{(b)}{\leq} \frac{4 \varrho \sqrt{\bar{\cL}(\theta_0)}}{\lambda_0}\\
& \leq 4 \varrho \sqrt{c_{0,4\nu_0}} \frac{\sqrt{n} }{\lambda_0} \\
& \overset{(c)}{\leq} \abedit{\rho, \rho_1}~,
\end{align*}

%\abcomment{need to handle $\rho_1$ suitably, or can we make $\rho_1 = \rho$ for this work}
where (a) follows from our induction hypothesis, (b) follows from $\frac{x}{1-\sqrt{1-x\lambda_0}}\leq \frac{2}{\lambda_0}$ for $x<\frac{1}{\lambda_0}$, and (c) follows since $\rho, \rho_1 = \Theta(\frac{\sqrt{n}}{\lambda_0})$.  \pccomment{Again, I wonder if there is a circular argument here. Since $\varrho$ depends on both $\rho$ and $\rho_1$, I used the bounds on $\Theta$ dependency on $\rho, \rho_1$ to show that $\varrho$ is upper bounded by a constant without $n$ being explicit --- yet, at the same time, we have that $\rho,\rho_1$ is defined as an upper bound to $\varrho$. I think the derivation is fine since $m$ can be chosen sufficiently large; still, the whole derivation must be double-checked.} 

% Based on the choice of the step sizes, we show that $\theta_{t+1} \in B_{\rho,\rho_1}^{(\spec}(\theta_0), t \in [T]$ for $T = \log \frac{1}{\epsilon}$ \abcomment{update}. To see this, note that for $t \in [T]$
% \begin{align*}
% \| \theta_{t+1} - \theta_0 \|_2 & \leq \sum_{\tau=0}^{t} \| \theta_{\tau+1} - \theta_{\tau} \|_2 \\
% & = \sum_{\tau=0}^t \eta \| \nabla \bar{\cL}(\theta_{\tau} \|_2 \\
% & \overset{(a)}{\leq} 2 \varrho \eta   t \sqrt{n c_{\rho_1,\gamma}} \\
% & \overset{(b)}{\leq} 2 \varrho^2 \eta c' \log \frac{1}{\epsilon} \sqrt{n c_{\rho_1,\gamma}}\\
% & \leq \rho~,
% \end{align*}
% since $T = c' \log \frac{1}{\epsilon}$ for some $c' > 0$.

\pcdelete{Before proceeding further, we note that for $L=O(1)$, the constant $c_H = O(\rho_1)$ following Remark 4.3 in \cite{baner22sgld} and we will use $c_H \leq  c_2 \frac{\sqrt{n}}{\lambda_0}$ for some suitable constant $c_2>0$. %Further, $L\rho + \rho_1 \leq c_3 \frac{\sqrt{n}}{\lambda_0}$. 
}

Now, we have
\begin{align*}
\lambda_{\min}(K_{\ntk}(\cdot;\theta_t)) 
& \overset{(a)}{\geq} \lambda_{\min}(K_{\ntk}(\cdot;\theta_{0})) - 4 c_H \varrho^2 \frac{n}{\sqrt{m}} \eta \sqrt{\cL(\theta_{t-1})} \\
& \geq K_{\ntk}(\cdot;\theta_0) - 4 c_H \varrho^2 \eta   \frac{n}{\sqrt{m}} \sum_{\tau=0}^{t-1}  \sqrt{\cL(\theta_{\tau})}\\
& \overset{(b)}{\geq} \lambda_0 - 4 c_2  \varrho^2 \eta \frac{n^{3/2}}{\lambda_0\sqrt{m}} t \sqrt{\bar{\cL}(\theta_0)}\\
%& \pcedit{\geq \lambda_0 - 4 c_2 c_3 \eta \frac{n^{5/2} }{\sqrt{m}}\pcedit{\poly(\frac{1}{\lambda_0})} t \sqrt{\bar{\cL}(\theta_0)} }\\
& \overset{(c)}{\geq} \lambda_0 - 4 c_2 c_3 \varrho \eta \frac{n^2 \tilde{c}}{\lambda_0 \sqrt{m}} \log \frac{1}{\epsilon} \\
%& \overset{(c)}{\geq} \lambda_0 -  \frac{n^{3/2}}{\lambda_0 \sqrt{m}} \frac{\bar{c} \eta }{1 - \sqrt{1- \eta \lambda_0}} \\
& \overset{(d)}{\geq} \lambda_0 -  \frac{\sqrt{n}}{2 \sqrt{m}} \lambda_0  ~,
\end{align*}
%\abcomment{ongoing, because [LZB'20] does not help, since their Theorem 2 needs $\| H \|_2 \leq \frac{c}{n^2}$ which needs $m = n^4$}
%
where (a) follows from Lemma~\ref{lemm:NTK_step}, (b) follows by the induction hypothesis implying $\bar{\cL}(\theta_{\tau}) \leq \bar{\cL}(\theta_0)$, 
%
and (c) follows 
since $t \leq c_3 \log \frac{n}{\epsilon}$ and (d) follows since $\eta \leq \frac{\lambda_0^2}{c' n^2}$ for  some suitable constant $c'$.
Then, with $m = \Omega(n)$ we have $K_{\ntk}(\theta_t) \geq \lambda_0/2$.


Since $\theta_t,\theta_{t+1}\in B_{\rho,\rho_1}^{\spec}(\theta_0)$, we now take the smoothness property shown by~\cite[Theorem~5.2]{AB-PCV-LZ-MB:22}, with $\beta=O(1)$ given that $L=O(1)$, and further obtain
\begin{equation}
\begin{aligned}
\bar{\cL}(\theta_{t+1})-\bar{\cL}(\theta_{t})&\leq \langle \theta_{t+1}-\theta_t,\nabla_{\theta}\bar{\cL}(\theta_t)\rangle  +\frac{\beta\pcedit{n}}{2}\norm{\theta_{t+1}-\theta_t}^2_2\\
& \overset{(a)}{\leq} -\eta_t\norm{\nabla_\theta \bar{\cL}(\theta_t)}_2^2 +\frac{\beta\eta_t^2\pcedit{n}}{2}\norm{\nabla_\theta \bar{\cL}(\theta_t)}^2_2\\
&= - \eta_t \left(1 - \frac{\beta\eta_t\pcedit{n}}{2} \right)\norm{\nabla_\theta \bar{\cL}(\theta_t)}^2_2\\
%
&\overset{(b)}{\leq} - \frac{\eta}{2} \ell_t'^\top K_{\ntk}(\theta_t)\ell_t' \\
&\overset{(c)}{\leq} - \frac{\eta}{2} ~\lambda_{\min}(K_{\ntk}(\theta_t))~ \|\ell'_t \|_2^2 \\
%&\overset{(c)}{\leq} - \frac{\eta_t}{2} \left( \lambda_0 - \tilde{c} \frac{n}{\sqrt{m}} \sum_{\tau=1}^t \eta_{\tau} \right) \|\ell'_t \|_2^2 \\
&\overset{(d)}{\leq} - \frac{\eta}{2}  \frac{\lambda_0}{2} 4 \cL(\theta_t) \\
\implies \qquad \cL(\theta_{t+1})&\leq \left(1- \eta  \lambda_0 \right) \bar{\cL}(\theta_t),
\end{aligned}
\end{equation}
where (a) follows from the gradient descent update;  
%
(b) follows from our choice of step-size and 
\begin{align*}
\left\| \nabla \bar{\cL}(\theta_t) \right\|_2^2 = \left\| \sum_{i=1}^n \ell'_{t,i} \nabla f(\theta_t;\x_i) \right\|_2^2 =  \sum_{i=1}^n \sum_{j=1}^n \ell'_{t,i} \ell'_{t,j} \langle \nabla_{\theta} f(\theta_t;\x_i), \nabla_{\theta} f(\theta_t;\x_j) \rangle = \ell_t'^T K_{\ntk}(\theta_t) \ell_t'~,
\end{align*}
where $\ell_t' := [\ell'_{t,i}] \in \R^n$ evaluated at $\theta_t$ and also using $\eta_t  \leq \frac{1}{\beta n}$ so that $-(1-\frac{\beta \eta_t}{2}) \leq -\frac{1}{2}$; 
%
(c)  follows by the definition of minimum eigenvalue;
%
and (d) follows since 
\abdelete{
\begin{align*}
\tilde{c} \frac{n}{\sqrt{m}} \sum_{\tau=1}^t \eta_{\tau} \leq \tilde{c} \frac{n}{\sqrt{m}} T \eta \leq \frac{\lambda_0}{2}~,
\end{align*}
so that $\left(\frac{\beta\eta_t^2}{2}-\eta_t\right)<0$ and $\lambda_{\min}(K_{\ntk}(\cdot;\theta))\geq \frac{\lambda_0}{K}$; }
%
with $\ell_{t,i} = (y_i - f(\theta_t;\x_i))^2$, we have $\ell'_{t,i} = -2(y_i-f(\theta_t;\x_i))$ so that with $\ell'_t := [\ell'_{t,i}]$, we have 
\begin{equation}
\norm{\ell'_t}^2_2 =\sum_{i=1}^n \ell'^2_{t,i} = 4\sum_{i=1}^n (y_i - f(\theta_t;\x_i))^2 = 4 \bar{\cL}(\theta_t)~.
\label{eq:sqlossgrad}
\end{equation}
Repeating the same argument for $t \in [T]$ for 
\begin{align}
T = \Omega\left( \frac{\log \frac{\bar{\cL(\theta_0)}}{\epsilon}}{\log \frac{1}{1-\eta \lambda_0}} \right) ~,
%= \Omega\left( \log \frac{1}{\epsilon}\right)~, 
\end{align}
we have 
\begin{align*}
\bar{\cL}(\theta_{T+1}) \leq (1-\eta \lambda_0)^T \bar{\cL}(\theta_0) \leq \epsilon~.
\end{align*}
That  completes the proof. \qed 





