\section{Hessian for \texorpdfstring{$Y$}{}}\label{sec:hessian_Y}
In Section~\ref{sub:hessian_Y:hessian}, we present the hessian property with respect to $Y$. In Section~\ref{sub:hessian_Y:one_j0i0}, we compute the Hessian matrix with respect to $Y$ for one $j_0,i_0$.
\subsection{Hessian Property}\label{sub:hessian_Y:hessian}

In this section, we analyze the Hessian properties. 

\begin{lemma}\label{lem:hessian_property:y}
If the following conditions hold
\begin{itemize}
    \item Let $B_{j_0}(x) = f(x)_{j_0} f(x)_{j_0}^\top \in \R^{n \times n}$ (because of Lemma~\ref{lem:hessian_y_j_0_i_0})
    \item Let $B(x) = \sum_{j_0=1}^n B_{j_0}(x)$
    \item Let $H_{j_0,i_0} = \frac{\d^2 L_{j_0,i_0}}{ \d y_{i_0} \d y_{i_0}} = A_3^\top B_{j_0}(x) A_3 \in \R^{d \times d}$
    \item Let $H_{i_0} \in \R^{d \times d}$ be $H_{i_0} = \frac{\d^2 L}{\d y_{i_0} \d y_{i_0}} = \sum_{j_0=1}^d H_{j_0,i_0} $
    \item Let $H_{\reg,i_0} = A_3^\top ( B(x) + W^2 ) A_3$ where $W \in \R^{n \times n}$ is a positive diagonal matrix
    \item Let $H(y) \in \R^{d^2 \times d^2} $ be $ H(y)= \begin{bmatrix}
    H_{1} & 0 & \cdots & 0 \\
    0 & H_2& \cdots & 0 \\
    \vdots & \vdots & \ddots & \vdots \\
    0 & 0 & \cdots & H_d
    \end{bmatrix}$
\end{itemize}
Then, we have
\begin{itemize}
    \item {\bf Part 1.}
    \begin{align*}
        0 \preceq B_{j_0}(x) \preceq I_n
    \end{align*}
    \item {\bf Part 2.}
    \begin{align*}
        0 \preceq B(x) \preceq n \cdot I_n
    \end{align*}
    \item {\bf Part 3.} If $\min_{j_1 \in [n]} w_{j_1,j_1}^2 \geq \frac{l}{\sigma_{\min}(A_3)^2} $
    \begin{align*}
        H_{\reg, i_0} \succeq l \cdot I_{d}, ~~~ H(y) \succeq l\cdot I_{d^2}
    \end{align*}
    \item {\bf Part 4.} If $\min_{j_1 \in [n]} w_{j_1,j_1}^2 \geq \frac{l}{\sigma_{\min}(A_3)^2} + 100 n$
    \begin{align*}
        0.9  (W^2+B(x)) \preceq    W^2 \preceq 1.1 (W^2+B(x))
    \end{align*}
    \item {\bf Part 5.} Lipschitz, Due to $H(y)$ is independent of $y$, then
    \begin{align*}
        \| H(y) - H(\wt{y}) \| \leq \| y - \wt{y} \|_2
    \end{align*}
\end{itemize}
\end{lemma}
\begin{proof}
For hessian closed-form, we can obtain them from Lemma~\ref{lem:hessian_y_j_0_i_0}.

The proofs are straightforward, so we omit the details here.
\end{proof}

\subsection{Hessian for One \texorpdfstring{$j_0,i_0$}{}}\label{sub:hessian_Y:one_j0i0}

In this section, we analyze the Hessian for the matrix $Y$ with one $j_0,i_0$.

\begin{lemma}\label{lem:hessian_y_j_0_i_0}
If the following conditions hold
\begin{itemize}
    \item We define a temporary notation here $v := f(x)_{j_0}$ (for simplicity we drop the index $j_0$ in the statement. Note that $v$ could have different meaning in other sections.)  
    \item Let $f(x)_{j_0}$ be defined as Definition~\ref{def:f}.
    \item Let $c(x,:)_{j_0,i_0}$ be defined as Definition~\ref{def:c}.
    \item Let $h(y)_{i_0}$ be defined as Definition~\ref{def:h}.
    \item Let $L_{j_0,i_0}$ be defined as Definition~\ref{def:f}.
\end{itemize}
Then, we have
\begin{itemize}
    \item {\bf Part 1.} For $i_1 = i_2$, the diagonal case
    \begin{align*}
        \frac{\d^2 L_{j_0,i_0}}{\d y_{i_0,i_1} \d y_{i_0,i_1}} = A_{3,*,i_1}^\top v v^\top A_{3,*,i_1}
    \end{align*}
    \item {\bf Part 2.} For $i_1 \neq i_2$, the off-diagonal case
    \begin{align*}
        \frac{\d^2 L_{j_0,i_0} }{ \d y_{i_0,i_1} \d y_{i_0,i_2} } = A_{3,*,i_1}^\top v v^\top A_{3,*,i_2}
    \end{align*}
    \item {\bf Part 3.} The $\frac{\d^2 L_{j_0,i_0}}{\d y_{i_0} \d y_{i_0}} \in \R^{d \times d}$
    \begin{align*}
        \frac{\d^2 L_{j_0,i_0}}{\d y_{i_0} \d y_{i_0}} = A_{3}^\top vv^\top A_3
    \end{align*}
\end{itemize}
\end{lemma}
\begin{proof}

{\bf Proof of Part 1.}

\begin{align*}
\frac{\d^2 L_{j_0,i_0}}{\d y_{i_0,i_1} \d y_{i_0,i_1}} 
= & ~ \frac{\d }{\d y_{i_0,i_1}} ( \frac{\d }{\d y_{i_0,i_1}} L_{j_0,i_0} ) \\
= & ~ \frac{\d }{\d y_{i_0,i_1}} ( c(:,y)_{j_0,i_0} \langle v, A_{3,*,i_1} \rangle ) \\
= & ~ \langle v, A_{3,*,i_1} \rangle \cdot \langle v, A_{3,*,i_1} \rangle \\
= & ~ A_{3,*,i_1}^\top v v^\top A_{3,*,i_1}
\end{align*}
where the first step follows from simple algebra, the second step follows from Lemma~\ref{lem:gradient_y}, the third step follows from Lemma~\ref{lem:gradient_y}, and the last step follows from Fact~\ref{fac:circ_rules}.

{\bf Proof of Part 2.}

\begin{align*}
\frac{\d^2 L_{j_0,i_0}}{\d y_{i_0,i_2} \d y_{i_0,i_1}} 
= & ~ \frac{\d }{\d y_{i_0,i_2}} ( \frac{\d }{\d y_{i_0,i_1}} L_{j_0,i_0} ) \\
= & ~ \frac{\d }{\d y_{i_0,i_2}} ( c(:,y)_{j_0,i_0} \langle v, A_{3,*,i_1} \rangle ) \\
= & ~ \langle v, A_{3,*,i_2} \rangle \cdot \langle v, A_{3,*,i_1} \rangle \\
= & ~ A_{3,*,i_1}^\top v v^\top A_{3,*,i_2}
\end{align*}
where the first step follows from simple algebra, the second step follows from Lemma~\ref{lem:gradient_y}, the third step follows from Lemma~\ref{lem:gradient_y}, and the last step follows from Fact~\ref{fac:circ_rules}.

{\bf Proof of Part 3.}

It follows by combining above two parts directly.
\end{proof}