\section{Hessian for \texorpdfstring{$X$}{} Is PSD}\label{sec:psd_H_xx}

In Section~\ref{sub:psd_H_xx:main_res}, we present the main result of PSD bound for Hessian. In Section~\ref{sub:psd_H_xx:psd}, we show the PSD bound for $B(x)$. In this section, our focus will be on establishing the PSD bound for $H_{x,x}$. Throughout this section, we will use the symbol $H$ to represent $H_{x,x}$ for the sake of simplicity.
\subsection{Main Result}\label{sub:psd_H_xx:main_res}

In this section, we introduce the main result of the PSD bound for Hessian.

\begin{lemma}\label{lem:hessian_property:x}
If the following conditions hold
\begin{itemize}
    \item Let $j_0 \in [n]$
    \item Let $i_0 \in [d]$
    \item Let $H_{j_0,i_0} = \frac{\d^2 L_{j_0,i_0}}{\d x \d x} \in \R^{d^2 \times d^2}$
    \item Let $B_{j_0,i_0}(x) \in \R^{n \times n}$ be defined as Definition~\ref{def:B(x)}.
    \begin{itemize}
        \item Therefore, $H_{j_0,i_0} = \A_{j_0}^\top B_{j_0,i_0}(x) \A_{j_0} \in \R^{d^2 \times d^2}$
    \end{itemize}
    \item Let $\max_{j_0 \in [n]} \| \A_{j_0} \| \leq R$
    \item Let $\sigma_{\min}$ be the smallest singular value. We define $\sigma_{\min}(\A_{\min}) := \min_{j_0 \in [n]} \sigma_{\min} (\A_{j_0})$.
    \item Let $H = \sum_{j_0=1}^n \sum_{i_0=1}^d H_{j_0,i_0}$ 
    \item Let $H_{\reg,j_0,i_0} =  \A_{j_0}^\top ( B_{j_0,i_0}(x) + W^2) \A_{j_0}$ where $W \in \R^{n \times n}$ is a positive diagonal matrix.
    \item Let $H_{\reg} = \sum_{j_0=1}^n \sum_{i_0=1}^d H_{\reg,j_0,i_0}$
    \item Let $C_0: = 30 R^8$ (be a local parameter in this lemma)
    \item Let $l > 0$ (denote the strongly convex parameter for hessian)
\end{itemize}
Then, we have
\begin{itemize}
\item {\bf Part 1.} For each $j_0 \in [n]$, for each $i_0 \in [d]$
\begin{align*}
    - C_0 I_n \preceq B_{j_0,i_0}(x) \preceq C_0 I_n
\end{align*}
\item {\bf Part 2.} For each $j_0 \in [n]$, for each $i_0 \in [d]$
\begin{align*}
    \| H_{j_0,i_0}(x) \| \leq C_0 R^2. 
\end{align*}
\item {\bf Part 3.} For each $j_0 \in [n]$, $i_0 \in [d]$, if $\min_{j_1 \in [n]} w_{j_1,j_1} \geq \frac{ l }{ \sigma_{\min}(\A_{j_0})^{2} } + C_0 $, then we have
\begin{align*}
    H_{\reg,j_0,i_0}(x) \succeq l \cdot I_{d^2}
\end{align*}
\item {\bf Part 4.} For each $j_0 \in [n]$, $i_0 \in [d]$, if $\min_{j_1 \in [n]} w_{j_1,j_1} \geq \frac{ l }{ \sigma_{\min}(\A_{j_0})^2 } + 100 \cdot C_0 $, then we have
\begin{align*}
 1.1 \cdot ( B(x)_{j_0,i_0} + W^2 ) \succeq W^2 \succeq 0.9 \cdot ( B(x)_{j_0,i_0} + W^2 ) 
\end{align*}
and 
\begin{align*}
   1.1 H_{j_0,i_0} \succeq H_{\reg,j_0,i_0} \succeq 0.9 H_{j_0,i_0}
\end{align*}
\item {\bf Part 5.}
For each $j_0 \in [n]$, $i_0 \in [d]$, if $\min_{j_1 \in [n]} w_{j_1,j_1} \geq \frac{ l }{ nd \sigma_{\min}(\A_{\min})^{2} } + C_0 $, then we have
\begin{align*}
    H_{\reg}(x) \succeq l \cdot I_{d^2}
\end{align*}
\item {\bf Part 6.} For each $j_0 \in [n]$, $i_0 \in [d]$, if $\min_{j_1 \in [n]} w_{j_1,j_1} \geq \frac{ l }{ nd \sigma_{\min}(\A_{\min})^{2} } + 100 \cdot C_0 $, then we have
\begin{align*}
1.1 H \succeq H_{\reg} \succeq 0.9  H
\end{align*}
\end{itemize}
\end{lemma}
\begin{proof}
{\bf Proof of Part 1.}

It directly follows from Lemma~\ref{lem:hessian_X_psd_tool}.

{\bf Proof of Part 2.}
We have
\begin{align*}
   \| H_{j_0,i_0} \| 
   = & ~ \| \A_{j_0}^\top B_{j_0,i_0}(x) \A_{j_0} \| \\
   \leq & ~ \| \A_{j_0} \|^2 \cdot \| B_{j_0,i_0}(x) \| \\
   \leq & ~ R^2 \cdot \| B_{j_0,i_0}(x) \| \\
   \leq & ~ 30 R^{10} 
\end{align*}
where the first step follows from the $H_{j_0,i_0} = \A_{j_0}^\top B_{j_0,i_0}(x) \A_{j_0}$, the second step follows from Fact~\ref{fac:matrix_norm}, the third step follows from $\max_{j_0 \in [n]} \| \A_{j_0} \| \leq R$, and the last step follow from {\bf Part 1}.

{\bf Proof of Part 3.}

The proof is similar to \cite{dls23}.

{\bf Proof of Part 4.}


The proof is similar to \cite{dls23}.

{\bf Proof of Part 5 and Part 6.}
It is because we can write $H$ as summation of $nd$ terms $H_{j_0,i_0}$ for all $j_0 \in [d]$, $i_0 \in [d]$.
\end{proof}

\subsection{PSD Bound}\label{sub:psd_H_xx:psd}

In this section, we analyze the PSD bound for each of the $B_{\rank}$ and $B_{\diag}$.

\begin{lemma}\label{lem:hessian_X_psd_tool}
If the following condition holds
\begin{itemize}
    \item $B_{\diag}^1 := (1-\gamma_{j_0}(x)) \cdot c(x,:)_{j_0,i_0} \cdot \diag( f(x)_{j_0} \circ v )$
    \item $B_{\rank}^1 := -( 2 \gamma_{j_0}(x) + c(x,:)_{j_0,i_0}) \cdot ( ( f(x)_{j_0} \circ v ) f(x)_{j_0}^\top + f(x)_{j_0} (f(x)_{j_0} \circ v)^\top )$
    \item $B_{\rank}^2 := ( 2 \gamma_{j_0}(x) c(x,:)_{j_0,i_0} + \gamma_{j_0}(x)^2 ) \cdot f(x)_{j_0} f(x)_{j_0}^\top $
    \item $B_{\rank}^3 := (f(x)_{j_0} \circ v) \cdot ( f(x)_{j_0} \circ v )^\top$
    \item $|\gamma(x)_{j_0}| \leq R^2$
    \item $|c(x,:)_{j_0,i_0}| \leq 2R^2$
    \item $\| v \|_2 \leq R^2$
\end{itemize}
Then, we have
\begin{itemize}
    \item Part 1.
    \begin{align*}
      -8R^6 \cdot I_n \preceq B_{\diag}^1 \preceq 8 R^6 \cdot I_n
    \end{align*}   
    \item Part 2. 
    \begin{align*}
        -16 R^8 \cdot I_n \preceq B_{\rank}^1 \preceq 16 R^8 \cdot I_n
    \end{align*}   
    \item Part 3.
    \begin{align*}
        -8 R^4 \cdot I_n \preceq B_{\rank}^2 \preceq 8 R^4 \cdot I_n
    \end{align*} 
    \item Part 4.
    \begin{align*}
        0 \cdot I_n \preceq B_{\rank}^3 \preceq 8 R^4 \cdot I_n
    \end{align*}
\end{itemize}
\end{lemma}
\begin{proof}
{\bf Proof of Part 1.}
\begin{align*}
     B_{\diag}^1 = & ~(1-\gamma_{j_0}(x)) \cdot c(x,:)_{j_0,i_0} \cdot \diag( f(x)_{j_0} \circ v ) \\
     \preceq & ~ |1-\gamma_{j_0}(x) ||c(x,:)_{j_0,i_0}|\|f(x)_{j_0}\|_2 
    \| v \|_2 \\
    \preceq & ~ 8R^6 \cdot I_n
\end{align*}
where the first step follows from the definition of $B_{\diag}^1$, the second step follows from Fact~\ref{fac:psd_rule}, and the last step follows from Lemma~\ref{lem:upper_bound}, $|\gamma(x)_{j_0}| \leq R^2,~|c(x,:)_{j_0,i_0}| \leq 2R^2$, and $\| v \|_2 \leq R^2$. 

{\bf Proof of Part 2.}
\begin{align*}
    B_{\rank}^1 
    = & ~ -( 2 \gamma_{j_0}(x) + c(x,:)_{j_0,i_0}) \cdot ( ( f(x)_{j_0} \circ v ) f(x)_{j_0}^\top + f(x)_{j_0} (f(x)_{j_0} \circ v)^\top ) \\
    \succeq & ~ - |2 \gamma_{j_0}(x) + c(x,:)_{j_0,i_0} | \cdot (( f(x)_{j_0} \circ v )\cdot  (f(x)_{j_0} \circ v )^{\top} + f(x)_{j_0}f(x)_{j_0}^\top) \\
    \succeq & ~ -4R^2 \cdot (\|f(x)_{j_0} \circ v\|_2^2 + \| f(x)_{j_0}\|_2^2) I_n \\
    \succeq & ~ -4R^2(\| f(x)_{j_0}\|_2^2 \| v\|_2^2 + \| f(x)_{j_0}\|_2^2) I_n \\
    \succeq & ~ -5R^4 \cdot I_n
\end{align*}
where the first step follows from the definition of $B_{\rank}^1$, the second step follows from Fact~\ref{fac:psd_rule}, the third step follows from $|\gamma(x)_{j_0}| \leq R^2,~
    |c(x,:)_{j_0,i_0}| \leq 2R^2$ and Fact~\ref{fac:psd_rule}, the fourth step follows from Fact~\ref{fac:circ_rules}, and last step follows from $\|f(x)_{j_0}\|_2 \leq 1$ (see {\bf Part 4} of Lemma~\ref{lem:upper_bound}) and $\| v\|_2 \leq R^2$. 

{\bf Proof of Part 3.}
\begin{align*}
    B_{\rank}^2 
    = & ~( 2 \gamma_{j_0}(x) c(x,:)_{j_0,i_0} + \gamma_{j_0}(x)^2 ) \cdot f(x)_{j_0} f(x)_{j_0}^\top \\
    \preceq & ~ |2 \gamma_{j_0}(x) c(x,:)_{j_0,i_0} + \gamma_{j_0}(x)^2 | \|f(x)_{j_0} \|_2^2 \\
    \preceq & ~ 8R^4 \cdot I_n
\end{align*}
where the first step follows from definition of $B_{\rank}^2$, the second step follows from Fact~\ref{fac:psd_rule}, and the last step follows from $|\gamma(x)_{j_0}| \leq R^2,~
    |c(x,:)_{j_0,i_0}| \leq 2R^2$ and Lemma~\ref{lem:upper_bound}.

{\bf Proof of Part 4.}
\begin{align*}
    B_{\rank}^3 
    = & ~ (f(x)_{j_0} \circ v) \cdot ( f(x)_{j_0} \circ v )^\top \\
    \preceq & ~ \|f(x)_{j_0} \circ v\|_2^2 \\
    \preceq & ~ \|f(x)_{j_0} \|_2^2 \|  v\|_2^2 \\
    \preceq & ~ 8R^4 \cdot I_n
\end{align*}
where the first step follows from definition of $B_{\rank}^3$, the second step follows from Fact~\ref{fac:psd_rule}, the third step follows from Fact~\ref{fac:circ_rules}, and the last step follows from $\| v\|_2 \leq R^2$ and Lemma~\ref{lem:upper_bound}.

\end{proof}