
\paragraph{Roadmap.}

In Section~\ref{sec:preli}, we present the basic notations we use, some mathematical facts, and helpful definitions that support the following proof. In Section~\ref{sec:gradient}, we compute the gradients of the helpful functions defined earlier. In Section~\ref{sec:hessian}, we define the Hessian for further discussion. In Section~\ref{sec:hessian_X}, we compute the Hessian matrix with respect to $X$. In Section~\ref{sec:lips_H_xx}, we demonstrate that the Hessian for $X$ is Lipschitz. In Section~\ref{sec:psd_H_xx}, we show that the Hessian matrix with respect to $X$ is positive semidefinite (PSD). In Section~\ref{sec:hessian_Y}, we compute the Hessian matrix with respect to $Y$ and show that it is Lipschitz and positive semidefinite (PSD). In Section~\ref{sec:hessian_XY}, we compute the Hessian matrix with respect to both $X$ and $Y$. In Section~\ref{sec:lips_H_xy}, we demonstrate that the Hessian matrix with respect to both $X$ and $Y$ is Lipschitz. In Section~\ref{sec:tensorsketch}, we introduce some tensor sketch techniques to obtain fast approximations of the Hessian. In Section~\ref{sec:newton}, we introduce the Newton step.


\section{Preliminaries}\label{sec:preli}

In Section~\ref{sub:preli:basic_facts}, we present the basic mathematical properties of vectors, norms and matrices. In section~\ref{sub:preli:general_def}, we provide a definition of $L(X,Y)$. In Section~\ref{sub:preli:help_def_x}, we define a series of helpful functions with respect to $X$. In section~\ref{sub:preli:help_def_y}, we define a series of helpful functions with respect to $Y$. In Section~\ref{sub:preli:help_def_xy}, we define a series of helpful functions with respect to both $X$ and $Y$. In Section~\ref{sub:preli:regularization}, we define the regularization function. In Section~\ref{sub:preli:fast_matrix_multi}, we introduce facts related to fast matrix multiplication.

\paragraph{Notation}

Now we define the basic notations we use in this paper. 

First, we define the notations related to the sets. We use $\mathbb{N}$ to denote the set of positive integers, namely $\mathbb{N} := \{1, 2, 3, \dots\}$. Let $n$ and $d$ be in $\mathbb{N}$. We define $[n] := \{1, 2, \dots, n\}$. We use $\R, \R^n, \R^{n \times d}$ to denote the set containing all real numbers, all $n$-dimensional vectors, and $n \times d$ matrices, whose entries are all in $\R$. We use $\R_+$ to denote the set containing all positive real numbers.

Then, we define the notations related to vectors. Let $x,y \in \R^d$. For all $i \in [d]$, we define $x_i \in \R$ as the $i$-th entry of $x$. We define $\langle \cdot, \cdot \rangle : \R^d \times \R^d \to \R$ as $\langle x, y \rangle := \sum_{i = 1}^d x_i y_i$, which is called the inner product between $x$ and $y$. We define $x \circ y \in \R^d$ as $(x \circ y)_i := x_i \cdot y_i$, for all $i \in [d]$. For all $p \in \{1, 2, \infty\}$, we define $\|x\|_p : = (\sum_{i \in [d]} |x_i|^p)^{1/p}$, which is the $\ell_p$ norm of $x$. We use ${\bf 1}_d$ and ${\bf 0}_d$ to denote the $d$-dimensional vectors whose entries are all $1$'s and $0$'s, respectively.

After that, we define the notations related to matrices. Let $A \in \R^{n \times d}$. For all $i \in [n]$ and $j \in [d]$, we use $A_{i, j} \in \R$ to denote the entry of $A$ at $i$-th row and $j$-th column, use $A_{i, *} \in \R^d$ and $A_{*, j} \in \R^n$ to denote vectors, where $(A_{i, *})_j = A_{i, j} = (A_{*, j})_i$. We use $A^\top \in \R^{d \times n}$ to denote the transpose of the matrix $A$, where $A_{i, j}^\top = A_{j, i}$. For $X \in \R^{d \times d}$, we define $x = \vect(X) \in \R^{d^2}$ as $X_{i,j} = \vect(X)_{(i - 1) \times d + j}$. For $x \in \R^d$, we define $\diag(x) \in \R^{d \times d}$ as $\diag(x)_{i, i} = x_i$, for all $i \in [d]$ and other entries of $\diag(x)$ are all $0$'s. $\| A \|_F \in \R$ and $\|A\| \in \R$ denote the Frobenius norm and the spectral norm of $A \in \R^{n \times d}$, respectively, where $\|A\|_F := \sqrt{\sum_{i \in [n]} \sum_{j \in [d]} |A_{i, j}|^2}$ and $\| A \| := \max_{x \in \R^d} \| A x \|_2 / \| x \|_2$. Let $\A \in \R^{n^2 \times d^2}$. For each $j_1 \in [n]$, we use $\A_{j_1} \in \R^{n \times d^2}$ to denote one $n \times d^2$ block from $\A \in \R^{n^2 \times d^2}$. Let $C, D \in \R^{d \times d}$ be symmetric matrices, $C \succeq D$ if for all $y \in \R^{d}$, $y^\top C y \geq y^\top D y$. $C$ is said to be a positive semidefinite (PSD) matrix if $y^\top C y \geq 0$. We use $I_d$ to denote the $d \times d$ identity matrix. $\nnz(A)$ represents the number of entries in the matrix $A$ that are not equal to zero. ${\bf 0}_{n \times n} \in \R^{n \times n}$ is a matrix, where for all $i,j \in [n]$, $({\bf 0}_{n \times n})_{i, j} = 0$.

Let $n_1, n_2, d_1, d_2$ be positive integers. Let $A \in \R^{n_1 \times d_1}$ and $B \in \R^{n_2 \times d_2}$. We define the Kronecker product between matrices $A$ and $B$, denoted $A \otimes B \in \R^{n_1 n_2 \times d_1 d_2}$, as $(A \otimes B)_{(i_1 - 1) n_2 + i_2, (j_1-1)d_2+j_2}$ 
is equal to $A_{i_1,j_1} B_{i_2,j_2}$, where $i_1 \in [n_1], j_1 \in [d_1], i_2 \in [n_2], j_2 \in [d_2]$. $\mathrm{mat} : \R^{n^2} \to \R^{n \times n}$ is defined by $X_{i, j} = \mathrm{mat}(x)_{i, j} := x_{(i - 1) \cdot n + j}$, and $\vect = \mathrm{mat}^{-1}$.

\begin{figure}[!ht]
    \centering
    \includegraphics[width = 0.8\linewidth]{vec_mat.pdf}
    \caption{The visualization of the functions $\mathrm{mat} : \R^{n^2} \to \R^{n \times n}$ and $\vect = \mathrm{mat}^{-1} : \R^{n \times n} \to \R^{n^2}$. We have $x \in \R^{n^2}$ and $X \in \R^{n \times n}$. In this figure, we give an example of $n = 3$. In the left figure, by the function $\mathrm{mat}$, the first three entries of the vector $x$ are mapped to $X_{1, 1}$, $X_{1, 2}$, and $X_{1, 3}$ respectively, the second three entries of the vector $x$ are mapped to $X_{2, 1}$, $X_{2, 2}$, and $X_{2, 3}$ respectively, and the third three entries of the vector $x$ are mapped to $X_{3, 1}$, $X_{3, 2}$, and $X_{3, 3}$ respectively. For the right figure, every entry in $X$ is mapped to $x$ by $\vect$ in the reverse pattern of $\mathrm{mat}$.}
    \label{fig:vec_mat}
\end{figure}

\subsection{Basic Facts}\label{sub:preli:basic_facts}

In this section, we will introduce the basic mathematical facts. 

\begin{fact}\label{fac:circ_rules}
    Let $a, b \in \R$.
    
    For all vectors $u,v,w \in \R^{n}$, we have 
    \begin{itemize}
        \item $\langle u,v \rangle = \langle u \circ v, {\bf 1}_n \rangle =  u^\top \mathrm{diag}(v)  {\bf 1}_n $
        \item $\langle u \circ v, w\rangle = \langle u \circ w, v\rangle$
        \item $\langle u \circ v, w \rangle =  \langle u \circ v \circ w, {\bf 1}_n  \rangle = u^\top \diag(v) w$
        \item $\langle u \circ v \circ w \circ z , {\bf 1}_n \rangle = u^\top \diag(v \circ w) z$
        \item $u \circ  v = v \circ u = \diag (u) \cdot v = \diag (v) \cdot u$ 
        \item $u^{\top}(v \circ w) = v^{\top}(u \circ w) = w^{\top}(u \circ v)= u^{\top}\diag(v) w = v^{\top}\diag(u) w = w^{\top}\diag(u) v$
        \item $ \diag (u)^{\top} = \diag (u)$
        \item $\diag (u) \cdot \diag (v) \cdot {\bf 1}_n = \diag(u) v$
        \item $\diag (u \circ v) = \diag (u) \diag (v)$
        \item $\diag (u) + \diag (v) = \diag (u +v)$
        \item $\langle u,v \rangle = \langle v,u \rangle$
        \item $\langle u,v \rangle = u^\top v = v^\top u$
        \item $a\langle w, v \rangle + b\langle u, v \rangle = \langle aw + bu, v \rangle = \langle v, aw + bu \rangle = a\langle v, w \rangle + b\langle v, u \rangle$.
    \end{itemize}
\end{fact}
\begin{fact}\label{fac:vector_norm}
Let $R > 0$ be a real number.

For vectors $x,y \in \R^n$ and $\alpha \in \R$, we have
\begin{itemize}
    \item $\| x \circ y \|_2 \leq \| x \|_{\infty} \cdot \| y \|_2$
    \item $\| x \|_{\infty} \leq \| x \|_2 \leq \sqrt{n} \| x \|_{\infty}$
    \item $\| \exp(x) \|_{\infty} \leq \exp(\| x \|_2)$ 
    \item $\|x + y \|_2 \leq \| x\|_2 + \| y\|_2$
    \item $ \| \alpha x \|_2 \leq |\alpha| \cdot \| x\|_2$
    \item For any $\| x \|_2, \|y\|_2 \leq R$, we have $\| \exp(x) - \exp(y) \|_2 \leq \exp(R) \cdot \| x - y \|_2$
\end{itemize}
\end{fact}

\begin{fact}\label{fac:matrix_norm}
For any matrices $X ,Y \in \R^{n \times n}$ and for any vector $x \in \R^n$, we have 
\begin{itemize}
    \item $\| X^\top \| = \| X \|$ 
    \item $\| X \| \geq \| Y \| - \| X - Y \|$
    \item $\| X + Y \| \leq \| X \| + \| Y \|$
    \item $\| X \cdot Y \| \leq \| X \| \cdot \| Y \|$ 
    \item If $X \preceq \alpha \cdot Y$, then $\| X \| \leq \alpha \cdot \| Y \|$, for $X$ and $Y$ being PSD matrices and $\alpha > 0$.
    \item $\|Yx \|_2 \leq \| Y\| \cdot \|x \|_2$
\end{itemize}
\end{fact}
\begin{fact}\label{fac:psd_rule}
    For any vectors $u,v \in \R^n$, we have
    \begin{itemize}
        \item Part 1. $u u^{\top} \preceq \| u\|_2^2 \cdot I_n $
        \item Part 2. $\diag(u) \preceq \|u\|_2 \cdot I_n$
        \item Part 3. $\diag(u \circ u) \preceq \|u\|_2^2 \cdot I_n$
        \item Part 4. $uv^{\top} + vu^{\top} \preceq uu^
        \top + vv^{\top}$
        \item Part 5. $uv^{\top} + vu^{\top} \succeq -( uu^
        \top + vv^{\top})$
        \item Part 6. $(v \circ u) (v \circ u)^{\top} \preceq \| v\|^2_{\infty} u u^{\top}$
        \item  Part 7. $\diag(u \circ v) \preceq \|u\|_2\|v\|_2 \cdot I_n$
    \end{itemize}
\end{fact}

\begin{fact} \label{fac:exponential_der_rule}
Let $g, f: \R^d \to \R^n$ and $q: \R^d \to \R$. 

Let $x \in \R^d$ be an arbitrary vector.

Let $a \in \R$ be an arbitrary real number.

Then, we have
    \begin{itemize}
        \item $\frac{\d q(x)^a}{\d x} =  a\cdot q(x)^{a-1} \cdot \frac{\d q(x)}{\d x}$
        \item $\frac {\d \|f(x) \|^2_2}{\d t} = 2 \langle f(x) , \frac{\d f(x)}{\d t} \rangle $
        \item $\frac{\d \langle f(x), g(x) \rangle}{\d t} = \langle \frac{\d f(x)}{\d t} , g(x) \rangle + \langle f(x), \frac{\d g(x)}{\d t} \rangle$
        \item $\frac{\d (g(x) \circ f(x))}{\d t} = \frac{\d g(x)}{\d t} \circ f(x) + g(x) \circ \frac{\d f(x)}{\d t}$ (product rule for Hadamard product)
    \end{itemize}
\end{fact}

\subsection{General Definitions}\label{sub:preli:general_def}

In this section, we introduce some general definitions. 

\begin{figure}[!ht]
    \centering
    \includegraphics[width = \linewidth]{softmax.pdf}
    \caption{The visualization of a variation of Definition~\ref{def:attention}. Let $A_1, A_2, A_3, B \in \R^{n \times d}$, $X \in \R^{d \times d}$, $D(X) \in \R^{n \times n}$ (see Figure~\ref{fig:attention_optimization} and Definition~\ref{def:attention}), and $\A = A_1 \otimes A_2 \in \R^{n^2 \times d^2}$. $\mathrm{mat} : \R^{n^2} \to \R^{n \times n}$ is defined by $X_{i, j} = \mathrm{mat}(x)_{i, j} := x_{(i - 1) \cdot n + j}$, and $\vect = \mathrm{mat}^{-1}$. We first get that $(D(X) \otimes I_n)^{-1} \in \R^{n^2 \times n^2}$ and multiply $\A$ with $\vect(X)$. Then, we multiply $(D(X) \otimes I_n)^{-1} \in \R^{n^2 \times n^2}$ with $\A \cdot \vect(X) \in \R^{n^2}$, which gives us a vector in $\R^{n^2}$. We use $\mathrm{mat}$ to transform that into a matrix in $\R^{n \times n}$. After that, we multiply this matrix with $A_3 Y \in \R^{n \times d}$. Finally, we compute the minimum of the Frobenius norm of $\mathrm{mat}((D(X) \otimes I_n)^{-1} \cdot \exp(\A \vect(X))) A_3 Y - B$. In this figure, we give an example when $n = 3$: in the matrix $D(X) \otimes I_n$, the three light green squares (and their nearby white area) make up the first chunk, the three middle green squares (and their nearby white area) make up the second chunk, and the three dark green squares (and their nearby white area) make up the third chunk. The blue rectangles represent the matrices in $\R^{n \times d}$. The red rectangle represents the matrix in $\R^{d \times d}$.}
    \label{fig:softmax}
\end{figure}


\begin{figure}[!ht]
    \centering
    \includegraphics[width = 0.8\linewidth]{svm.pdf}
    \caption{The visualization of Eq.~\eqref{eq:svm}. Let $A_1, A_2, A_3, B \in \R^{n \times d}$ and $X, Y \in \R^{d \times d}$. We have $\A = A_1 \otimes A_2 \in \R^{n^2 \times d^2}$ and $\A_{j_0} \in \R^{n \times d^2}$ is the $j_0$-th block of $\A$. $x = \vect(X) \in \R^{d^2}$. First, we use the definition of $f(x)_{j_0} \in \R^n$ (see Definition~\ref{def:f}) and $h(Y)_{i_0} \in \R^n$ (see Definition~\ref{def:h}) to compute them. Then, we find their inner produce and subtract the entry of $B$ at $j_0$-th row and $i_0$-column from the inner produce. Finally, we compute the square of this difference and add all of them from $i_0 = 1$ to $i_0 = d$ and from $j_0 = 1$ to $j_0 = n$. In this figure, we use blue rectangles to represent vectors, where the dark blue represents $f(x)_{j_0}$ and $h(Y)_{i_0}$, and the light blue represents the terms used to compute $f(x)_{j_0}$ and $h(Y)_{i_0}$. The green square represents the scalar. The red rectangle represents the matrix.}
    \label{fig:svm}
\end{figure}


\begin{definition}[Index summary]
We use $i$ to denote indices in $[d^2]$ range, and $j$ to denote indices in $[n^2]$ range.

We use $i_0, i_1, i_2$ to denote indices in $[d]$, and $j_0, j_1, j_2$ to denote indices in $[n]$.
\end{definition}

\begin{definition}\label{def:L}
If the following conditions hold
\begin{itemize}
    \item Let $A_1 \in \R^{n \times d}$.
    \item Let $A_2 \in \R^{n \times d}$.
    \item Let $\mathsf{A} \in \R^{n^2 \times d^2}$ denote the Kronecker product between $A_1, A_2$
    \begin{itemize}
        \item For each $j_0 \in [n]$, we use $\A_{j_0} \in \R^{n \times d^2}$ to be one $n \times d^2$ block from $\A \in \R^{n^2 \times d^2}$ (see Remark~\ref{rem:block}).
    \end{itemize}
    \item Let $A_3 \in \R^{n \times d}$.
    \item Let $B \in \R^{n \times d}$ and $b_{j_0,i_0}$ denote the $(j_0,i_0)$-th entry in $B \in \R^{n \times d}$ for each $j_0 \in [n]$ and $i_0 \in [d]$.  
    \item Let $X \in \R^{d \times d}$.
\end{itemize}
 Our final goal is to study the loss function, defined as:
\begin{align*}
   L(X,Y) := 0.5 \cdot \| \underbrace{ D(X)^{-1} }_{n \times n} \underbrace{ \exp(A_1 X A_2^\top) }_{n \times n} \underbrace{ A_3 }_{n \times d} \underbrace{Y}_{d \times d} - \underbrace{ B  }_{n \times d} \|_F^2
\end{align*}
where
\begin{itemize}
    \item $D(X) \in \R^{n \times n}$ is defined as $D(X) := \diag( \exp(A_1 X A_2^\top ) {\bf 1}_n )$ and
    \item for each $j_0 \in [n]$, $D(X)_{j_0} \in \R$ is $\langle \exp( \A_{j_0} x ), {\bf 1}_n \rangle$, $\A_{j_0} \in \R^{n \times d^2}$ is the $j_0$-th block of $\A \in \R^{n^2 \times d^2}$, and $x \in \R^{d^2}$ is the vectorization of $X \in \R^{d \times d}$  
\end{itemize}
Further, for each $j_0 \in [n], i_0 \in [d]$, we define $L(X,Y)_{j_0,i_0}$ as follows: 
\begin{align*}
    L(X,Y)_{j_0,i_0} := 0.5 ( \langle \langle \exp(\A_{j_0} x ) , {\bf 1}_n \rangle^{-1} \exp (\A_{j_0} x ) , A_3 Y_{*,i_0} \rangle - b_{j_0,i_0})^2 
\end{align*}
 

Using tensor-trick in \cite{gsx23_incontext,gsy23_coin}, we can see that
\begin{align*}
 L(X,Y) = \sum_{j_0=1}^n \sum_{i_0=1}^d L(X,Y)_{j_0,i_0}.
\end{align*}
\end{definition}

\subsection{Helpful Definitions With Respect to \texorpdfstring{$X$}{}}\label{sub:preli:help_def_x}

Now, we introduce a few helpful definitions related to $X \in \R^{d \times d}$.

\begin{definition}\label{def:u}
Let $\A = A_1 \otimes A_2 \in \R^{n^2 \times d^2}$, where $A_1, A_2 \in \R^{n \times d}$, and $\A_{j_0} \in \R^{n \times d^2}$ be one $n \times d^2$ block from $\A$.


We define $u(x)_{j_0}: \R^{d^2} \rightarrow \R^n$ as follows:
\begin{align*}
    u(x)_{j_0} := \underbrace{ \exp( \A_{j_0} x ) }_{n \times 1}.
\end{align*}
\end{definition}

\begin{definition}\label{def:alpha}
Let $\A = A_1 \otimes A_2 \in \R^{n^2 \times d^2}$, where $A_1, A_2 \in \R^{n \times d}$, and $\A_{j_0} \in \R^{n \times d^2}$ be one $n \times d^2$ block from $\A$.


We define $\alpha(x)_{j_0}: \R^{d^2} \rightarrow \R$ as:
\begin{align*}
  \alpha(x)_{j_0}:= \langle \underbrace{ \exp( \A_{j_0} x ) }_{n \times 1} , \underbrace{ {\bf 1}_n }_{n \times 1} \rangle.
\end{align*}
\end{definition}

\begin{definition}\label{def:f}

Let $\alpha(x)_{j_0} \in \R$ be defined as in Definition~\ref{def:alpha}.

Let $u(x)_{j_0} \in \R^n$ be defined as in Definition~\ref{def:u}.

We define $f(x)_{j_0} : \R^{d^2} \rightarrow \R^n$
\begin{align*}
    f(x)_{j_0} := \underbrace{ \alpha(x)_{j_0}^{-1} }_{ \mathrm{scalar} } \underbrace{ u(x)_{j_0} }_{ n \times 1 } .
\end{align*}
\end{definition}


\subsection{A Helpful Definition With Respect to \texorpdfstring{$Y$}{}}\label{sub:preli:help_def_y}

In this section, we introduce a helpful definition related to $Y \in \R^{d \times d}$.


\begin{definition}\label{def:h}
For each $i_0 \in [d]$, we define $h()_{i_0} : \R^{d \times d} \rightarrow \R^n$ as:
\begin{align*}
    h(Y)_{i_0}:= \underbrace{ A_3 }_{n \times d} \underbrace{ Y_{*,i_0} }_{d \times 1}.
\end{align*}

\end{definition}


\subsection{Helpful Definitions With Respect to Both \texorpdfstring{$X$}{} and \texorpdfstring{$Y$}{}} \label{sub:preli:help_def_xy}

In this section, we introduce some helpful definitions related to both $X \in \R^{d \times d}$ and $Y \in \R^{d \times d}$.


\begin{definition}\label{def:c}
We define $c(x,y)_{j_0,i_0}: \R^{d^2} \times \R^{d^2} \rightarrow \R$ as follows:
\begin{align*}
    c(x,y)_{j_0,i_0}:= \langle f(x)_{j_0}, h(y)_{i_0} \rangle - b_{j_0,i_0}.
\end{align*}
Furthermore, we define $c(x, :)_{j_0,i_0}$ as follows
\begin{align*}
    c(x,:)_{j_0,i_0} := \langle f(x)_{j_0}, v \rangle - b_{j_0,i_0}
\end{align*}
for some fixed vector $v \in \R^n$ which doesn't depend on $x$ and also doesn't depend on $y$. 

Similarly, we also define $c(:,y)_{j_0,i_0}$ as follows
\begin{align*}
    c(:,y)_{j_0,i_0} := \langle v, h(y)_{i_0} \rangle - b_{j_0,i_0}
\end{align*}
for some fixed vector $v \in \R^n$ which doesn't depend on $x$ and also doesn't depend on $y$.
\end{definition}


\begin{definition}\label{def:l}
We define
\begin{align*}
 L(x,:)_{j_0,i_0} := 0.5 c(x,:)_{j_0,i_0}^2
\end{align*}
and 
\begin{align*}
     L(:,y)_{j_0,i_0} := 0.5 c(:,y)_{j_0,i_0}^2
\end{align*}
and 
\begin{align*}
     L(x,y)_{j_0,i_0} := 0.5 c(x,y)_{j_0,i_0}^2
\end{align*}
\end{definition}

\subsection{Regularization}\label{sub:preli:regularization}

In this section, we define the regularization loss we use.

\begin{definition}\label{def:reg_term}
Let $W \in \R^{n \times n}$ denote a positive diagonal matrix. 
We use the following regularization loss
\begin{align*}
   \| (W \otimes I) (A_1 \otimes A_2)x \|_2^2 + \| W A_3 y \|_F^2 
\end{align*}
Note that $\| W A_3 y \|_F^2 = \sum_{i_0=1}^d \| W A_3 y_{i_0} \|_2^2$.
\end{definition}

Adding this regularization term to the loss function $L(X, Y)$ (see Definition~\ref{def:L}), we can ensure the positive definiteness of this loss function (see Lemma~\ref{lem:hessian_property:y} and Lemma~\ref{lem:hessian_property:x}).

\subsection{Fast Matrix Multiplication}\label{sub:preli:fast_matrix_multi}

We use $\Tmat(a,b,c)$ to denote the time of multiplying an $a \times b$ matrix with another $b \times c$ matrix. Fast  matrix multiplication \cite{c82,w12,lg14,gu18,cglz20,aw21,dwz23,lg23,wxxz23} is a fundamental tool in theoretical computer science.

\begin{fact}\label{fac:Tmat}
$O(\Tmat(a,b,c)) = O( \Tmat(b,a,c) ) = O( \Tmat(a,c,b) )$.
\end{fact}


For $k \in \R_+$, we define $\omega(k) \in \R_+$ to be the value such that $\forall n \in \mathbb{N}$, $\Tmat(n,n,n^k) = O(n^{\omega(k)})$.

For convenience, we define three special values of $\omega(k)$. We define $\omega$ to be the fast matrix multiplication exponent, i.e., $\omega := \omega(1)$. We define $\alpha \in \R_+$ to be the dual exponent of matrix multiplication, i.e., $\omega(\alpha) = 2$. We define $\beta := \omega(2)$.

The following fact can be found in Lemma 3.6 of \cite{jkl+20}, also see \cite{bcs97}.
\begin{fact}[Convexity of $\omega(k)$]\label{fact:omega_k_convex}
The function $\omega(k)$ is convex.
\end{fact}