%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\input{math.tex}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\DeclareMathOperator{\asto}{\xrightarrow{\text{a.s.}}}
\DeclareMathOperator{\toind}{\xrightarrow{\mathcal{D}}}
\DeclareMathOperator{\mat}{Mat}
\DeclareMathOperator{\vect}{vec}
\DeclareMathOperator{\rank}{rank}
\DeclareMathOperator{\EE}{\mathbb{E}}
\DeclareMathOperator{\CPD}{CPD}
\DeclareMathOperator{\tr}{tr}
\DeclareMathOperator{\var}{\mathbb{V}ar}
\DeclareMathOperator{\concat}{concat}
\DeclareMathOperator{\plog}{polylog}
\DeclareMathOperator{\supp}{sup}
\usepackage{color}
%\usepackage{authblk}
\usepackage{comment}
\usepackage{algorithm}
\usepackage{algpseudocode}
%\usepackage[ruled, lined, linesnumbered, commentsnumbered, longend]{algorithm2e}
%\SetKwInOut{Input}{Input}
%\SetKwInOut{KwOut}{Output}
%\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
%\newtheorem{theorem}[theorem]{theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{remark}[theorem]{Remark}
\usetikzlibrary{patterns}
\usepackage{url}
 \definecolor{mitred}{rgb}{0.78, 0.39, 0.07}
 \usepackage{pifont}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Learning from Low Rank Tensor Data:\\ A Random Tensor Theory Perspective\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<mohamed.seddik@tii.ae>?Subject=Your UAI 2023 paper}{Mohamed El Amine Seddik}{}}
%\author[1]{Mohamed El Amine Seddik}
\author[2]{Malik Tiomoko}
\author[2]{Alexis Decurninge}
\author[1]{Maxim Panov}
\author[3]{Maxime Guillaud}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Technology Innovation Institute\\
    PO Box: 9639, Masdar City\\
    Abu Dhabi, UAE
}
\affil[2]{%
    Huawei Technologies France\\
    Paris, France
}
\affil[3]{%
Inria / CITI Laboratory\\
     6 avenue des Arts\\
     69621 Villeurbanne, France
  }
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\begin{abstract}
This supplementary material recalls some tensor operations (Section \ref{sec_notations}) used throughout the paper and random tensor theory tools presented in Section \ref{sec_random_tensor_theory}. The main proofs are then presented in Section \ref{sec_proofs}. Finally, some extensions of our results to a more general data model are discussed in Section \ref{sec_general}.
\end{abstract}

\section{Tensor operations}\label{sec_notations}
We briefly recall in this section some tensor notations and operations that are used throughout the paper. 
\paragraph{Inner product and norm:} The inner product of two same-sized order $k$ tensors $\tX, \tY \in \sR^{p_1\times \cdots \times p_k}$ is the sum of the products of their entries and is denoted as $\langle \tX, \tY \rangle = \sum_{i_1, \ldots, i_k} X_{i_1 \cdots i_k} Y_{i_1 \cdots i_k}$.
In particular, the norm $\Vert \tX \Vert$ of $\tX\in \sR^{p_1\times \cdots \times p_k}$ is $\Vert \tX \Vert^2 = \langle \tX, \tX \rangle$.
\paragraph{Rank-one tensors} An order $k$ tensor $\tX\in \sR^{p_1\times \cdots \times p_k}$ is said to be a \textit{rank-one} tensor if it can be written as the outer product of $k$ vectors $\va_1,\ldots, \va_k$, i.e., $\tX = \bigotimes_{j=1}^k \va_j = \va_1 \otimes \cdots \otimes \va_k$, where the outer product $\bigotimes_{i=1}^k \va_i$ is defined such that $\left( \bigotimes_{j=1}^k \va_j \right)_{i_1 \ldots i_k} = \prod_{j=1}^k (\va_j)_{i_j}$, i.e., each element of the rank-one tensor is the product of the elements of the corresponding vectors.
\paragraph{Tensor multiplication:} The $j$-mode (matrix) product of a tensor $\tX\in \sR^{p_1\times \cdots \times p_k}$ with a matrix $\mM \in \sR^{m\times p_j}$ is denoted $\tX \times_j \mM $ and is a tensor of size $p_1\times \cdots \times p_{j-1} \times m \times p_{j+1} \times \cdots \times p_k$. Element-wise, the $j$-mode (matrix) product is defined as $\left( \tX \times_j \mM  \right)_{i_1 \cdots i_{j-1} k i_{j+1}\cdots i_k} = \sum_{i_j=1}^{p_j} X_{i_1 \cdots i_k} M_{k i_j}$.
Similarly, the $j$-mode (vector) product or \textit{contraction} of an order $k$ tensor $\tX\in \sR^{p_1\times \cdots \times p_k}$ with a vector $\vv \in \sR^{p_j}$ is also denoted as $\tX \times_j \vv$ and results in a tensor of order $k-1$ of dimension $p_1 \times \cdots \times p_{j-1} \times p_{j+1} \times \cdots \times p_k$. Element-wise, the $j$-mode contraction is defined as $\left( \tX \times_j \vv \right)_{i_1 \cdots i_{j-1} i_{j+1}\cdots i_k} = \sum_{i_j=1}^{p_j} X_{i_1 \cdots i_k} v_{i_j}$,
which basically consists in computing the inner product of each mode-$j$ \textit{fiber} with the vector $\vv$.
\paragraph{Tensor Rank and the CANDECOMP/PARAFAC Decomposition (CPD):} The CP decomposition \citep{hitchcock1927expression, landsberg2012tensors} produces a decomposition of a tensor $\tX\in \sR^{p_1\times \cdots \times p_k}$ into a sum of rank-one tensors, i.e., $\tX = \sum_{i=1}^r \bigotimes_{j=1}^k \va_j^{(i)}$.
The rank of $\tX$ denoted $\rank(\tX)$ is defined as the smallest possible integer $r$ for which $\tX$ decomposes as above.


\section{Random tensor theory}\label{sec_random_tensor_theory}
The random tensor theory consists of generalizing classical random matrix theory \citep{marvcenko1967distribution, baik2005phase} to random tensor models. The first line of research on this topic was proposed by \cite{montanari2014statistical} who introduced the concept of tensor PCA. Afterward, many works have focused on the analysis of \textit{symmetric} random tensors \citep{perry2020statistical, lesieur2017statistical, handschy2019phase, jagannath2020statistical, de2021random}. However, symmetric random tensor models have limited applications in machine learning since real data structures do not necessarily have such symmetric properties. In a very recent work by \cite{seddik2021random}, a study of \textit{asymmetric} spiked random tensors have been carried out. It considers an observed $k$-order tensor $\tT$ of the form
\begin{align}\label{eq_spiked_tensor}
    \tT = \beta \bigotimes_{j=1}^k \vu_j + \frac{1}{\sqrt{\sum_{i=1}^k p_i}} \tZ \in \sR^{p_1\times \cdots \times p_k},
\end{align}
where $\vu_j\in \sR^{p_j}$ for $j\in [k]$ are unitary vectors, $\tZ$ is a random tensor with i.i.d.\@ $\mathcal{N}(0, 1)$ entries and $\beta>0$ is a parameter controlling the signal-to-noise ratio (SNR). The study has provided asymptotic evaluation of $\lambda$ and $\langle \vu_j, \vv_j \rangle$ with $\lambda \bigotimes_{j=1}^k \vv_j$ being the best rank-one approximation of $\tT$ given by the maximum likelihood estimator (MLE) as
\begin{align}
    \argmin_{\lambda >0,\{ \vv_j \,\mid\, \Vert \vv_j\Vert = 1,\, j\in [k]\}} \left\Vert \tT - \lambda \bigotimes_{j=1}^k \vv_j \right\Vert_{\text{F}}^2.
\end{align}
This study was carried out in the high-dimensional regime, where $p_j\to \infty$ with $\frac{p_j}{\sum_{i=1}^k p_i}\to c_j\in [0, 1]$. Precisely, \cite{seddik2021random} provided the following results which will be subsequently applied in order to assess the performance of the learning algorithms studied in the present work.
\subsection{$k$-order spiked random tensors}
\begin{theorem}[Theorem 8 in \citep{seddik2021random}]\label{thm_main_tensors} As $p_j\to \infty$ with $\frac{p_j}{\sum_{i=1}^k p_i}\to c_j\in [0, 1]$, for $k\geq 3$, there exists $\beta_s$ such that for $\beta > \beta_s$,
\begin{align*}
    \begin{cases}
    \lambda \asto \lambda^\infty(\beta),\\
    \vert \langle \vu_j, \vv_j \rangle \vert \asto q_j(\lambda^\infty(\beta)),
    \end{cases}
\end{align*}
where $\lambda^\infty(\beta)$ satisfies\footnote{We will sometimes omit the dependence on $\beta$ for simplicity.} $f(\lambda^\infty(\beta), \beta) = 0$ with $f(z, \beta) = z + g(z) - \beta \prod_{j=1}^k q_j(z)$, $q_j(z) = \sqrt{1 - \frac{g_i^2(z)}{c_i} }$, $g_j(z) = \frac{g(z) + z}{2} - \frac{\sqrt{4c_j + (g(z) + z)^2}}{2}$ and $g(z)$ being the unique solution to $g(z) = \sum_{j=1}^k g_j(z)$.
\end{theorem}
In essence, for an SNR $\beta$ large enough, Theorem \ref{thm_main_tensors} predicts a non-zero correlation between the signal components (i.e., the $\vu_j$'s) and their estimated counterparts (i.e., the $\vv_j$'s) by the MLE. We refer the reader to \citep{seddik2021random} for a more detailed discussion.
\subsection{Cubic spiked random tensors}
In the case of cubic tensors, i.e., $k=3$ and all the tensor dimensions are equal ($p_1=p_2=p_3$), $\lambda^\infty$ and $q_j(\lambda^\infty)$ in Theorem \ref{thm_main_tensors} have closed form expressions in terms of $\beta$.
\begin{corollary}[Corollary 3 in \citep{seddik2021random}]\label{cor_cubic_tensors} As $p_j\to \infty$, for $\beta >  \frac{2\sqrt 3}{3}$,
\begin{align*}
\begin{cases}
\lambda \asto \lambda^\infty(\beta) = \sqrt{\frac{\beta^{2}}{2} + 2 + \frac{\sqrt{3} \sqrt{\left(3 \beta^{2} - 4\right)^{3}}}{18 \beta}},\\
\vert \langle \vu_j, \vv_j \rangle\vert \asto \bar{q}(\beta),
\end{cases}
\end{align*}
with $\bar{q}(\beta)  = \frac{\sqrt{9 \beta^{2} - 12 + \frac{\sqrt{3} \sqrt{\left(3 \beta^{2} - 4\right)^{3}}}{\beta}} + \sqrt{9 \beta^{2} + 36 + \frac{\sqrt{3} \sqrt{\left(3 \beta^{2} - 4\right)^{3}}}{\beta}}}{6\sqrt{2} \beta}$.
\end{corollary}

\subsection{Spiked random matrices}
For $k=2$, the model in \eqref{eq_spiked_tensor} becomes a so-called \textit{spiked random matrix} which has been extensively studied using random matrix theory \citep{baik2005phase, benaych2011eigenvalues, capitaine2009largest, peche2006largest, arous2021long}. Theorem \ref{thm_main_tensors} covers also such models by not letting all tensor dimensions go to infinity which yields the following corollary.
\begin{corollary}[Corollary 5 in \citep{seddik2021random}]\label{cor_matrix_case} As $p_1,p_2\to \infty$ with $\frac{p_1}{p_1+p_2}\to c \in [0, 1]$, for $\beta> \sqrt[4]{c(1-c)} $,
\begin{align*}
\begin{cases}
\lambda \asto \lambda^\infty(\beta) = \sqrt{\beta^2 + 1 + \frac{c(1-c)}{\beta^2}},\\
\left\vert \langle \vu_1, \vv_1 \rangle\right\vert \asto \frac{1}{\kappa(\beta, c)},\quad 
\left\vert \langle \vu_2, \vv_2 \rangle\right\vert \asto \frac{1}{\kappa(\beta, 1 - c)},
\end{cases}
\end{align*}
where $\kappa(\beta, c) = \beta \sqrt{\frac{   \beta^{2} \left(\beta^{2} + 1\right) - c \left(c - 1\right)  }{ (\beta^4 + c(c-1)) \left( \beta^{2} + 1 -  c \right)}}$.
\end{corollary}


\section{Main Proofs}\label{sec_proofs}
\subsection{Poof of theorem 3.1}\label{proof_prop_MF}
Recall $\vw = \vect(\tW)$, $\mX = \mat(\tX)$, $p=\sum_{j=1}^k p_j$ and $P=\prod_{j=1}^k p_j$, hence $\vw = \frac{1}{\sqrt{ n p }} \mX \vy$. Denoting $\tilde\vx_i = \mat(\tilde\tX_i)$ for some $\tilde\tX_i \in \mathcal{C}_a$ with $a\in \{1, 2 \}$ independent of the training data $\tX$, the decision function write as $f_{\text{R}}(\tilde\vx_i) = \vw^\top \tilde\vx_i = \sum_{j=1}^d w_j \tilde x_{ij}$. Thus, by Lyapunov’s central limit theorem \citep{billingsley2008probability}, the decision function has a Gaussian distribution for large $n$, we, therefore, need to compute its expectation and variance.
\paragraph{Computation of $\mathbb{E}[f_{\text{R}}(\tilde\vx_i)]$:} Let $\vmu = \vect(\tM)$, then $\tilde\vx_i = (-1)^a \vmu + \vz_i$ with $\vz_i\sim \mathcal{N}(\vzero, \mI_P)$ and 
\begin{align*}
    \EE[f_{\text{R}}(\tilde\vx_i)] = \frac{1}{\sqrt{n p}} \EE \left[ \vy^\top \mX^\top \tilde\vx_i \right] = \frac{1}{\sqrt{n p}} \vy^\top \vy \vmu^\top (-1)^a\vmu = (-1)^a \sqrt{\frac{n}{p}} \Vert \vmu \Vert^2= (-1)^a \sqrt{\frac{n}{p}} \Vert \tM \Vert^2.
\end{align*}
\paragraph{Computation of $\mathbb{E}[f(\vx_i)^2]$:}
\begin{align*}
    \EE \left[ f(\vx_i)^2 \right] = \EE \left[ \frac{1}{np} \vy^\top \mX^\top \tilde \vx_i \tilde \vx_i^\top \mX \vy \right] = \EE \left[ \frac{1}{np} \vy^\top \mX^\top \mX \vy \right] + \EE \left[ \frac{1}{np} \vy^\top \mX^\top \vmu \vmu^\top \mX \vy \right] = E_1 + E_2.
\end{align*}
Since $\mX = \vmu \vy^\top + \mZ$ with $\mZ = [\vz_1, \ldots, \vz_n]= \mat(\tZ) \in \sR^{d\times n}$, we have
\begin{align*}
    E_1 &= \frac{1}{np}  \Vert \vmu \Vert^2 \Vert \vy \Vert^4  + \frac{1}{np} \EE \left[ \vy^\top \mZ^\top \mZ \vy \right] = \frac{n}{p} \Vert \tM\Vert^2 + \frac{P}{p},\\
    E_2 &= \frac{1}{np} \vy^\top \vy \vmu^\top \vmu \vmu^\top \vmu \vy^\top \vy + \frac{1}{np} \EE\left[ \vy^\top \mZ^\top \vmu \vmu^\top \mZ \vy \right] = \frac{1}{np} \Vert \vy \Vert^4 \Vert \vmu \Vert^4 + \frac{1}{np}  \tr\left( \EE\left[\mZ \vy \vy^\top \mZ^\top \right] \vmu \vmu^\top \right),
\end{align*}
where $ \EE\left[\mZ \vy \vy^\top \mZ^\top\right] = \EE\left[ \left( \sum_{i=1}^n y_i \vz_i \right) \left( \sum_{i=1}^n y_i \vz_i^\top \right) \right] =  \sum_{i=1}^n y_i^2 \EE\left[ \vz_i \vz_i^\top \right] = n \mI_P $. Therefore,
\begin{align*}
    \EE \left[ f_{\text{R}}(\tilde\vx_i)^2 \right] = \frac{n}{p} \Vert \tM\Vert^2 + \frac{P}{p} + \frac{n}{p} \Vert \tM\Vert^4 + \frac{1}{p} \Vert \tM\Vert^2,
\end{align*}
and the term $\frac{1}{p} \Vert \tM\Vert^2$ vanishes for large values of $p$ under Assumption 2.2. In particular, the variance of $f(\vx_i)$ is given by $\EE \left[ f_{\text{R}}(\tilde\vx_i)^2 \right] - \EE \left[ f_{\text{R}}(\tilde\vx_i) \right]^2 = \frac{n}{p} \Vert \tM\Vert^2 + \frac{P}{p} $ for large values of $p$.

\subsection{Poof of theorem 3.3}\label{proof_prop_CPMF}
Denote $\tM = \gamma \bigotimes_{j=1}^k \vu_j$ where $\vu_j = \frac{\vmu_j}{\Vert \vmu_j \Vert}$, as such $\Vert \tM\Vert = \gamma$. Therefore, from the definition of the weight tensor and further denoting $\beta = \Vert \tM \Vert \sqrt{\frac{n}{p}}$, $\tW$ expresses as
\begin{align}\label{eq_weights_tensor}
    \tW = \beta \bigotimes_{j=1}^k \vu_j + \frac{1}{\sqrt{p}}\tilde \tZ.
\end{align}
The best rank-one approximation $\lambda \bigotimes_{j=1}^k \vv_j$ (with the $\vv_j$'s being unitary vectors) of $\tW$ is given by the MLE as
\begin{align*}
    \argmin_{\lambda >0,\{ \vv_j \,\mid\, \Vert \vv_j\Vert = 1,\, j\in [k]\}} \left\Vert \tW - \lambda \bigotimes_{j=1}^k \vv_j \right\Vert_{\text{F}}^2.
\end{align*}
As in Section \ref{proof_prop_MF}, for a new test datum $\tilde \tX_i = (-1)^a \tM + \tilde\tZ_i$, the decision function $f_{\text{TR}}(\tilde \tX_i)$ is a Gaussian random variable, the mean of which expresses as follows.
\begin{align*}
    \EE\left[ f_{\text{TR}}(\tilde \tX_i) \right] = \EE\left[ \left\langle \lambda \bigotimes_{j=1}^k \vv_j, \tilde \tX_i \right\rangle \right] = \EE\left[ (-1)^a \Vert \tM \Vert \lambda \prod_{j=1}^k \langle \vu_j, \vv_j \rangle \right] \to (-1)^a \Vert \tM \Vert \lambda^\infty(\beta) \prod_{j=1}^k q_j(\lambda^\infty(\beta) ),
\end{align*}
by Theorem \ref{thm_main_tensors}. Moreover, the variance of $f_{\text{TR}}(\tilde \tX_i)$ expresses as
\begin{align*}
    \var&\left[ f_{\text{TR}}(\tilde \tX_i) \right] = \EE\left[ \left\langle \lambda \bigotimes_{j=1}^k \vv_j, \tilde \tZ_i \right\rangle^2 \right] = \EE\left[ \lambda^2 \left( \sum_{i_1, \ldots, i_k} \prod_{j=1}^k (\vv_j)_{i_j} (\tilde \tZ_i)_{i_1,\ldots,i_k} \right)^2 \right]\\
    &= \EE \left[ \lambda^2 \sum_{i_1, \ldots, i_k,i_1', \ldots, i_k'} \prod_{j=1}^k (\vv_j)_{i_j} (\tilde \tZ_i)_{i_1,\ldots,i_k} \prod_{j=1}^k (\vv_j)_{i_j'} (\tilde \tZ_i)_{i_1',\ldots,i_k'} \right]\\
    &= \EE \left[ \lambda^2 \sum_{i_1, \ldots, i_k} \prod_{j=1}^k (\vv_j)_{i_j}^2 (\tilde \tZ_i)_{i_1,\ldots,i_k}^2 \right] = \EE \left[ \lambda^2 \sum_{i_1, \ldots, i_k} \prod_{j=1}^k (\vv_j)_{i_j}^2 \EE[(\tilde \tZ_i)_{i_1,\ldots,i_k}^2\mid \tZ] \right] = \EE[\lambda^2] \to \lambda^\infty(\beta)^2,\\
\end{align*}
since $\EE[(\tilde \tZ_i)_{i_1,\ldots,i_k}^2\mid \tZ] = 1$ and $\sum_{i_1, \ldots, i_k} \prod_{j=1}^k (\vv_j)_{i_j}^2 = \prod_{j=1}^k \Vert \vv_j \Vert^2 = 1$.
\subsection{Poof of theorem 3.5}\label{proof_prop_linear_clustering}
The equivalent random matrix model writes as
\begin{align*}
    \tilde \mX = \sqrt{\frac{n}{d+n}} \vect(\tM) \bar\vy^\top + \frac{1}{\sqrt{d + n}} \mat(\tZ) \in \sR^{d\times n},
\end{align*}
where $\bar \vy = \vy / \sqrt{n}$ and the normalization by $\sqrt{P+n}$ is considered for convenience. Let $\hat{\vy}$ be the right singular vector of $\tilde \mX$ corresponding to its largest singular value. Then evoking Corollary \ref{cor_matrix_case}, the asymptotic alignment under Assumption 2.2 is given as
\begin{align*}
    \vert \langle \hat\vy, \bar\vy \rangle\vert \asto \alpha = \kappa\left( \Vert \tM \Vert \sqrt{\frac{n}{P+n}}, \frac{n}{P+n} \right)^{-1}.
\end{align*}
Moreover, $\hat\vy$ decomposes as
\begin{align*}
    \hat\vy = \alpha \bar\vy + \sigma \vw,
\end{align*}
where $\vw\in \sR^n$ is a random vector, orthogonal to $\bar\vy$ and of unit norm. Since $\hat\vy$ is of unit norm, $\sigma$ satisfies $1 = \alpha^2 + \sigma^2$, as such $\sigma = \sqrt{1 - \alpha^2}$. Finally, the Gaussianity of the entries of $\hat\vy$ is obtained thanks to similar arguments as in \citep{couillet2016kernel}.

\subsection{Poof of theorem 3.6}\label{proof_prop_CP_clustering}
The equivalent random tensor model writes as
\begin{align*}
    \tilde\tX = \sqrt{\frac{n}{p+n}} \tM \otimes \bar\vy + \frac{1}{\sqrt{p+n}} \tZ \in \sR^{p_1\times \cdots \times p_k\times n},
\end{align*}
where $\bar \vy = \vy / \sqrt{n}$. As such $\tilde \tX$ is a spiked random tensor of order $k+1$. As in Section \ref{proof_prop_linear_clustering}, we need to express the asymptotic alignment between $\hat\vy$ and $\bar\vy$ with $\hat \vy$ being the $(k+1)$-th mode component of the rank-one tensor approximation of $\tilde \tX$, which is straightforwardly obtained thanks to Theorem \ref{thm_main_tensors}, applied to a $(k+1)$-th order tensor of dimensions $p_1\times \cdots \times p_k\times n$, yielding
\begin{align*}
    \vert \langle \hat\vy, \bar\vy \rangle\vert \asto \alpha = q_{k+1}\left( \lambda^\infty\left( \Vert \tM \Vert \sqrt{\frac{n}{p+n}} \right) \right),
\end{align*}
where $q_{k+1}(\cdot)$ and $\lambda^\infty(\cdot)$ are defined in Theorem \ref{thm_main_tensors}.

\section{Low-rank data model with orthogonal components}\label{sec_general}
Our results generalize to a more complex model of the following form. Suppose that the $\tX_i$'s are distributed in two classes $\mathcal{C}_1$ and $\mathcal{C}_2$ (of cardinality $n_1$ and $n_2$ respectively), such that for $\tX_i\in \mathcal{C}_a$ with $a\in {1, 2}$, 
\begin{align}\label{eq_general_data_1}
    \tX_i = \sum_{\ell=1}^{r_a} \bigotimes_{j=1}^k \vmu_{j,\ell}^{(a)} + \tZ_i \in \sR^{p_1\times \cdots \times p_k},
\end{align}
where $\tZ_i$ is a random tensor with i.i.d. standard Gaussian entries, $\vmu_{j,\ell}^{(a)}\in \sR^{p_j}$ are independent from $\tZ_i$ such that $\langle \vmu_{j,\ell_1}^{(a)}, \vmu_{j,\ell_2}^{(a)} \rangle = \delta_{\ell_1 \ell_2}$. That is, the data tensors $\tX_i$ have a rank-$r_a$ (with $r_a$ being small) structure with orthogonal components.
\subsection{Supervised setting}
Let us denote $\tM_a = \sum_{\ell=1}^{r_a} \bigotimes_{j=1}^k \vmu_{j,\ell}^{(a)}$. In a supervised setting, it is convenient to center the data by subtracting\footnote{In real scenarios one would first estimate the $\tM_a$'s with their empirical estimates through tensor decomposition.} $\frac12(\tM_1 + \tM_2)$ from each data sample which yields tensors of the form  
\begin{align}\label{eq_general_data}
    \tX_i = (-1)^a \left( \tM_1 - \tM_2 \right) + \tZ_i,
\end{align}
where $\tM_1 - \tM_2$ is clearly a low-rank tensor (of rank $r_1 + r_2$) with orthogonal components. Stacking all the data samples $\tX_i$ in a data tensor $\tX\in \sR^{p_1\times \cdots \times p_k \times n}$, the $\infty$-Ridge classifier has weights tensor of the form
\begin{align}
    \tW = \frac{1}{\sqrt{np}} \tX \times_{k+1} \vy = \sqrt{\frac{n}{p}} \tM + \frac{1}{\sqrt{p}}\tilde \tZ,
\end{align}
where $\tilde \tZ = \frac{1}{\sqrt{n}} \sum_{i=1}^n y_i \tZ_i$ and $\tM = \tM_1 - \tM_2 = \sum_{\ell=1}^{r_1 + r_2} \bigotimes_{j=1}^k \vmu_{j,\ell}$ is a rank-$(r_1 + r_2)$ tensor. Therefore, the Tensor-Ridge classifier for this case relies on a low-rank approximation of $\tW$ of rank $r_1 + r_2$ which might be performed using tensor power iteration with deflation procedure. We, therefore, have the following theorem characterizing the performance of the Tensor-Ridge classifier in this case.
\begin{theorem}[Performance of the Tensor-Ridge classifier for data model in \eqref{eq_general_data}] Under Assumption 2.2, for $\tilde \tX_i \in \mathcal{C}_a $ with $a \in \{1, 2\}$ independent from the training set $\tX$,
\begin{align*}
    \frac{1}{\sqrt{\sum_{\ell=1}^{r_1 + r_2} \sigma_\ell^2}} \left( f_{\text{TR}}(\tilde \tX_i) - m_a \right) \toind \mathcal{N}(0, 1),
\end{align*}
where $m_a = (-1)^a \sum_{\ell=1}^{r_1 + r_2} \sigma_\ell \mu_\ell \prod_{j=1}^k q_j(\sigma_\ell, \mu_\ell \sqrt{\frac{n}{p}} )$ where $\mu_\ell = \Vert \bigotimes_{j=1}^k \vmu_{j,\ell} \Vert$ and $\sigma_\ell$ satisfies $f(\sigma_\ell, \mu_\ell \sqrt{\frac{n}{p}}) = 0$. $q_j$ and $f$ are defined in Theorem \ref{thm_main_tensors}. Furthermore, the misclassification error verifies with probability one $\sP \left( (-1)^a g_{\text{CP}}(\tilde\tX_i) < 0 \mid \tilde\tX_i \in \mathcal{C}_a \right) - Q\left( \frac{ \vert m_a\vert }{\sqrt{\sum_{\ell=1}^{r_1 + r_2} \sigma_\ell^2}} \right) \to 0$.
\end{theorem}
\begin{proof}
The proof strategy is the same as for theorem 3.3.
\end{proof}

\subsection{Unsupervised setting}
The generalization to the unsupervised setting is more challenging since the data tensor $\tX$ for the model in \eqref{eq_general_data_1} does not follow a CP decomposition but rather a block-term decomposition \citep{rontogiannis2021block} which is more challenging to analyze theoretically and is therefore left for a future investigation.
\bibliography{uai2023}
\bibliographystyle{plain}
\end{document}
