%
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\DeclareMathAlphabet{\mathcal}{OMS}{cmsy}{m}{n}
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography


\usepackage{graphicx}
\usepackage{amssymb}
\usepackage{amsmath} \allowdisplaybreaks[4]
% For algorithms
\usepackage{algorithm}
\usepackage{algorithmic}

\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{multirow}
%\usepackage{subfigure}
\usepackage{caption,subcaption}
\usepackage{mathrsfs}
\usepackage{bbm}
\usepackage{bm}
\usepackage{eufrak}
\usepackage{bookmark}
\usepackage{mathtools}
\usepackage{hyperref}       % hyperlinks
%\usepackage{subfigure}
\usepackage{enumitem}
\hypersetup{colorlinks=true,citecolor=blue,linkcolor=blue}
\newcommand{\circled}[1]{\small{\raisebox{.6pt}{\textcircled{\raisebox{-.8pt}{#1}}}}}
\usepackage{xr}
\externaldocument{Yang_517-supp}
%\graphicspath{{illustrations/}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand\blfootnote[1]{%
  \begingroup
  \renewcommand\thefootnote{}\footnote{#1}%
  \addtocounter{footnote}{-1}%
  \endgroup
}
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\def\aff#1{\textup {aff}\left(#1\right)}

\title{Noisy $\ell^{0}$-Sparse Subspace Clustering on Dimensionality Reduced Data}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author{ \hspace{0.1in} Yingzhen Yang \hspace{2.5in} Ping Li \\
\hspace{-.4in} School of Computing and Augmented Intelligence \hspace{1in} Cognitive Computing Lab \\
\hspace{0.2in} Arizona State University \hspace{1.9in} Baidu Research \\
\hspace{0.6in} 699 S Mill Ave. Tempe, AZ 85281, USA \hspace{0.8in} 10900 NE 8th ST. Bellevue, WA 98004, USA \\
\hspace{0.3in} \texttt{yingzhen.yang@asu.edu} \hspace{1.6in} \texttt{pingli98@gmail.com}
}




\input{symbol_tit.tex}
\begin{document}

\maketitle

\begin{abstract}\vspace{-0.1in}
\footnote{Yingzhen Yang's work was conducted as a consulting researcher at Baidu Research - Bellevue,~WA,~USA.\vspace{-0.1in}}Sparse subspace clustering methods with sparsity induced by $\ell^{0}$-norm, such as $\ell^{0}$-Sparse Subspace Clustering ($\ell^{0}$-SSC)~\citep{YangFJYH16-L0SSC-ijcv}, are demonstrated to be more effective than its $\ell^{1}$ counterpart such as Sparse Subspace Clustering (SSC)~\citep{ElhamifarV13}. However, the theoretical analysis of $\ell^{0}$-SSC is restricted to clean data that lie exactly in subspaces. Real data often suffer from noise and they may lie close to subspaces. In this paper, we show that an optimal solution to the optimization problem of noisy $\ell^{0}$-SSC achieves subspace detection property (SDP), a key element with which data from different subspaces are separated, under deterministic and semi-random model. Our results provide theoretical guarantee on the correctness of noisy $\ell^{0}$-SSC in terms of SDP on noisy data for the first time, which reveals the advantage of noisy $\ell^{0}$-SSC in terms of much less restrictive condition on subspace affinity. In order to improve the efficiency of noisy $\ell^{0}$-SSC, we propose Noisy-DR-$\ell^{0}$-SSC which provably recovers the subspaces on dimensionality reduced data. Noisy-DR-$\ell^{0}$-SSC first projects the data onto a lower dimensional space by random projection, then performs noisy $\ell^{0}$-SSC on the projected data for improved efficiency. Experimental results demonstrate the effectiveness of Noisy-DR-$\ell^{0}$-SSC. \vspace{-0.05in}
\end{abstract}

\vspace{-0.07in}
\section{Introduction}
\vspace{-0.07in}

Clustering is an important unsupervised learning procedure for analyzing a broad class of scientific data. High-dimensional data, such as facial images and gene expression data, often lie in low-dimensional subspaces in many cases, and clustering in accordance to the underlying subspace structure is particularly important. Among various subspace clustering algorithms, the ones that employ sparsity prior, such as Sparse Subspace Clustering (SSC)~\citep{ElhamifarV13} and $\ell^{0}$-Sparse Subspace Clustering ($\ell^{0}$-SSC)~\citep{YangFJYH16-L0SSC-ijcv}, have been proven to be effective in separating the data in accordance with the subspaces that the data lie in under certain assumptions. Furthermore, Sparse Additive Subspace Clustering~\citep{yuan2014sparse} considers a nonlinear transformation of each data point such that the transformed point can be linearly represented by data in the same subspace as that point, extending the usual linear representation by SSC.

Sparse subspace clustering methods construct the sparse similarity matrix by sparse representation of the data. Subspace detection property (SDP) defined in Section~\ref{sec::setup-method} ensures that the similarity between data from different subspaces vanishes in the sparse similarity matrix, and applying spectral clustering~\citep{Ng01} on such sparse similarity matrix leads to compelling clustering performance. Elhamifar and Vidal~\citep{ElhamifarV13} prove that when the subspaces are independent or disjoint, SDP can be satisuyfied by solving the canonical sparse linear representation problem using data as the dictionary, under certain conditions on the rank, or singular value of the data matrix and the principle angle between the subspaces. Under the independence assumption on the subspaces, low rank representation~\citep{LiuLY13,liu2014recovery,liu2016low} is also proposed to recover the subspace structures. Relaxing the assumptions on the subspaces to allowing overlapping subspaces, the Greedy Subspace Clustering~\citep{ParkCS14} and the Low-Rank Sparse Subspace Clustering~\citep{Wang13-lrr-ssc} achieve subspace detection property with high probability. The geometric analysis in~\citet{Soltanolkotabi2012} shows the theoretical results on subspace recovery by SSC. In the following, we use the term SSC or $\ell^{1}$-SSC exchangeably to indicate the Sparse Subspace Clustering method in~\citet{ElhamifarV13}.


\begin{table*}[t]
\centering
\scriptsize
\caption{\small Comparison between Different Subspace Clustering Methods in terms of Conditions Required under the Semi-Random Model. Please refer to Section~\ref{sec::contributions} for the definition of notations.\vspace{-0.05in}}
\resizebox{1.0\linewidth}{!}{
\begin{tabular}{|c|c|c|}
  \hline
   Methods or Assumptions &Allowing Overlapping Subspaces &Subspace Affinity \\ \hline
Greedy Subspace Clustering (GSC)~\citep{ParkCS14} &Yes &$\max_{k,l \in [K]}\text{aff}(\mathcal S_k,S_l) < \frac{C_2\log {n/K}}{\log {(d_0L \delta^{-1})} \cdot \log {(nd_0\delta^{-1}})} \overset{d_0 \to \infty}{\longrightarrow} 0$ \\ \hline
$\ell^1$-SSC~\citep{ElhamifarV11}, Noisy SSC~\citep{WangX13} &Yes & $\max_{k,l \in [K]}\text{aff}(\mathcal S_k,S_l) < \sqrt{d_0} \cdot \frac{\bar c \sqrt{\log{\rho}}}{8{\sqrt 2} \log{n}}   \overset{d_0 \to \infty}{\longrightarrow} 0$ \\ \hline
Affine Sparse Subspace Clustering (ASSC)~\citep{Li2018-ASSC,YouLRV19-ASSC} &No & NA \\ \hline
Noisy $\ell^{0}$-SSC~\citep{YangFJYH16-L0SSC-ijcv} &Yes &$\max_{k,l \in [K]}\text{aff}(\mathcal S_k,S_l) <   \frac{\sigma'^2_{\min}}{r_0-1} > 0$ for sufficiently large $d_0$ \\ \hline
\end{tabular}
}
\label{table:theoretical-results}
\end{table*}

%We also develop the accelerated algorithm, named noisy $\ell^{0}$-SSC by %Random Projection (noisy $\ell^{0}$-SSC-RP), for the optimization of noisy %$\ell^{0}$-SSC by randomized matrix decomposition via random projection. %Noisy $\ell^{0}$-SSC-RP constitutes an efficient and provable algorithm %for noisy $\ell^{0}$-SSC.
Real data often suffer from noise. The correctness of noisy SSC is analyzed in~\citet{WangX13} which handles noisy data that lie close to disjoint or overlapping subspaces, and the original optimization problem of noisy SSC is proposed in~\citet{ElhamifarV13}. While~\citet{YangFJYH16-L0SSC,Yang18-DRL0SSC} prove the correctness of $\ell^{0}$-SSC or its dimensionality reduced variant on clean data based on a constrained $\ell^0$-minimization problem, it empirically solves an unconstrained $\ell^{0}$-regularized problem to handle noise in data, and they lack theoretical analysis on the correctness of $\ell^{0}$-SSC on noisy data.
%Similarly, the efficient optimization method of $\ell^{0}$-SSC~\citep{Yang18-DR-L0SSC} only considers noiseless data.
%Real data often suffer from noise. Noisy SSC proposed in~\citet{WangX13}
fhandles noisy data that lie close to disjoint or overlapping subspaces. While $\ell^{0}$-SSC~\citep{YangFJYH16-L0SSC-ijcv} has guaranteed clustering correctness via subspace detection property under much milder assumptions than previous subspace clustering methods including SSC, it assumes that the observed data lie in exactly in the subspaces and does not handle noisy data. In this paper, we present noisy $\ell^{0}$-SSC, which enhances $\ell^{0}$-SSC by theoretical guarantee on the correctness of clustering on noisy data. It should be emphasized that while $\ell^{0}$-SSC on clean data~\citep{YangFJYH16-L0SSC-ijcv} empirically adopts a form of optimization problem robust to noise, it lacks theoretical analysis on the correctness of $\ell^{0}$-SSC on noisy data.
In this paper, the correctness of noisy $\ell^{0}$-SSC on noisy data in terms of the subspace detection property is established. Our analysis is under both deterministic model and semi-random model, which are the models employed in the geometric analysis of SSC~\citep{Soltanolkotabi2012}. Our randomized analysis demonstrates the advantage of noisy $\ell^{0}$-SSC over its $\ell^{1}$ counterpart as more general assumption on data distribution can be adopted. Moreover, we present Noisy Dimensionality Reduced $\ell^{0}$-Sparse Subspace Clustering (Noisy-DR-$\ell^{0}$-SSC), an efficient version of noisy $\ell^{0}$-SSC which also enjoys robustness to noise. Noisy-DR-$\ell^{0}$-SSC first projects the data onto a lower dimensional space by random projection, then performs noisy $\ell^{0}$-SSC on the dimensionality reduced data. Noisy-DR-$\ell^{0}$-SSC provably recovers the underlying subspace structure in the original data from the projected data under deterministic model. Experimental results show the effectiveness of both noisy $\ell^{0}$-SSC and Noisy-DR-$\ell^{0}$-SSC.

\vspace{-0.03in}
\subsection{Notations}
\vspace{-0.03in}

We use bold letters for matrices and vectors, and regular lower letter for scalars throughout this paper. The bold letter with superscript indicates the corresponding column of a matrix, e.g. $\bA^i$ is the $i$-th column of matrix $\bA$, and the bold letter with subscript indicates the corresponding element of a matrix or vector. $\|\cdot\|_F$ and $\|\cdot\|_p$ denote the Frobenius norm and the vector $\ell^{p}$-norm or the matrix $p$-norm, and $\norm{\cdot}{0}$ is the $\ell^{0}$-norm, that is, the number of nonzero elements of a vector. ${\rm diag}(\cdot)$ indicates the diagonal elements of a matrix. $\bH_{\bT} \subseteq \RR^d$ indicates the subspace spanned by the columns of $\bT$, and $\bA_{\bI}$ denotes a submatrix of $\bA$ whose columns correspond to the nonzero elements of $\bI$ (or with indices in $\bI$ without confusion). $\sigma_{t}(\cdot)$ denotes the $t$-th largest singular value of a matrix, and $\sigma_{\min}(\cdot)$ indicates the smallest singular value of a matrix. ${\rm supp}(\cdot)$ is the support of a vector, $\mathbb P_{\cS'}$ is the operator of orthogonal projection onto the subspace $\cS'$. $[n]$ represents all the natural numbers between $1$ and $n$ inclusively. $\unitsphere{d-1}$ denotes the unit sphere in $\RR^{d}$. $\Theta(a)$ denotes a number such that there exists two constants $c_1$ and $c_2$ such that $\Theta(a) \in [c_1a,c_2a]$.

\vspace{-0.03in}
\subsection{Contributions}\label{sec::contributions}
\vspace{-0.03in}

First, the correctness of noisy $\ell^{0}$-SSC on noisy data in terms of the subspace detection property is established for the first time, which is presented in Section 3 of this paper. Our analysis is under both deterministic model and semi-random model, which are also the models employed by the geometric analysis of SSC [Soltanolkotabi2012]. Our randomized analysis demonstrates the significant advantage of noisy $\ell^{0}$-SSC over its $\ell^{1}$ counterpart and other competing subspace clustering methods in terms of much less restrictive condition on the subspace affinity. Table~\ref{table:theoretical-results} below demonstrates the conditions under which SDP holds for representative subspace clustering methods under the semi-random model, including Greedy Subspace Clustering (GSC)~\citep{ParkCS14},  $\ell^1$-SSC~\citep{ElhamifarV11}, Noisy SSC~\citep{WangX13}, Affine Sparse Subspace Clustering (ASSC)~\citep{Li2018-ASSC,YouLRV19-ASSC}, Noisy $\ell^{0}$-SSC~\citep{YangFJYH16-L0SSC-ijcv}. When the size of data $n$ grows exponentially in terms of the common subspace dimension $d_0$ (the dimension of every subspace is $d_0$), in particular, $n = \Theta(e^{d_0^{\tau}})$  for $\tau \in (0.5,0.9)$, then all the competing subspace clustering methods other than Noisy $\ell^{0}$-SSC either do not allow overlapping subspaces, or require the maximum pairwise subspace affinity goes to $0$ when $d_0 \to \infty$, which means that these methods require all the subspaces to be almost pairwise orthogonal when the common subspace dimension $d_0$ is very large. Instead, Noisy $\ell^{0}$-SSC allows  subspace affinity to be lower bounded from $0$, suggesting that Noisy $\ell^{0}$-SSC is still able to recover subspaces which are not orthogonal when $d_0$ is very large. $\sigma'_{\min}$ in Table~\ref{table:theoretical-results} is defined in Theorem~\ref{theorem::noisy-l0ssc-subspace-detection-lambda-random}.

In Table~\ref{table:theoretical-results}, it is preferred that a subspace clustering method requires milder conditions, which are allowing overlapping subspaces and larger upper bound for the maximum subspace affinity, denoted by $\max_{k,l \in [K]}\text{aff}(\mathcal S_k,S_l)$ where $\set{\cS_k}_{k=1}^K$ are $K$ subspaces, so that the underlying subspaces can be recovered for overlapping subspaces and for subspaces which are closer to each other (larger subspace affinity). Here $\text{aff}$ denotes subspace affinity, $d_0$ is the common subspace dimension, $\delta$ is a small positive constant (see~\citep{ParkCS14}). $r_0 > 1$ is an upper bound for the support of an optimal solution to the noisy $\ell^0$-SSC problem for all data points.  In this table, $n = \Theta(e^{d_0^{\tau}})$ for $\tau \in (0.5,0.9)$ when $d_0 \to \infty$. Note that two subspaces are overlapping subspaces if the dimension of their intersection is larger than $1$. When a subspace clustering method does not allow overlapping subspaces, then no condition on subspace affinity is presented in the subspace clustering literature.



Second, we propose Noisy Dimensionality Reduced $\ell^{0}$-Sparse Subspace Clustering (Noisy-DR-$\ell^{0}$-SSC) to accelerate noisy $\ell^{0}$-SSC with provable robustness to noise. Noisy-DR-$\ell^{0}$-SSC first projects the data onto a lower dimensional space by random projection, then performs noisy $\ell^{0}$-SSC on the dimensionality reduced data. Two types of random projections are used in Noisy-DR-$\ell^{0}$-SSC, which are the random projection induced by randomized low-rank approximation and the sparse random projection, particular "Count-Sketch (SC) Projections''.

It should be emphasized that~\citet{Yang18-DRL0SSC} also studies dimensionality reduced $\ell^{0}$-SSC. However, the analysis of this work is performed on the following constrained $\ell^0$-minimization problem and only for clean data without noise,
%\bal\label{eq:constrained-dr-l0ssc}%
%\mathop {\min }\limits_{{\bZ}}  \norm{\bZ}{0} \quad s.t.\;{\tilde \bX} = {{\tilde \bX}}{\bZ},\,\, {\rm diag}(\bZ) = \bzero,
%\eal%
%where $\tilde \bX$ is the dimensionality reduced version of clean data by random projection.
On the other hand, the actual optimization problem~\citep{Yang18-DRL0SSC} solves a unconstrained $\ell^{0}$-regularized problem.
%\bal\label{eq:noisy-dr-l0ssc-intro}%
%\mathop {\min }\limits_{{\bZ}} \ltwonorm{\tilde %\bX - \tilde \bX \bZ}^2 + \lambda %\norm{\bZ}{0}\quad s.t.\;,\,\, {\rm diag}(\bZ) = %\bzero.
%\eal%
In contrast, we analyze noisy $\ell^{0}$-SSC on the unconstrained $\ell^{0}$-regularized problem with noisy data which reveals the advantage of noisy $\ell^0$ SSC over $\ell^1$-SSC. Our analysis also suggests that a larger $\lambda$ tends to guarantee the subspace detection property (Remark~\ref{remark::lambda}), verified by experiments. Throughout the paper, we refer to $\ell^{0}$-SSC for noisy data with the unconstrained $\ell^{0}$-regularized problem as noisy $\ell^{0}$-SSC.

\section{Problem Setup}\label{sec::setup-method}


Sparse Subspace Clustering (SSC) methods, such as~\citep{ElhamifarV11,Soltanolkotabi2012,WangX13,yuan2014sparse}, construct a sparse similarity matrix by sparse representation of the data, and then perform clustering on the sparse similarity matrix.

We hereby introduce the notations for subspace clustering on noisy data considered in this paper. The uncorrupted data matrix is denoted by ${\bY}=[ {{\by_1},\ldots ,{\by_n}} ] \in {\RR^{d \times n}}$, where $d$ is the dimensionality and $n$ is the size of the data. The uncorrupted data $\bY$ lie in a union of $K$ distinct subspaces $\{\cS_k\}_{k=1}^K$ of dimensions $\{d_k\}_{k=1}^K$ with $d_{\max} \defeq \max_{k \in [K]} d_k$ and $d_{\min} \defeq \min_{k \in [K]} d_k$. The observed noisy data is $\bX = \bY + \bN$, where ${\bN}=[ {{\bn_1},\ldots ,{\bn_n}} ] \in {\RR^{d \times n}}$ is the additive noise. $\bx_i = \by_i + \bn_i$ is the noisy data point that is corrupted by the noise $\bn_i$. We let $\bY^{(k)} \in \RR^{d \times n_k}$ denote the data belonging to subspace $\cS_k$ with $\sum\limits_{k=1}^K n_k = n$, and denote the corresponding columns in $\bX$ by $\bX^{(k)}$. Let $\bU^{(k)} \in \RR^{d \times d_k}$ be the orthogonal basis of $\cS_k$ for all $k \in [K]$. The data $\bX$ are normalized such that each column has unit $\ell^{2}$-norm in our deterministic analysis. We consider deterministic noise model where the noise $\bN$ is fixed and $\max_{i \in [n]}\ltwonorm{\bn_i} \le \delta$.

%Note that our analysis can be extended to a random noise model which is common and also considered by noisy SSC~\citep{WangX13}, and the random noise model assumes that columns of $\bN$ are i.i.d. samples with maximum $\ell_2$-norm bounded by $\delta$ with high probability. Note that such random noise model does not require spherical symmetric noise as that in~\citet{WangX13}.




Formally, given observed data $\bX \in \RR^{d \times n}, \bX = \bth{\bx_1, \ldots, \bx_n}$, where $\bx_i \in \RR^d$, SSC solves the following optimization problem for each $i \in [n]$:
\bsal\label{eq:ssc-opt}
\min_{\bbeta \in \RR^n} {\lonenorm{\bbeta}} \quad \text{s.t. } \bx_i =  \bX \bbeta, \bbeta_i = 0.
\esal%
In order to handle noisy data, noisy SSC~\citep{WangX13} was proposed to solve the $\ell^{1}$ regularized problem:
\bsal\label{eq:noisy-ssc-opt}
\min_{\bbeta \in \RR^n} ||\bx_i -  \bX \bbeta||^2 + \lambda \lonenorm{\bbeta}, \quad \text{s.t. }  \bbeta_i = 0.
\esal%
The sparse code $\bbeta$ of the data point $\bx_i$ is obtained by solving (\ref{eq:ssc-opt}) or (\ref{eq:noisy-ssc-opt}) for SSC or noisy SSC.  A coefficient matrix $\bZ \in \RR^{n \times n}$ is then formed by concatenating the sparse codes of all the data points, and the $i$-th column of $\bZ$ is the sparse code of $\bx_i$. The sparse similarity matrix is then computed~by $\bW = \frac{|\bZ | + |\bZ ^{\top}|}{2}$. Subspace detection property (SDP, formally defined later) ensures that the similarity between data from different subspaces vanishes in the sparse similarity matrix. If SDP holds, then similarity between~data points from different clusters vanish in $\bW$. As a result, performing spectral clustering on $\bW$ leads to compelling clustering~results.

Under the independence assumption on the subspaces, low rank representation~\citep{LiuLY13} is proposed to recover the subspace structures. Relaxing the assumptions on the subspaces to allowing overlapping subspaces, the Greedy Subspace Clustering~\citep{ParkCS14} and the Low-Rank Sparse Subspace Clustering~\citep{LiuLY13} achieve subspace detection property with high probability. The geometric analysis in~\citet{Soltanolkotabi2012} shows the theoretical results on subspace recovery by SSC. In the following text, we use  SSC or $\ell^{1}$-SSC exchangeably to indicate the Sparse Subspace Clustering method in~\citet{Soltanolkotabi2012,ElhamifarV13}.

$\ell^{0}$-SSC~\citep{YangFJYH16-L0SSC-ijcv} proposes to solve the following $\ell^{0}$ sparse representation problem
\bsal\label{eq:l0ssc}
\min_{\bZ \in \RR^{n \times n}} \norm{\bZ}{0} \quad s.t. \,\, \bX = \bX \bZ, \,\, {\textup {diag}}(\bZ)= \bzero,
\esal%
and it proves that SDP is satisfied with an globally optimal solution to problem (\ref{eq:l0ssc}). In~\citet{YangFJYH16-L0SSC-ijcv}, the $\ell^{0}$ regularized sparse approximation problem below is solved so as to handle noisy data for $\ell^{0}$-SSC, which is the optimization problem of noisy $\ell^{0}$-SSC:
\bsal\label{eq:l0ssc-lasso}
\min_{\bZ \in \RR^{n \times n}, {\textup {diag}}(\bZ) = 0}  L(\bZ) = \norm{\bX - \bX \bZ}{F}^2 + \lambda\norm{\bZ}{0}, .
\esal%
The optimization problem of noisy $\ell^{0}$-SSC (\ref{eq:l0ssc-lasso}) is separable. For each $i \in [n]$, the optimization problem with respect to the sparse code $\bbeta$ of $i$-th data point is
\bal\label{eq:noisy-l0ssc-i}
&\mathop {\min }\limits_{{\bbeta \in \RR^n,\bbeta_i = 0}} L(\bbeta) = {\|\bx_i - \bX \bbeta\|_2^2 + {\lambda}\|{\bbeta}\|_0}.
\eal%%
The sparse similarity matrix $\bW$ is then computed in the same way as $\ell^{1}$-SSC by $\bW = \frac{|\bZ| + |\bZ ^{\top}|}{2}$, and the subspace clustering result of noisy $\ell^{0}$-SSC is achieved by performing spectral clustering on $\bW$.

In the following text, we always use $\bbeta^*$ to denote an optimal solution to (\ref{eq:noisy-l0ssc-i}), and define $r^* \defeq \norm{\bbeta^*}{0}$.

The definition of subspace detection property for noisy $\ell^{0}$-SSC and noiseless $\ell^{0}$-SSC, i.e. $\ell^{0}$-SSC on noiseless data, is defined in Definition~\ref{def::subspace-detection} below.
%\subsection{Optimization Problem for noisy $\ell^{0}$-SSC}
\begin{definition}\label{def::subspace-detection}
({Subspace detection property for noisy and noiseless $\ell^{0}$-SSC})
Let $\bZ^*$ be an optimal solution to (\ref{eq:l0ssc-lasso}). The subspaces $\{\cS_k\}_{k=1}^K$ and the data $\bX$ satisfy the Subspace Detection Property (SDP) for noisy $\ell^{0}$-SSC if $\bZ^i$ is a nonzero vector, and nonzero elements of $\bZ^i$ correspond to the columns of $\bX$ from the same subspace as $\by_i$ for all $1 \le i \le n$. %A matrix $\bZ \in \RR^{n \times n}$ satisfies the subspace detection property if $\bZ^i$ satisfies subspace detection property for $\by_i$ with $1 \le i \le n$.
%Similarly, in the noiseless setting where $\bX = \bY$, let $\bZ^*$ be an optimal solution to (\ref{eq:l0ssc}). The subspaces $\{\cS_k\}_{k=1}^K$ and the data $\bX$ satisfy the SDP for noiseless $\ell^{0}$-SSC if $\bZ^i$ is a nonzero vector, and nonzero elements of ${\bZ^*}^i$ correspond to the columns of $\bX$ that from the same subspace as $\by_i$ for all $1 \le i \le n$.
We say that SDP holds for $\bx_i$ if nonzero elements of ${\bZ^*}^i$, which is $\bbeta^*$ for problem (\ref{eq:noisy-l0ssc-i}), correspond to the data that lie in the same subspace as $\by_i$, for either noisy $\ell^{0}$-SSC or noiseless $\ell^{0}$-SSC.
\end{definition}



\section{Analysis for Noisy $\ell^{0}$-SSC}
\label{sec::noisy-l0ssc-correctness}

Similar to~\citet{Soltanolkotabi2012}, we introduce the deterministic and the semi-random model for the analysis of noisy $\ell^{0}$-SSC.
\begin{itemize}[leftmargin=*]
\item {\textbf{Deterministic Model:}} the subspaces and the data in each subspace are fixed.
\item {\textbf{Semi-Random Model:}} the subspaces are fixed but the data are independent and identically distributed in each of the subspaces.
%\item {\textbf{Fully-Random Model:}} both the subspaces and the data of each subspace are independent and identically distributed.
\end{itemize}
The data in the above definitions refer to clean data without noise. Both the deterministic model and the semi-random model are extensively employed to analyze the subspace detection property in the subspace learning literature~\citep{Soltanolkotabi2012,Wang13-lrr-ssc,WangX13,Wang16-graphconnectivity}.
%\begin{figure}[!hbt]
%\begin{center}
%\includegraphics[width=0.18\textwidth]{sphere.eps}
%\end{center}
%   \caption{Illustration of an external subspace. All the data $\bY$ are %normalized to have unit norm for illustration purpose, so they lie on the %surface of the sphere. $\cS_1$ and $\cS_2$ are two subspaces in the three-dimensional %ambient space. The subspace spanned by $\by_i \in \cS_1$ and $\by_j \in %\cS_2$ is an external subspace, and the intersection of this external subspace %and $\cS_1$ is a dashed line ${\by_i}{\bO}{A}$.  }
%\label{fig:external-subspace}
%\end{figure}
%\subsection{Noiseless $\ell^{0}$-SSC}
%Suppose the data ${\bX}=[ {{\bx_1},\ldots ,{\bx_n}} ] \in {\RR^{d \times n}}$ lie in a union of $K$ distinct subspaces $\{\cS_k\}_{k=1}^K$ of dimensions $\{d_k\}_{k=1}^K$. Let $\bX^{(k)} \in \RR^{d \times n_k}$ denote the data that belong to subspace $\cS_k$, and $\sum\limits_{k=1}^K n_k = n$.
%In addition to the theoretical results for noiseless $\ell^{0}$-SSC under semi-random model in~\citet{YangFJYH16-L0SSC-ijcv} (see supplementary), we present result under deterministic model in Theorem~\ref{theorem::l0ssc-deterministic}. %For semi-random model, noiseless $\ell^{0}$-SSC only requires assumptions of distinct subspaces and continuous data distribution in each subspace, which are much milder than the assumptions required by previous subspace clustering methods including SSC in terms of subspaces and random data generation (refer to more details in~\citet{YangFJYH16-L0SSC-ijcv}).

%Let $\cD_k$ indicates the data distribution of subspace $\cS_k$. and the data distribution in each subspace is only required to be continuous, which indicates that the data distribution in each subspace is non-degenerate in the sense that the probability measure of any subspace of dimension less than that of the subspace is $0$.
%not ``degenerate'', i.e. the probability measure of any low-dimensional subspace in each subspace is $0$. This assumption is defined in Definition~\ref{def::zero-prob-subspace}. For example, it holds if each subspace has continuous data distribution.
%\begin{definition}\label{def::zero-prob-subspace}
%(\textit{Assumption of Zero-Probability Hyperplane})
%For any $1 \le k \le K$, the data distribution $\cD_k$ of the subspace $\cS_k$ is said to satisfy the zero-probability subspace assumption, if the probability measure of any subspace $H \subset \cS_k$ of dimension less than $d_k$ is $0$, i.e. $\Pr[H] = 0$ for subspace $H \subset \cS_k$ and ${\rm Dim}[H] < d_k$.
%\end{definition}



%Note that the correctness of noiseless $\ell^{0}$-SSC under deterministic model in Theorem~\ref{theorem::l0ssc-deterministic} requires that the data in each subspace are away from the external subspaces, which is much easier to check than the conditions involving inradius and incoherence required by noiseless SSC in~\citet{Soltanolkotabi2012}. Also, it can be verified that Theorem $1$ in~\citet{YangFJYH16-L0SSC-ijcv} follows from Theorem~\ref{theorem::l0ssc-deterministic} with the observation that the external subspaces in $\cH_k$ are the confusion area where the subspace detection property fails to hold, and the probability measure of the intersection of a subspace and any of its associated external subspaces is $0$ due to the continuous data distribution in each subspace. This indirectly demonstrates the merit of Theorem~\ref{theorem::l0ssc-deterministic}, since the correctness of noiseless $\ell^{0}$-SSC under semi-random model in Theorem $1$ in~\citet{YangFJYH16-L0SSC-ijcv} only requires assumptions of distinct subspaces and continuous data distribution in each subspace, which are much milder than the assumptions required by previous subspace clustering methods, such as those in terms of subspace affinity and uniform data distribution in each subspace, and please refer to~\citet{YangFJYH16-L0SSC-ijcv} for more details.


\subsection{Noisy $\ell^{0}$-SSC: Deterministic Analysis}


We first introduce the definition of general position and external subspace before our analysis on noisy $\ell^{0}$-SSC.
\begin{definition}\label{def::general-position}
{\rm (General position)}
For any $1 \le k \le K$, the data $\bY^{(k)}$ are in general position if any subset of $L \le d_k$ data points (columns) of $\bY^{(k)}$ are linearly independent. $\bY$ are in general position if $\bY^{(k)}$ are in general position for $1 \le k \le K$.
\end{definition}
The assumption of general condition is rather mild. In fact, if the data points in $\bX^{(k)}$ are independently distributed according to any continuous distribution, then they almost surely in general position.

Let the distance between a point $\bx \in \RR^d$ and a subspace $\cS \subseteq \RR^d$ be defined as $d(\bx,\cS) = \inf_{\by \in \cS} \|\bx-\by\|_2$, the definition of external subspaces is presented as follows. %Figure~\ref{fig:external-subspace} illustrates an example of external subspace. %External subspace is also introduced in~\citet{Yang18-DR-L0SSC} for analyzing SDP of $\ell^{0}$-SSC on dimensionality reduced clean data, and it is essential for our analysis on the subspace detection property of noisy $\ell^{0}$-SSC.
\begin{definition}\label{def::external-subspace}
{\rm (External subspace of limited dimension)}
For a point $\by \in \bY^{(k)}$, a subspace $\bH_{\{\by_{i_j}\}_{j=1}^L}$ spanned by a set of linear independent points $\{\by_{i_j}\}_{j=1}^L \subseteq \bY$ is defined to be an external subspace of $\by$ if $\{\by_{i_j}\}_{j=1}^L \not \subseteq \bY^{(k)}$ and $\by \notin \{\by_{i_j}\}_{j=1}^L$. The set of all external subspaces of $\by$ of dimension no greater than $r$ with $r \ge 1$ for $\by$ is denoted by $\cH_{\by,r}$, that is, $\cH_{\by,r} = \{\bH \colon \bH = \bH_{\{\by_{i_j}\}_{j=1}^L}, {\rm dim}[\bH] = L, L \le r, \{\by_{i_j}\}_{j=1}^L \not \subseteq \bY^{(k)}, \by \notin \{\by_{i_j}\}_{j=1}^L \}$. The point $\by$ is said to be away from its external subspaces of dimension $r$ if $\min_{\bH \in \cH_{\by, r}} d(\by, \bH) > 0$. All the data points in $\bY^{(k)}$ are said to be away from the external subspaces if each of them is away from the its associated external spaces.
\end{definition}

We also need the definitions related to the spectrum of $\bX$ and $\bY$, which are defined as follows. In the following analysis, we employ $\bbeta$ to denote the sparse code of datum $\bx_i$ so that a simpler notation other than $\bZ^i$ is dedicated to our analysis.

\begin{definition}\label{def::minimum-eigenvalue}
The minimum restricted eigenvalue of the uncorrupted data is defined as
\bals%\label{eq:restricted-minimum-eigenvalue}
&\sigma_{\bY,r} \defeq \min_{\bbeta: \|\bbeta\|_0 = r, {\rm rank}(\bY_{\bbeta}) = \|\bbeta\|_0} \sigma_{\min}(\bY_{\bbeta})
\eals%%
for $r \ge 1$. In addition, the normalized minimum restricted eigenvalue of the uncorrupted data is defined by
\bals%\label{eq:normalzied-restricted-minimum-eigenvalue}
&\bar \sigma_{\bY,r} \defeq \frac{\sigma_{\bY,r}}{\sqrt{r}}.
\eals%%
\end{definition}
Moreover, the following quantities are defined for our analysis. We define
\bal\label{eq:equivalence-noisy-l0ssc-tau0}
&\tau_0 \defeq \frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*} + \tau_1,
\eal%%
where
\bal\label{eq:equivalence-noisy-l0ssc-tau1}
&\tau_1 \defeq \frac{\delta} {\bar \sigma_{\bY}^* - \delta}, \quad \sigma_{\bX}^* \defeq \sigma_{\min}(\bX_{\bbeta^*}),
\eal%%
with $\delta < \bar \sigma_{\bY}^*$, and $\bar \sigma_{\bY}^*$ is defined as
\bal\label{eq:l0ssc-bar-sigma-star}
&\bar \sigma_{\bY}^* \defeq \min_{1 \le r < r^*} \bar \sigma_{\bY,r}.
\eal%%

Now we present our main result on  noisy $\ell^{0}$-SSC. %let $\sigma_1 = \min(\sigma, \sigma_{\bX}^*)$
%\begin{theorem}\label{theorem::noisy-l0ssc-subspace-detection}
%{\rm (Subspace detection property holds for noisy $\ell^{0}$-SSC)}
%Let nonzero vector $\bbeta^*$ be an optimal solution to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}) for point $\bx_i$ with $\|\bbeta^*\|_0=r^* > 1$, and $c^* \defeq \|\bx_i - \bX \bbeta^*\|_2$. Suppose $\bY$ is in general position, $\by_i \in \cS_k$ for some $1 \le k \le K$, $\delta < \bar \sigma_{\bY}^*$, ${\rm HS}(\by_i,\bY,\bbeta^*) > \tau_0+2\tau_1+2\delta$, $d(\bx_i, \cS_k) \le c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}$, $\bB(\by_i, \delta+c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}) \cap \bH = \emptyset$ for any $\bH \in \cH_{\by_i, d_k}$. Then the subspace detection property holds for $\bx_i$ with $\bbeta^*$. Here $\tau_0$, $\tau_1$, $\bar \sigma_{\bY}^*$ and $\sigma_{\bX}^*$ are defined in Lemma~\ref{lemma::equivalence-noisy-l0ssc}.
%\end{theorem}
\begin{theorem}\label{theorem::noisy-l0ssc-subspace-detection}
{\rm (Subspace detection property holds for noisy $\ell^{0}$-SSC)}
Let nonzero vector $\bbeta^*$ be an optimal solution to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}) for point $\bx_i$ with $\|\bbeta^*\|_0=r^* \ge 1$, and $c^* \defeq \|\bx_i - \bX \bbeta^*\|_2$. Suppose $\bY$ is in general position, $\by_i \in \cS_k$ for some $1 \le k \le K$, $\delta < \bar \sigma_{\bY}^*$, $\lambda > \tau_0$, $\bB(\by_i, \delta+c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}) \cap \bH = \emptyset$ for any $\bH \in \cH_{\by_i, r^*}$. Then the subspace detection property holds for $\bx_i$ with $\bbeta^*$. Here $\tau_0$, $\tau_1$, $\bar \sigma_{\bY}^*$ and $\sigma_{\bX}^*$ are defined in (\ref{eq:equivalence-noisy-l0ssc-tau0}), (\ref{eq:equivalence-noisy-l0ssc-tau1}) and (\ref{eq:l0ssc-bar-sigma-star}).
\end{theorem}
\begin{remark}
When $\delta = 0$ and there is no noise in the data $\bX$, the conditions for the correctness of noisy $\ell^{0}$-SSC in Theorem~\ref{theorem::noisy-l0ssc-subspace-detection} almost reduce to that for noiseless $\ell^{0}$-SSC. To see this, the conditions are reduced to $\bB(\by_i,c^*) \cap \bH = \emptyset$, which are exactly the conditions required by noiseless $\ell^{0}$-SSC in Lemma~\ref{lemma::l0ssc-deterministic} in the supplementary, namely data are away from the external subspaces by choosing $\lambda \to 0$ and it follows that $c^* = 0$.%Also, the correctness of noisy $\ell^{0}$-SSC under the deterministic model only requires that the subspaces are distinct, and such assumption on subspaces is rather mild compared to SSC~\citep{ElhamifarV13} and noisy SSC~\citep{WangX13}.
\end{remark}

While Theorem~\ref{theorem::noisy-l0ssc-subspace-detection} establishes geometric conditions under which the subspace detection property holds for noisy $\ell^{0}$-SSC, it can be seen that these conditions are often coupled with an optimal solution $\bbeta^*$ to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}). In the following theorem, the correctness of noisy $\ell^{0}$-SSC is guaranteed in terms of $\lambda$, the weight for the $\ell^{0}$ regularization term in (\ref{eq:noisy-l0ssc-i}), and the geometric conditions independent of an optimal solution to (\ref{eq:noisy-l0ssc-i}).

Let $M_i >0$ be the minimum distance between $\by_i \in \cS_k$ and its external subspaces when $\by_i$ is away from its external subspaces of dimension $r$, that is,
\bal\label{eq:Mi}
&M_{i} \defeq \min\{d(\by_i, \bH) \colon \bH \in \cH_{\by_i,d_k}\},
\eal%%
%Define
%\bal\label{eq:D-distance-to-subspaces}
%&D_{i,\bX,\bbeta,\bbeta'} \defeq d(\bx_i,\bH_{\bX_{\bbeta}}) - d(\bx_i,\bH_{\bX_{\bbeta'}}),
%\eal%%
%then
%$\tilde d_{i,r}$ is defined to be the minimum absolute difference between distance of $\bx_i$ to subspaces spanned by linearly independent columns of $\bX$ of different size bounded by $r$:
%\bal\label{eq:tilde-di}
%&\tilde D_{i,r} \defeq \min\{ D_{i,\bX,\bbeta,\bbeta'} \colon D_{i,\bX,\bbeta,\bbeta'} \ge (\|\bbeta'\|_0 - \|\bbeta\|_0) \lambda, \nonumber \\
%& \|\bbeta\|_0 < \|\bbeta'\|_0 \le r,
%{\rm rank}(\bX_{\bbeta'}) = \|\bbeta'\|_0, {\rm rank}(\bX_{\bbeta}) = \|\bbeta\|_0 \}
%\eal%%
The following two quantities related to the spectrum of clean and noisy data, $\mu_{r}$ and $\sigma_{\bX,r}$, are defined as follows with $r > 1$ for the analysis in Theorem~\ref{theorem::noisy-l0ssc-subspace-detection-lambda}.
\bal\label{eq:mu-r}
&\mu_{r} \defeq \frac{\delta} {\min_{1 \le r' < r} \bar \sigma_{\bY,r} - \delta},
\eal%%
\bal\label{eq:sigma-X-r}
&\sigma_{\bX,r} \defeq \min\{\sigma_{\min}(\bX_{\bbeta}) \colon 1 \le \|\bbeta\|_0 \le r\}
\eal%%

\begin{theorem}\label{theorem::noisy-l0ssc-subspace-detection-lambda}
{\rm (Subspace detection property holds for noisy $\ell^{0}$-SSC under deterministic model with conditions in terms of $\lambda$)}
Let nonzero vector $\bbeta^*$ be an optimal solution to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}) for point $\bx_i$ with $\|\bbeta^*\|_0=r^* > 1$, $n_k \ge d_k+1$ for every $k \in [K]$, and there exists $1 < r_0 \le \floor{\frac{1}{\lambda}}$ such that $r^* \le r_0$. Suppose $\bY$ is in general position, $\by_i \in \cS_k$ for some $1 \le k \le K$, $\delta < \bar \sigma_{\bY}^*$, and $M_{i,\delta} \defeq M_i - \delta$. Suppose
\bal\label{eq:noisy-l0ssc-sdp-M}
&M_{i,\delta}  > \frac{2\delta}{\sigma_{\bX,r_0}},
\eal%%
and
\bal\label{eq:noisy-l0ssc-sdp-mu}
&\mu_{r_0} < 1-\frac{2\delta}{\sigma_{\bX,r_0}}.
\eal%%
Then if
\bal\label{eq:noisy-l0ssc-sdp-lambda}
& \lambda_{0} < \lambda < 1,
\eal%%
where $\lambda_0 \defeq \max\{\lambda_1,\lambda_2\}$ and
\bal
&\lambda_{1} \defeq \inf\{0 < \lambda < 1 \colon \sqrt{1-\lambda} + \frac{2\delta}{\sigma_{\bX,r_0} \sqrt{\lambda}}< M_{i,\delta}\}, \label{eq:noisy-l0ssc-sdp-min-lambda-1} \\
&\lambda_{2} \defeq \inf\{0 < \lambda < 1 \colon \lambda - \frac{2\delta}{\sigma_{\bX,r_0}} \frac{1}{\sqrt{\lambda}} > \mu_{r_0}\}, \label{eq:noisy-l0ssc-sdp-min-lambda-2}
\eal%%
the subspace detection property holds for $\bx_i$ with $\bbeta^*$. Here $M_i$, $\mu_{r_0}$, $\sigma_{\bX,r_0}$ are defined in (\ref{eq:Mi}), (\ref{eq:mu-r}), (\ref{eq:sigma-X-r}) respectively.
\end{theorem}
\begin{remark}
The two conditions (\ref{eq:noisy-l0ssc-sdp-M}) and (\ref{eq:noisy-l0ssc-sdp-mu}) are induced by two conditions, $\bB(\by_i, \delta+c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}) \cap \bH = \emptyset$ for any $\bH \in \cH_{\by_i, d_k}$ and $\lambda > \tau_0$ respectively, which are required by Theorem~\ref{theorem::noisy-l0ssc-subspace-detection}. Note that when (\ref{eq:noisy-l0ssc-sdp-M}) and (\ref{eq:noisy-l0ssc-sdp-mu}) hold, $\lambda_{1}$ and $\lambda_{2}$ can always be chosen in accordance with (\ref{eq:noisy-l0ssc-sdp-min-lambda-1}) and (\ref{eq:noisy-l0ssc-sdp-min-lambda-2}).
\end{remark}
\begin{remark}\label{remark::lambda}
It can be observed from condition (\ref{eq:noisy-l0ssc-sdp-lambda}) that noisy $\ell^{0}$-SSC encourages sparse solution by a relatively large $\lambda$ so as to guarantee the subspace detection property. This theoretical finding is consistent with the empirical study shown in the experimental results.% demonstrated in~\citet{YangFJYH16-L0SSC-ijcv}, where the authors chooses $\lambda = 0.5$ empirically throughout all the experiments, and the average number of nonzero elements of the sparse code for each data point is around $3$ for most cases.
\end{remark}

\subsection{Noisy $\ell^{0}$-SSC: Randomized Analysis}
The correctness of noisy $\ell^{0}$-SSC is analyzed under the semi-random model that the data in subspace $\cS^{(k)}$ are i.i.d. according to the uniform distribution on the unit sphere, $\unitsphere{d_k-1}$, of $\RR^{d_k}$ centered at the origin for all $k \in [K]$. This setting is employed extensively in the subspace learning literature~\citep{Soltanolkotabi2012,Wang13-lrr-ssc,WangX13,Wang16-graphconnectivity}. We then have the major theorem below stating the theoretical guarantee of the subspace detection property of noisy $\ell^{0}$-SSC under the semi-random model. Before stating this theorem, we introduce the following definition of subspace affinity, which is widely used in the analysis of semi-random model in the sparse subspace clustering literature.

\begin{definition}
\label{def::subspaces-affinity}
(\textup{Subspace affinity})
The affinity between two subspaces, $\cS_k$ and $\cS_l$ with $k,l \in [K]$, is defined by
\bsals%\label{eq:subspaces-affinity}
&{\rm aff}(\cS_k,\cS_l) = \sqrt{\sum\limits_{t=1}^{\min\{k,l\}} \cos^2 \theta_{kl}^{(t)}},
\esals%
where $\cos \theta_{kl}^{(t)}$ is the $t$-th canonical angle between $\cS_k$ and $\cS_l$ defined in~\citet{Soltanolkotabi2012}. Let $\bU^{(k)}$ and $\bU^{(l)}$ be the orthonormal basis for $\cS_k$ and $\cS_l$ respectively, then
\begin{small}\begin{align*}
&\cos \theta_{kl}^{(t)} = \sup_{\bu \in \cS_k, \bv \in \cS_l} \frac{\bu^{\top}\bv}{\|\bu\|_2 \|\bv\|_2} = \frac{{\bu^t}^{\top} \bv^t}{\|\bu^t\|_2 \|\bv^t\|_2},
\end{align*}\end{small}%
with orthogonality:  $\bu^{\top} \bu^j=0$, $\bv^{\top} \bv^j=0$, $j = 1,\ldots,t-1$.
It can be verified that ${\rm aff}(\cS_k,\cS_l) = \|{\bU^{(k)}}^{\top}\bU^{(l)}\|_F$.
\end{definition}

\begin{theorem}\label{theorem::noisy-l0ssc-subspace-detection-lambda-random}
{\rm (Subspace detection property holds for noisy $\ell^{0}$-SSC under semi-random model with conditions in terms of $\lambda$)}
Under the semi-random model, let nonzero vector $\bbeta^*$ be an optimal solution to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}) for point $\bx_i$ with $\|\bbeta^*\|_0=r^* > 1$, $n_k \ge d_k+1$ for every $1\le k \le K$, and there exists $1 < r_0 \le \floor{\frac{1}{\lambda}}$ such that $r^* \le r_0$. Suppose $c_1 > 0$ is an arbitrary small constant, $\eps_0, \eps_1 > 0$ be small constants, and $d_k$ is large enough such that $d_k \ge \floor{\frac{1}{\lambda}}$, $2d^{-0.05}_k + 2d^{-0.1}_k \le \eps_0$ and $\sqrt{\frac{1}{\lambda d_k}} + \sqrt{ \frac{2}{\lambda d_k} \log{\frac{en_k}{r_0}}} \le \eps_1$ hold for all $k \in [K]$. Define
\bsal\label{eq:sigma-min-clean-data}
\sigma'_{\min} \defeq \frac{1}{1+\eps_0} \pth{1-\sqrt{c_1} - \eps_1},
\esal%,
$c \defeq \sqrt{\frac{\sigma'^2_{\min} - (r_0-1){\rm aff}(\cS_{t_1}, \cS_{t_2})}{r_0}}$. For $t>0$ such that $\frac{1}{d_{\max}} - 2t \sqrt{1 - \frac 1 {d_{\max}}}-t^2 > 0$, suppose
\begin{align}\label{eq:noisy-l0ssc-sdp-random-subspaces-affnity}
&\max_{t_1,t_1 \in [K], t_1 \neq t_2} {\rm aff}(\cS_{t_1}, \cS_{t_2}) < \frac{\sigma'^2_{\min}}{r_0-1},\\\label{eq:noisy-l0ssc-sdp-random-cond1}
&\hspace{0.8in}\delta < c,\\\label{eq:noisy-l0ssc-sdp-random-cond2}
&\hspace{0in}\delta + \frac{2\delta}{\sqrt{r_0} (c-\delta) } \le \frac{1}{d_{\max}} - 2t\sqrt{1 - \frac 1 {d_{\max}}}-t^2,\\\label{eq:noisy-l0ssc-sdp-random-cond3}
&\hspace{0.3in}\frac{\delta}{c-\delta} + \frac{2\delta}{\sqrt{r_0} (c-\delta)} < 1,\\\label{eq:noisy-l0ssc-sdp-lambda-random}
&\hspace{0.8in} \lambda'_{0} < \lambda < 1,
\end{align}
where $M \defeq \sup_{0 \le t < 1} \frac{2 t^3 \arccos t}{\pi} < 1$, $\lambda'_0 \defeq \max\{\lambda'_1,\lambda'_2\}$,
\bals
&\lambda'_{1} \defeq \inf\{0 < \lambda < 1 \colon \sqrt{1-\lambda} + \frac{2\delta}{\sqrt{r_0} (c-\delta) \sqrt{\lambda}} \nonumber \\
&< \frac{1}{d_{\max}} - 2t\sqrt{1 - \frac 1 {d_{\max}}}-t^2 -\delta\},
%\label{eq:noisy-l0ssc-sdp-min-lambda-1-random}
\\
&\lambda'_{2} \defeq \inf\{0 < \lambda < 1 \colon \lambda - \frac{2\delta}{\sqrt{r_0} (c-\delta)} \frac{1}{\sqrt{\lambda}} > \frac{\delta}{c-\delta}\}. %\label{eq:noisy-l0ssc-sdp-min-lambda-2-random}
\eals%% %Lemma~\ref{lemma::point-to-subspace-concentration}
When the conditions in Lemma~\ref{lemma::point-to-subspace-concentration} of the supplementary hold for all $k \in [K]$ and every point $\by \in \bY^{(k)}$, then with probability at least $1-\sum_{k=1}^K \pth{\exp(-c_1 d_k)+2n_k\exp\pth{-d^{0.9}_k}} -8\sum\limits_{k=1}^K n_k\exp(-\frac{d_k t^2}{2})$, the subspace detection property holds for $\bx_i$ with $\bbeta^*$ for all $i \in [n]$.
\end{theorem}

\begin{remark}[\textup{Advantage of Noisy $\ell^{0}$-SSC in terms of Subspace Affinity}]
\label{remark::subspace-affinity}
It is well known that the difficulty of achieving the subspace detection property increases with larger affinity between subspaces, that is, the subspace are closer to each other.
Our analysis reveals the significant advantage of noisy $\ell^{0}$-SSC over $\ell^1$-SSC in terms of the maximum subspace affinity. To the best of our knowledge, the best theoretical result of $\ell^1$-SSC, including its geometrical analysis~\citep{Soltanolkotabi2012} and the subsequent works on noisy or dimensionality-reduced data~\citep{WangX13,Wang2015-dr-l1ssc}, requires that the maximum subspace affinity satisfies
\bsal\label{eq:subspace-affinity-l1ssc}
&\max_{t_1,t_1 \in [K], t_1 \neq t_2} {\rm aff}(\cS_{t_1}, \cS_{t_2}) < \sqrt{d_0} \cdot \frac{\bar c \sqrt{\log{\rho}}}{8{\sqrt 2} \log{n}},
\esal
under the setting in~\citet{Soltanolkotabi2012} that $d_k = d_0$ and $n_k = \rho d_0 + 1$ for all $k \in [K]$, so that $n = K (\rho d_0 + 1)$. When $n  > \exp\pth{d_0^\tau}$ for $\tau \in (0.5,0.9)$, then (\ref{eq:subspace-affinity-l1ssc}) requires $\max_{t_1,t_1 \in [K], t_1 \neq t_2} {\rm aff}(\cS_{t_1}, \cS_{t_2}) \to 0$ when $d_0 \to \infty$, while the condition of noisy $\ell^{0}$-SSC, (\ref{eq:noisy-l0ssc-sdp-random-subspaces-affnity}), only requires that $\max_{t_1,t_1 \in [K], t_1 \neq t_2} {\rm aff}(\cS_{t_1}, \cS_{t_2})  < \frac{\sigma'^2_{\min}}{r_0-1}$ when $d_0$ is sufficiently large. Such less restive condition on the maximum subspace affinity reveals the theoretical advantage of noisy $\ell^{0}$-SSC over $\ell^1$-SSC.
\end{remark}
%In addition, while~\citep{Soltanolkotabi2012,WangX13} require data in each subspace are i.i.d according to uniform distribution on unit sphere, our randomized result requires data in each subspace are i.i.d. isotropic random vectors on sphere of radius $\sqrt{d_k}$. Note that i.i.d samples uniformly distributed on sphere of radius $\sqrt{d_k}$ centered at the origin are in fact isotropic, our assumption is less restrictive after scaling the data by a factor of $\sqrt{d_k}$.



\section{Noisy $\ell^{0}$-SSC on Dimensionality Reduced Data: Noisy-DR-$\ell^{0}$-SSC}
\label{sec::noisy-dr-l0ssc}
Albeit the theoretical guarantee and compelling empirical performance of noisy $\ell^{0}$-SSC to be shown in the experimental results, the computational cost of noisy $\ell^{0}$-SSC is high with the high dimensionality of the data. In this section, we propose Noisy Dimensionality Reduced $\ell^{0}$-SSC (Noisy-DR-$\ell^{0}$-SSC) which performs noisy $\ell^{0}$-SSC on dimensionality reduced data. The theoretical guarantee on the correctness of Noisy-DR-$\ell^{0}$-SSC under the deterministic model as well as its empirical performance are presented. %Our analysis in Section~\ref{sec::dr-l0ssc-correctness} shows the correctness of DR-$\ell^{0}$-SSC under both the semi-random model and the deterministic model, and these models are introduced in Section~\ref{sec::models}. Section~\ref{sec::randomized-low-rank} and Section~\ref{sec::general-rp} provides theoretical guarantee on the correctness of DR-$\ell^{0}$-SSC using two different randomized linear transformation, i.e. the random projection by randomized low-rank approximation of the data and the random projection that approximately preserves the $\ell^{2}$-norm. This analysis is under the deterministic model wherein the subspaces and the data in each subspace are nonrandom, which is also the model employed by~\citep{ElhamifarV13}.

\subsection{Method}
\label{sec::noisy-dr-l0ssc-method}
Noisy-DR-$\ell^{0}$-SSC performs subspace clustering by the following two steps: 1) obtain the dimension reduced data $\tilde \bX = \bP \bX$ with a linear transformation $\bP \in \RR^{p \times d}$ ($p < d$). 2) perform noisy $\ell^{0}$-SSC on the compressed data $\tilde \bX$:
\bal\label{eq:noisy-dr-l0ssc-i}
&\mathop {\min }\limits_{{\tilde \bbeta \in \RR^n, \tilde \bbeta_i = 0}} L(\tilde \bbeta) = {\|\tilde \bx_i - \tilde \bX \bbeta\|_2^2 + {\tilde \lambda}\|{\tilde \bbeta}\|_0}.
\eal%%
If $p < d$, Noisy-DR-$\ell^{0}$-SSC operates on the compressed data $\tilde \bX$ rather than on the original data, so that the efficiency is improved. We  introduce two  types of random projection for Noisy-DR-$\ell^{0}$-SSC in the following two subsections.

\subsection{Randomized Low-Rank Approximation}
\label{sec::low-rank-approx}
High-dimensional data often exhibits low-dimensional structures, which often leads to low-rankness of the data matrix. Intuitively, if the data is low rank, then it could be safe to perform noisy $\ell^{0}$-SSC on its dimensionality reduced version by the linear projection $\bP$, and it is expected that $\bP$ can preserve the information of the subspaces contained in the original data as much as possible, while effectively removing uninformative dimensions. To this end, we propose to choose $\bP$ as a random projection induced by randomized low-rank approximation of the data.

The  merit of random projection (RP) is highlighted by the celebrated Johnson-Lindenstrauss Lemma~\citep{Article:JL84}.  In the past 20 years or more, RP has been used extensively in dimension reduction, approximate near neighbor search, compressed sensing, computational biology, etc~\citep{Proc:Dasgupta_UAI00,Proc:Bingham_KDD01,Article:Buher_01,Article:Achlioptas_JCSS03,Proc:Fern_ICML03,Proc:Datar_SCG04,Article:Candes_IT06,Article:Donoho_IT06,Proc:Frund_NIPS07,li2007very,li2017simple,li2019sign}. In particular,  RP has been employed to accelerate  numerical matrix computation and matrix optimization problems, including matrix decomposition~\citep{Frieze2004-fast-monto-carlo-lowrank,Drineas2004-large-graph-svd,Sarlos2006-large-matrix-random-projection,
Drineas2006-fast-monto-carlo-lowrank,Drineas2008-matrix-decomposition,Mahoney2009-matrix-decomposition,
Drineas2011-fast-least-square-approximation,Lu2013-fast-ridge-regression-random-subsample}.

Formally, a random matrix $\bT \in \RR^{n \times p}$ is generated such that each element $\bT_{ij}$ is sampled independently according to the Gaussian distribution $\cN(0,1)$. QR decomposition is then performed on $\bX \bT$ to obtain the basis of its column space, namely $\bX \bT = \bQ \bR$ where $\bQ \in \RR^{d \times p}$ is an orthogonal matrix of rank $p$ and $\bR \in \RR^{p \times p}$ is an upper triangle matrix. The columns of $\bQ$ form the orthogonal basis for the sample matrix $\bX \bT$. An approximation of $\bX$ is then obtained by projecting $\bX$ onto the column space of $\bX \bT$: $\bQ\bQ^{\top}\bX = \bQ\bW = \hat \bX$ where $\bW = \bQ^{\top}\bX \in \RR^{p \times n}$. In this manner, a randomized low-rank decomposition of $\bX$ is achieved by
\bals%\label{eq:low-rank-decomposition}
&\hat \bX = \bQ\bW.
\eals%%
It is proved that the low rank approximation $\bar {\bX}$ is close to $\bX$ in spectral norm~\citep{Halko2011-random-matrix-decomposition}. We present probabilistic result in Theorem~\ref{theorem::noisy-dr-l0ssc-subspace-detection} on the correctness of Noisy-DR-$\ell^{0}$-SSC using the random projection induced by randomized low-rank decomposition of the data $\bX$, namely $\bP = \bQ^{\top}$. In the sequel, $\tilde \bx = \bP \bx$ for any $\bx \in \RR^n$. To guarantee the subspace detection property on the dimensionality-reduced data $\tilde \bX$, it is crucial to ensure that the conditions, such as (\ref{eq:noisy-l0ssc-sdp-M}) (\ref{eq:noisy-l0ssc-sdp-mu}) in Theorem~\ref{theorem::noisy-l0ssc-subspace-detection-lambda}, still hold after  linear transformation.

Each subspace $\cS_k$ is transformed into $\tilde \cS_k = \bP (\cS_k)$ with dimension $\tilde d_k$. We denote by $\tilde \bbeta^*$ an optimal solution to (\ref{eq:noisy-dr-l0ssc-i}), and define $C_{p,p_0} \defeq \big(1+17\sqrt{1+\frac{p_0}{p-p_0}}\big) \sigma_{p_0+1} + \frac{8\sqrt{p}}{p-p_0+1} (\sum\limits_{j > p_0} \sigma_j^2)^{\frac{1}{2}}$ with $p_0 \ge 2$. We also define the following quantities for the convenience of our analysis, which correspond to $M_i$, ${\bar \sigma}_{\bY,r}$, $\sigma_{\bX,r}$ and $\mu_{r}$ used in the analysis on the original data:
\bal\label{eq:tilde-Mi}
&\tilde M_i \defeq \min\{d(\tilde \by_i, \bH) \colon \bH \in \cH_{\tilde \by_i, \tilde d_k}\},
\eal%%
where $\cH_{\tilde \by_i, \tilde d_k}$ is all the external subspaces of $\tilde \by_i$ with dimension no greater than $\tilde d_k$ in the transformed space~by~$\bP$,
\bal
{\bar \sigma}_{\tilde \bY,r} &\defeq \min_{\bbeta: \|\bbeta\|_0 = r, {\rm rank}(\tilde \bY_{\bbeta}) = \|\bbeta\|_0} \sigma_{\min}(\tilde \bY_{\bbeta}),\label{eq:tilde-bar-sigma-Y-r}
 \\
\sigma_{\tilde \bX,r} &\defeq \min\{\sigma_{\min}(\tilde \bX_{\bbeta}) \colon 1 \le \|\bbeta\|_0 \le r\}, \label{eq:tilde-sigma-X-r} \\
\tilde \mu_{r} &\defeq \frac{\delta} {\min_{1 \le r' < r}  {\bar \sigma}_{\tilde \bY,r} - \delta}. \label{eq:tilde-mu-r}
\eal%%

\begin{theorem}\label{theorem::noisy-dr-l0ssc-subspace-detection}
{\rm (Subspace detection property holds for Noisy-DR-$\ell^{0}$-SSC under deterministic model)}
Let nonzero vector $\bbeta^*$ be an optimal solution to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}) for point $\bx_i$ with $\|\bbeta^*\|_0=r^* > 1$, $n_k \ge d_k+1$ for every $1\le k \le K$, and there exists $1 < r_0 \le d_k$ such that $r^* \le r_0 \le \floor{\frac 1\lambda}$. Suppose $\bY$ is in general position, $\delta < \min_{1 \le r < r_0} \bar \sigma_{\bY,r}$, and $\tilde M_{i,\delta} \defeq \tilde M_i - \delta$. Furthermore, suppose the following conditions hold:
\begin{enumerate}[leftmargin=*]
\item[(i)]
$C_{p,p_0} + 2\delta \sqrt{\tilde d_{\max}} < \min_{k = 1,\ldots,K} \sigma^{(k)}_{\bY}$, \\
where $\tilde d_{\max} \defeq \max_{k} \tilde d_k$,
$\sigma^{(k)}_{\bY} \defeq \min \{\sigma_{\min}(\bA) \colon \bA \subseteq \bY^{(k)}, \bA \in \RR^{d \times n'}, n' \le \tilde d_k\}$,

\item[(ii)] $\delta (1+2\sqrt{r_0}) <  \min_{1 \le r < r_0} \bar \sigma_{\bY,r} - C_{p,p_0}$,

\item[(iii)] $\min_{1\le r \le \tilde d_k} \sigma_{\bY, r} >  C_{p,p_0} - 2\delta \sqrt{\tilde d_{k}}$ and
\bals%\label{eq:noisy-dr-l0ssc-subspace-detection-seg8}
&M_i - C_{p,p_0} (1+\frac{1}{\min_{1\le r \le \tilde d_k} \sigma_{\bY, r} -  C_{p,p_0} - 2\delta \sqrt{\tilde d_{k}}}) \nonumber \\
&> \delta + \frac{2\delta}{\sigma_{\bX,r_0} - C_{p,p_0}},
\eals%%
for all $\by_i \in \cS_k$ and $1 \le k \le K$,

\item[(iv)] $\min_{1 \le r < r_0}  \bar \sigma_{\bY,r_0} > C_{p,p_0} - 2\delta \sqrt{r_0} - \delta$ and
\bals%\label{eq:noisy-dr-l0ssc-subspace-detection-seg10}
&\frac{\delta} {\min_{1 \le r < r_0}  \bar \sigma_{\bY,r_0} - C_{p,p_0} - 2\delta \sqrt{r_0} - \delta} \nonumber \\
&< 1-\frac{2\delta}{\sigma_{\bX,r_0} - C_{p,p_0}}.
\eals%%

\end{enumerate}

If $\tilde \lambda_{0} < \tilde \lambda < 1$,
where $\tilde \lambda_0 = \max\{\tilde \lambda_1, \tilde \lambda_2\}$ and
\bal
&\tilde \lambda_{1} = \inf\{0 < \tilde \lambda < 1 \colon \sqrt{1- \tilde \lambda} + \frac{2\delta}{ \sigma_{\tilde \bX,r_0} \sqrt{\tilde \lambda}}< \tilde M_{i,\delta}\}, \label{eq:noisy-dr-l0ssc-sdp-min-lambda-1}
\\
&\tilde \lambda_{2} = \inf\{0 < \tilde \lambda < 1 \colon \tilde \lambda - \frac{2\delta}{ \sigma_{\bX,r_0}} \frac{1}{\sqrt{\tilde \lambda}} > \tilde \mu_{r_0}\}, \label{eq:noisy-dr-l0ssc-sdp-min-lambda-2}
\eal%%
then with probability at least $1-6e^{-p}$, the subspace detection property holds for $\tilde \bx_i$ with $\tilde \bbeta^*$. Here $\tilde M_i$, $\tilde \mu_r$ and $\tilde \sigma_{\tilde \bX,r_0}$ are defined in (\ref{eq:tilde-Mi}), (\ref{eq:tilde-mu-r}) and (\ref{eq:tilde-sigma-X-r}) respectively.
\end{theorem}

\subsection{Very Sparse Random Projections}
\label{sec::osnap}
In this subsection, we study the case when the linear transformation $\bP$ for the dimensionality reduced $\ell^0$-SSC problem (\ref{eq:noisy-dr-l0ssc-i}) is a sparse matrix~\citep{Article:Charikar_2004,Article:Cormode_05,li2007very,Proc:Weinberger_ICML2009,Article:Gilbert_IEEE10,Proc:Li_NIPS11_Hashing,NelsonN13-OSNAP,li2022gcwsnet}. In particular, we choose $\bP$ such that each column of $\bP$ only has $1$ nonzero element, in a fashion known as ``count-sketch''~\citep{Article:Charikar_2004}. \cite{Proc:Weinberger_ICML2009} applied count-sketch as a dimension reduction tool for machine learning. The work of~\citep{Proc:Li_NIPS11_Hashing}, in addition to developing hash learning algorithm based on minwise hashing, also provided the thorough theoretical analysis for count-sketch in the context of estimating inner products. The conclusion from~\citet{Proc:Li_NIPS11_Hashing}  is that,  to estimate inner products, we should use count-sketch (or very sparse random projections~\citep{li2007very})  instead of the original (dense) random projections, because count-sketch is not only computationally much more efficient but also (slightly) more accurate, as far as the task of similarity estimation is concerned.

Using those nice theoretical properties of count-sketch projections, we have the following theorem about the correctness of Noisy-DR-$\ell^{0}$-SSC when $\bP$ has only  $1$ nonzero element in each column. For breveity, we name such a projection matrix to be ``CSP".

\begin{theorem}\label{theorem::noisy-dr-l0ssc-subspace-detection-osnap}
{\rm (Subspace detection property holds for Noisy-DR-$\ell^{0}$-SSC under deterministic model with $\bP$ being the CSP)}
Let nonzero vector $\bbeta^*$ be the optimal solution to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}) for point $\bx_i$ with $\|\bbeta^*\|_0=r^*$, $n_k \ge d_k+1$ for every $1\le k \le K$, and there exists $1 < r_0 \le \floor{\frac 1\lambda}$ such that $1 < r^* \le r_0$. Suppose $\bY$ is in general position, $\by_i \in \cS_k$ for some $1 \le k \le K$, $\delta < \min_{1 \le r < r_0} \bar \sigma_{\bY,r}$. Let $M_{i,\delta} \defeq M_i - \delta$, $\varepsilon$ be a positive number such that $0 < \varepsilon \le 1$. Suppose
\bal\label{eq:noisy-dr-l0ssc-sdp-M}
M_{i,\delta}  &>  \frac{2 (1+\varepsilon)^3 \delta}{\sigma_{\bX,r_0}},\\\label{eq:noisy-dr-l0ssc-sdp-mu}
\mu_{r,\varepsilon} \defeq \frac{\delta} {\frac{\min_{1 \le r' < r} \bar \sigma_{\bY,r}}{(1+\varepsilon)^2} - \delta}& < 1-\frac{2(1+\varepsilon)^2\delta}{\sigma_{\bX,r_0}}.
\eal%%
Then if $\tilde \lambda_{0} < \tilde \lambda < 1$, where $\tilde \lambda_0 \defeq \max\{\lambda_1,\lambda_2\}$ and
\bal
&\lambda_{1} \defeq \inf\{0 < \tilde \lambda < 1 \colon \sqrt{1+\varepsilon-\tilde \lambda} + \frac{2\delta}{\sigma_{\tilde \bX,r_0} \sqrt{\tilde \lambda}}< M_{i,\delta}\}, \label{eq:noisy-dr-l0ssc-sdp-min-lambda-1}
\\
&\lambda_{2} \defeq \inf\{0 < \lambda < 1 \colon \lambda - \frac{2\delta}{\sigma_{\tilde \bX,r_0}} \frac{1}{\sqrt{\lambda}} > \tilde \mu_{r_0}\}, \label{eq:noisy-dr-l0ssc-sdp-min-lambda-2}
\eal%%
then with probability at least $1-K\delta'$ for all $\delta' \in (0,\frac 1K)$, the subspace detection property holds for $\tilde \bx_i$ with $\tilde \bbeta^*$. Here $\tilde \mu_{r_0}$ and $\sigma_{\tilde \bX,r_0}$ are defined in (\ref{eq:tilde-mu-r}) and (\ref{eq:tilde-sigma-X-r}) respectively. $\tilde \bbeta^*$ is the optimal solution to (\ref{eq:noisy-dr-l0ssc-i}) with $\bP$ being the CSP described in the beginning of this subsection with $p \ge \frac{d_{\max}^2+d_{\max}}{\delta' (2\varepsilon-\varepsilon^2)^2}$.
\end{theorem}
\begin{table*}[!hbt]
\centering
\caption{\small Clustering results on various data sets, with the best three results in bold.\vspace{-0.05in}}
\resizebox{1\linewidth}{!}{
\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|c|}
  \hline
  Data Set

                              &Measure & KM    & SC     &Noisy SSC &Noisy DR-SSC       &SMCE    &SSC-OMP     &Noisy $\ell^{0}$-SSC   &Noisy-DR-$\ell^{0}$-SSC-LR &Noisy-DR-$\ell^{0}$-SSC-CSP\\\hline

  \multirow{2}{*}{COIL-20}    &AC      &0.6554 &0.4278  &0.7854    &0.7764 &0.7549  &0.3389      &\textbf{0.8472} $\pm$ 0.0031   &\textbf{0.8479} $\pm$ 0.0023 &\textbf{0.8472} $\pm$ 0.0019\\ \cline{2-11}
                              &NMI     &0.7630 &0.6217  &0.9148    &0.9219 &0.8754  &0.4853      &\textbf{0.9428} $\pm$ 0.0082   &\textbf{0.9433} $\pm$ 0.0063 &\textbf{0.9429} $\pm$ 0.0037\\ \hline
  \multirow{2}{*}{COIL-100}   &AC      &0.4996 &0.2835  &0.5275    &0.5013 &0.5639  &0.1667      &\textbf{0.7683} $\pm$ 0.0020  &\textbf{0.7039} $\pm$ 0.0087 &\textbf{0.7046} $\pm$ 0.0083\\ \cline{2-11}
                              &NMI     &0.7539 &0.5923  &0.8041    &0.8019 &0.8064  &0.3757      &\textbf{0.9182} $\pm$ 0.0096  &\textbf{0.8706} $\pm$ 0.0109 &\textbf{0.8708} $\pm$ 0.0117\\ \hline
  \multirow{2}{*}{Yale-B}     &AC      &0.0954 &0.1077  &0.7850    &0.7255 &0.3293  &0.7789      &\textbf{0.8480}  $\pm$ 0.0091 &\textbf{0.8231} $\pm$ 0.0173 &\textbf{0.8318} $\pm$ 0.0112\\ \cline{2-11}
                              &NMI     &0.1258 &0.1485  &0.7760    &0.7311 &0.3812  &0.7024      &\textbf{0.8612}  $\pm$ 0.0072  &\textbf{0.8533} $\pm$ 0.0294  &\textbf{0.8593} $\pm$ 0.0133\\ \hline

  \multirow{2}{*}{MPIE S$1$}
                           &AC      &0.1164 &0.1285  &0.5892    &0.3588 &0.1721  &0.1695      &\textbf{0.6741}$\pm$ 0.0413 &\textbf{0.6741}$\pm$ 0.0938 &\textbf{0.6744}$\pm$ 0.0662 \\ \cline{2-11}
                           &NMI     &0.5049 &0.5292  &0.7653    &0.6806 &0.5514  &0.3395      &\textbf{0.8622}$\pm$ 0.0533 &\textbf{0.8622}$\pm$ 0.0834 &\textbf{0.8548}$\pm$ 0.0931\\ \hline

  \multirow{2}{*}{MPIE S$2$}
                           &AC      &0.1315 &0.1410  &0.6994    &0.4611 &0.1898  &0.2093      &\textbf{0.7527}$\pm$ 0.0115 &\textbf{0.7533}$\pm$ 0.0596 &\textbf{0.7517}$\pm$ 0.0813 \\ \cline{2-11}
                           &NMI     &0.4834 &0.5128  &0.8149    &0.7086 &0.5293  &0.4292      &\textbf{0.8939}$\pm$ 0.0389 &\textbf{0.8926} $\pm$ 0.0742 &\textbf{0.8910} $\pm$ 0.0454\\ \hline

  \multirow{2}{*}{MPIE S$3$}
                           &AC      &0.1291 &0.1459  &0.6316    &0.4841 &0.1856  &0.1787      &\textbf{0.7050}$\pm$ 0.0277 &\textbf{0.7123}$\pm$ 0.0812 &\textbf{0.7184}$\pm$ 0.1045 \\ \cline{2-11}
                           &NMI     &0.4811 &0.5185  &0.7858    &0.7340 &0.5155  &0.3415      &\textbf{0.8750}$\pm$ 0.0157 &\textbf{0.8455}$\pm$ 0.0693 &\textbf{0.8457}$\pm$ 0.0913 \\ \hline

  \multirow{2}{*}{MPIE S$4$}
                           &AC      &0.1308 &0.1463  &0.6803    &0.5511 &0.1823  &0.1680      &\textbf{0.7246}$\pm$ 0.0147 &\textbf{0.7137}$\pm$ 0.0605 &\textbf{0.7250}$\pm$ 0.0443 \\ \cline{2-11}
                           &NMI     &0.4866 &0.5280  &0.8063    &0.7955 &0.5294  &0.3345      &\textbf{0.8837}$\pm$ 0.0212 &\textbf{0.8847}$\pm$ 0.0781 &\textbf{0.8834}$\pm$ 0.0517 \\ \hline

  \multirow{2}{*}{MNIST}      &AC      &0.5236  &0.3504  &0.5714   &0.5123 &\textbf{0.6542}   &0.5561       &0.6259 $\pm$ 0.0249   &\textbf{0.6296} $\pm$ 0.1522 &\textbf{0.6310} $\pm$ 0.1031 \\ \cline{2-11}
                              &NMI     &0.4770  &0.3607   &0.6091  &0.5026  &\textbf{0.6796}  &0.5986     &\textbf{0.6501} $\pm$ 0.0196   &0.6440 $\pm$ 0.0259  &\textbf{0.6497} $\pm$ 0.0313 \\ \hline

\end{tabular}
}
\label{table:results}\vspace{-0.05in}
\end{table*}

\subsection{The Algorithm of Noisy-DR-$\ell^{0}$-SSC}
We denote by Noisy-DR-$\ell^{0}$-SSC-LR the Noisy-DR-$\ell^{0}$-SSC with random projection induced by randomized low-rank approximation in Section~\ref{sec::low-rank-approx}, and denote by Noisy-DR-$\ell^{0}$-SSC-CSP the Noisy-DR-$\ell^{0}$-SSC with CSP serving as the random projection in Section~\ref{sec::osnap}.


\begin{algorithm}[h]
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand\algorithmicensure {\textbf{Output:} }
\small
\caption{\small Noisy Dimensionality Reduced $\ell^{0}$-Sparse Subspace Clustering by Randomized Low-Rank Approximation (Noisy-DR-$\ell^{0}$-SSC-LR)}
\label{alg:noisy-dr-l0ssc-lr}
%\allowdisplaybreaks
\begin{algorithmic}[1]
\STATE{Generate a Gaussian random matrix $\bT \in \RR^{n \times p}$ where each element $\bT_{ij}$ is sampled independently according to the standard Gaussian distribution $\cN(0,1)$}
\STATE{Perform QR decomposition on $\bX \bT$, $\bX \bT = \bQ \bR$ where $\bQ \in \RR^{d \times p}$}
\STATE{Set the linear transformation $\bP = \bQ^{\top}$, and obtain the dimensionality reduced data $\tilde \bX = \bP \bX$}
\STATE{Perform noisy $\ell^{0}$-SSC on $\tilde \bX$ using Algorithm~\ref{alg:PGD-l0ssc}}
\end{algorithmic}
\end{algorithm}
Noisy-DR-$\ell^{0}$-SSC-LR is described by Algorithm~\ref{alg:noisy-dr-l0ssc-lr}. The algorithm of Noisy-DR-$\ell^{0}$-SSC-CSP is similar to Algorithm~\ref{alg:noisy-dr-l0ssc-lr} except that CSP serves as the random projection $\bP$. Algorithm~\ref{alg:PGD-l0ssc} in Section~\ref{sec::pgd-noisy-l0ssc} of the supplementary describes how to solve the noisy $\ell^0$-SSC problem~(\ref{eq:noisy-l0ssc-i}).

\vspace{-0.05in}
\section{Experiments}
\vspace{-0.05in}

We demonstrate the performance of Noisy-DR-$\ell^{0}$-SSC-LR and Noisy-DR-$\ell^{0}$-SSC-CSP, with comparison to other competing clustering methods including K-means (KM), Spectral Clustering (SC), noisy SSC, Sparse Manifold Clustering and Embedding (SMCE)~\citep{ElhamifarV11} and SSC-OMP~\citep{Dyer13a} in this section. We will use Noisy-DR-$\ell^{0}$-SSC to refer to its two variants. With the coefficient matrix $\bZ$ obtained by the optimization of noisy $\ell^{0}$-SSC or Noisy-DR-$\ell^{0}$-SSC, a sparse similarity matrix is built by $\bW = \frac{|\bZ| + |\bZ^{\top}|}{2}$, and spectral clustering is performed on $\bW$ to obtain the clustering results. Two measures are used to evaluate the performance of different clustering methods, i.e. the Accuracy (AC) and the Normalized Mutual Information (NMI)~\citep{Zheng04}.

We use randomized rank-$p$ decomposition of the data matrix in Noisy-DR-$\ell^{0}$-SSC-LR with $p = \frac{\min\{d,n\}}{10}$. It can be observed that noisy $\ell^{0}$-SSC and Noisy-DR-$\ell^{0}$-SSC always achieve better performance than other methods in Table~\ref{table:results}, including the noisy SSC on dimensionality reduced data (Noisy DR-SSC)~\citep{Wang2015-dr-l1ssc}. Note that noisy $\ell^{0}$-SSC has the same performance as $\ell^{0}$-SSC~\citep{YangFJYH16-L0SSC-ijcv}. Throughout all the experiments we find that the best clustering accuracy is achieved whenever $\lambda$ is chosen by $0.5 < \lambda < 0.95$, justifying our theoretical finding claimed in Remark~\ref{remark::lambda} and (\ref{eq:noisy-l0ssc-sdp-lambda}) in Theorem~\ref{theorem::noisy-l0ssc-subspace-detection-lambda}. For all the methods that involve random projection, we conduct the experiments for $30$ times and report the average performance. Note that the cluster accuracy of SSC-OMP on the extended Yale-B data set is reported according to~\citet{YouRV16-OMP}. We randomly sample $1000$ images from each class of the MNIST data set so as to collect a total number of $10000$ images on which clustering is performed, and the average performance of $10$ random sampling is reported for this data set. The time complexity of noisy $\ell^{0}$-SSC and the two variants of Noisy-DR-$\ell^{0}$-SSC are analyzed in Section~\ref{sec::time-complexity} of the supplementary. The actual running time of both algorithms confirms such time complexity, and we observe that Noisy-DR-$\ell^{0}$-SSC-LR is always $8.7$ times faster than noisy $\ell^{0}$-SSC with the same number of iterations, and the acceleration is boosted to $9.6$ times by Noisy-DR-$\ell^{0}$-SSC-CSP
due to sparse random projections.

\vspace{0.05in}

%\subsection{Justification of Theoretical Analysis for noisy $\ell^{0}$-SSC}
We further demonstrate the practical implication of our theoretical analysis for noisy $\ell^{0}$-SSC. As mentioned in Remark~\ref{remark::lambda}, a relatively large $\lambda$ tends to preserve the subspace detection property. This theoretical finding is consistent with the empirical study shown in this subsection. We add Gaussian noise of zero mean and different choices of variance $\sigma^2$ to the extended Yale-B data set. In Section~\ref{sec::additional-experiments} of the supplementary, Figure~\ref{fig:yaleb-noisy-level-1} to Figure~\ref{fig:yaleb-noisy-level-6} illustrate SDP violation with respect to $\lambda$ for different noise levels with $\sigma^2$ ranging over $10,20,30,40,50,60$. The SDP violation is defined in~\citet{WangX13} which is the percentage of pairs of data points which are mistakenly put in the same subspace by the similarity matrix $\bW$, namely the percentage of pairs $(\bx_i,\bx_j)$ with nonzero $\bW_{ij}$ while they are in fact not in the same subspace. We observe that increasing $\lambda$ effectively reduces SDP violation for noisy $\ell^{0}$-SSC, Noisy-DR-$\ell^{0}$-SSC-LR and Noisy-DR-$\ell^{0}$-CSP, confirming our theoretical prediction.
%\begin{figure}[!hbt]
%\begin{center}
%\includegraphics[width=0.2\textwidth]{noisy-level/yaleb-noise-1.pdf}
%\end{center}
%   \caption{SDP violation with respect to the variance $\sigma^2=10$ on the extended Yale-B data set. }
%\label{fig:sdp-violation-sigma20}
%\end{figure}


\vspace{-0.06in}
\section{Conclusion}
\vspace{-0.06in}


In this paper, we prove that noisy $\ell^{0}$-SSC recovers subspaces from noisy data through $\ell^{0}$-induced sparsity. Our results for the first time reveal the theoretical advantage of noisy $\ell^{0}$-SSC over its $\ell^1$ counterpart and other competing subspace clustering methods in terms of much less restrictive condition on the subspace affinity, when the size of data grows exponentially in the subspace dimension. We then propose Noisy-DR-$\ell^{0}$-SSC to improve the efficiency of noisy $\ell^{0}$-SSC, which performs noisy $\ell^{0}$-SSC on dimensionality reduced data and still provably recovers the underlying subspaces. Experiments evidence the findings of our theoretical results in the robustness of noisy $\ell^{0}$-SSC against noise as well as the effectiveness of Noisy-DR-$\ell^{0}$-SSC.

%\newpage\clearpage


\iffalse
\section{Proofs}
\begin{proof}[\textbf{Proof of Theorem~\ref{theorem::l0ssc-deterministic}}]
Let $\bx_i \in \cS_k$. Note that ${\bZ^*}^{i}$ is an optimal solution to the following $\ell^{0}$ sparse representation problem
\bal\label{eq:ssc-l0-i}
\mathop {\min }\limits_{{\bZ^i}} {\| {{\bZ^i}} \|_0}\quad s.t.\;{ \bx_i} = {{[ \bX^{(k)}\setminus  \bx_i \quad  \bX^{(-k)}]}}{\bZ^i},\,\, \bZ_{ii} = 0,
\eal%%
where $\bX^{(-k)}$ denotes the data that lie in all subspaces except $\cS_k$. Let ${\bZ^*}^{i} = \left[ {\begin{array}{*{20}{c}}
\balpha\\
\bbeta
\end{array}} \right]$ where $\balpha$ and $\bbeta$ are sparse codes corresponding to $ \bX^{(k)}\setminus  \bx_i$ and $ \bX^{(-k)}$ respectively.

Suppose $\bbeta \neq \bzero$, then $ \bx_i$ belongs to a subspace $\cS^{'} = \bH_{ \bX_{{\bZ^*}^i}}$ spanned by the projected data points corresponding to nonzero elements of  ${\bZ^*}^{i}$, and $\cS^{'} \neq  \cS_k$, ${\rm dim}[\cS^{'}] \le  d_k$. To see this, if $\cS^{'} = \cS_k$, then the data corresponding to nonzero elements of $\bbeta$ belong to $ \cS_k$, which is contrary to the definition of $\bX^{(-k)}$. Also, if ${\rm dim}[\cS^{'}] >  d_k$, then any $ d_k$ points in $ \bX^{(k)}$ can be used to linearly represent $ \bx_i$ by the condition of general position, contradicting with the optimality of ${\bZ^*}^{i}$.

Since the data points (or columns) in $ \bX_{{\bZ^*}^i}$ are linearly independent, it follows that $\bx_i$ lies in an external subspace $\bH_{\bX_{{\bZ^*}^i}}$ spanned by linearly independent points in $\bX_{{\bZ^*}^i}$, and ${\rm dim}[\bH_{\bX_{{\bZ^*}^i}}] = {\rm dim}[\cS^{'}] \le  d_k$. This contradicts with the assumption that $\bx_i$ is away from the external subspaces. Therefore, $\bbeta = \bzero$. Perform the above analysis for all $1 \le i \le n$, we can prove that the subspace detection property holds for all $1 \le i \le n$.

\end{proof}

\begin{proof}[\textbf{Proof of Theorem~\ref{theorem::noisy-l0ssc-subspace-detection-lambda}}]
One can check that the conditions in Theorem~\ref{theorem::noisy-l0ssc-subspace-detection} hold when (\ref{eq:noisy-l0ssc-sdp-M}) and (\ref{eq:noisy-l0ssc-sdp-mu}) hold and $\lambda$ is chosen according to (\ref{eq:noisy-l0ssc-sdp-lambda}).
\end{proof}


\begin{proof}[\textbf{Proof of Theorem~\ref{theorem::noisy-dr-l0ssc-subspace-detection}}]
For any matrix $\bA \in \RR^{p \times q}$, we first show that multiplying $\bQ$ to the left of $\bA$ would not change its spectrum. To see this, let the singular value decomposition of $\bA$ be $\bA = \bU_{\bA} \bSigma \bV_{\bA}^{\top}$ where $\bU_{\bA}$ and $\bV_{\bA}$ have orthonormal columns with $\bU_{\bA}^{\top}\bU_{\bA} = \bV_{\bA}^{\top}\bV_{\bA} = \bI$. Then $\bQ\bA = \bU_{\bQ\bA} \bSigma \bV_{\bQ\bA}$ is the singular value decomposition of $\bQ\bA$ with $\bU_{\bQ\bA} = \bQ\bU_{\bA}$ and $\bV_{\bQ\bA} = \bV_{\bA}$. This is because the columns of $\bU_{\bQ\bA}$ are orthonormal since the columns $\bQ$ are orthonormal: $\bU_{\bQ\bA}^{\top} \bU_{\bQ\bA} = \bU_{\bA}^{\top}\bQ^{\top} \bQ\bU_{\bA} = \bI$, and $\bSigma$ is a diagonal matrix with nonnegative diagonal elements. It follows that $\sigma_{\min}(\bQ\bA) = \sigma_{\min}(\bA)$ for any $\bA \in \RR^{p \times q}$.

For a point $\bx_i = \by_i + \bn_i$, after projection via $\bP$, we have the projected noise $\tilde \bn_i = \bP \bn_i$. Because
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg1}
&\|\tilde \bn_i\|_2 = \|\bP \bn_i \|_2 = \|\bQ^{\top} \bn_i \|_2 \le \|\bQ\|_2 \|\bn_i\|_2 \le \|\bn_i\|_2 \le \delta,
\eal%%
the magnitude of the noise in the projected data is also bounded by $\delta$. Also,
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg1}
&\|\tilde \bx_i\|_2 = \|\bQ^{\top} \bx_i \|_2 \le  \|\bx_i\|_2 \le 1,
\eal%%
%\le \tilde d_{\max} = \max_{k=1,\ldots, K} d_k and ${\rm rank}(\bY_{\bbeta}) = r$
Let $\bbeta \in \RR^n$, $\tilde \bY_{\bbeta} = \bP \bY_{\bbeta} $ with $\|\bbeta\|_0 = r$. Then $\sigma_{\min}(\bQ \tilde \bY_{\bbeta}) = \sigma_{\min}(\tilde \bY_{\bbeta}))$. We have
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg2}
&|\sigma_{\min}(\tilde \bY_{\bbeta}) - \sigma_{\min}(\bY_{\bbeta})| = |\sigma_{\min}(\bQ \tilde \bY_{\bbeta}) - \sigma_{\min}(\bY_{\bbeta})| \nonumber \\
&\le \|\bQ \tilde \bY_{\bbeta} - \bY_{\bbeta}\|_2 \nonumber \\
& = \|\bQ \bQ^{\top}\bY_{\bbeta}  - \bY_{\bbeta}\|_2 \nonumber \\
&= \|\bQ \bQ^{\top}\bX_{\bbeta}  - \bX_{\bbeta} + \bN_{\bbeta} - \bQ \bQ^{\top} \bN_{\bbeta} \|_2 \nonumber \\
&\le C_{p,p_0} + \|\bN_{\bbeta} \|_F + \|\bQ \bQ^{\top} \bN_{\bbeta} \|_F \nonumber \\
&\le C_{p,p_0} + 2\delta \sqrt{r}.
\eal%%

Let $\sigma^{(k)}_{\bY} = \min \{\sigma_{\min}(\bA) \colon \bA \subseteq \bY^{(k)}, \bA \in \RR^{d \times n'}, n' \le \tilde d_k\}$. Therefore, it follows from (\ref{eq:noisy-dr-l0ssc-subspace-detection-seg2}) that if
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg3}
&C_{p,p_0} + 2\delta \sqrt{\tilde d_{\max}} < \min_{k = 1,\ldots,K} \sigma^{(k)}_{\bY},
\eal%%
where $\tilde d_{\max} = \max_{k=1,\ldots, K} \tilde d_k$, then $\tilde \bY$ is also in general position.

In addition, since $\lambda \ge \frac{1}{r_0}$, we have $\lambda \|\tilde \bbeta^*\|_0 \le L(\bzero) \le 1$, and it follows that $\|\tilde \bbeta^*\|_0 \le \frac{1}{\lambda} \le r_0$.

%For $\bbeta \in \RR^n$ with $\|\bbeta\|_0 = r < r_0$, define
%\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg4}
%&\tilde {\bar \sigma}_{\tilde \bY,r} \defeq \min_{\bbeta: \|\bbeta\|_0 = r, {\rm rank}(\tilde \bY_{\bbeta}) = \|\bbeta\|_0} \sigma_{\min}(\tilde \bY_{\bbeta})
%\eal%%

Based on (\ref{eq:noisy-dr-l0ssc-subspace-detection-seg3}) we have
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg4}
&|\tilde {\bar \sigma}_{\tilde \bY,r} - {\bar \sigma}_{\bY,r}| \le C_{p,p_0} + 2\delta \sqrt{r_0},
\eal%%

it follows that $\delta < \min_{1 \le r < r_0} \tilde {\bar \sigma}_{\bY,r}$ because $\delta < C_{p,p_0} + 2\delta \sqrt{r_0} + \min_{1 \le r < r_0} \bar \sigma_{\bY,r}$.

Again, for $\bbeta \in \RR^n$ with $\|\bbeta\|_0 = r \le r_0$, we have
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg5}
&|\sigma_{\min}(\tilde \bX_{\bbeta}) - \sigma_{\min}(\bX_{\bbeta})| = |\sigma_{\min}(\bQ \tilde \bX_{\bbeta}) - \sigma_{\min}(\bX_{\bbeta})| \nonumber \\
&\le \|\bQ \tilde \bX_{\bbeta} - \bX_{\bbeta}\|_2 \nonumber \\
& = \|\bQ \bQ^{\top}\bX_{\bbeta}  - \bX_{\bbeta}\|_2  = \|\hat \bX - \bX_{\bbeta}\|_2 \nonumber \\
&\le C_{p,p_0}.
\eal%%

%Define
%\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg6}
%&\tilde \sigma_{\tilde \bX,r} \defeq \min\{\sigma_{\min}(\tilde \bX_{\bbeta}) \colon 1 \le \|\bbeta\|_0 \le r\}
%\eal%%

It can be verified that
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg7}
&|\tilde \sigma_{\tilde \bX,r} - \sigma_{\bX,r}| \le C_{p,p_0}.
\eal%%

Combining (\ref{eq:noisy-dr-l0ssc-subspace-detection-seg7}) and Lemma~\ref{lemma::perturbation-distance-to-subspace-projection}, noting that $\sigma_{\bX,r_0} - C_{p,p_0}$, since
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg8}
&M_i - C_{p,p_0} (1+\frac{1}{\min_{1\le r \le \tilde d_k} \sigma_{\bY, r} - C_{p,p_0} - 2\delta \sqrt{\tilde d_{k}} }) \nonumber \\
&> \delta + \frac{2\delta}{\sigma_{\bX,r_0} -C_{p,p_0} },
\eal%%
we have
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg9}
&\tilde M_{i, \tilde \delta} \defeq \tilde M_i - \delta  > \frac{2\delta}{\tilde \sigma_{\tilde \bX,r_0}},
\eal%%
where $\by_i \in \cS_k$.

%Moreover, define
%\bal\label{eq:tilde-mu-r}
%&\tilde \mu_{r} \defeq \frac{\delta} {\min_{1 \le r < r_0} \tilde {\bar \sigma}_{\bY,r} - \delta}.
%\eal%%

Based on (\ref{eq:noisy-dr-l0ssc-subspace-detection-seg4}) and (\ref{eq:noisy-dr-l0ssc-subspace-detection-seg7}), we have
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg10}
&\tilde \mu_{r_0} < 1-\frac{2\delta}{\tilde \sigma_{\tilde \bX,r_0}},
\eal%%
because
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg10}
&\frac{\delta} {\min_{1 \le r < r_0}  \bar \sigma_{\bY,r_0} - C_{p,p_0} - 2\delta \sqrt{r_0} - \delta} < 1-\frac{2\delta}{\sigma_{\bX,r_0} - C_{p,p_0}}.
\eal%%

\end{proof}
\fi


\bibliography{Yang_517}
%\bibliographystyle{uai2022-template}






\end{document}
