%
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\DeclareMathAlphabet{\mathcal}{OMS}{cmsy}{m}{n}
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography


\usepackage{graphicx}
\usepackage{amssymb}
\usepackage{amsmath} \allowdisplaybreaks[4]
% For algorithms
\usepackage{algorithm}
\usepackage{algorithmic}

\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{multirow}
%\usepackage{subfigure}
\usepackage{caption,subcaption}
\usepackage{mathrsfs}
\usepackage{bbm}
\usepackage{bm}
\usepackage{eufrak}
\usepackage{bookmark}
\usepackage{mathtools}
\usepackage{hyperref}       % hyperlinks
%\usepackage{subfigure}
\usepackage{enumitem}
\hypersetup{colorlinks=true,citecolor=blue,linkcolor=blue}
\newcommand{\circled}[1]{\small{\raisebox{.6pt}{\textcircled{\raisebox{-.8pt}{#1}}}}}

\usepackage{xr}
\externaldocument{Yang_517}
\usepackage{balance}

%\graphicspath{{illustrations/}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand\blfootnote[1]{%
  \begingroup
  \renewcommand\thefootnote{}\footnote{#1}%
  \addtocounter{footnote}{-1}%
  \endgroup
}
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\def\aff#1{\textup {aff}\left(#1\right)}

\title{Noisy $\ell^{0}$-Sparse Subspace Clustering on Dimensionality Reduced Data Supplementary Material}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author{ \hspace{0.1in} Yingzhen Yang \hspace{2.5in} Ping Li \\
\hspace{-.4in} School of Computing and Augmented Intelligence \hspace{1in} Cognitive Computing Lab \\
\hspace{0.2in} Arizona State University \hspace{1.9in} Baidu Research \\
\hspace{0.6in} 699 S Mill Ave. Tempe, AZ 85281, USA \hspace{0.8in} 10900 NE 8th ST. Bellevue, WA 98004, USA \\
\hspace{0.3in} \texttt{yingzhen.yang@asu.edu} \hspace{1.6in} \texttt{pingli98@gmail.com}
}




\input{symbol_tit.tex}
\begin{document}

\maketitle



\section{Proofs}
We provide proofs to the lemmas and theorems in the paper in this subsection.

%\bal\label{eq:l0ssc}%
%\mathop {\min }\limits_{{\bZ}} {\| {{\bZ}} \|_0}\quad s.t.\;{\bX} = {{\bX}}{\bZ},\,\, {\rm diag}(\bZ) = \bzero,
%\eal%%
%\bal\label{eq:noisy-l0ssc-i}
%&\mathop {\min }\limits_{{\bbeta \in \RR^n,\bbeta_i = 0}} L(\bbeta) = {\|\bx_i - \bX \bbeta\|_2^2 + {\lambda}\|{\bbeta}\|_0}.
%\eal%%

\subsection{Lemma~\ref{lemma::l0ssc-deterministic} and Its Proof}
\begin{lemma}\label{lemma::l0ssc-deterministic}
{\rm (Subspace detection property holds for noiseless $\ell^{0}$-SSC under the deterministic model)}
%i.e. $\cS_k \neq \cS_{k'}$ for $k \neq k'$.
It can be verified that the following statement is true. Under the deterministic model, suppose data is noiseless, $n_k \ge d_k+1$, $\bY^{(k)}$ is in general position. If all the data points in $\bY^{(k)}$ are away from the external subspaces for any $1 \le k \le K$, then the subspace detection property for $\ell^{0}$-SSC holds with an optimal solution $\bZ^*$ to (\ref{eq:l0ssc}).
\end{lemma}
\begin{proof}
%We have ${\rm dim}[H] \le d_k$ for any $H \in \cH_k$ since $H \subseteq \cS_k$. Suppose ${\rm dim}[H] = d_k$, then $H = \cS_k$. Also, since $H = \cS_k \cap H'$ for some inter-subspace subspace $H'$ such that ${\rm dim}[H'] \le d_k$, we must have $H' = \bH_{\{\bx_{i_j}\}} = \cS_k$ for some linear independent set $\{\bx_{i_j}\}$ that does not lie in any subspace of $\{\cS_k\}$, which is a contradiction. Therefore, ${\rm dim}[H] < d_k$ for any $H \in \cH_k$, $1 \le k \le K$.
Let $\bx_i \in \cS_k$. Note that ${\bZ^*}^{i}$ is an optimal solution to the following $\ell^{0}$ sparse representation problem
\bals%\label{eq:ssc-l0-i}
\mathop {\min }\limits_{{\bZ^i}} {\| {{\bZ^i}} \|_0}\quad s.t.\;{ \bx_i} = {{[ \bX^{(k)}\setminus  \bx_i \quad  \bX^{(-k)}]}}{\bZ^i},\,\, \bZ_{ii} = 0,
\eals%%
where $\bX^{(-k)}$ denotes the data that lie in all subspaces except $\cS_k$. Let ${\bZ^*}^{i} = \left[ {\begin{array}{*{20}{c}}
\balpha\\
\bbeta
\end{array}} \right]$ where $\balpha$ and $\bbeta$ are sparse codes corresponding to $ \bX^{(k)}\setminus  \bx_i$ and $ \bX^{(-k)}$ respectively.

Suppose $\bbeta \neq \bzero$, then $ \bx_i$ belongs to a subspace $\cS^{'} = \bH_{ \bX_{{\bZ^*}^i}}$ spanned by the projected data points corresponding to nonzero elements of  ${\bZ^*}^{i}$, and $\cS^{'} \neq  \cS_k$, ${\rm dim}[\cS^{'}] \le  d_k$. To see this, if $\cS^{'} = \cS_k$, then the data corresponding to nonzero elements of $\bbeta$ belong to $ \cS_k$, which is contrary to the definition of $\bX^{(-k)}$. Also, if ${\rm dim}[\cS^{'}] >  d_k$, then any $ d_k$ points in $ \bX^{(k)}$ can be used to linearly represent $ \bx_i$ by the condition of general position, contradicting with the optimality of ${\bZ^*}^{i}$.
%Let $\cS^{''} = \cS^{'} \cap  \cS_k$, then ${\rm dim}[\cS^{''}] \le  d_k$. If ${\rm dim}[\cS^{''}] <  d_k$, then it can be verified that $\bP^{(-1)}(\cS^{''})$ is a subspace in $\cS_k$ and ${\rm dim}[\bP^{(-1)}(\cS^{''})] < d_k$. It follows that $ \bx_i \in \bP^{(-1)}(\cS^{''})$
Since the data points (or columns) in $ \bX_{{\bZ^*}^i}$ are linearly independent, it follows that $\bx_i$ lies in an external subspace $\bH_{\bX_{{\bZ^*}^i}}$ spanned by linearly independent points in $\bX_{{\bZ^*}^i}$, and ${\rm dim}[\bH_{\bX_{{\bZ^*}^i}}] = {\rm dim}[\cS^{'}] \le  d_k$. This contradicts with the assumption that $\bx_i$ is away from the external subspaces. Therefore, $\bbeta = \bzero$. Perform the above analysis for all $1 \le i \le n$, we can prove that the subspace detection property holds for all $1 \le i \le n$.

\end{proof}

\subsection{Proof of Theorem~\ref{theorem::noisy-l0ssc-subspace-detection}}
%\begin{theorem}\label{theorem::noisy-l0ssc-subspace-detection}
%{\rm (Subspace detection property holds for noisy $\ell^{0}$-SSC)}
%Let nonzero vector $\bbeta^*$ be an optimal solution to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}) for point $\bx_i$ with $\|\bbeta^*\|_0=r^* > 1$, and $c^* \defeq \|\bx_i - \bX \bbeta^*\|_2$. Suppose $\bY$ is in general position, $\by_i \in \cS_k$ for some $1 \le k \le K$, $\delta < \bar \sigma_{\bY}^*$, $\lambda > \tau_0$, $\bB(\by_i, \delta+c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}) \cap \bH = \emptyset$ for any $\bH \in \cH_{\by_i, d_k}$. Then the subspace detection property holds for $\bx_i$ with $\bbeta^*$. Here $\tau_0$, $\tau_1$, $\bar \sigma_{\bY}^*$ and $\sigma_{\bX}^*$ are defined in Lemma~\ref{lemma::equivalence-noisy-l0ssc}.
%\end{theorem}

%\begin{definition}\label{def::subspace-separation-margin}
%For a data point $\bx \in \bX$ and a vector $\bbeta \in \RR^n$ with $\|\bbeta\|_0 = r > 1$, the subspace separation margin, denoted by ${\rm HS}(\bx,\bX,\bbeta)$, is defined below as minimum difference between the distance of $\bx$ to any subspace spanned by linearly independent subset (columns) of $\bX$ of size less than $r$ and that to the subspace spanned by $\bX_{\bbeta}$, namely
%\bal\label{eq:subspace-separation-margin}
%&{\rm HS}(\bx,\bX,\bbeta) = \min_{\bbeta': \|\bbeta'\|_0 < r, {\rm rank}(\bX_{\bbeta'}) = \|\bbeta'\|_0} d(\bx,\bH_{\bX_{\bbeta'}}) \nonumber \\
%&- d(\bx,\bH_{\bX_{\bbeta}}).
%\eal%%
%Similarly, the subspace separation margin for $\by \in \bY$ is defined as
%\bal\label{eq:subspace-separation-margin-Y}
%&{\rm HS}(\by,\bY,\bbeta) = \min_{\bbeta': \|\bbeta'\|_0 < r, {\rm rank}(\bY_{\bbeta'}) = \|\bbeta'\|_0} d(\by,\bH_{\bY_{\bbeta'}}) \nonumber \\
%&- d(\by,\bH_{\bY_{\bbeta}}).
%\eal%%
%\end{definition}
Before proving this theorem, we introduce the following perturbation bound for the distance between a data point and the subspaces spanned by noisy and noiseless data, which is useful to establish the conditions when the subspace detection property holds for noisy $\ell^{0}$-SSC.

\begin{lemma}\label{lemma::perturbation-distance-to-subspace}
Let $\bbeta \in \RR^n$ and $\bY_{\bbeta}$ has full column rank. Suppose $\delta < \bar \sigma_{\bY,r}$ where $r = \|\bbeta\|_0$, then $\bX_{\bbeta}$ is a full column rank matrix, and
\bal\label{eq:perturbation-distance-to-subspace}
|d(\bx_i, \bH_{\bX_{\bbeta}}) - d(\bx_i, \bH_{\bY_{\bbeta}}) | \le \frac{\delta} {\bar \sigma_{\bY,r} - \delta}
\eal%%
for any $1 \le i \le n$.
\end{lemma}


Lemma~\ref{lemma::equivalence-noisy-l0ssc} shows that an optimal solution to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}) is also that to a $\ell^{0}$-minimization problem with tolerance to noise.

\begin{lemma}\label{lemma::equivalence-noisy-l0ssc}
%then the subspace separation margin ${\rm HS}(\bx_i,\bX,\bbeta^*) > 0$
Let nonzero vector $\bbeta^*$ be an optimal solution to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}) for point $\bx_i$ with $\|\bbeta^*\|_0=r^* > 1$. If $\lambda > \tau_0$ where $\tau_0$ is defined as
%${\rm HS}(\bx_i,\bX,\bbeta^*) > \tau_0$
\begin{align*}
&\tau_0 \defeq \frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*} + \tau_1,
\end{align*}%
where
\begin{align*}%\label{eq:equivalence-noisy-l0ssc-tau1}
&\tau_1 \defeq \frac{\delta} {\bar \sigma_{\bY}^* - \delta}, \quad \sigma_{\bX}^* \defeq \sigma_{\min}(\bX_{\bbeta^*}),
\end{align*}%
with $\delta < \bar \sigma_{\bY}^*$, and $\bar \sigma_{\bY}^*$ is defined as
\begin{align*}%\label{eq:l0ssc-bar-sigma-star}
&\bar \sigma_{\bY}^* \defeq \min_{r \in [r^*]} \bar \sigma_{\bY,r},
\end{align*}%
then $\bbeta^*$ is an optimal solution to the following sparse approximation problem with the uncorrupted data as the dictionary:
\bal\label{eq:equivalence-noisy-l0ssc-2}
&\mathop {\min }\limits_{{\bbeta}} {\| {\bbeta} \|_0} \quad s.t.\;\|{\bx_i} - {{\bY}}{\bbeta}\|_2 \le c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*},  \,\, \bbeta_i = 0,
\eal%%
where $c^* \defeq \|\bx_i - \bX \bbeta^*\|_2$.
\end{lemma}
%For $\by_i \in \cS_k$, suppose $d(\bx_i, \cS_k) \le c_0$ for some positive number $c_0$, and let $\cS_{c_0}$ be the set of all subspaces whose distance to $\bx_i$ is less or equal to $c_0$, i.e. $\cS_{c_0} = \{\cS \in \{\cS_k\}_{k=1}^K \colon d(\bx_i, \cS) \le c_0\}$.


Now we are ready to prove Theorem~\ref{theorem::noisy-l0ssc-subspace-detection}.

\begin{proof}[\textup{\bf Proof of Theorem~\ref{theorem::noisy-l0ssc-subspace-detection}}]
%We first prove that ${\rm HS}(\bx_i,\bX,\bbeta^*) > \tau_0$ if ${\rm HS}(\by_i,\bY,\bbeta^*) > \tau_0+2\tau_1+2\delta$. To see this, let $\bbeta' \in \RR^n$, $\|\bbeta'\|_0 = r \le r^*$, and $\bX_{\bbeta'}$ has full column rank. According to Lemma~\ref{lemma::perturbation-distance-to-subspace}, we have
%\bal\label{eq:noisy-l0ssc-subspace-detection-seg1}
%|d(\bx, \bH_{\bX_{\bbeta'}}) - d(\bx, \bH_{\bY_{\bbeta'}}) | \le \frac{\delta} {\bar \sigma_{\bY,r} - \delta}
%\eal%%
%where $\bx \in \bX$, and $\by = \bx - \bn$ is its corresponding clean datum. Then we have
%\bal\label{eq:noisy-l0ssc-subspace-detection-seg2}
%|d(\bx, \bH_{\bY_{\bbeta'}}) - d(\by, \bH_{\bY_{\bbeta'}}) | \le \|\bx - \bn\|_2 \le \delta
%\eal%%
%According to (\ref{eq:noisy-l0ssc-subspace-detection-seg1}) and (\ref{eq:noisy-l0ssc-subspace-detection-seg2}), we have
%\bal\label{eq:noisy-l0ssc-subspace-detection-seg3}
%|d(\bx, \bH_{\bX_{\bbeta'}}) - d(\by, \bH_{\bY_{\bbeta'}}) | \le \frac{\delta} {\bar \sigma_{\bY,r} - \delta} + \delta
%\eal%%
%
%Now let $\bbeta' = \bbeta^*$ and $\bbeta' = \bbeta$ with $\bbeta \in \RR^n$, $\|\bbeta\|_0 = r < r^*$, and $\bX_{\bbeta}$ has full column rank. It follows from (\ref{eq:noisy-l0ssc-subspace-detection-seg3}) that
%\bal\label{eq:noisy-l0ssc-subspace-detection-seg4}
%&|\big( d(\bx, \bH_{\bX_{\bbeta}}) - d(\bx, \bH_{\bX_{\bbeta^*}}) \big) - \big( d(\by, \bH_{\bY_{\bbeta}}) - d(\by, \bH_{\bY_{\bbeta^*}}) \big) | \nonumber \\
%&\le 2 (\frac{\delta} {\bar \sigma_{\bY,r} - \delta} + \delta) \le 2 (\frac{\delta} {\bar \sigma_{\bY}^* - \delta} + \delta) = 2\tau_1 + 2\delta
%\eal%%
%
%It follows that
%\bal\label{eq:noisy-l0ssc-subspace-detection-seg5}
%&|\big( d(\bx, \bH_{\bX_{\bbeta}}) - d(\bx, \bH_{\bX_{\bbeta^*}}) \big)| \ge {\rm HS}(\by,\bY,\bbeta^*) - 2\tau_1 - 2\delta \nonumber \\
%&> \tau_0+2\tau_1+2\delta - 2\tau_1 - 2\delta = \tau_0
%\eal%%

We first show that $d(\bx_i, \cS_k) \le c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}$. To see this, $\sigma_{\bX}^* = \sigma_{\min}(\bX_{\bbeta^*}) \le 1$ as the columns of $\bX$ have unit $\ell^{2}$-norm. It follows that
\bals%\label{eq:noisy-l0ssc-subspace-detection-seg1}
&c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*} \ge 2 \delta \sqrt{r^*} \ge 2 \delta > \|\bx_i - \by_i\| \ge d(\bx_i, \cS_k).
\eals%%

By Lemma~\ref{lemma::equivalence-noisy-l0ssc}, it can be verified that $\bbeta^*$ is an optimal solution to the following problem
\bal\label{eq:noisy-l0ssc-subspace-detection-seg2}
&\mathop {\min }\limits_{{\bbeta}} {\| {\bbeta} \|_0} \quad s.t.\;\|{\bx_i} - {{\bY}}{\bbeta}\|_2 \le c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*},  \,\, \bbeta_i = 0.
\eal%
Let $\bx'$ be the projection of $\bx_i$ onto $\bH_{\overbar {\bY^{(i)}}}$, and let the columns of $\overbar {\bY^{(i)}}$ have column indices $\bI$ in $\bY^{(k)}$, that is, $\bY_{\bI}^{(k)} = \overbar {\bY^{(i)}}$. Then there exists $\bbeta' \in \RR^n$ and $\bbeta'_j = 0$ for all $j \notin \bI$ such that $\bx' = \bY \bbeta'$ and $\norm{\bbeta'}{0} \le r^*$. It is clear that $\bbeta'$ is a feasible solution to (\ref{eq:noisy-l0ssc-subspace-detection-seg2})  because $d(\bx_i,\bH_{\overbar {\bY^{(i)}}}) = \ltwonorm{\bx_i - \bY \bbeta'} \le c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}$ and it satisfies SDP for $\bx_i$.

Suppose that there is an optimal solution $\bbeta''$ to (\ref{eq:noisy-l0ssc-subspace-detection-seg2}) which does not satisfy SDP for $\bx_i$, then $\norm{\bbeta''}{0} \le r^*$. Then the subspace spanned by $\bY_{\bbeta''}$, $\bH_{\bY_{\bbeta''}}$, is an external subspace of $\by_i$ and $\bH_{\bY_{\bbeta''}} \in \cH_{\by_i,r^*}$, and it follows that $d(\bx_i,\bH_{\bY_{\bbeta''}}) > c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}$. However, since $\bbeta''$ is a feasible solution, $d(\bx_i,\bH_{\bY_{\bbeta''}}) \le c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}$. This contradiction shows that every optimal solution to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}) satisfies SDP for $\bx_i$.

\end{proof}

\subsection{Proof of Lemma~\ref{lemma::perturbation-distance-to-subspace}}

The following lemma is used for proving Lemma~\ref{lemma::perturbation-distance-to-subspace}.

\begin{lemma}\label{lemma::perturbation-distance-to-hyperplane}
{\rm (Perturbation of distance to subspaces)}
Let $\bA$, $\bB \in \RR^{m \times n}$ are two matrices and ${\rm rank}(\bA) = r$, ${\rm rank}(\bB) = s$. Also, $\bE = \bA-\bB$ and $\|\bE\|_2 \le C$, where $\|\cdot\|_2$ indicates the spectral norm. Then for any point $\bx \in \RR^m$, the difference of the distance of $\bx$ to the column space of $\bA$ and $\bB$, i.e. $|d(\bx, \bH_{\bA}) - d(\bx, \bH_{\bB})|$, is bounded by
\bals%\label{eq:perturbation-distance-to-hyperplane}
|d(\bx, \bH_{\bA}) - d(\bx, \bH_{\bB}) | \le \frac{C \|\bx\|_2 }{ \min\{\sigma_{r}(\bA),\sigma_{s}(\bB)\} }.
\eals%%
\end{lemma}
\begin{proof}%[\textbf{{Proof of Lemma~\ref{lemma::perturbation-distance-to-hyperplane}}}]
Note that the projection of $\bx$ onto the subspace $\bH_{\bA}$ is $\bA \bA^{+} \bx$ where $\bA^{+}$ is the Moore-Penrose pseudo-inverse of the matrix $\bA$, so $d(\bx, \bH_{\bA})$ equals to the distance between $\bx$ and its projection, namely $d(\bx, \bH_{\bA}) = \|\bx - \bA \bA^{+} \bx\|_2$. Similarly,
$d(\bx, \bH_{\bB}) = \|\bx - \bB \bB^{+} \bx\|_2$.

It follows that
\bal\label{eq:perturbation-distance-to-hyperplane-seg1}
&|d(\bx, \bH_{\bA}) - d(\bx, \bH_{\bB}) | = | \|\bx - \bA \bA^{+} \bx\|_2 -  \|\bx - \bB \bB^{+} \bx\|_2 | \nonumber \\
&\le \|\bA \bA^{+} \bx - \bB \bB^{+} \bx\|_2 \le \|\bA \bA^{+} - \bB \bB^{+}\|_2 \|\bx\|_2.
\eal%%

According to the perturbation bound on the orthogonal projection in~\citet{Chen2016-perturbation-orthogonal-projection,Stewart1977-perturbation-pseudoinverse-projection},
\bal\label{eq:perturbation-distance-to-hyperplane-seg2}
&\|\bA \bA^{+} - \bB \bB^{+}\|_2 \le \max\{\|\bE\bA^{+}\|_2, \|\bE\bB^{+}\|_2\}.
\eal%%

Since $\|\bE\bA^{+}\|_2 \le \|\bE\|_2 \|\bA^{+}\|_2 \le \frac{C}{\sigma_{r}(\bA)}$,
$\|\bE\bB^{+}\|_2 \le \|\bE\|_2 \|\bB^{+}\|_2 \le \frac{C}{\sigma_{s}(\bB)}$, combining (\ref{eq:perturbation-distance-to-hyperplane-seg1}) and (\ref{eq:perturbation-distance-to-hyperplane-seg2}), we have

\bals%\label{eq:perturbation-distance-to-hyperplane-seg3}
&|d(\bx, \bH_{\bA}) - d(\bx, \bH_{\bB}) | &\le \max\{ \frac{C}{\sigma_{r}(\bA)},\frac{C}{\sigma_{s}(\bB)}\} \|\bx\|_2 \nonumber \\
& =\frac{C \|\bx\|_2}{ \min\{\sigma_{r}(\bA),\sigma_{s}(\bB)\} }.
\eals%%
So that (\ref{eq:perturbation-distance-to-subspace}) is proved.
\end{proof}

%\begin{lemma}\label{lemma::perturbation-distance-to-subspace}
%Let $\bbeta \in \RR^n$ and $\bY_{\bbeta}$ has full column rank. Suppose $\delta < \bar \sigma_{\bY,r}$ where $r = \|\bbeta\|_0$, then $\bX_{\bbeta}$ is a full column rank matrix, and
%\bal\label{eq:perturbation-distance-to-subspace}
%|d(\bx_i, \bH_{\bX_{\bbeta}}) - d(\bx_i, \bH_{\bY_{\bbeta}}) | \le \frac{\delta} {\bar \sigma_{\bY,r} - \delta}
%\eal%%
%for any $1 \le i \le n$.
%\end{lemma}

\begin{proof}[\textup{\bf {Proof of Lemma~\ref{lemma::perturbation-distance-to-subspace}}}]
We have $\by_i = \bx_i - \bn_i$, and $\sigma_{\min}(\bY_{\bbeta}^{\top}\bY_{\bbeta})  = \big(\sigma_{\min}(\bY_{\bbeta}) \big)^2 \ge \sigma_{\bY,r}^2$.

By Weyl~\citep{Weyl1912-perturbation-singular-value}, $|\sigma_{i}(\bY_{\bbeta}) - \sigma_{i}(\bX_{\bbeta})| \le \|\bN_{\bbeta}\|_2 \le \|\bN_{\bbeta}\|_F \le \sqrt{r} \delta$. Since $ \sqrt{r} \delta < \sigma_{\bY,r} \le \sigma_{\min}(\bY_{\bbeta}) \le \sigma_{i}(\bY_{\bbeta})$, $\sigma_{i}(\bX_{\bbeta}) \ge \sigma_{i}(\bY_{\bbeta}) - \sqrt{r} \delta \ge \sigma_{\bY,r} - \sqrt{r} \delta > 0$ for $1 \le i \le \min\{d,r\}$. It follows that $\sigma_{\min}(\bX_{\bbeta}) \ge  \sigma_{\bY,r} - \sqrt{r} \delta > 0$ and $\bX_{\bbeta}$ has full column rank.

%Both $\bH_{\bX_{\bbeta}}$ and $\bH_{\bY_{\bbeta}}$ are closed sets in $\RR^d$ with basis $\bX_{\bbeta}$ and $\bY_{\bbeta}$ respectively. It can be verified that $d(\bx_i, \bH_{\bX_{\bbeta}}) = \|\bx_i - {\bX_{\bbeta}} (\bX_{\bbeta}^{\top}\bX_{\bbeta})^{-1} \bX_{\bbeta}^{\top} \bx_i \|_2$ and $d(\bx_i, \bH_{\bY_{\bbeta}}) = \|\bx_i - {\bY_{\bbeta}} (\bY_{\bbeta}^{\top}\bY_{\bbeta})^{-1} \bY_{\bbeta}^{\top} \bx_i \|_2$. Therefore,
Also, $\|\bX_{\bbeta} - \bY_{\bbeta}\|_2 \le \|\bX_{\bbeta} - \bY_{\bbeta}\|_F \le \sqrt{r} \delta$. According to Lemma~\ref{lemma::perturbation-distance-to-hyperplane},
\bals%\label{eq:distance-to-subspace-seg1}
&|d(\bx_i, \bH_{\bX_{\bbeta}}) - d(\bx_i, \bH_{\bY_{\bbeta}}) | \nonumber \\
&\le \frac{\sqrt{r} \delta}{ \min\{\sigma_{\min}(\bX_{\bbeta}),\sigma_{\min}(\bY_{\bbeta})\} } \nonumber \\
&\le \frac{\sqrt{r} \delta} {\sigma_{\bY,r} - \sqrt{r} \delta} = \frac{\delta} {\bar \sigma_{\bY,r} - \delta}.
\eals%

\end{proof}

\subsection{Proof of Lemma~\ref{lemma::equivalence-noisy-l0ssc}}
%then the subspace separation margin ${\rm HS}(\bx_i,\bX,\bbeta^*) > 0$
%\begin{lemma}\label{lemma::equivalence-noisy-l0ssc}
%%then the subspace separation margin ${\rm HS}(\bx_i,\bX,\bbeta^*) > 0$
%Let nonzero vector $\bbeta^*$ be an optimal solution to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}) for point $\bx_i$ with $\|\bbeta^*\|_0=r^* > 1$. If $\lambda > \tau_0$ where $\tau_0$ is defined as
%%${\rm HS}(\bx_i,\bX,\bbeta^*) > \tau_0$
%\begin{align*}
%&\tau_0 \defeq \frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*} + \tau_1,
%\end{align*}%
%where
%\begin{align*}%\label{eq:equivalence-noisy-l0ssc-tau1}
%&\tau_1 \defeq \frac{\delta} {\bar \sigma_{\bY}^* - \delta}, \quad \sigma_{\bX}^* \defeq \sigma_{\min}(\bX_{\bbeta^*}),
%\end{align*}%
%with $\delta < \bar \sigma_{\bY}^*$, and $\bar \sigma_{\bY}^*$ is defined as
%\begin{align*}%\label{eq:l0ssc-bar-sigma-star}
%&\bar \sigma_{\bY}^* \defeq \min_{1 \le r < r^*} \bar \sigma_{\bY,r},
%\end{align*}%
%then $\bbeta^*$ is an optimal solution to the following sparse approximation problem with the uncorrupted data as the dictionary:
%\bal\label{eq:equivalence-noisy-l0ssc-2}
%&\mathop {\min }\limits_{{\bbeta}} {\| {\bbeta} \|_0} \quad s.t.\;\|{\bx_i} - {{\bY}}{\bbeta}\|_2 \le c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*},  \,\, \bbeta_i = 0.\,
%\eal%%
%where $c^* \defeq \|\bx_i - \bX \bbeta^*\|_2$.
%\end{lemma}
\begin{proof}[\textup{\bf Proof of Lemma~\ref{lemma::equivalence-noisy-l0ssc}}]
%According to the proof of Theorem~\ref{theorem::l0-ssc-random}, with probability $1$, $d_k$ points (columns) in $\bY^{(k)}$ other than $\by_i$, denoted by $\bY^{(k,-i)}$, serve as a basis for $\cS_k$. Therefore, $\by_i = \bY^{(k)} \beta'$ for some vector $\beta' \in \RR^n$ with at most $d_k$ nonzero elements. Let $\bbeta^*$ be an optimal solution to the problem (\ref{eq:noisy-l0ssc-i}) of noisy $\ell^{0}$-SSC for $\bx_i$, then $L(\bbeta^*) \le L(\bbeta')$, and we have
%
We have
\begin{align*}
&\|\bx_i - \bX \bbeta^*\|_2^2 + \lambda \|{\bbeta^*}\|_0 \le \|\bx_i - \bX \bzero\|_2^2 + {\lambda}\|{\bzero}\|_0 = 1 \\
&\Rightarrow c^* = \|\bx_i - \bX \bbeta^*\|_2 \le \sqrt{1 - \lambda r^*} < 1.
\end{align*}

We first prove that $\bbeta^*$ is an optimal solution to the sparse approximation problem
\bal\label{eq:equivalence-noisy-l0ssc-1}
&\mathop {\min }\limits_{{\bbeta}} {\| {\bbeta} \|_0} \quad s.t.\;\|{\bx_i} - {{\bX}}{\bbeta}\|_2 \le c^*,  \,\, \bbeta_i = 0.
\eal%%

To see this, if $r^* = 1$, then $\beta^*$ must be an optimal solution to (\ref{eq:equivalence-noisy-l0ssc-1}). If $r^* > 1$, suppose there is a vector $\bbeta'$ such that $\|{\bx_i} - {{\bX}}{\bbeta'}\|_2 \le c^*$ and $\| {\bbeta'} \|_0 < \|{\bbeta^*}\|_0$, then $L(\bbeta') < c^* + \lambda \| {\bbeta^*} \|_0 = L(\bbeta^*)$, contradicting the fact that $\bbeta^*$ is an optimal solution to (\ref{eq:noisy-l0ssc-i}).

%Denote by $\bX_{\bbeta^*}$ the submatrix of $\bX$ whose columns correspond to the nonzero elements of $\bbeta^*$. Then
Note that $\bX_{\bbeta^*}$ is a full column rank matrix, otherwise a sparser solution to (\ref{eq:noisy-l0ssc-i}) can be obtained as vector whose support corresponds to the maximal linear independent set of columns of $\bX_{\bbeta^*}$.

Also, the distance between $\bx_i$ and the subspace spanned by columns of $\bX_{\bbeta^*}$ equals to $c^*$, i.e. $d(\bx_i,\bH_{\bX_{\bbeta^*}}) = c^*$. To see this, it is clear that $d(\bx_i,\bH_{\bX_{\bbeta^*}}) \le c^*$. If there is a vector $\by = \bX {\tilde \bbeta}$ in $\bH_{\bX_{\bbeta^*}}$ with ${\rm supp}({\tilde \bbeta}) \subseteq {\rm supp}({\bbeta^*})$, and $\|\bx_i-\by\|_2 < c^*$, then $L({\tilde \bbeta}) < L(\bbeta^*)$ which contradicts the optimality of $\bbeta^*$. Therefore, $d(\bx_i,\bH_{\bX_{\bbeta^*}}) \ge c^*$, and it follows that $d(\bx_i,\bH_{\bX_{\bbeta^*}}) = c^*$.

%To prove that the subspace separation margin ${\rm HS}(\bx_i,\bX,\bbeta^*) > 0$, suppose ${\rm HS}(\bx_i,\bX,\bbeta^*) \le 0$, so there exists $\bbeta'$ such that $\|\bbeta'\|_0 < r^*$, ${\rm rank}(\bX_{\bbeta'}) = \|\bbeta'\|_0$ and $d(\by_i,\bH_{\bX_{\bbeta'}}) \le d(\by_i,\bH_{\bX_{\bbeta^*}}) \le c^*$. Then $\bbeta'$ is sparser than $\bbeta^*$ and it satisfies the constraint of problem (\ref{eq:equivalence-noisy-l0ssc-1}), contradicting the optimality of $\bbeta^*$.

Since $\|\bx_i - \bX \bbeta^*\|_2 \le 1$, $\|\bX \bbeta^*\|_2 \le 2$. Also,
$$\sigma_{\min} (\bX_{\bbeta^*}^{\top}{\bX_{\bbeta^*}}) \| \bbeta^*\|_2^2 \le \|\bX \bbeta^*\|_2^2 \le 4,$$
it follows that $\| \bbeta^*\|_2^2 \le \frac{4}{{\sigma_{\bX}^*}^2}$. By Cauchy-Schwarz inequality, $\| \bbeta^*\|_1 \le \frac{2\sqrt{r^*}}{\sigma_{\bX}^*}$ and $\|\bN \bbeta^*\|_2 \le \| \bbeta^*\|_1 \delta \le \frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}$. Therefore,

\bals
&\|{\bx_i} - {\bY}{\bbeta^*}\|_2 = \|{\bx_i} - {{\bX}}{\bbeta^*} + {{\bN}}{\bbeta^*}\|_2  \\
& \le \|{\bx_i} - {{\bX}}{\bbeta^*}\|_2 + \|{{\bN}}{\bbeta^*}\|_2 \le c^* + \frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*},
\eals%
so that $\bbeta^*$ is a feasible for problem (\ref{eq:equivalence-noisy-l0ssc-2}).

To prove that $\bbeta^*$ is an optimal solution to (\ref{eq:equivalence-noisy-l0ssc-2}), we first note that $\bbeta^*$ must be an optimal solution to (\ref{eq:equivalence-noisy-l0ssc-2}) if $r^*=1$. This is because $c^* \le \sqrt{1 - \lambda r^*} \le 1 - \lambda$ and $\lambda > \tau_0 > \frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}$ so that $c^* + \frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*} < 1$, and it follows that $\bzero$ is not feasible to (\ref{eq:equivalence-noisy-l0ssc-2}).

If $r^* > 1$ and suppose $\bbeta^*$ is not an optimal solution to (\ref{eq:equivalence-noisy-l0ssc-2}), then an optimal solution to (\ref{eq:equivalence-noisy-l0ssc-2}) is a vector $\bbeta'$ such that $\|{\bx_i} - {{\bY}}{\bbeta'}\|_2 \le c^*+ \frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}$ and $\| {\bbeta'} \|_0 = r < r^*$.
%Note that when $\delta < \frac{(1-c^*){\sigma_{\bX}^*}}{2\sqrt{r^*}}$, we must have $r \ge 1$, so such $\bbeta'$ cannot exist if $r^*=1$.
${\bY}_{\bbeta'}$ is a full column rank matrix, otherwise a sparser solution can be obtained as vector whose support corresponds to the maximal linear independent set of columns of $\bY_{\bbeta'}$. We have
\begin{align*}
&d(\bx_i, \bH_{\bY_{\bbeta'}}) \le \|{\bx_i} - {\bY}{\bbeta'}\|_2 \le c^* + \frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}.
\end{align*}%
According to Lemma~\ref{lemma::perturbation-distance-to-subspace}, we have

\bal\label{eq:equivalence-noisy-l0ssc-seg1}
&|d(\bx_i, \bH_{\bX_{\bbeta'}}) - d(\bx_i, \bH_{\bY_{\bbeta'}})| \le \frac{\sqrt{r} \delta} {\sigma_{\bY,r} - \sqrt{r} \delta} \nonumber \\
& = \frac{\delta} {\bar \sigma_{\bY,r} - \delta} \le \frac{\delta} {\bar \sigma_{\bY}^* - \delta} \nonumber \\
& \Rightarrow d(\bx_i, \bH_{\bX_{\bbeta'}}) \le c^* + \frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*} + \frac{\delta} {\bar \sigma_{\bY}^* - \delta} = c^* + \tau_0.
\eal%
However, according to the optimality of $\bbeta^*$ in the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}), we have

\bal\label{eq:equivalence-noisy-l0ssc-seg2}
&d(\bx_i, \bH_{\bX_{\bbeta'}}) - c^* = d(\bx_i, \bH_{\bX_{\bbeta'}}) - d(\bx_i, \bH_{\bX_{\bbeta^*}}) \nonumber \\
&\stackrel{\circled{1}}{\ge} (r^* - r) \lambda > \tau_0.
\eal%
To see $\circled{1}$ holds, let $\bbeta'' \in \RR^d$, $\supp{\bbeta''} \subseteq \supp{\bbeta'}$ such that $\ltwonorm{\bx_i - \bX \bbeta''} = d(\bx_i, \bH_{\bX_{\bbeta'}})$.
Then by the optimality of $\bbeta^*$,
\bals
\ltwonorm{\bx_i - \bX \bbeta''}  &\ge  d(\bx_i, \bH_{\bX_{\bbeta^*}}) + \lambda r^* - \lambda \abth{\supp{\bbeta''}} \\
&\ge d(\bx_i, \bH_{\bX_{\bbeta^*}}) + (r^* - r) \lambda.
\eals
The contradiction between (\ref{eq:equivalence-noisy-l0ssc-seg1}) and (\ref{eq:equivalence-noisy-l0ssc-seg2}) shows that $\bbeta^*$ is an optimal solution to (\ref{eq:equivalence-noisy-l0ssc-2}).
\end{proof}

\iffalse
\subsection{Proof of Lemma~\ref{lemma::noisy-l0ssc-subspace-detection}}
Define $\bB(\bx_i, c_0) = \{\bx \colon \|\bx-\bx_i\| \le c_0\}$ be the ball centered at $\bx_i$ with radius $c_0$. If $\bB(\bx_i, c_0)$ is away from the corresponding confusion area, i.e. all the external subspaces in $\cH_{\by_i, d_k}$, then subspace detection property holds with the solution to a proper sparse approximation problem where $\bx_i$ is approximated by the uncorrupted data, as shown in the following Lemma.
\begin{lemma}\label{lemma::noisy-l0ssc-subspace-detection}
Suppose $\bY$ is in general position and $\by_i \in \cS_k$ for some $1 \le k \le K$. For positive number $c_0 \defeq c^*+\frac{2\delta\sqrt{r^*}}{\sigma_{\bX}^*}$, suppose $\bB(\bx_i, c_0) \cap \bH = \emptyset$ for any $\bH \in \cH_{\by_i, r^*}$. Then the subspace detection property holds for $\bx_i$ with an optimal solution to the following sparse approximation problem, denoted by $\bbeta^*$, i.e. nonzero elements of $\bbeta^*$ correspond to the columns of $\bX$ from the same subspace as $\by_i$.
\bal\label{eq:noisy-l0ssc-subspace-representation-prob}
&\mathop {\min }\limits_{{\bbeta}} {\| {\bbeta} \|_0} \quad s.t.\;\|{\bx_i} - {\bY}{\bbeta}\|_2 \le c_0,  \,\, \bbeta_i = 0.
\eal%%
\end{lemma}
\begin{proof}[\textup{\bf Proof of Lemma~\ref{lemma::noisy-l0ssc-subspace-detection}}]
%Define $\cA = \{k: d(\bx_i, \cS_k) \le c_0 \}$, $\by = {\bY}{\bbeta^*}$. Then it can be verified that problem
(\ref{eq:noisy-l0ssc-subspace-representation-prob}) is equivalent to the following problem
\bal\label{eq:noisy-l0ssc-subspace-representation-seg1}
&\mathop {\min }\limits_{{\bbeta}} {\| {\bbeta} \|_0} \quad s.t.\;\by = {\bY}{\bbeta}, \,\, \|\bx_i-\by\|_2 \le c_0,  \,\, \bbeta_i = 0.
\eal%
We show that the points (columns) of $\bY_{\bbeta^*}$ must come from subspace $\cS_k$. To see this, suppose some columns of $\bY_{\bbeta^*}$ come from different subspaces. We first have $\|\bbeta^*\|_0 \le d_k$. To see this, we can choose some $\by' \in \cS_k$ such that $\|\by'-\bx_i\|_2 \le c_0$ since $c_0 \ge d(\bx_i, \cS_k)$. Also, $d_k$ points in $\bY^{(k)}$ can linearly represent $\by'$ since $\bY^{(k)}$ is in general position, and it follows that $\|\bbeta^*\|_0 \le d_k$ due to the optimality of $\bbeta^*$.

Also, $\bY_{\bbeta^*}$ has full column rank, so that subspace $\bH_{\bY_{\bbeta^*}} \in \cH_{\by_i, d_k}$. Let $\by^* = {\bY}{\bbeta^*}$, then $\by^* \in \bH_{\bY_{\bbeta^*}} \cap \bB(\bx_i, c_0)$ which contradicts the fact that $\bB(\bx_i, c_0) \cap \bH = \emptyset$ for any $\bH \in \cH_{\by_i, d_k}$. Therefore, columns of $\bY_{\bbeta^*}$ must come from $\cS_k$.
%and it follows that $\by^*$ must lie in some subspace among $\cS_{c_0}$, and we are looking for the subspace which leads to the sparsest representation for any point within it. (\ref{eq:noisy-l0ssc-subspace-representation-prob}) is equivalent to
%\bal\label{eq:noisy-l0ssc-subspace-representation-seg2}
%&\mathop {\min }\limits_{{\bbeta}} {\| {\bbeta} \|_0} \quad s.t.\;\by = {\bY^{(k_t)}}{\bbeta}, \,\, \|\bx_i-\by\|_2 \le c_0, \,\, \by \in \cS_{k_t}, \,\, 1 \le t \le T,  \,\, \bbeta_i = 0
%\eal%
%where $\cS_{c_0} = \{\cS_{k_t}\}_{t=1}^T$.
%Suppose $\by \in \cS_{k_t}$ for some $1 \le t \le T$. Since $\bY^{(k_t)}$ are in general position, at least $d_{k_t}$ points in  $\bY^{(t)}$ are required to represent $\by^*$. On the other hand, any $d_t$ points in $\bY^{(t)}$ must represent $\by^*$, so the sparsest $\bbeta$ such that $\by = {\bY^{k_t}}{\bbeta}$ should satisfy $\|\bbeta\|_0 = d_{k_t}$. It follows that we should choose $\by \in \cS_k$ since $\cS_k$ has the lowest dimension among $\cS_{c_0}$ to obtain an optimal solution to (\ref{eq:noisy-l0ssc-subspace-representation-seg2}). Therefore, nonzero elements of $\bbeta^*$ correspond to data in $\bY^{(k)}$ and the subspace detection property holds for $\bx_i$.
\end{proof}
\fi

\subsection{Proof of Theorem~\ref{theorem::noisy-l0ssc-subspace-detection-lambda}}
%\begin{theorem}\label{theorem::noisy-l0ssc-subspace-detection-lambda}
%{\rm (Subspace detection property holds for noisy $\ell^{0}$-SSC under deterministic model, with conditions in terms of $\lambda$)}
%Let nonzero vector $\bbeta^*$ be an optimal solution to the noisy $\ell^{0}$-SSC problem (\ref{eq:noisy-l0ssc-i}) for point $\bx_i$ with $\|\bbeta^*\|_0=r^*$, $n_k \ge d_k+1$ for every $1\le k \le K$, and there exists $1 < r_0 \le d$ such that $1 < r^* \le r_0$. Suppose $\bY$ is in general position, $\by_i \in \cS_k$ for some $1 \le k \le K$, $\delta < \min_{1 \le r < r_0} \bar \sigma_{\bY,r}$, and $M_{i,\delta} \defeq M_i - \delta$. Suppose
%\bal\label{eq:noisy-l0ssc-sdp-M}
%&M_{i,\delta}  > \frac{2\delta}{\sigma_{\bX,r_0}},
%\eal%%
%and
%\bal\label{eq:noisy-l0ssc-sdp-mu}
%&\mu_{r_0} < 1-\frac{2\delta}{\sigma_{\bX,r_0}}.
%\eal%%
%Then if
%\bal\label{eq:noisy-l0ssc-sdp-lambda}
%& \lambda_{0} < \lambda < 1,
%\eal%%
%where $\lambda_0 \defeq \max\{\lambda_1,\lambda_2\}$ and
%\bal
%&\lambda_{1} \defeq \inf\{0 < \lambda < 1 \colon \sqrt{1-\lambda} + \frac{2\delta}{\sigma_{\bX,r_0} \sqrt{\lambda}}< M_{i,\delta}\}, \label{eq:noisy-l0ssc-sdp-min-lambda-1} \\
%&\lambda_{2} \defeq \inf\{0 < \lambda < 1 \colon \lambda - \frac{2\delta}{\sigma_{\bX,r_0}} \frac{1}{\sqrt{\lambda}} > \mu_{r_0}\}, \label{eq:noisy-l0ssc-sdp-min-lambda-2}
%\eal%%
%the subspace detection property holds for $\bx_i$ with $\bbeta^*$. %Here $M_i$, $\mu_{r_0}$ and $\sigma_{\bX,r_0}$ are defined in (\ref{eq:Mi}), (\ref{eq:mu-r}) and (\ref{eq:sigma-X-r}) respectively.
%\end{theorem}
\begin{proof}[\textup{Proof of Theorem~\ref{theorem::noisy-l0ssc-subspace-detection-lambda}}]
This theorem can be proved by checking that the conditions in Theorem~\ref{theorem::noisy-l0ssc-subspace-detection} are satisfied.
\end{proof}

%\begin{lemma}\label{lemma::point-to-subspace-concentration}
%Under semi-random model, given $1 \le k \le K$ and $\by \in \bY^{(k)}$, suppose $\bH \in \cH_{\by_i, d_k}$ is any external subspace of $\by$. Then for any $t>0$,
%\bal\label{eq:cond-a}
%&\Pr[d(\by, \bH) \ge 1 - 2t\sqrt{d_k-1}-t^2] \ge 1-8\exp(-\frac{d_k t^2}{2}).
%\eal%%
%\end{lemma}


\subsection{Proof of Theorem~\ref{theorem::noisy-l0ssc-subspace-detection-lambda-random}}

In order to prove this theorem, the following lemma is presented and it provides the geometric concentration inequality for the distance between a point $\by \in \bY^{(k)}$ and any of its external subspaces. It renders a lower bound for $M_{i}$, namely the minimum distance between $\by_i \in \cS_k$ and its external subspaces.

\begin{lemma}\label{lemma::point-to-subspace-concentration}
Under semi-random model, given $1 \le k \le K$ and $\by \in \bY^{(k)}$, suppose $\bH \in \cH_{\by_i, d_k}$ is any external subspace of $\by$. Moreover, assume that for any external subspace $\bH'$ of $\by$, ${\rm Tr} ( \bU^{\top}_{\bH} \bU^{(k)} {\bU^{(k)}}^{\top} \bU_{\bH} ) \le d_k - 1$ where $\bU_{\bH}$ is an orthonormal basis of $\bH$.  Then for any $t>0$,
\bal\label{eq:point-to-subspace-concentration}
&\Pr[d(\by, \bH) \ge \frac 1 {d_k} - 2t\sqrt{1 - \frac 1 {d_k}}-t^2] \ge 1-8\exp(-\frac{d_k t^2}{2}).
\eal%%
\end{lemma}
\begin{proof}[\textup{\bf Proof of Lemma~\ref{lemma::point-to-subspace-concentration}}]
Let $\bH$ be a fixed subspace of dimension $d_e \le d_k$, and $\by \notin \bH$. Since $\by \in \cS_k$ and $\by \notin \bH$. Let $\by = \bU^{(k)} \tilde \by$ and $\Expect{}{\tilde \by {\tilde \by}^{\top}} = \bI_{d_k}$.

\iffalse
$\bU_{\cS_k} = \left[ {\begin{array}{*{20}{c}}
{\bI_{d_k}}\\
{\bzero}
\end{array}} \right]\in \RR^{d \times d_k}$ be the orthonormal basis of $\cS_k$ under which the isotropic random vector $\by$ in $\cS_k$ satisfies $\E[\by \by^{\top}] = \left[ {\begin{array}{*{20}{c}}
\bI_{d_k}& \bzero\\
\bzero& \bzero
\end{array}} \right]$. It follows that more columns vectors can be added to $\bU_{\cS_k} $ to form a orthonormal basis $\bU \in \RR^{d \times d'}$ for the minimum subspace that contains $\cS_k$ and $\bH$. It can be verified that $d_k +1 \le d' \le \min\{d_k + d_e, d\}$ because $\bH \neq \cS_k$. Note that $\bU$ can be represented as a block matrix as $\bU = \left[ {\begin{array}{*{20}{c}}
\bI_{d_k}& \bzero\\
\bzero& \bU'
\end{array}} \right]$ where $\bU' \in \RR^{(d-d_k) \times (d'-d_k)}$ has orthonormal columns. It can be verified that the basis of $\bH$ can be represented as $\bU_{\bH} = \left[ {\begin{array}{*{20}{c}}
\bI_{d_e-d'+d_k}& \bzero\\
\bzero& \bU'
\end{array}} \right]$. Note that if $d_e-d'+d_k = 0$, $\bU_{\bH} = \left[ {\begin{array}{*{20}{c}}
{\bzero}\\
{\bU'}
\end{array}} \right]$.
\fi

Then the projection of $\by$ onto $\bH$ is $\mathbb P_{\bH}(\by) = \bU_{\bH} \bU^{\top}_{\bH} \by$, and we have
\begin{small}\bal\label{eq:point-to-subspace-concentration-seg1}
&\E[\|\mathbb P_{\bH}(\by)\|_2^2] = \E[ \by^{\top} \bU_{\bH} \bU^{\top}_{\bH} \bU_{\bH} \bU^{\top}_{\bH} \by] \nonumber \\
&= \E[{\rm Tr}(\by^{\top} \bU_{\bH} \bU^{\top}_{\bH} \by )] \nonumber \\
&= \E[{\rm Tr}( \bU^{\top}_{\bH} \by \by^{\top} \bU_{\bH}  )] \nonumber \\
&={\rm Tr} ( \bU^{\top}_{\bH} \E[\by \by^{\top} ] \bU_{\bH} ) \nonumber \\
&={\rm Tr} ( \bU^{\top}_{\bH} \bU^{(k)} \E[\tilde \by {\tilde \by}^{\top} ] {\bU^{(k)}}^{\top} \bU_{\bH} ) \nonumber \\
%&= {\rm Tr} \Big( \left[ {\begin{array}{*{20}{c}}
%\bI_{d_e-d'+d_k}& \bzero\\
%\bzero& \bU'
%\end{array}} \right]^{\top} \left[ {\begin{array}{*{20}{c}}
%\bI_{d_k}& \bzero\\
%\bzero& \bzero
%\end{array}} \right] \left[ {\begin{array}{*{20}{c}}
%\bI_{d_e-d'+d_k}& \bzero\\
%\bzero& \bU'
%\end{array}} \right] \Big) \nonumber \\
&=\frac{1}{d_k} {\rm Tr} ( \bU^{\top}_{\bH} \bU^{(k)} {\bU^{(k)}}^{\top} \bU_{\bH} ) \le \frac{d_k-1}{d_k} = 1 - \frac 1{d_k}.
\eal\end{small}%%
According to the concentration inequality in section 5.2 of~\citep{aubrun2017alice}, for any $t > 0$,
\bal\label{eq:point-to-subspace-concentration-seg2}
&\Pr[\abth{\ltwonorm{\mathbb P_{\bH}(\by)} - \Expect{}{\ltwonorm{\mathbb P_{\bH}(\by)}}} \ge t] \le 8 \exp(-\frac{d_k t^2}{2}),
\eal%%

and by (\ref{eq:point-to-subspace-concentration-seg1}) $\Expect{}{\ltwonorm{\mathbb P_{\bH}(\by)}} \le \sqrt{1 - \frac 1{d_k}}$.

Now let $\bH$ be spanned by data from $\bY$, i.e. $\bH = \bH_{\{\by_{i_j}\}_{j=1}^{d_e}}$, where $\{\by_{i_j}\}_{j=1}^{d_e}$ are any $d_e$ linearly independent points that does not contain $\by$. For any fixed points $\set{\by_{i_j}}_{j=1}^{d_e}$, (\ref{eq:point-to-subspace-concentration-seg2}) holds. Let $A$ be the event that $\abth{\mathbb P_{\bH}(\by) - \Expect{}{\ltwonorm{\mathbb P_{\bH}(\by)}}} \ge t$, we aim to integrate the indicator function $\1_{A}$ with respect to the random vectors, i.e. $\by$ and $\{\by_{i_j}\}_{j=1}^{d_e}$, to obtain the probability that $A$ happens over these random vectors. Let $\by = \by_i$, using Fubini theorem, we have
\bal\label{eq:point-to-subspace-concentration-seg3}
&\Pr[A] = \int_{\otimes_{j=1}^n \cS^{(j)}} \1_{A} {\otimes}_{j=1}^n d\mu^{(j)} \nonumber \\
&=\int_{\otimes_{j \neq i} \cS^{(j)}} \Pr[A | \{\by_j\}_{j \neq i}] {\otimes}_{j \neq i} d\mu^{(j)} \nonumber \\
&\le \int_{\otimes_{j \neq i} \cS^{(j)}}  8 \exp(-\frac{d_k t^2}{2}) {\otimes}_{j \neq i} d\mu^{(j)} = 8 \exp(-\frac{d_k t^2}{2}),
\eal%%
where $\cS^{(j)} \in \{\cS_k\}_{k=1}^K$ is the subspace that $\by_j$ lies in, and $\mu^{(j)}$ is the probabilistic measure of the distribution in $\cS^{(j)}$. The last inequality is due to (\ref{eq:point-to-subspace-concentration-seg2}).

Note that for any $\by$'s external subspace $\bH = \bH_{\{\by_{i_j}\}_{j=1}^{d_e}}$, $d(\by, \bH) = \sqrt{\|\by\|_2^2 - \|\mathbb P_{\bH}(\by)\|_2^2} = \sqrt{1 - \|\mathbb P_{\bH}(\by)\|_2^2} $. According to (\ref{eq:point-to-subspace-concentration-seg3}), we have
\bals
%\label{eq:point-to-subspace-concentration-seg4}
&\Pr[d(\by, \bH) \ge \frac 1 {d_k} - 2t\sqrt{1 - \frac 1 {d_k}}-t^2] \ge 1-8\exp(-\frac{d_k t^2}{2}).
\eals%%
\end{proof}
The following lemma shows the lower bound for any submatrix of $\bY^{(k)}$.

\begin{lemma}\label{lemma:chi-square-concentration}
{\rm (\cite[Lemma 1]{Laurent2000-chi-square-concentration})} Let $\set{X_i}_{i=1}^k$ be i.i.d. standard Gaussian random variables and $X = \sum\limits_{i=1}^k X_i^2$, then
\bals%\label{eq:chi-square-concentration}
\Prob{X - k \ge 2 \sqrt{kx} + 2x} &\ge \exp\pth{-x}, \nonumber \\
\Prob{k - X \ge 2 \sqrt{kx} } &\ge \exp\pth{-x}.
\eals%
\end{lemma}

\begin{lemma}\label{lemma::spectrum-bound-random-matrix}
{\rm (Spectrum bound for Gaussian random matrix, \cite[Theorem II.13]{Davidson08-local-operator-random-matrix})}
Suppose $\bA \in \RR^{m \times n}$ ($m \ge n$) is a random matrix whose entries are i.i.d. samples generated from the standard Gaussian distribution $\cN(0,\frac{1}{m})$. Then
\bsals%\label{eq:spectrum-bound-1}
& 1 - \sqrt {\frac{n}{m}} \le \E[\sigma_{n}(\bA)] \le \E[\sigma_{1}(\bA)] \le 1 + \sqrt {\frac{n}{m}}.
\esals%
Also, for any $t > 0$,
\bal
&\Pr[\sigma_{n}(\bA) \le 1 - \sqrt {\frac{n}{m}} - t] < \exp\pth{-\frac{mt^2}{2}}, \label{eq:least-singular-lower-bound} \\
&\Pr[\sigma_{1}(\bA) \ge 1 + \sqrt {\frac{n}{m}} + t] < \exp\pth{-\frac{mt^2}{2}}. \nonumber
\eal%
\end{lemma}

\begin{lemma}\label{lemma:lower-bound-singular-clean-data}
Let $\bY \in \RR^{d \times r}$ be any submatrix of $\bY^{(k)}$ with ${\rm rank}(\bY) = r$ and $r \le r_0 \le \floor{\frac{1}{\lambda}} \le d_k$, $k \in [K]$. Suppose $c_1 > 0$ is an arbitrary small constant, $\eps_0, \eps_1 > 0$ be small constants, and $d_k$ is large enough such that $2d^{-0.05}_k + 2d^{-0.1}_k \le \eps_0$ and $\sqrt{\frac{1}{\lambda d_k}} + \sqrt{ \frac{2}{\lambda d_k} \log{\frac{en_k}{r_0}}} \le \eps_1$.  Then with probability at least $1-\exp(-c_1 d_k)-2n_k\exp\pth{-d^{0.9}_k}$, $\sigma_{\min}(\bY) \ge \sigma'_{\min} $, where $\sigma'_{\min}$ is defined by (\ref{eq:sigma-min-clean-data}).
\end{lemma}
\begin{proof}
%
Let $\bY = \bU^{(k)} \balpha \bS$ be a submatrix of size $d_k \times r$ of $\bY^{(k)}$. $\balpha \in \RR^{d_k \times r}$ and elements of $\balpha$ are i.i.d. standard Gaussians, that is, $\balpha_{ij} \sim \cN(0,1)$, $i \in [d_k], j \in [r]$. $\bS \in \RR^{r \times r}$ is a diagonal matrix with $\bS_{ii} = \ltwonorm{\balpha^i}$ for $i \in [r]$. Define $\bC \defeq \balpha \bS$. By the concentration property of $\chi^2$-distribution (Lemma~\ref{lemma:chi-square-concentration}), with probability at least $1-2n_k\exp\pth{-d^{0.9}_k}$, $\bS_{ii} \in [\sqrt{d_k - 2d^{0.95}_k}, \sqrt{d_k + 2d^{0.95}_k + 2d^{0.9}_k}]$ for all $i \in [r]$ and any submatrix $\bY$ of $\bY^{(k)}$.

Now we estimate an lower bound for the least singular value of $\balpha$. By (\ref{eq:least-singular-lower-bound}) of Lemma~\ref{lemma::spectrum-bound-random-matrix}, for a particular submatrix $\bY$ of $\bY^{(k)}$ and the corresponding $\balpha$ and any $t > 0$, we have
\bsal\label{eq:lower-bound-singular-clean-data-seg1}
\Prob{\sigma_{\min}(\balpha) \ge \sqrt{d_k} - \sqrt{r} - \sqrt{d_k} t } \ge 1 - \exp\pth{-\frac{d_k t^2}{2}}. \esal%
Now there are $\binom{n_k}{r}$ ways of chooing the submatrix $Y$, and $\binom{n_k}{r} \le \pth{\frac{en_k}{r}}^r$. Applying the union bound to (\ref{eq:lower-bound-singular-clean-data-seg1}), we have
\bsal\label{eq:lower-bound-singular-clean-data-seg2}
&\Prob{\sigma_{\min}(\balpha) \ge \sqrt{d_k} - \sqrt{r} - \sqrt{d_k} t } \ge 1 - \binom{n_k}{r} \exp\pth{-\frac{d_k t^2}{2}} \nonumber \\
&\ge 1 - \exp\pth{r \log{\frac{en_k}{r}} - \frac{d_k t^2}{2} }
\ge 1 - \exp\pth{r_0 \log{\frac{en_k}{r_0}} - \frac{d_k t^2}{2} }
\esal
for any submatrix $Y \in \RR^{d_k \times r}$ of $\bY^{(k)}$.
Let $c_1 > 0$ and $ t = \frac{\sqrt{2 r_0 \log{\frac{en_k}{r_0}}}}{\sqrt{d_k}}  +\sqrt{c_1}$ in (\ref{eq:lower-bound-singular-clean-data-seg2}), then with probability at least $1 - \exp\pth{-\frac{c_1 d_k}{2}}$, $\sigma_{\min}(\balpha) \ge \sqrt{d_k}(1-\sqrt{c_1}) - \sqrt{r} - \sqrt{2 r_0 \log{\frac{en_k}{r_0}}} $. Combined with the bounds for $\bS_{ii}$, we conclude that with probability at least $1-\exp(-c_1d_k)-2n_k\exp\pth{-d^{0.9}_k}$,
\bsals
&\sigma_{\min}(\bY) =  \sigma_{\min}(\balpha \bS) \ge \frac{\sqrt{d_k}(1-\sqrt{c_1}) - \sqrt{r} - \sqrt{2 r_0 \log{\frac{en_k}{r_0}}}}{\sqrt{d_k + 2d^{0.95}_k + 2d^{0.9}_k}} \\
&\ge \frac{1}{1 + 2d^{-0.05}_k + 2d^{-0.1}_k} \pth{1-\sqrt{c_1} - \sqrt{\frac{r}{d_k}} - \sqrt{ \frac{2r_0}{d_k} \log{\frac{en_k}{r_0}}} } \\
&\ge \frac{1}{1+\eps_0} \pth{1-\sqrt{c_1} - \eps_1} = \sigma'_{\min}.
\esals%
\end{proof}


\begin{proof}[\textup{\bf Proof of Theorem~\ref{theorem::noisy-l0ssc-subspace-detection-lambda-random}}]

Let $\bY_{\bbeta}$ for any $\bbeta \in \RR^n$ with $\|\bbeta\|_0 = r_0$. Noting that $\bY_{\bbeta}$ have columns from at most $r_0$ subspaces, let $\bbeta = \sum_{r=1}^{r_0} \bbeta^{(r)}$, $\set{\bbeta^{(r)}}_{r=1}^{r_0}$ have non-coverlapping support, each $\bY_{\bbeta^{(r)}}$ is a submatrix of $\bY_{\bbeta}$ and columns of $\bY_{\bbeta^{(r)}}$ are from the same subspace. For any $\bu \in \RR^{r_0}$ with $\ltwonorm{\bu}=1$, we can write $\bu$ as $\bu = \sum_{r=1}^{r_0} \bu^{(r)}$ where $\set{\bu^{(r)}}_{r=1}^{r_0}$ have non-overlapping support and $\bu^{(r)}$ corresponds to $\bY_{\bbeta^{(r)}}$ for $r \in [r_0]$.
With $d_{\min}$ sufficiently large as specified in the conditions of this theorem, by Lemma~\ref{lemma:lower-bound-singular-clean-data}, $\sigma_{\min}(\bY_{\bbeta^{(r)}}) \ge \sigma'_{\min}$ for $r \in [r_0]$, where $\sigma'_{\min}$ is defined by (\ref{eq:sigma-min-clean-data}). Furthermore, define
\bsals
{\rm aff}_{\max} \defeq \max_{t_1,t_2 \in [K] \colon t_1 \neq t_2} \aff{\cS_{t_1},\cS_{t_2}}.
\esals%

We then have
\bal\label{eq:noisy-l0ssc-subspace-detection-lambda-random-seg1}
&\ltwonorm{\bY_{\bbeta} \bu}^2 \nonumber \\
&= \sum_{r=1}^{r_0} \ltwonorm{ \bY_{\bbeta^{(r)}} \bu^{(r)}}^2 + 2 \sum_{s,t \in [r_0] \colon s < t}  {\bu^{(s)}}^{\top} \bY_{\bbeta^{(s)}}^{\top} \bY_{\bbeta^{(t)}} {\bu^{(t)}}
\nonumber \\
&\ge \sigma'^2_{\min} \ltwonorm{\bu}^2 -  2 \sum_{s,t \in [r_0] \colon s < t} \ltwonorm{\bu^{(s)}} \ltwonorm{\bu^{(t)}} {\rm aff}_{\max} \nonumber \\
&\ge \pth{\sigma'^2_{\min} - (r_0-1){\rm aff}_{\max} } \ltwonorm{\bu}^2 \nonumber \\
&= \sigma'^2_{\min} - (r_0-1){\rm aff}_{\max}.
\eal%
It follows that $\sigma_{\min}(\bY_{\bbeta}) \ge \sigma'^2_{\min} - (r_0-1){\rm aff}(\cS_{t_1}, \cS_{t_2})$.
By Weyl~\citep{Weyl1912-perturbation-singular-value}, $\abth{\sigma_{\min}(\bX_{\bbeta}) - \sigma_{\min}(\bY_{\bbeta})}\le \|\bN_{\bbeta}\|_2 \le \delta \sqrt{r_0}$. Therefore, it follows by (\ref{eq:noisy-l0ssc-subspace-detection-lambda-random-seg1}) that
\bals
&\sigma_{\min}(\bX_{\bbeta}) \ge \sigma'^2_{\min} - (r_0-1){\rm aff}(\cS_{t_1}, \cS_{t_2}) - \delta \sqrt{r_0} > 0,
\eals%
if $\delta < \frac{\sigma'^2_{\min} - (r_0-1){\rm aff}(\cS_{t_1}, \cS_{t_2})}{\sqrt{r_0}} = c$. It can be verified that (\ref{eq:noisy-l0ssc-sdp-random-cond2}), (\ref{eq:noisy-l0ssc-sdp-random-cond3}) and (\ref{eq:noisy-l0ssc-sdp-lambda-random}) guarantee (\ref{eq:noisy-l0ssc-sdp-M}), (\ref{eq:noisy-l0ssc-sdp-mu}) and (\ref{eq:noisy-l0ssc-sdp-lambda}) in Theorem~\ref{theorem::noisy-l0ssc-subspace-detection-lambda} respectively, therefore, the conclusion holds.
\end{proof}



\subsection{Proof of Theorem~\ref{theorem::noisy-dr-l0ssc-subspace-detection}}

We need the following lemmas before presenting the proof of Theorem~\ref{theorem::noisy-dr-l0ssc-subspace-detection}.
Lemma~\ref{lemma::random-matrix-decomposition} shows  that the low rank approximation $\bar {\bX}$ is close to $\bX$ in terms of the spectral norm~\citep{Halko2011-random-matrix-decomposition}. Lemma~\ref{lemma::perturbation-distance-to-subspace-projection} presents a perturbation bound for the distance between a data point and a subspace before and after the projection $\bP$.

\begin{lemma}\label{lemma::random-matrix-decomposition}
{\rm (Corollary $10.9$ in~\citet{Halko2011-random-matrix-decomposition})}
Let $p_0 \ge 2$ be an integer and $p' = p-p_0 \ge 4$, then with probability at least $1-6e^{-p}$, the spectral norm of $\bX - \hat \bX$ is bounded by
\bals%\label{eq:X-appro}
&\|\bX - \hat \bX\|_2 \le C_{p,p_0},
\eals%
where
\bals%\label{eq:C-p-p0}
&C_{p,p_0} \defeq \big(1+17\sqrt{1+\frac{p_0}{p'}}\big) \sigma_{p_0+1} + \frac{8\sqrt{p}}{p'+1} (\sum\limits_{j > p_0} \sigma_j^2)^{\frac{1}{2}}
\eals%
and $\sigma_1 \ge \sigma_2 \ge \ldots$ are the singular values of $\bX$.
\end{lemma}

\begin{lemma}\label{lemma::perturbation-distance-to-subspace-projection}
Let $\bbeta \in \RR^n$, $\tilde \by_i = \bP \by_i$, $\bH_{\bY_{\bbeta}}$ is an external subspace of $\by_i$, $\tilde \bY_{\bbeta} = \bP(\bY_{\bbeta})$ and $\tilde \bY_{\bbeta}$ has full column rank. Then
\bals%\label{eq:perturbation-distance-to-subspace-projection}
&|d(\by_i, \bH_{\bY_{\bbeta}}) - d(\tilde \by_i, \bH_{\tilde \bY_{\bbeta}}) |  \nonumber \\
&\le C_{p,p_0} (1+\frac{1}{\min_{1\le r \le \tilde d_k} \sigma_{\bY, r} - C_{p,p_0} - 2\delta \sqrt{\tilde d_{k}}})
\eals%%
for any $1 \le i \le n$ and $\by_i \in \cS_k$.
\end{lemma}
\begin{proof}
This lemma can be proved by applying Lemma~\ref{lemma::perturbation-distance-to-hyperplane}.
\end{proof}

\begin{proof}[\textup{\bf Proof of Theorem~\ref{theorem::noisy-dr-l0ssc-subspace-detection}}]
For any matrix $\bA \in \RR^{p \times q}$, we first show that multiplying $\bQ$ to the left of $\bA$ would not change its spectrum. To see this, let the singular value decomposition of $\bA$ be $\bA = \bU_{\bA} \bSigma \bV_{\bA}^{\top}$ where $\bU_{\bA}$ and $\bV_{\bA}$ have orthonormal columns with $\bU_{\bA}^{\top}\bU_{\bA} = \bV_{\bA}^{\top}\bV_{\bA} = \bI$. Then $\bQ\bA = \bU_{\bQ\bA} \bSigma \bV_{\bQ\bA}$ is the singular value decomposition of $\bQ\bA$ with $\bU_{\bQ\bA} = \bQ\bU_{\bA}$ and $\bV_{\bQ\bA} = \bV_{\bA}$. This is because the columns of $\bU_{\bQ\bA}$ are orthonormal since the columns $\bQ$ are orthonormal: $\bU_{\bQ\bA}^{\top} \bU_{\bQ\bA} = \bU_{\bA}^{\top}\bQ^{\top} \bQ\bU_{\bA} = \bI$, and $\bSigma$ is a diagonal matrix with nonnegative diagonal elements. It follows that $\sigma_{\min}(\bQ\bA) = \sigma_{\min}(\bA)$ for any $\bA \in \RR^{p \times q}$.

For a point $\bx_i = \by_i + \bn_i$, after projection via $\bP$, we have the projected noise $\tilde \bn_i = \bP \bn_i$. Because
\bals%\label{eq:noisy-dr-l0ssc-subspace-detection-seg1}
&\|\tilde \bn_i\|_2 = \|\bP \bn_i \|_2 = \|\bQ^{\top} \bn_i \|_2 \le \|\bQ\|_2 \|\bn_i\|_2 \le \|\bn_i\|_2 \le \delta,
\eals%%
the magnitude of the noise in the projected data is also bounded by $\delta$. Also,
\bals%\label{eq:noisy-dr-l0ssc-subspace-detection-seg1}
&\|\tilde \bx_i\|_2 = \|\bQ^{\top} \bx_i \|_2 \le  \|\bx_i\|_2 \le 1.
\eals%%
%\le \tilde d_{\max} = \max_{k=1,\ldots, K} d_k and ${\rm rank}(\bY_{\bbeta}) = r$
Let $\bbeta \in \RR^n$, $\tilde \bY_{\bbeta} = \bP \bY_{\bbeta} $ with $\|\bbeta\|_0 = r$. Since $\sigma_{\min}(\bQ \tilde \bY_{\bbeta}) = \sigma_{\min}(\tilde \bY_{\bbeta}))$, we have
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg2}
&|\sigma_{\min}(\tilde \bY_{\bbeta}) - \sigma_{\min}(\bY_{\bbeta})| = |\sigma_{\min}(\bQ \tilde \bY_{\bbeta}) - \sigma_{\min}(\bY_{\bbeta})| \nonumber \\
&\le \|\bQ \tilde \bY_{\bbeta} - \bY_{\bbeta}\|_2 \nonumber \\
& = \|\bQ \bQ^{\top}\bY_{\bbeta}  - \bY_{\bbeta}\|_2 \nonumber \\
&= \|\bQ \bQ^{\top}\bX_{\bbeta}  - \bX_{\bbeta} + \bN_{\bbeta} - \bQ \bQ^{\top} \bN_{\bbeta} \|_2 \nonumber \\
&\le C_{p,p_0} + \|\bN_{\bbeta} \|_F + \|\bQ \bQ^{\top} \bN_{\bbeta} \|_F \nonumber \\
&\le C_{p,p_0} + 2\delta \sqrt{r}.
\eal%%

It follows from (\ref{eq:noisy-dr-l0ssc-subspace-detection-seg2}) that if
\bals%\label{eq:noisy-dr-l0ssc-subspace-detection-seg3}
&C_{p,p_0} + 2\delta \sqrt{\tilde d_{\max}} < \min_{k = 1,\ldots,K} \sigma^{(k)}_{\bY},
\eals%%
then $\tilde \bY$ is also in general position.

In addition, since $r_0 \le \floor{\frac{1}{\lambda}}$ and $\lambda \|\tilde \bbeta^*\|_0 \le L(\bzero) \le 1$, we have $\norm{\tilde \bbeta^*}{0} \le r_0 \le \floor{\frac{1}{\lambda}} $.

%For $\bbeta \in \RR^n$ with $\|\bbeta\|_0 = r < r_0$, define
%\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg4}
%&{\bar \sigma}_{\tilde \bY,r} \defeq \min_{\bbeta: \|\bbeta\|_0 = r, {\rm rank}(\tilde \bY_{\bbeta}) = \|\bbeta\|_0} \sigma_{\min}(\tilde \bY_{\bbeta})
%\eal%%

Based on (\ref{eq:noisy-dr-l0ssc-subspace-detection-seg2}) we have
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg4}
&|{\bar \sigma}_{\tilde \bY,r} - {\bar \sigma}_{\bY,r}| \le C_{p,p_0} + 2\delta \sqrt{r_0},
\eal%%

and it follows by (\ref{eq:noisy-dr-l0ssc-subspace-detection-seg4}) that $\delta < \min_{1 \le r < r_0} {\bar \sigma}_{\tilde \bY,r}$ because $\delta < \min_{1 \le r < r_0} \bar \sigma_{\bY,r} - C_{p,p_0} - 2\delta \sqrt{r_0} $.

Again, for $\bbeta \in \RR^n$ with $\|\bbeta\|_0 = r \le r_0$, we have
\bsal\label{eq:noisy-dr-l0ssc-subspace-detection-seg5}
&|\sigma_{\min}(\tilde \bX_{\bbeta}) - \sigma_{\min}(\bX_{\bbeta})| = |\sigma_{\min}(\bQ \tilde \bX_{\bbeta}) - \sigma_{\min}(\bX_{\bbeta})| \nonumber \\
&\le \|\bQ \tilde \bX_{\bbeta} - \bX_{\bbeta}\|_2 \nonumber \\
& = \|\bQ \bQ^{\top}\bX_{\bbeta}  - \bX_{\bbeta}\|_2  = \|\hat \bX - \bX_{\bbeta}\|_2 \nonumber \\
&\le C_{p,p_0}.
\esal%%

%Define
%\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg6}
%&\sigma_{\tilde \bX,r} \defeq \min\{\sigma_{\min}(\tilde \bX_{\bbeta}) \colon 1 \le \|\bbeta\|_0 \le r\}
%\eal%%
It can be verified by (\ref{eq:noisy-dr-l0ssc-subspace-detection-seg5}) that
\bal\label{eq:noisy-dr-l0ssc-subspace-detection-seg7}
&|\sigma_{\tilde \bX,r} - \sigma_{\bX,r}| \le C_{p,p_0}.
\eal%%

Combining (\ref{eq:noisy-dr-l0ssc-subspace-detection-seg7}), Lemma~\ref{lemma::perturbation-distance-to-subspace-projection}, and the known condition that
\bsals%\label{eq:noisy-dr-l0ssc-subspace-detection-seg8}
&M_i - C_{p,p_0} (1+\frac{1}{\min_{1\le r \le \tilde d_k} \sigma_{\bY, r} - C_{p,p_0} - 2\delta \sqrt{\tilde d_{k}} }) \nonumber \\
&> \delta + \frac{2\delta}{\sigma_{\bX,r_0} -C_{p,p_0} },
\esals%%
we have
\bals%\label{eq:noisy-dr-l0ssc-subspace-detection-seg9}
&\tilde M_{i, \delta} \defeq \tilde M_i - \delta  > \frac{2\delta}{\tilde \sigma_{\tilde \bX,r_0}},
\eals%%
where $\by_i \in \cS_k$.

%Moreover, define
%\bal\label{eq:tilde-mu-r}
%&\tilde \mu_{r} \defeq \frac{\delta} {\min_{1 \le r < r_0} \tilde {\bar \sigma}_{\bY,r} - \delta}.
%\eal%%

Based on (\ref{eq:noisy-dr-l0ssc-subspace-detection-seg4}) and (\ref{eq:noisy-dr-l0ssc-subspace-detection-seg7}), we have
\bals%\label{eq:noisy-dr-l0ssc-subspace-detection-seg10}
&\tilde \mu_{r_0} < 1-\frac{2\delta}{\sigma_{\tilde \bX,r_0}},
\eals%%
because
\bals%\label{eq:noisy-dr-l0ssc-subspace-detection-seg10}
&\frac{\delta} {\min_{1 \le r < r_0}  \bar \sigma_{\bY,r_0} - C_{p,p_0} - 2\delta \sqrt{r_0} - \delta} \nonumber \\
&< 1-\frac{2\delta}{\sigma_{\bX,r_0} - C_{p,p_0}}.
\eals%%

\end{proof}

\subsection{Proof of Theorem~\ref{theorem::noisy-dr-l0ssc-subspace-detection-osnap}}
\label{sec::noisy-dr-l0ssc-subspace-detection-osnap-proof}
\begin{proof}[\textup{Proof of Theorem~\ref{theorem::noisy-dr-l0ssc-subspace-detection-osnap}}]
It can be verified that $\tilde M_i \ge \frac{M_i}{1+\varepsilon}$. Let $\bbeta \in \RR^n$, $\tilde \bY_{\bbeta} = \bP \bY_{\bbeta} $ with $\|\bbeta\|_0 = r$ and ${\rm rank}(\bY_{\bbeta})=r$, then for any $\bu \in \RR^{r}$, $\|\tilde \bY_{\bbeta} \bu\|_2 = \|\bP \bY_{\bbeta} \bu\|_2 \ge (1-\varepsilon) \|\bY_{\bbeta} \bu\|_2 \ge  (1-\varepsilon) \sigma_{\min} (\bY_{\bbeta}) \|\bu\|_2$. It follows that $\sigma_{\min}(\tilde \bY_{\bbeta}) \ge (1-\varepsilon) \sigma_{\min} (\bY_{\bbeta})$, and
${\bar \sigma}_{\tilde \bY,r} \ge (1-\varepsilon){\bar \sigma}_{\bY,r}$. Similarly, $\sigma_{\min} (\tilde \bX_{\bbeta}) \ge (1-\varepsilon)\sigma_{\min} (\bX_{\bbeta})$ for $\bbeta \in \RR^n, \|\bbeta\|_0 = r$ and ${\rm rank}(\bX_{\bbeta})=r$. It follows that $\sigma_{\tilde \bX,r} \ge (1-\varepsilon)\sigma_{\bX,r}$. Since (\ref{eq:noisy-dr-l0ssc-sdp-M})-(\ref{eq:noisy-dr-l0ssc-sdp-min-lambda-2}) hold, the conditions (\ref{eq:noisy-l0ssc-sdp-M})-(\ref{eq:noisy-l0ssc-sdp-min-lambda-2}) required by Theorem~\ref{theorem::noisy-l0ssc-subspace-detection-lambda} on the projected data ($\tilde \bY$ and $\tilde \bX$) also hold. Therefore, the subspace detection property holds with $\tilde \bbeta^*$ for $\tilde \bx_i$ with probability at least $1-K\delta$ by the union bound when $p \ge \frac{d^2+d}{\delta' (2\varepsilon-\varepsilon^2)^2}$.
\end{proof}

\section{Bound for Suboptimal and Globally Optimal Solutions for Noisy $\ell^{0}$-SSC and Noisy-DR-$\ell^{0}$-SSC}
\label{sec::suboptimal-optimal}

While our theoretical analysis for noisy $\ell^{0}$-SSC and Noisy-DR-$\ell^{0}$-SSC is based on optimal solution to the $\ell^{0}$ regularized problem (\ref{eq:noisy-l0ssc-i}), in this subsection we prove that the bound for the suboptimal solution $\hat \bbeta$ obtained by Algorithm~\ref{alg:PGD-l0ssc} is in fact close to an optimal solution to (\ref{eq:noisy-l0ssc-i}), justifying the theoretical findings of noisy $\ell^{0}$-SSC and Noisy-DR-$\ell^{0}$-SSC.

We further present the bound for the gap between $\hat \bbeta$ and $\bbeta^*$, $\|{\hat \bbeta} - \bbeta^*\|_2$, based on Theorem 5 in~\citet{YangY19-fast-pgd-l0}. Let $g(\bbeta) = \|\bx_i - \bX {\bbeta}\|_2^2$ and $\bbeta^*$ be the globally optimal solution to (\ref{eq:noisy-l0ssc-i}), $\bS^* = {\rm supp}(\bbeta^*)$, $\hat \bbeta$ be the suboptimal solution to (\ref{eq:noisy-l0ssc-i}) obtained by PGD, $\hat \bS = {\rm supp}(\hat \bbeta)$. The following theorem presents the bound for $\|{\hat \bbeta} - \bbeta^*\|_2$.
 \begin{theorem}\label{theorem::suboptimal-optimal}
{\rm (Theorem 5 in~\citet{YangY19-fast-pgd-l0})}
%suppose $\kappa_-(A_i) > 0$ and $\kappa_-(|\hat \bS \cup \bS^*|) > \kappa > 0$ % let $\bE = \hat \bS \cup \bS^*$,
Suppose $\bX_{\bS \cup \bS^*}$ has full column rank with $\kappa_0 \defeq \sigma_{\min}(\bX_{\bS \cup \bS^*}) > 0$ where $\bS$ is the support of the initialization for PGD on problem (\ref{eq:noisy-l0ssc-i}). Let $\kappa > 0$ such that $2\kappa_0^2 > \kappa$ and $b$ is chosen according to (\ref{eq:b-cond}) as below:
\bal\label{eq:b-cond}
&0< b < \min\{\min_{j \in {\hat \bS}} | \hat \bbeta_j|, \frac{\lambda}{ \max_{j \notin {\hat \bS}} |\frac{\partial g}{\partial {\bbeta_j}}|_{\bbeta =  { \hat \bbeta}}|}, \nonumber \\
&\min_{j \in {\bS^*}} |\bbeta_j^*|, \frac{\lambda}{ \max_{j \notin \bS^*} |\frac{\partial g}{\partial {\bbeta_j}}|_{\bbeta = \bbeta^*} |}  \}.
\eal%%
Let $\bF = ({\hat \bS} \setminus \bS^*) \cup (\bS^* \setminus {\hat \bS})$ be the symmetric difference between $\hat \bS$ and $\bS^*$, then
\bals%\label{eq:suboptimal-optimal}
&\|{\hat \bbeta} - \bbeta^*\|_2 \le \frac{1}{2\kappa_0^2-\kappa}\big(\sum\limits_{j \in \bF \cap \hat \bS} (\max\{0,\frac{\lambda}{b} - {\kappa} |{\hat \bbeta_j} - b|\})^2 + \nonumber \\
&\quad \sum\limits_{j \in \bF \setminus \hat \bS} (\max\{0,\frac{\lambda}{b} - {\kappa} b\})^2 \big)^{\frac{1}{2}}.
\eals%%
\end{theorem}
\begin{remark}
It is observed that the gap $\|{\hat \bbeta} - \bbeta^*\|_2$ is small when $\frac{\lambda}{b} - {\kappa} |{\hat \bbeta_j} - b|$ for $j \in \bF \cap \hat \bS$ and $\frac{\lambda}{b} - {\kappa} b$ are small. Based on this observation, Theorem~\ref{theorem::suboptimal-optimal-concrete} establishes the conditions under which $\hat \bbeta$ is also an optimal solution to (\ref{eq:noisy-l0ssc-i}), i.e. ${\hat \bbeta} = \bbeta^*$.
\end{remark}

Define $\bS^* = {\rm supp}(\bbeta^*)$, $H^* = \max_{1 \le j \le n} {\rm dist} (\bbeta, \bH_{\bX_{\bS^* \setminus \{j\}}})$, $\mu = \max\{H^* + \|\bbeta_i-\bX \bbeta^*\|_2, 2\|\bx_i-\bX \hat \bbeta\|_2, 2\|\bx_i-\bX \bbeta^*\|_2\}$, $\kappa_0 = \sigma_{\min}(\bX_{\bS \cup \bS^*}) > 0$ where $\bS = {\rm supp}({\bbeta}^{(0)})$. The following theorem demonstrates that $\hat \bbeta = \bbeta^*$ if $\lambda$ is two-side bounded and $\hat \bbeta_{\min} = \min_{t: \hat \bbeta_t \neq 0} |\hat \bbeta_t|$ is sufficiently large.

\begin{theorem}\label{theorem::suboptimal-optimal-concrete}
{\rm (Conditions that the suboptimal solution by PGD is also globally optimal)}
%suppose $\kappa_-(A_i) > 0$ and $\kappa_-(|\hat \bS \cup \bS^*|) > \kappa > 0$ % let $\bE = \hat \bS \cup \bS^*$,
If
\bal\label{eq:hat-bz-min-cond}
&\hat \bbeta_{\min} \ge \frac{\mu}{\kappa_0^2}
\eal%
and
\bal\label{eq:lambda-two-side}
&\frac{\mu^2}{2\kappa_0^2} \le \lambda \le (\hat \bbeta_{\min} - \frac{\mu}{2\kappa_0^2}) \mu,
\eal%
then ${\hat \bbeta} = \bbeta^*$.
\end{theorem}
\begin{proof}[\textup {Sketch of Proof}]
It can be verified that $\max\{0,\frac{\lambda}{b} - {\kappa} |{\hat \bbeta_j} - b|\} = 0$ and $\max\{0,\frac{\lambda}{b} - {\kappa} b\} = 0$ under the conditions (\ref{eq:hat-bz-min-cond}) and (\ref{eq:lambda-two-side}), therefore, $\hat \bbeta = \bbeta^*$ by applying Theorem~\ref{theorem::suboptimal-optimal}.
\end{proof}

\begin{table*}[t]
\centering
\caption{\small Clustering results on various data sets, with different values of $p$ for the linear transformation $\bP$ and the best two results in bold}
\resizebox{\linewidth}{!}{
\begin{tabular}{|c|c|c|c|c|c|c|c|c|c|}
  \hline
  Data Set

                              &Measure &Noisy $\ell^{0}$-SSC   &\multicolumn{3}{|c|}{Noisy-DR-$\ell^{0}$-SSC-LR}               &\multicolumn{3}{|c|}{Noisy-DR-$\ell^{0}$-SSC-CSP} \\\hline
  $p$                           &        &           &$p=\min\{d,n\}/5$ &$p=\min\{d,n\}/10$ &$p=\min\{d,n\}/15$    &$p=\min\{d,n\}/5$ &$p=\min\{d,n\}/10$ &$p=\min\{d,n\}/15$ \\ \hline
  \multirow{2}{*}{COIL-20}    &AC      &0.8472          &{0.8479} &{0.8479} &\textbf{0.8479}                      &\textbf{0.8486} &{0.8472} &{0.8472} \\\cline{2-9}
                              &NMI     &0.9428          &0.9433   &0.9433   &\textbf{0.9433}                 &\textbf{0.9439} &{0.9428} &{0.9428}  \\ \hline
  \multirow{2}{*}{COIL-100}   &AC      &\textbf{0.7683} &{0.6992} &\textbf{0.7276} &{0.7043}                      &\textbf{0.5404} &{0.7046} &{0.7233} \\\cline{2-9}
                              &NMI     &\textbf{0.9182} &0.8626   &\textbf{0.8919} &{0.8636}                      &{0.7819} &{0.8708} &\textbf{0.8726}  \\ \hline
  \multirow{2}{*}{Yale-B}     &AC      &\textbf{0.8480}         &{0.8219} &{0.8231} &{0.8289}                      &\textbf{0.8500} &{0.8318} &{0.8277} \\\cline{2-9}
                              &NMI     &0.8612         &0.8519   &0.8527        &{0.8534}                 &{0.8538} &\textbf{0.8593} &\textbf{0.8594}  \\ \hline

 % \multirow{2}{*}{MNIST}      &AC      &0.5621 &0.4922  &0.4948             &0.5784  &0.5754      &{0.6590} \\ \cline{2-8}
%                              &NMI     &0.5113 &0.4755  &0.5210             &0.6332  &0.5463      &{0.6709} \\ \hline

\end{tabular}
}
\label{table:more-results}
\end{table*}

\begin{figure*}[t]
\includegraphics[width=0.38\textwidth]{yaleb-sensitivity_lambda_acc.pdf}
\includegraphics[width=0.38\textwidth]{yaleb-sensitivity_lambda_nmi.pdf}
\includegraphics[width=0.2\textwidth]{yaleb-sensitivity_lambda_legend.pdf}
\caption{Accuracy (left) and NMI (right) with respect to different values of $\lambda$ on the Extended Yale-B data set}
\label{fig:accuracy-nmi-sensitivity}
\end{figure*}

\begin{figure*}[!htb]
    \centering % <-- added
\begin{subfigure}{0.33\textwidth}
  \includegraphics[width=\linewidth]{yaleb-noise-1.pdf}
  \caption{$\sigma^2=10$}
  \label{fig:yaleb-noisy-level-1}
\end{subfigure}\hfil % <-- added
\begin{subfigure}{0.33\textwidth}
  \includegraphics[width=\linewidth]{yaleb-noise-2.pdf}
 \caption{$\sigma^2=20$}
  \label{fig:yaleb-noisy-level-2}
\end{subfigure}\hfil % <-- added
\begin{subfigure}{0.33\textwidth}
  \includegraphics[width=\linewidth]{yaleb-noise-3.pdf}
  \caption{$\sigma^2=30$}
  \label{fig:yaleb-noisy-level-3}
\end{subfigure}

\medskip
\begin{subfigure}{0.33\textwidth}
  \includegraphics[width=\linewidth]{yaleb-noise-4.pdf}
  \caption{$\sigma^2=40$}
  \label{fig:yaleb-noisy-level-4}
\end{subfigure}\hfil % <-- added
\begin{subfigure}{0.33\textwidth}
  \includegraphics[width=\linewidth]{yaleb-noise-5.pdf}
  \caption{$\sigma^2=50$}
  \label{fig:yaleb-noisy-level-5}
\end{subfigure}\hfil % <-- added
\begin{subfigure}{0.33\textwidth}
  \includegraphics[width=\linewidth]{yaleb-noise-6.pdf}
  \caption{$\sigma^2=60$}
  \label{fig:yaleb-noisy-level-6}
\end{subfigure}
\caption{The SDP violation rate with respect to $\lambda$ for noisy $\ell^0$-SSC, Noisy-DR-$\ell^0$-SSC and Noisy-DR-$\ell^{0}$-SSC-CSP. The SDP violation rate for Noisy-DR-$\ell^0$-SSC and that for Noisy-DR-$\ell^{0}$-SSC-CSP are the same, so their curves overlap each other.}
\label{fig:yaleb-noisy-level}
\end{figure*}


\section{Time Complexity of noisy $\ell^{0}$-SSC, Noisy-DR-$\ell^{0}$-SSC-LR, Noisy-DR-$\ell^{0}$-SSC-CSP}
\label{sec::time-complexity}
The time complexity of running PGD by Algorithm~\ref{alg:PGD-l0ssc} for noisy $\ell^{0}$-SSC is $\cO(Tnd)$, where $T$ is the maximum iteration number. The time complexity of running Algorithm~\ref{alg:noisy-dr-l0ssc-lr}
for Noisy-DR-$\ell^{0}$-SSC-LR is comprised of two parts. The first part is the time complexity of steps 1-3 with matrix multiplication and QR decomposition, which is $\cO(dp^2 + pdn)$. The second part is the time complexity of step 4, which is $\cO(Tnp)$. The overall time complexity of Noisy-DR-$\ell^{0}$-SSC is $\cO(dp^2 + pdn + Tnp)$. In practice, $p$ is much smaller than $\min\set{d,n,T}$, so Noisy-DR-$\ell^{0}$-SSC-LR is more efficient than noisy $\ell^{0}$-SSC. Noisy-DR-$\ell^{0}$-SSC-CSP is even more efficient than both noisy $\ell^{0}$-SSC and Noisy-DR-$\ell^{0}$-SSC, whose time complexity is $\cO(pdn + Tnp)$. This is because the linear transformation $\bP$ obtained by CSP does require QR decomposition.


\section{Proximal Gradient Descent (PGD) for Noisy $\ell^{0}$-SSC}
\label{sec::pgd-noisy-l0ssc}
Algorithm~\ref{alg:noisy-dr-l0ssc-lr} describes how to perform Noisy-DR-$\ell^{0}$-SSC-LR for data clustering. Note that Noisy-DR-$\ell^{0}$-SSC performs noisy $\ell^{0}$-SSC on the dimensionality reduced data $\tilde \bX$. Proximal Gradient Descent (PGD) is employed to optimize the objective function of noisy $\ell^{0}$-SSC for every data point $\bx_i$, which is desribed in Algorithm~\ref{alg:PGD-l0ssc}. In the $k$-th iteration of PGD for problem (\ref{eq:noisy-l0ssc-i}), the variable $\bbeta$ is updated according to
\bals%\label{eq:pgd}
        &{\bbeta}^{(k+1)} = T_{\sqrt{{2\lambda s}}}({\bbeta}^{(k)} - s \nabla g(\bbeta^{(k)})),
\eals%%
where $s$ is a positive step size, $g(\bbeta) = \|\bx_i - \bX {\bbeta}\|_2^2$, $T_{\theta}$ is an element-wise hard thresholding operator:
\begin{align*}
        [T_{\theta}(\bu)]_j=
        \left\{
        \begin{array}
                {r@{\quad:\quad}l}
                0 & {|\bu_j| \le \theta } \\
                {\bu_j} & {\rm otherwise}
        \end{array}
        \right., \quad 1 \le j \le n.
\end{align*}%
It is proved in~\citet{Yang2017-PGD-l0-sparse-approximation} that the sequence $\{{\bbeta}^{(k)}\}$ generated by PGD converges to a critical point of (\ref{eq:noisy-l0ssc-i}).

\begin{algorithm}[!ht]
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand\algorithmicensure {\textbf{Output:} }
\small
\caption{Proximal Gradient Descent (PGD) for noisy $\ell^0$-SSC problem (\ref{eq:noisy-l0ssc-i})}
\label{alg:PGD-l0ssc}
%\allowdisplaybreaks
\begin{algorithmic}[1]
\REQUIRE ~~\\
The initialization ${\bbeta}^{(0)}$, step size $s >0$, parameter $\lambda$, maximum iteration number $T$, stopping threshold $\varepsilon$.
%$\lambda_{\ell^{1}}$ for the initialization of the the A$\ell^{0}$-SSC,
%maximum iteration number $M$, stopping threshold $\varepsilon$\\
\FOR{$1 \le i \le n$}
\STATE{$\tilde \bbeta^{(t)} = {\bbeta}^{(t-1)} - s \nabla g({\bbeta}^{(t-1)})$}
\STATE{${\bbeta}^{(t)} = T_{\sqrt{{2\lambda s}}}(\tilde \bbeta^{(t)})$}
\IF{$|L(\bbeta^{(t)})-L(\bbeta^{(t-1)})| < \varepsilon$}
%\PRINT
\STATE \textbf{break}
\ENDIF
\ENDFOR
%\WHILE{$t \le M$}
%\STATE{Obtain ${\bbeta}^{(t)}$ from ${\bbeta}^{(t-1)}$ by (\ref{eq:l0ssc-lasso-proximal-step1}) and (\ref{eq:l0ssc-lasso-proximal-step2})}
%\IF{$|L(\bbeta^{(t)})-L(\bbeta^{(t-1)})| < \varepsilon$}
%%\PRINT
%\STATE \textbf{break}
%\ELSE
%\STATE{$t=t+1$.}
%\ENDIF
%%and in each step of coordinate descent use the feature-sign search algorithm to solve the optimization problem (\ref{eq:objfunci}).
%\ENDWHILE

\ENSURE $\hat \bbeta$ which is the suboptimal solution to (\ref{eq:noisy-l0ssc-i})
\end{algorithmic}
\end{algorithm}

\section{Additional Experimental Results}
\label{sec::additional-experiments}
We present more results of Noisy-DR-$\ell^{0}$-SSC-LR and Noisy-DR-$\ell^{0}$-SSC-CSP in Table~\ref{table:more-results} with different projection dimension $p$. Figure~\ref{fig:accuracy-nmi-sensitivity} show how the accuracy and NMI varies with respect to $\lambda$ on the Extended Yale-B data set.

%\begin{figure*}[!hbt]
%\includegraphics[width=0.38\textwidth]{yaleb-sensitivity_lambda_acc.pdf}
%\includegraphics[width=0.38\textwidth]{yaleb-sensitivity_lambda_nmi.pdf}
%\includegraphics[width=0.2\textwidth]{yaleb-sensitivity_lambda_legend.pdf}
%\caption{Accuracy (left) and NMI (right) with respect to different values of $\lambda$ on the Extended Yale-B data set}
%\label{fig:accuracy-nmi-sensitivity}
%\end{figure*}


Figure~\ref{fig:yaleb-noisy-level-1} to Figure~\ref{fig:yaleb-noisy-level-6} illustrate SDP violation with respect to $\lambda$ for different noise levels, justifying our theoretical finding that a large $\lambda$ tends to preserve the subspace detection property for noisy $\ell^{0}$-SSC, Noisy-DR-$\ell^{0}$-SSC-LR and Noisy-DR-$\ell^{0}$-CSP.

%\section{Proof of Theorem~\ref{theorem::optimal-rp}}
%\begin{proof}[\textbf{Proof of Theorem~\ref{theorem::optimal-rp}}]
%Let $\bY = \tilde \bX$, it can be verified that%By the proof of Lemma~\ref{lemma::PGD-convergence}, we have
%\begin{align*}
%&\| 2{{{\bY}^{(-i)}}^{\top}}{{\bY}^{(-i)}}{{\tilde \bZ}_{-i}^i} - 2{{{\bY}^{(-i)}}^{\top}}\bx_i  + {\tilde \partial \bR}(\tilde {\bZ^i})\|_2  = 0.
%\end{align*}
%It follows that
%\bal\label{eq:optimal-rp-seg1}
%&\| 2{{{\bX}^{(-i)}}^{\top}}{{\bX}^{(-i)}}{{\tilde \bZ}_{-i}^i} -2{{{\bX}^{(-i)}}^{\top}}\bx_i  + {\tilde \partial \bR}(\tilde {\bZ^i})\|_2  \nonumber \\
%&= \| 2{{{\bX}^{(-i)}}^{\top}}{{\bX}^{(-i)}}{{\tilde \bZ}_{-i}^i} - 2{{{\bY}^{(-i)}}^{\top}}{{\bY}^{(-i)}}{{\tilde \bZ}_{-i}^i}
%+2{{{\bY}^{(-i)}}^{\top}}{{\bY}^{(-i)}}{{\tilde \bZ}_{-i}^i} \nonumber \\
%&\quad -2{{{\bX}^{(-i)}}^{\top}}\bx_i
%+ 2{{{\bY}^{(-i)}}^{\top}}\bx_i - 2{{{\bY}^{(-i)}}^{\top}}\bx_i
%+ {\tilde \partial \bR}(\tilde {\bZ^i})\|_2  \nonumber \\
%& \le \| 2{{{\bX}^{(-i)}}^{\top}}{{\bX}^{(-i)}}{{\tilde \bZ}_{-i}^i} - 2{{{\bY}^{(-i)}}^{\top}}{{\bY}^{(-i)}}{{\tilde \bZ}_{-i}^i}\|_2 \nonumber \\
%&\quad + \|2{{{\bX}^{(-i)}}^{\top}}\bx_i - 2{{{\bY}^{(-i)}}^{\top}}\bx_i\|_2 \nonumber \\
%&\quad + \| 2{{{\bY}^{(-i)}}^{\top}}{{\bY}^{(-i)}}{{\tilde \bZ}_{-i}^i} -2{{{\bY}^{(-i)}}^{\top}}\bx_i + {\tilde \partial \bR}(\tilde {\bZ^i})\|_2 \nonumber \\
%&=\| 2{{{\bX}^{(-i)}}^{\top}}{{\bX}^{(-i)}}{{\tilde \bZ}_{-i}^i} - 2{{{\bY}^{(-i)}}^{\top}}{{\bY}^{(-i)}}{{\tilde \bZ}_{-i}^i}\|_2 \nonumber \\
%&\quad + \|2{{{\bX}^{(-i)}}^{\top}}\bx_i - 2{{{\bY}^{(-i)}}^{\top}}\bx_i\|_2 \nonumber \\
%&\le 2\|{{{\bX}^{(-i)}}^{\top}} ({{\bX}^{(-i)}}- {{\bY}^{(-i)}}) {{\tilde \bZ}_{-i}^i}\|_2 \nonumber \\
%&\quad + 2\| ({{\bX}^{(-i)}}-{{\bY}^{(-i)}})^{\top} {{\bY}^{(-i)}} {{\tilde \bZ}_{-i}^i}\|_2
%+ 2\|{{{\bX}^{(-i)}}^{\top}}\bx_i - {{{\bY}^{(-i)}}^{\top}}\bx_i\|_2.
%\eal%
%By ${\tilde L}(\bZ^i) \le {\tilde L}({\bZ^i}^{(0)})$, we have $\|{{\tilde \bZ}_{-i}^i}\|_2 \le M$. By Proposition~\ref{proposition::random-matrix-decomposition}, with probability at least $1-6e^{-p}$, $\|\bX - \bY\|_2 \le C_{k,k_0}$. It follows from (\ref{eq:optimal-rp-seg1}) that
%\begin{align*}
%&\| 2{{{\bX}^{(-i)}}^{\top}}{{\bX}^{(-i)}}{{\tilde \bZ}_{-i}^i} -2{{{\bX}^{(-i)}}^{\top}}\bx_i + {\tilde \partial \bR}(\tilde {\bZ^i})\|_2 \nonumber \\
%& \le 2\sigma_{\max}(\bX) C_{k,k_0}M + 2C_{k,k_0} (\sigma_{\max}(\bX) + C_{k,k_0})M + 2C_{k,k_0}\|\bx_i\|_2\nonumber \\
%&= 2C_{k,k_0}M (2\sigma_{\max}(\bX) + C_{k,k_0}) + 2C_{k,k_0}\|\bx_i\|_2.
%\end{align*}
%It can also be verified that %Also, by the proof of Lemma~\ref{lemma::PGD-convergence},
%\begin{align*}
%&\| 2{{{\bX}^{(-i)}}^{\top}}{{\bX}^{(-i)}} {{\bZ^i}_{-i}^*} -2{{{\bX}^{(-i)}}^{\top}}\bx_i + {\tilde \partial \bR}({{\bZ^*}^i})\|_2  = 0.
%\end{align*}
%
%Let $\Delta = {{\bZ^i}_{-i}^*} - {{\tilde \bZ}_{-i}^i}$, $\tilde \Delta = {\tilde \partial \bR}({\bZ^*}^i) - {\tilde \partial \bR}(\tilde {\bZ^i})$,
%\begin{align*}
%&\| 2{{{\bX}^{(-i)}}^{\top}}{{\bX}^{(-i)}} \Delta  + \tilde \Delta\|_2  \le 2C_{k,k_0}M (2\sigma_{\max}(\bX) + C_{k,k_0}) \nonumber \\
%&+ 2C_{k,k_0}\|\bx_i\|_2.
%\end{align*}
%Now following Theorem~\ref{theorem::suboptimal-optimal}, we have
%\bal\label{eq:optimal-rp-seg2}
%&\|{\bZ^*}^i-{\tilde \bZ}^i\|_2 = \|\Delta\|_2 \nonumber \\
%&\le \frac{1}{2\tau_0^2-\tau}\bigg(\big(\sum\limits_{j \in {\bG_i} \cap \tilde \bS_i} (\max\{0,\frac{\lambda}{b} - {\kappa} |{\tilde \bZ}_j^i - b| \})^2 + \nonumber \\
%&\sum\limits_{j \in {\bG_i} \setminus \tilde \bS_i} (\max\{0, \frac{\lambda}{b} - {\kappa} b\})^2 \big)^{\frac{1}{2}} \nonumber \\
%&+ 2C_{k,k_0}M (2\sigma_{\max}(\bX) + C_{k,k_0}) + 2C_{k,k_0}\|\bx_i\|_2 \bigg)
%\eal%
%\end{proof}

%\appendix
%% NOTE: necessary when ptmx or no mathfont class option is given
%\providecommand{\upGamma}{\Gamma}
%\providecommand{\uppi}{\pi}
%\section{Math font exposition}
%How math looks in equations is important:
%\begin{equation*}
%  F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
%\end{equation*}
%However, one should not ignore how well math mixes with text:
%The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
%It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.


%\newpage%\clearpage

\clearpage

\balance

\bibliography{Yang_517}

\end{document}
