\documentclass[accepted]{uai2023} % for initial submission

% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage[american]{babel}


\usepackage{algcompatible}
\usepackage{algorithm}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{bbm}
\usepackage{amsthm}

\usepackage{pdfrender}

\usepackage{bbm}
\usepackage{amsthm}

\usepackage{thmtools} 
\usepackage{thm-restate}

\usepackage{makecell}

\usepackage{amssymb,amsmath,balance}

\usepackage{amsfonts}
\usepackage{arydshln}

\usepackage{mathtools} 
\usepackage{nicefrac}

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\RequirePackage{mathrsfs}
\RequirePackage{mdframed}

\def\ii{\mathbbm{j}}
\def\hsamp#1#2{\hvarphi^{(#1)}_{#2}}
\def\tsamp#1#2{\tvarphi^{(#1)}_{#2}}
\def\samp#1#2{\varphi^{(#1)}_{#2}}
\def\hftr#1#2{\wh{F}^{\textup{tr}}(\wh{X}^{(#1)}_{#2})}
\def\ftr#1#2{\wh{F}^{\textup{tr}}({X}^{(#1)}_{#2})}

\def\covmatparam{\bSigma^{\natural}}
\def\covmatnonparam{\bSigma^{\natural}}
\def\precmatparam{\bTheta^{\natural}}
\def\precmatnonparam{\bTheta^{\natural}}
\newcommand{\red}[1]{\textcolor{red}{#1}}



% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2020} with \usepackage[nohyperref]{icml2020} above.
\usepackage{hyperref}

% Attempt to make hyperref and algorithmic work together better:
%\newcommand{\theHalgorithm}{\arabic{algorithm}}

\usepackage{enumitem}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{assumption}{Assumption}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}


% Attempt to make hyperref and algorithmic work together better:
%\newcommand{\theHalgorithm}{\arabic{algorithm}}


\allowdisplaybreaks


\newcommand{\circled}[1]{\small{\raisebox{.6pt}{\textcircled{\raisebox{-.8pt}{#1}}}}}

\def\cirone{\circled{1}}
\def\cirtwo{\circled{2}}
\def\cirthree{\circled{3}}
\def\cirfour{\circled{4}}
\def\cirfive{\circled{5}}
\def\cirsix{\circled{6}}
\def\cirseven{\circled{7}}
\def\cireight{\circled{8}}
\def\cirnine{\circled{9}}
\def\cira{\circled{A}}
\def\cirb{\circled{B}}
\def\circc{\circled{C}}
\def\cird{\circled{D}}
\def\cire{\circled{E}}
\def\cirf{\circled{F}}
\def\cirg{\circled{G}}
\def\cirh{\circled{H}}
\def\ciri{\circled{I}}
\def\cirj{\circled{J}}
\def\cirk{\circled{K}}
\def\cirl{\circled{L}}
\def\cirm{\circled{M}}
\def\cirn{\circled{N}}
\def\ciro{\circled{O}}
\def\cirp{\circled{P}}
\def\cirq{\circled{Q}}
\def\cirr{\circled{R}}
\def\cirs{\circled{S}}
\def\cirt{\circled{T}}
\def\ciru{\circled{U}}
\def\cirv{\circled{V}}
\def\cirw{\circled{W}}
\def\cirx{\circled{X}}
\def\ciry{\circled{Y}}
\def\cirz{\circled{Z}}


\newcommand*{\boldcheckmark}{%
  \textpdfrender{
    TextRenderingMode=FillStroke,
    LineWidth=.5pt, % half of the line width is outside the normal glyph
  }{\checkmark}%
}



% =====================================
% Numbering 
% =====================================
%\setcounter{section}{0}
%\def\thesection{\arabic{section}}
%\setcounter{page}{1}
%
%\renewcommand{\thefootnote}{\fnsymbol{footnote}}

% ====================================
% Formula Environment
% ====================================
%\renewcommand{\theequation}{\thesection.\arabic{equation}}
%\numberwithin{equation}{section}
%\numberwithin{equation}{section}

\def\[#1\]{\begin{align*}#1\end{align*}}


% ========================
% Letter
% ========================
\renewcommand{\hat}{\widehat}


\newcommand{\bfm}[1]{\ensuremath{\boldsymbol{#1}}}
\newcommand{\bfmcap}[1]{\ensuremath{\mathbf{#1}}}
\newcommand{\bfsym}[1]{\ensuremath{\boldsymbol{#1}}}


\def\bone{\bfm 1}
\def\btwo{\bfm 0}
%\def\ba{\bfm a}   \def\bA{\bfm A}  \def\AA{\mathbb{A}}
%\def\bb{\bfm b}   \def\bB{\bfm B}  \def\BB{\mathbb{B}}
%\def\bc{\bfm c}   \def\bC{\bfm C}  \def\CC{\mathbb{C}}
%\def\bd{\bfm d}   \def\bD{\bfm D}  \def\DD{\mathbb{D}}
%\def\be{\bfm e}   \def\bE{\bfm E}  \def\EE{\mathbb{E}}
%\def\bff{\bfm f}  \def\bF{\bfm F}  \def\FF{\mathbb{F}}
%\def\bg{\bfm g}   \def\bG{\bfm G}  \def\GG{\mathbb{G}}
%\def\bh{\bfm h}   \def\bH{\bfm H}  \def\HH{\mathbb{H}}
%\def\bi{\bfm i}   \def\bI{\bfm I}  \def\II{\mathbb{I}}
%\def\bj{\bfm j}   \def\bJ{\bfm J}  \def\JJ{\mathbb{J}}
%\def\bk{\bfm k}   \def\bK{\bfm K}  \def\KK{\mathbb{K}}
%\def\bl{\bfm l}   \def\bL{\bfm L}  \def\LL{\mathbb{L}}
%\def\bm{\bfm m}   \def\bM{\bfm M}  \def\MM{\mathbb{M}}
%\def\bn{\bfm n}   \def\bN{\bfm N}  \def\NN{\mathbb{N}}
%\def\bo{\bfm o}   \def\bO{\bfm O}  \def\OO{\mathbb{O}}
%\def\bp{\bfm p}   \def\bP{\bfm P}  \def\PP{\mathbb{P}}
%\def\bq{\bfm q}   \def\bQ{\bfm Q}  \def\QQ{\mathbb{Q}}
%\def\brr{\bfm r}  \def\bR{\bfm R} \def\RR{\mathbb{R}}
%\def\bs{\bfm s}   \def\bS{\bfm S}  \def\SS{\mathbb{S}}
%\def\bt{\bfm t}   \def\bT{\bfm T}  \def\TT{\mathbb{T}}
%\def\bu{\bfm u}   \def\bU{\bfm U}  \def\UU{\mathbb{U}}
%\def\bv{\bfm v}   \def\bV{\bfm V}  \def\VV{\mathbb{V}}
%\def\bw{\bfm w}   \def\bW{\bfm W}  \def\WW{\mathbb{W}}
%\def\bx{\bfm x}   \def\bX{\bfm X}  \def\XX{\mathbb{X}}
%\def\by{\bfm y}   \def\bY{\bfm Y}  \def\YY{\mathbb{Y}}
%\def\bz{\bfm z}   \def\bZ{\bfm Z}  \def\ZZ{\mathbb{Z}}

\def\ba{\bfm a}   \def\bA{\bfmcap A}  \def\AA{\mathbb{A}}
\def\bb{\bfm b}   \def\bB{\bfmcap B}  \def\BB{\mathbb{B}}
\def\bc{\bfm c}   \def\bC{\bfmcap C}  \def\CC{\mathbb{C}}
\def\bd{\bfm d}   \def\bD{\bfmcap D}  \def\DD{\mathbb{D}}
\def\be{\bfm e}   \def\bE{\bfmcap E}  \def\EE{\mathbb{E}}
\def\bff{\bfm f}  \def\bF{\bfmcap F}  \def\FF{\mathbb{F}}
\def\bg{\bfm g}   \def\bG{\bfmcap G}  \def\GG{\mathbb{G}}
\def\bh{\bfm h}   \def\bH{\bfmcap H}  \def\HH{\mathbb{H}}
\def\bi{\bfm i}   \def\bI{\bfmcap I}  \def\II{\mathbb{I}}
\def\bj{\bfm j}   \def\bJ{\bfmcap J}  \def\JJ{\mathbb{J}}
\def\bk{\bfm k}   \def\bK{\bfmcap K}  \def\KK{\mathbb{K}}
\def\bl{\bfm l}   \def\bL{\bfmcap L}  \def\LL{\mathbb{L}}
\def\bm{\bfm m}   \def\bM{\bfmcap M}  \def\MM{\mathbb{M}}
\def\bn{\bfm n}   \def\bN{\bfmcap N}  \def\NN{\mathbb{N}}
\def\bo{\bfm o}   \def\bO{\bfmcap O}  \def\OO{\mathbb{O}}
\def\bp{\bfm p}   \def\bP{\bfmcap P}  \def\PP{\mathbb{P}}
\def\bq{\bfm q}   \def\bQ{\bfmcap Q}  \def\QQ{\mathbb{Q}}
\def\brr{\bfm r}  \def\bR{\bfmcap R} \def\RR{\mathbb{R}}
\def\bs{\bfm s}   \def\bS{\bfmcap S}  \def\SS{\mathbb{S}}
\def\bt{\bfm t}   \def\bT{\bfmcap T}  \def\TT{\mathbb{T}}
\def\bu{\bfm u}   \def\bU{\bfmcap U}  \def\UU{\mathbb{U}}
\def\bv{\bfm v}   \def\bV{\bfmcap V}  \def\VV{\mathbb{V}}
\def\bw{\bfm w}   \def\bW{\bfmcap W}  \def\WW{\mathbb{W}}
\def\bx{\bfm x}   \def\bX{\bfmcap X}  \def\XX{\mathbb{X}}
\def\by{\bfm y}   \def\bY{\bfmcap Y}  \def\YY{\mathbb{Y}}
\def\bz{\bfm z}   \def\bZ{\bfmcap Z}  \def\ZZ{\mathbb{Z}}


\def\ha{\wh{a}}   \def\hba{\wh{\bfm a}} 
\def\hA{\wh{A}}   \def\hbA{\wh{\bfm A}}

\def\hb{\wh{b}}   \def\hbb{\wh{\bfm b}} 
\def\hB{\wh{B}}   \def\hbB{\wh{\bfm B}}
   
\def\hc{\wh{c}}   \def\hbc{\wh{\bfm c}} 
\def\hC{\wh{C}}   \def\hbC{\wh{\bfm C}}
  
\def\hd{\wh{d}}   \def\hbd{\wh{\bfm d}} 
\def\hD{\wh{D}}   \def\hbD{\wh{\bfm D}}
  
\def\he{\wh{e}}   \def\hbe{\wh{\bfm e}} 
\def\hE{\wh{E}}   \def\hbE{\wh{\bfm E}}
 
\def\hff{\wh{f}}  \def\hbff{\wh{\bfm f}} 
\def\hFF{\wh{F}}  \def\hbFF{\wh{\bfm F}}
 
\def\hg{\wh{g}}   \def\hbg{\wh{\bfm g}} 
\def\hG{\wh{G}}   \def\hbG{\wh{\bfm G}}
 
\def\hh{\wh{h}}   \def\hbh{\wh{\bfm h}} 
\def\hH{\wh{H}}   \def\hbH{\wh{\bfm H}}

\def\hii{\wh{i}}   \def\hbii{\wh{\bfm i}} 
\def\hII{\wh{I}}   \def\hbII{\wh{\bfm I}}

\def\hj{\wh{j}}   \def\hbj{\wh{\bfm j}} 
\def\hJ{\wh{J}}   \def\hbJ{\wh{\bfm J}}
%
\def\hk{\wh{k}}   \def\hbk{\wh{\bfm k}} 
\def\hK{\wh{K}}   \def\hbK{\wh{\bfm K}}

\def\hll{\wh{l}}   \def\hbll{\wh{\bfm l}} 
\def\hLL{\wh{L}}   \def\hbLL{\wh{\bfm L}}

\def\hm{\wh{m}}   \def\hbm{\wh{\bfm m}} 
\def\hM{\wh{M}}   \def\hbM{\wh{\bfm M}}

\def\hn{\wh{n}}   \def\hbn{\wh{\bfm n}} 
\def\hN{\wh{N}}   \def\hbN{\wh{\bfm N}}

\def\ho{\wh{o}}   \def\hbo{\wh{\bfm o}} 
\def\hO{\wh{O}}   \def\hbO{\wh{\bfm O}}

\def\hp{\wh{p}}   \def\hbp{\wh{\bfm p}} 
\def\hP{\wh{P}}   \def\hbP{\wh{\bfm P}}
 
\def\hq{\wh{q}}   \def\hbq{\wh{\bfm q}} 
\def\hQ{\wh{Q}}   \def\hbQ{\wh{\bfm Q}}

\def\hrr{\wh{r}}   \def\hbrr{\wh{\bfm r}} 
\def\hRR{\wh{R}}   \def\hbRR{\wh{\bfm R}}
 
\def\hs{\wh{s}}   \def\hbs{\wh{\bfm s}} 
\def\hS{\wh{S}}   \def\hbS{\wh{\bfm S}}
  
\def\htt{\wh{t}}   \def\hbtt{\wh{\bfm t}} 
\def\hTT{\wh{T}}   \def\hbtT{\wh{\bfm T}}
    
\def\hu{\wh{u}}   \def\hbu{\wh{\bfm u}} 
\def\hU{\wh{U}}   \def\hbU{\wh{\bfm U}}
 
\def\hv{\wh{v}}   \def\hbv{\wh{\bfm v}} 
\def\hV{\wh{V}}   \def\hbV{\wh{\bfm V}}

\def\hw{\wh{w}}   \def\hbw{\wh{\bfm w}} 
\def\hW{\wh{W}}   \def\hbW{\wh{\bfm W}}
 
\def\hx{\wh{x}}   \def\hbx{\wh{\bfm x}} 
\def\hX{\wh{X}}   \def\hbX{\wh{\bfm X}}

\def\hy{\wh{y}}   \def\hby{\wh{\bfm y}} 
\def\hY{\wh{Y}}   \def\hbY{\wh{\bfm Y}}
 
\def\hz{\wh{z}}   \def\hbz{\wh{\bfm z}} 
\def\hZ{\wh{Z}}   \def\hbZ{\wh{\bfm Z}}  


\def\calA{{\cal  A}} \def\cA{{\cal  A}}
\def\calB{{\cal  B}} \def\cB{{\cal  B}}
\def\calC{{\cal  C}} \def\cC{{\cal  C}}
\def\calD{{\cal  D}} \def\cD{{\cal  D}}
\def\calE{{\cal  E}} \def\cE{{\cal  E}}
\def\calF{{\cal  F}} \def\cF{{\cal  F}}
\def\calG{{\cal  G}} \def\cG{{\cal  G}}
\def\calH{{\cal  H}} \def\cH{{\cal  H}}
\def\calI{{\cal  I}} \def\cI{{\cal  I}}
\def\calJ{{\cal  J}} \def\cJ{{\cal  J}}
\def\calK{{\cal  K}} \def\cK{{\cal  K}}
\def\calL{{\cal  L}} \def\cL{{\cal  L}}
\def\calM{{\cal  M}} \def\cM{{\cal  M}}
\def\calN{{\cal  N}} \def\cN{{\cal  N}}
\def\calO{{\cal  O}} \def\cO{{\cal  O}}
\def\calP{{\cal  P}} \def\cP{{\cal  P}}
\def\calQ{{\cal  Q}} \def\cQ{{\cal  Q}}
\def\calR{{\cal  R}} \def\cR{{\cal  R}}
\def\calS{{\cal  S}} \def\cS{{\cal  S}}
\def\calT{{\cal  T}} \def\cT{{\cal  T}}
\def\calU{{\cal  U}} \def\cU{{\cal  U}}
\def\calV{{\cal  V}} \def\cV{{\cal  V}}
\def\calW{{\cal  W}} \def\cW{{\cal  W}}
\def\calX{{\cal  X}} \def\cX{{\cal  X}}
\def\calY{{\cal  Y}} \def\cY{{\cal  Y}}
\def\calZ{{\cal  Z}} \def\cZ{{\cal  Z}}
\def\bZero{\bfm 0}
\def\bOne{\bfm 1}

% =========================
% Greek Letters
% =========================
\def\balpha{\bfsym \alpha}
\def\halpha{\hat{\alpha}}              
\def\hbalpha{\hat{\bfsym \alpha}}

\def\bbeta{\bfsym \beta}
\def\hbeta{\hat{\beta}}                 
\def\hbbeta{\hat{\bfsym \beta}}

\def\bgamma{\bfsym \gamma}             
\def\hgamma{\hat{\gamma}}              
\def\hbgamma{\hat{\bfsym \gamma}}

\def\bGamma{\bfsym \Gamma}
\def\hGamma{\hat{ \Gamma}}             
\def\hbGamma{\hat{\bfsym \Gamma}}

\def\bdelta{\bfsym {\delta}}
\def\hdelta{\hat{\delta}}              
\def\hbdelta{\hat{\bfsym {\delta}}}
          
\def\bDelta {\bfsym {\Delta}}
\def\hDelta {\hat{\Delta}}             
\def\hbDelta{\hat{\bfsym {\Delta}}}

\def\bfeta{\bfsym {\eta}}              
\def\heta{\hat {\eta}}                
\def\hbfeta {\hat{\bfsym {\eta}}}

\def\bfEta {\bfsym {\Eta}}

\def\bkappa{\bfsym \kappa}

\def\bmu{\bfsym {\mu}}  
\def\hmu{\hat{\mu}}                    
\def\hbmu {\hat{\bfsym {\mu}}}
               
\def\bMu {\bfsym {\Mu}}

\def\bnu{\bfsym {\nu}}
\def\hnu{\hat{\nu}}                    
\def\hbnu {\hat{\bfsym {\nu}}}

\def\bpi{\bfsym {\pi}}
\def\bPi{\bfsym {\Pi}}

\def\bphi{\bysym{\phi}}
\def\hphi{\hat{\phi}}
\def\hbphi{\hat{\bfsym{\phi}}}

\def\bvarphi{\bysym{\varphi}}
\def\hvarphi{\hat{\varphi}}
\def\tvarphi{\wt{\varphi}}
\def\hbvarphi{\hat{\bfsym{\varphi}}}


\def\btheta{\bfsym {\theta}} 
\def\htheta{\hat {\theta}}             
\def\hbtheta {\hat{\bfsym {\theta}}}
          
\def\bTheta {\bfsym {\Theta}}
\def\hTheta{\hat {\Theta}}             
\def\hbTheta {\hat{\bfsym {\Theta}}}


\def\bpsi{\bfsym{\psi}}
\def\hpsi{\hat{\psi}}
\def\hbpsi{\hat{\bfsym {\psi}}}

\def\bPsid{\bfsym{\Psi}}
\def\hPsi{\hat{\Psi}}
\def\hbPsi{\hat{\bfsym{\Psi}}}

\def\beps{\bfsym \epsilon}          
\def\beps{\bfsym \epsilon}
\def\hbeps{\hat{\bfsym \epsilon}}  

\def\bvareps{\bfsym \varepsilon}          
\def\bvareps{\bfsym \varepsilon}
\def\hbvareps{\hat{\bfsym \varepsilon}}  

\def\bsigma{\bfsym \sigma} 
\def\hsigma{\hat{\sigma}}              
\def\hbsigma{\hat{\bfsym \sigma}}
            
\def\bSigma{\bfsym \Sigma}
\def\hSigma{\hat{\Sigma}}              
\def\hbSigma{\hat{\bfsym \Sigma}}

\def\blambda {\bfsym {\lambda}} 
\def\hlambda{\hat{\lambda}}            
\def\hblambda{\hat{\bfsym \lambda}}
       
\def\bLambda {\bfsym {\Lambda}}
\def\hLambda{\hat{\Lambda}}            
\def\hbLambda{\hat{\bfsym \Lambda}}

\def\bomega {\bfsym {\omega}} 
\def\homega {\hat {\omega}}            
\def\hbomega {\hat{\bfsym {\omega}}}

\def\bOmega {\bfsym {\Omega}}
\def\hOmega {\hat {\Omega}}            
\def\hbOmega {\hat{\bfsym {\Omega}}}

\def\brho   {\bfsym {\rho}}
\def\hrho   {\hat {\rho}}             
\def\hbrho {\hat{\bfsym {\rho}}}
\def\bvarrho {\bfsym {\varrho}}
\def\hvarrho   {\hat {\varrho}}             
\def\hbvarrho {\hat{\bfsym {\varrho}}}


\def\btau{\bfsym {\tau}} 
\def\htau   {\hat {\tau}}               
\def\hbtau {\hat{\bfsym {\tau}}}

\def\bxi{\bfsym {\xi}}
\def\hxi{\hat{\xi}}                   
\def\hbxi{\hat{\bfsym {\xi}}}

\def\bXi{\bfsym {\Xi}}
\def\hXi{\hat{\Xi}}                   
\def\hbXi{\hat{\bfsym {\Xi}}}

\def\bzeta{\bfsym {\zeta}}
\def\hzeta{\hat{\zeta}}               
\def\hbzeta{\hat{\bfsym {\zeta}}}

\def\bZeta{\bfsym {\Zeta}}
\def\hZeta{\hat{\Zeta}}               
\def\hbZeta{\hat{\bfsym {\Zeta}}}

% ================
% General
% ================
\def\andd{\mbox{and}}

\def\+#1{\mathcal{#1}}
\def\-#1{\textup{#1}}

\def\br#1{\overline{#1}}

\def\bracket#1{\left(#1\right)}
\def\Bracket#1{\left[#1\right]}

\def\const{\mbox{const.}\quad}

\def\defequal {\triangleq}

\def \lr {\left}
\def \rt {\right}
\def \lra {\Longrightarrow}
\def \lla {\Longleftarrow}
\def \era {\Longleftrightarrow}

\def\nfrac#1#2{\nicefrac{#1}{#2}}

\def\say{\mbox{(say)}}
\def\sgn{\mbox{sgn}}

\def\wh{\widehat}
\def\wt{\widetilde}

\newcommand{\vect}[1]{{\textup{vec}\bracket{#1}}}
\newcommand{\V}[1]{{\boldsymbol{#1}}}

% =================
% Algebra
% =================
\newcommand{\diag}{\textup{diag}}
\def\dim{\textup{dim}}

\def\eigmax{\gamma_{\textup{max}}}
\def\eigmin{\gamma_{\textup{min}}}

\def\mleq{\preccurlyeq}
\def\mgeq{\succcurlyeq}

\newcommand{\la}{\left \langle}
\newcommand{\ra}{\right \rangle}

\newcommand{\La}{\left\langle\kern-0.64ex\left\langle}
\newcommand{\Ra}{\right\rangle\kern-0.64ex\right\rangle}

\providecommand{\abs}[1]{\left\lvert#1\right\rvert}

    
\def\norm#1#2{{\left\|#1\right\|}_{#2}}

\def\Norm#1#2{{\left\vert\kern-0.4ex\left\vert\kern-0.4ex\left\vert #1 
    \right\vert\kern-0.4ex\right\vert\kern-0.4ex\right\vert}_{#2}}


\def\psitwonorm#1{\norm{#1}{\psi_2}}

\def\infnorm#1{\norm{#1}{\infty}}
\def\Infnorm#1{\Norm{#1}{\infty}}

\def\fnorm#1{\norm{#1}{\textup{F}}}
\def\Fnorm#1{\Norm{#1}{\textup{F}}}

\def\opnorm#1{\norm{#1}{\textup{OP}}}
\def\Opnorm#1{\Norm{#1}{\textup{OP}}}

\def\nucnorm#1{\Norm{#1}{\textup{nuc}}}
\def\offdiagnorm#1{\Norm{#1}{1, \textup{off}}}

\def \Null  {\textup{null}}
\def \Proj  {\mathbb{P}}

\newcommand{\rank}{\textup{rank}}
\def \range {\textup{range}}
\def \rmt   {\top}
\def \rmh   {\mathrm{H}}

\def \sigmax {\sigma_{\textup{max}}}
\def \sigmin {\sigma_{\textup{min}}}
\newcommand{\sign}{\textup{sign}}
\def \supp  {\textup{supp}}

%\newcommand{\trace}{\textup{tr}}
\newcommand{\trace}{\operatorname{Tr}}
\def \tracemgf {\br{\operatorname{Tr}}}

\def\svd#1{\mathsf{SVD}\bracket{#1}}

\def \Indmat {\bI\bd}


% =================
% Analysis 
% ================
\def \card#1{\textup{card}\bracket{#1}}
\def \Fourier {\mathcal{F}}
\def \dist    {\textup{dist}}
\def \interior {\textup{int}}
\def \real {\textup{Re}}
\def \imag {\textup{Im}}
\def \eps  {\epsilon}
\def \veps {\varepsilon}
\def \mps {\mapsto}

% =================
% Geometry
% =================

\def \Vol {\textup{Vol}}

\def\set#1{\left\{#1\right\}}

\def\ball#1#2#3{\BB^{#1}\left(#2; #3\right)}
\def\sphere#1#2#3{\SS^{#1}\left(#2; #3\right)}

\def\sinang#1#2{\sin\angle\left(#1; #2\right)}
\def\cosang#1#2{\cos\angle\left(#1; #2\right)}
\def\tanang#1#2{\tan\angle\left(#1; #2\right)}


% =================
% Optimization
% ================
\newcommand{\argmax}{\textup{argmax}}
\newcommand{\argmin}{\textup{argmin}}

\def \Ind {\mathbbm{1}}
%\def \Ind {\mathbbm{1}}

\def \St  {\textup{~s.t.~}}
\def \Epi {\textup{Epi}}
\def \dom {\textbf{dom}}
\def \prox {\mathsf{prox}}
\def \Prox {\mathsf{prox}}
\def \opt {\textup{opt}}

% =================
% Probability 
% =================

\def \Expc {\mathbb{E}}
\def \Cov  {\textup{Cov}}
\def \cov  {\textup{cov}}


\def \logdet {\log\det}
\def \normdist {\mathsf{N}}
\def \Bino {\textup{Binomial}}
%\def \Prob {\textup{Pr}}
%\def \prob {\textup{Pr}}
\def \Prob {\mathbb{P}}
\def \prob {\mathbb{P}}
\def \prior {\textup{prior}}

\def \Var  {\textup{Var}}
\def \var  {\textup{var}}
\def \mean {\textup{mean}}
\def \Mean {\textup{Mean}}
\def \median  {\textup{med}}
\def \Median {\textup{Med}}
\def \med  {\textbf{med}}
\def \Med  {\textbf{Med}}
\def \as   {\textup{a.s.}}
\def \Unif {\textup{Unif}}
\def \Bern {\textup{Bern}}
\def \Poi  {\textup{Poi}}
\def \Ent  {\textup{Ent}}
\def \LogMnt#1{\log \Expc e^{#1}}
\def \lsim {\lesssim} 
\def \gsim {\gtrsim}

%\def \lsim {\precsim}
%\def \gsim {\succsim}

\def\kl#1#2{\mathbb{KL}\left(#1||#2\right)}
\newcommand{\kull}[2]{\ensuremath{D_{\text{KL}}(#1 \| #2)}}
\def\dtv#1#2{\norm{{#1}-{#2}}{\textup{TV}}}

\def \iid {\stackrel{\textup{i.i.d}}{\sim}}
\def \samedist {\stackrel{(\mathsf{d})}{=}}

\def\dh{\mathsf{d_H}}
\def\dH{\mathsf{d_H}}


% =================
% Statistics 
% =================
\def \natu {\natural}
\def \vcap {\wedge}
\def \vcup {\vee}
\def \Me {\mathfrak{M}}
\def \energy {\textup{Energy}}
\def\OPrate#1{\OO_{\textup{P}}\left(#1\right)}
\def\oprate#1{o_{\textup{P}}\left(#1\right)}

\def \ch {\textup{ch}}
\def \sh {\textup{sh}}
\def \th {\textup{th}}
\def \bias {\textup{bias}}
\def \cconverg {\stackrel{\textup{c}}{\longrightarrow}}
\def \dconverg {\rightsquigarrow}
\def \pconverg {\stackrel{\textup{p}}{\longrightarrow}}
\def \asconverg {\stackrel{\textup{a.s.}}{\longrightarrow}}
\def \se {\mathsf{se}}
\def \mse {\mathsf{MSE}}
\def \mmse {\mathsf{MMSE}}
\def \mmo {\mathsf{MMO}}

% =======================
% Signal Processing
% =======================
\def \snr {\mathsf{SNR}}
\def\quant#1#2{\mathsf{Quant}_{#1}\bracket{#2}}

% =========================
% Online Learn
% =========================
\def \regret {\textup{Regret}}
\def \risk  {\textup{Risk}}

% =========================
% Neural network
% =========================

\def \relu {\textup{ReLU}}

\newenvironment{proofoutline}
 {\renewcommand\qedsymbol{}\proof[Proof outline]}
 {\endproof}


\def \dh {\mathsf{d_H}}
\def \dH {\mathsf{d_H}}

\def \term {\mathsf{Term}}
\def\srank#1{\textup{srank}(#1)}

\def\wtminus#1#2{\wt{#1}_{\setminus (#2)}}
\def\sampminus#1#2{#1_{\setminus (#2)}}
\def\why{\textup{\red{WHY?}}}
\def\valtoset#1{\textup{\red{Value-to-set-#1}}}

\def\red#1{{\color{red}#1}}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Greed is good: correspondence recovery for unlabeled linear regression}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors


\author{Hang Zhang, Ping Li\\
 Cognitive Computing Lab\\
Baidu Research\\
 10900 NE 8th St, Bellevue, WA 98004, USA \\
\texttt{\{zhanghanghitomi, pingli98\}@gmail.com}
}
  
  \begin{document}
  
  \onecolumn
\maketitle


\section{Notations}
\label{subsec:greed_proof_notations}
We start our discussion by 
defining $\wh{\bB}$ and $\wt{\bB}$ respectively as 
\begin{align*}
\wt{\bB} &= \bracket{n-h}^{-1}\bX^{\rmt}\bPi^{*}\bX\bB^{*}, \\
\wh{\bB} &= \bracket{n-h}^{-1}\bX^{\rmt}\bY = \wt{\bB} + \
\bracket{n-h}^{-1}\bX^{\rmt}\bW,
\end{align*}
where $h$ is denoted as the Hamming distance between identity matrix $\bI$ and 
the ground truth selection matrix $\bPi^{*}$, 
i.e., $h = \dH(\bI, \bPi^{*})$. 


Here we modify the \emph{leave-one-out} 
trick, which is previously used in 
\citet{karoui2013asymptotic, el2013robust, karoui2018impact, chen2020noisy, sur2019likelihood}. 
First, we construct an independent copy $\bX_{s, :}^{'}$ for each row 
$\bX_{s, :}$ ($s$th row of the sensing matrix $\bX$). Building on these
independent copies, we construct the leave-one-out sample $\sampminus{\bX}{s}$
by replacing the $s$th row in the sensing matrix $\bX$ with its independent 
copy $\bX_{s, :}^{'}$. 
The detailed construction of independent copies 
$\{\wtminus{\bB}{s}\}_{s = 1}^n$ proceeds as 
\[
\wtminus{\bB}{s} =  
(n-h)^{-1}\bigg( 
\sum_{\substack{k \neq s \\ \pi^{*}(k) \neq s}} \bX_{\pi(k), :}\bX_{k, :}^{\rmt}
+ \sum_{\substack{k = s \textup{ or} \\  \pi^{*}(k) = s}} {\bX}^{'}_{\pi(k), :}{\bX}^{'\rmt}_{k, :}\bigg)\bB^{*}.
\]
Easily we can verify that $\wtminus{\bB}{i}$ is independent of 
the $i$th row $\bX_{i, :}$. 
Similarly, we construct the matrices 
$\{\wtminus{\bB}{s, t}\}_{1\leq s\neq t \leq n}$ as 
\[
\wtminus{\bB}{s, t} = 
(n-h)^{-1}\bigg( 
\sum_{\substack{k \neq s, t\\ \pi^{*}(k) \neq s,t }} \bX_{\pi(k), :}\bX_{k, :}^{\rmt}
+ \sum_{\substack{k = s \textup{ or } k = t \textup{ or} \\ \pi^{*}(k) =s
  \textup{ or }\pi^{*}(k) = t}} {\bX}^{'}_{\pi(k), :}{\bX}^{'\rmt}_{k, :}\bigg)\bB^{*},
\]
and verify the independence between $\wtminus{\bB}{s, t}$ and
the rows  
$\bX_{s, :}, \bX_{t, :}$. 

Moreover, we define the events $\calE_{i}$ as 
\[
\calE_1(\bM) &\defequal 
\set{\norm{\bM^{\rmt}\bX_{i, :}}{2} \lsim \sqrt{\log n} \Fnorm{\bM}
\textup{ and }\norm{\bM^{\rmt}\bX^{'}_{i, :}}{2} \lsim \sqrt{\log n} \Fnorm{\bM}~~\forall~1\leq i \leq n}; \\
\calE_{2, 1} & \defequal  
\set{\big\langle \bX_{i, :}, \bX_{j, :}^{'}\big\rangle \lsim \sqrt{p\log n},~~1\leq i, j \leq n}; \\
\calE_{2,2} & \defequal 
\set{\la \bX_{i, :}, \bX_{j, :}\ra \lsim \sqrt{p\log n}, ~~1\leq i \neq j \leq n}; \\ 
\calE_{2, 3} & \defequal 
\set{\big\langle \bX^{'}_{i, :}, \bX^{'}_{j, :}\big\rangle \lsim \sqrt{p\log n},~~1\leq i, j \leq n}; \\
\calE_2 &= \calE_{2,1}\bigcap \calE_{2, 2}\bigcap \calE_{2, 3};\\
\calE_3 &= \set{\norm{\bX_{s, :}}{2} \leq \sqrt{p\log n} \textup{ and }
\|\bX^{'}_{s, :}\|_{2} \leq \sqrt{p\log n},~~\forall~1\leq s \leq n }; \\
\calE_4 &= \set{
\Fnorm{\bX} \leq \sqrt{2np}~\textup{ and }\Fnorm{\sampminus{\bX}{s}} 
\leq \sqrt{2np},~~\forall~1\leq s \leq n}; \\
\calE_5 &= \set{
\norm{\bX\bX_{s, :}}{2} \lsim (\log n)\sqrt{np},
~~\forall~1\leq s \leq n }; \\
\calE_{6, 1} &= 
\set{\Fnorm{\bB^{*} - \wtminus{\bB}{s}} \lsim\frac{(\log n)(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}},~~\forall~1\leq s \leq n}; \\ 
\calE_{6, 2} &= 
\set{\Fnorm{\bB^{*} - \wtminus{\bB}{s, t}} \lsim \frac{(\log n)(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}},~~\forall~1\leq s \neq t \leq n}; \\
\calE_6 &= \calE_{6,1}\bigcap \calE_{6, 2}; \\
\calE_7 &= \
\set{\norm{ (\wt{\bB} - \wtminus{\bB}{s})^{\rmt} \bX_{s, :} }{2} 
\lsim \frac{p\log n}{n}\Fnorm{\bB^{*}},~\forall~1\leq s \leq n};  \\
\calE_8 &= \set{
\norm{(\wt{\bB} - \wtminus{\bB}{s, t})^{\rmt}\bX_{s, :} }{2} 
\lsim \frac{p\log n}{n}\Fnorm{\bB^{*}},~\forall~1\leq s\neq t \leq n}; \\
\calE_9 &= \set{\norm{( \wt{\bB} - \bB^{*})^{\rmt} \bX_{s, :}}{2} \lsim 
\frac{(\log n)^{\nfrac{3}{2}}(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}},~\forall~1\leq s\leq n}. 
\]
In addition, we define the quantities $\Delta_1$, 
$\Delta_2$, and $\Delta_3$ as 
\begin{align}
\Delta_1 &=
c_0 \sigma(\log n)^{\nfrac{5}{2}}\sqrt{\frac{p}{n}}\Fnorm{\bB^{*}}; \label{eq:Delta1_def}\\
\Delta_2 &= c_1\sigma (\log^2 n)\Fnorm{\bB^{*}};\label{eq:Delta2_def} \\
\Delta_3 &= c_2\Bracket{\frac{mp(\log n)^2\sigma^2}{n}
+ \sigma^2(\log n)^{2}\sqrt{\frac{mp}{n}}}, \label{eq:Delta3_def}
\end{align}
respectively. Besides, we define the summary $\Delta$ as 
$\Delta_1 + \Delta_2 + \Delta_3$.




%\section{Appendix: Proof of Theorem~\ref{thm:oracle_single_index_recover_MMV}}

\section{Appendix: Proof of Theorem~$2$}

\begin{proof}
We define the error event $\calE$ as 
\[
\calE \defequal  
\bigg\{ &\norm{\bB^{*\rmt}\bX_{\pi^{*}(i)}}{2}^2
+ \la \bW_i, \bB^{*\rmt}\bX_{\pi^{*}(i)}\ra \leq
\la \bB^{*\rmt}\bX_{\pi^{*}(i)}, 
\bB^{*\rmt}\bX_{j}  \ra  
+ \la \bW_i, \bB^{*\rmt}\bX_{j}\ra,~\forall~j\neq \pi^{*}(i)\bigg\}, 
\]
and complete the proof by showing $\Prob(\calE) \lsim n^{-c}$. 
To start with, we define three events $\calE_1, \calE_2$ and $\calE_3$ as 
\[
\calE_1 \defequal &\set{\norm{\bB^{*\rmt}\bX_{\pi^{*}(i)}}{2}
\leq \frac{1}{2}\Fnorm{\bB^{*}}}; \\
\calE_2 \defequal & 
\set{\langle \bB^{*\rmt}\bX_{\pi^{*}(i)}, 
\bB^{*\rmt}\bX_{j}  \rangle  \gsim \log n \Fnorm{\bB^{*}\bB^{*\rmt}} ,\forall j\neq \pi^{*}(i)}; \\
\calE_3 \defequal & 
\set{\langle \bW_i, \bB^{*\rmt}\bracket{\bX_j - \bX_{\pi^{*}(i)}}\rangle 
\gsim \sigma \log n\Fnorm{\bB^{*}},\forall j\neq \pi^{*}(i)},
\]
\noindent 
respectively. The proof begins with the following decomposition, which reads as
\[
\Expc \Ind\bracket{\calE} = 
\Expc\Ind\bigg(\calE \bigcap \bigcap_{i=1}^3 \br{\calE}_i\bigg)
+ \Expc\Ind\bigg(\bigcup_{i=1}^3 \calE_i \bigg).
\]
The subsequent proof can be divided into two parts. 


\textbf{Part I.}
We prove that $\Expc\Ind\bigg(\calE \bigcap \bigcap_{i=1}^3 \br{\calE}_i\bigg)$
is zero provided that $\srank{\bB^{*}}\gsim \log^2 n$ and 
$\snr\geq c$. 
The underlying reason is as the following. 
To begin with, we obtain 
\[
\norm{\bB^{*\rmt}\bX_{\pi^{*}(i)}}{2}^2 
\stackrel{\cirone}{\gsim}~& \Fnorm{\bB^{*}}^2  \stackrel{\cirtwo}{\gsim} \frac{\log n}{\sqrt{\srank{\bB^{*}}}}\Fnorm{\bB^{*}}^2
+ \sigma \log n\Fnorm{\bB^{*}} \\
\stackrel{\cirthree}{\geq}~& \log n \Fnorm{\bB^{*}\bB^{*\rmt}} 
+ \sigma \log n\Fnorm{\bB^{*}} 
\] 
where $\cirone$ is due to 
$\br{\calE}_1$, $\cirtwo$ is because of the 
assumption $\srank{\bB^{*}}\gsim \log^2 n$ and 
$\snr\geq c$, and $\cirthree$ results from the relation 
\[
\Fnorm{\bB^{*}\bB^{*\rmt}} \leq \Opnorm{\bB^{*}}\Fnorm{\bB^{*}}
= \frac{\Fnorm{\bB^{*}}^2}{\sqrt{\srank{\bB^{*}}}}. 
\]
Condition on the event $\br{\calE}_2 \bigcap \br{\calE}_3$, we 
conclude
\[
\norm{\bB^{*\rmt}\bX_{\pi^{*}(i)}}{2}^2
\gsim~& \langle \bB^{*\rmt}\bX_{\pi^{*}(i)}, 
\bB^{*\rmt}\bX_{j} \rangle + \la  \bW_i, \bB^{*\rmt}\bracket{\bX_j - \bX_{\pi^{*}(i)}}\ra, 
\]
which is contradictory to the definition of $\calE$ and hence 
leads to $\Expc\Ind(\calE \bigcap \bigcap_{i=1}^3 \br{\calE}_i) = 0$.
Therefore 
we can invoke the union bound and 
upper-bound the error probability $\Expc \Ind\bracket{\calE}$
as $\sum_{i=1}^3 \Expc \Ind(\calE_i)$. 

\textbf{Part II.} The following context separately bound the three terms $\Expc \Ind\bracket{\calE_i}$,~$1\leq i \leq 3$. 
For $\Expc \Ind(\calE_1)$, we can simply invoke Lemma~\ref{lemma:small_ball_log_concave}
and bound it as 
\[
\Expc \Ind\calE_1 \lsim e^{-\srank{\bB^{*}}} \stackrel{\cirfour}{\lsim} n^{-c},
\] 
where $\cirfour$ is due to the assumption $\srank{\bB^{*}} \gg \log^2 n$. 

Then we turn to bounding $\Expc \Ind(\calE_2)$, which proceeds as  
\begin{align}
\label{eq:oracle_T1_E2}
& \Expc\Ind\bracket{\calE_2} \leq
\Prob\bracket{\norm{\bB^{*}\bB^{*\rmt}\bX_{\pi^{*}(i)} }{2}
\gsim \sqrt{\log n}\Fnorm{\bB^{*}\bB^{*\rmt}}} \notag \\
+~& n \Expc_{\bX_{\pi^{*}(i)}}\Ind\bigg(
\norm{\bB^{*}\bB^{*\rmt}\bX_{\pi^{*}(i)} }{2}
\lsim \sqrt{\log n}\Fnorm{\bB^{*}\bB^{*\rmt}},  \langle \bB^{*\rmt}\bX_{\pi^{*}(i)}, \bB^{*\rmt}\bX_{j}  \rangle  \gsim \log n \Fnorm{\bB^{*}\bB^{*\rmt}} \bigg).
\end{align}
For the first term in ~\eqref{eq:oracle_T1_E2}, we have
\[
\Prob\bracket{\norm{\bB^{*}\bB^{*\rmt}\bX_{\pi^{*}(i)} }{2}
\gsim \sqrt{\log n}\Fnorm{\bB^{*}\bB^{*\rmt}}}
\lsim n^{-c_0}.
\] 
While for the second term in ~\eqref{eq:oracle_T1_E2}, we exploit the 
independence between $\bX_{\pi^{*}(i)}$ and $\bX_j$, which 
yields 
\[
& \Expc_{\bX_{\pi^{*}(i)}}\Ind\bigg(
\norm{\bB^{*}\bB^{*\rmt}\bX_{\pi^{*}(i)} }{2}
\lsim \sqrt{\log n}\Fnorm{\bB^{*}\bB^{*\rmt}}, \langle \bB^{*\rmt}\bX_{\pi^{*}(i)}, 
\bB^{*\rmt}\bX_{j}  \rangle   \gsim \log n \Fnorm{\bB^{*}\bB^{*\rmt}} \bigg) \\
\lsim~& \exp\bracket{-\frac{c_1\log^2 n \Fnorm{\bB^{*}\bB^{*\rmt}}^2}{\log n \Fnorm{\bB^{*}\bB^{*\rmt}}^2}} \leq n^{-c_1}.
\]
Hence we conclude $\Expc\Ind\bracket{\calE_2}  \lsim  n^{-c_0} + n\cdot n^{-c_1}
\lsim n^{-c_2}$. 
In the end, we consider $\Expc \Ind(\calE_3)$, which is written as 
\begin{align}
&\Expc \Ind(\calE_3) \leq
\Prob\big(\norm{\bB^{*\rmt}\bracket{\bX_j - \bX_{\pi^{*}(i)}}}{2} \leq \frac{\Fnorm{\bB^{*}}}{2},~\exists ~ j\big)
+ \Prob\big(\calE_3, \norm{\bB^{*\rmt}\bracket{\bX_j - \bX_{\pi^{*}(i)}}}{2} \geq \frac{\Fnorm{\bB^{*}}}{2},~\forall~j\big).
\label{eq:oracle_E3_sum}
\end{align}
For the first term in \eqref{eq:oracle_E3_sum}, we invoke Lemma~\ref{lemma:small_ball_log_concave} and 
have 
\[
\Prob\bracket{\norm{\bB^{*\rmt}\bracket{\bX_j - \bX_{\pi^{*}(i)}}}{2} \leq \frac{\Fnorm{\bB^{*}}}{2},~\exists ~ j} 
\stackrel{\cirfive}{\leq} n \exp\bracket{-c \cdot \srank{\bB^{*}}} \stackrel{\cirsix}{\lsim} n^{-c}, 
\]
where $\cirfive$ is due to the union bound and 
$\cirsix$ is due to the assumption such that $\srank{\bB^{*}}\gg \log^2 n$. 

For the second term in \eqref{eq:oracle_E3_sum}, we exploit the 
independence across $\bX$ and $\bW$ and have 
\[
\Prob\bracket{\calE_3, \norm{\bB^{*\rmt}\bracket{\bX_j - \bX_{\pi^{*}(i)}}}{2} \geq \frac{\Fnorm{\bB^{*}}}{2},~\forall~j} 
\leq n \exp\bracket{-\frac{c \log^2 n \Fnorm{\bB^{*}}^2}{\Fnorm{\bB^{*}}^2}}
\lsim n^{-c}.
\]
Summarizing the above discussion then completes the proof. 

\end{proof}


% =====================================================
%\section{Proof of Theorem~\ref{thm:nonoracle_single_index_recover_MMV}}
\section{Proof of Theorem~$3$}


Notice the reconstruction error, i.e., $\pi^{*}(i) \neq \wh{\pi}^{*}(i)$,  will 
occur as long as  there exists
$j\neq \pi^{*}(i)$ such that
\begin{align}
\label{eq:select_optim}
\la \bY_{i, :},  \wh{\bB}^{\rmt}\bX_{\pi^{*}(i), :} \ra 
\leq \la \bY_{i, :}, \wh{\bB}^{\rmt} \bX_{j, :} \ra. 
\end{align}


With the relation 
$\bY_{i, :} =\bB^{*\rmt}  \bX_{\pi^{*}(i), :}+ \bW_{i, :}$
and $\wh{\bB} = \wt{\bB} + (n-h)^{-1}\bX^{\rmt}\bW$, we can rewrite ~\eqref{eq:select_optim} as 
\begin{align}
\label{eq:greed_optim_condition}
& \la \bB^{*\rmt}  \bX_{\pi^{*}(i), :}+ \bW_{i, :}, 
\bracket{\wt{\bB} +(n-h)^{-1}\bX^{\rmt}\bW}^{\rmt}
\bX_{\pi^{*}(i), :}
\ra \notag \\
\leq~&
\la \bB^{*\rmt}  \bX_{\pi^{*}(i), :}+ \bW_{i, :}, 
\bracket{\wt{\bB} + (n-h)^{-1}\bX^{\rmt}\bW}^{\rmt}
\bX_{j, :}\ra. 
\end{align}
For the notation conciseness, we define terms 
$\term_i$ ($1\leq i \leq 4$) as 
\begin{align}
\term_{\textup{tot}}&= \la \bB^{*\rmt}  \bX_{\pi^{*}(i), :}, \wt{\bB}^{\rmt}
\bracket{\bX_{\pi^{*}(i), :} -  \bX_{j, :}}\ra; \label{eq:term1_def}\\
\term_{1} &= (n-h)^{-1}\la \bB^{*\rmt}  \bX_{\pi^{*}(i), :}, \bW^{\rmt}\bX\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}} \ra; \label{eq:term2_def}\\
\term_{2} &= \la \bW_{i, :}, \wt{\bB}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}} \ra;\label{eq:term3_def} \\
\term_{3} &= (n-h)^{-1}\la \bW_{i, :}, \bW^{\rmt}\bX\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}} \ra. \label{eq:term4_def}
\end{align}
Then ~\eqref{eq:greed_optim_condition} is equivalent to 
$\term_{\textup{tot}} \leq \term_{1} + \term_{2} + \term_{3}$. With the union bound, 
we conclude 
\begin{align}
\label{eq:main_theorem_tot}
\Prob\bracket{\pi^{*}(i) \neq \wh{\pi}(i), \exists~i}
=~& \Expc\Bracket{\Ind\bracket{\term_{\textup{tot}} \leq \term_{1} + \term_{2} + \term_{3},~\exists~i, j}\Ind\bracket{\bigcap_{a=1}^9 \calE_a}} + 
\sum_{a=1}^9 \Prob\bracket{\br{\calE}_a} \notag \\
\stackrel{\cirone}{\leq}~& n^2 \Expc\Bracket{\Ind\bracket{\term_{\textup{tot}} \leq \term_{1} + \term_{2} + \term_{3}}\Ind\bracket{\bigcap_{a=1}^9 \calE_a}} + 
c_0 p^{-c_1} + c_2 n^{-c_3}, 
\end{align}
where in $\cirone$ we invoke Lemma~\ref{lemma:x_row_norm_ub}, Lemma~\ref{lemma:inner_product}, 
Lemma~\ref{lemma:Xmat_fnorm}, Lemma~\ref{lemma:event5}, 
Lemma~\ref{lemma:Bmat_perturb}, Lemma~\ref{lemma:xb_single_idx}, 
Lemma~\ref{lemma:xb_multi_indices}, and Lemma~\ref{lemma:beta_perturb}. 

Regarding the term $\Expc\Bracket{\Ind\bracket{\term_{\textup{tot}} \leq \term_{1} + \term_{2} + \term_{3},~\exists~i, j}\Ind\bracket{\bigcap_{a=1}^9 \calE_a}}$, we further decompose it as the summary of two terms reading as 
\begin{align}
\label{eq:main_theorem_union}	
& \Expc\Bracket{\Ind\bracket{\term_{\textup{tot}} \leq \term_{1} + \term_{2} + \term_{3}}\Ind\bracket{\bigcap_{a=1}^9 \calE_a}} \notag \\
\leq~& \Expc\Bracket{\Ind\bracket{\term_{\textup{tot}} \leq \Delta}\Ind\bracket{\bigcap_{a=1}^9 \calE_a}}\notag  \\
+~& \Expc\Bracket{\Ind\bracket{\term_{1} + \term_{2} + \term_{3}\geq \Delta}\Ind\bracket{\bigcap_{a=1}^9 \calE_a}},\notag  \\
\leq~& \Expc\Bracket{\Ind\bracket{\term_{\textup{tot}} \leq \Delta}\Ind\bracket{\bigcap_{a=1}^9 \calE_a}} + \Expc\Bracket{\Ind\bracket{\term_{1} \geq \Delta_1}\Ind\bracket{\bigcap_{a=1}^9 \calE_a}}\notag  \\
+~& 
\Expc\Bracket{\Ind\bracket{\term_{2} \geq \Delta_2}\Ind\bracket{\bigcap_{a=1}^9 \calE_a}} + 
\Expc\Bracket{\Ind\bracket{\term_{3} \geq \Delta_3}\Ind\bracket{\bigcap_{a=1}^9 \calE_a}}, 
\end{align}
where the definitions of 
$\Delta_1$, $\Delta_2$, $\Delta_3$, and $\Delta$ are referred to 
Section~\ref{subsec:greed_proof_notations}.
The proof is then completed by 
combining ~\eqref{eq:main_theorem_tot} and ~\eqref{eq:main_theorem_union}
and invoking 
Lemma~\ref{lemma:termtot_bound}, Lemma~\ref{lemma:term1_bound}, 
Lemma~\ref{lemma:term2_bound}, and Lemma~\ref{lemma:term3_bound}.



% ===================================
\begin{lemma}
\label{lemma:termtot_bound}
Assume that $\srank{\bB^{*}}\gg \log^4 n$, 
$n\gsim p \log^6 n$, and $\snr \geq c$ and conditional on the intersection of events 
$\calE_1(\bB^{*})\bigcap { \calE_1\big(\bB^{*} \wtminus{\bB}{\pi^{*}(i), j}^{\rmt} \big) }\bigcap \calE_6 \bigcap \calE_7$, 
where indices $\pi^{*}(i)$ and $j$ are fixed.
we have $\term_{\textup{tot}} \geq \Delta$
hold with probability exceeding 
$1 - n^{-c}$ when $n$ and $p$ are sufficiently large, 
where $\term_{\textup{tot}}$ and $\Delta$ are defined in 
~\eqref{eq:term1_def} and Section~\ref{subsec:greed_proof_notations}, respectively. 
\end{lemma}


\begin{proof}
We start the discussion by decomposing $\term_{\textup{tot}}$ as 
\[
\term_{\textup{tot}} = \norm{\bB^{*\rmt}  \bX_{\pi^{*}(i), :}}{2}^2 + 
\underbrace{\big\langle \bB^{*\rmt}  \bX_{\pi^{*}(i), :} ,  
\bracket{\wt{\bB} - \bB^{*}}^{\rmt} \bX_{\pi^{*}(i), :}\big\rangle}_{\defequal \term_{\textup{tot}, 1}} 
- \underbrace{\la \bB^{*\rmt}  \bX_{\pi^{*}(i), :}, \wt{\bB}^{\rmt}\bX_{j, :}\ra}_{\defequal \term_{\textup{tot}, 2}}.  
\]
Then we obtain 
\begin{align}
\label{eq:term1_tot}
\Prob \bracket{\term_{\textup{tot}} \leq \Delta} 
=~&\Prob\bracket{\frac{\Delta}{\norm{\bB^{*\rmt}  \bX_{\pi^{*}(i), :}}{2}^2}
- \frac{\term_{\textup{tot}, 1}}{\norm{\bB^{*\rmt}  \bX_{\pi^{*}(i), :}}{2}^2}
+ \frac{\term_{\textup{tot}, 2}}{\norm{\bB^{*\rmt}  \bX_{\pi^{*}(i), :}}{2}^2}
\geq 1} \notag \\
\leq~&
\underbrace{\Prob\bracket{\norm{\bB^{*\rmt}  \bX_{\pi^{*}(i), :}}{2} \leq 
\delta}}_{\defequal \zeta_1} + 
\underbrace{\Prob\bracket{
\frac{\Delta}{\delta^2} + 
\frac{\abs{\term_{\textup{tot}, 1}}}{\delta^2}
+ \frac{\abs{\term_{\textup{tot}, 2}}}{\delta^2} \geq 1}}_{\defequal \zeta_2}.
\end{align}
We separately bound the probabilities $\zeta_1$ and 
$\zeta_2$ by setting $\delta$ as $1/2 \Fnorm{\bB^{*}}$.
For the term $\zeta_1$, we invoke the small ball probability (Lemma~\ref{lemma:small_ball_log_concave}) and conclude 
\begin{align}
\label{eq:term1_zeta1}
\Prob\bracket{\norm{\bB^{*\rmt}  \bX_{\pi^{*}(i), :}}{2} \leq 
\frac{1}{2}\Fnorm{\bB^{*}}}\leq e^{-c\srank{\bB^{*}}}.
\end{align} 
For probability $\zeta_2$, we will prove it to be zero 
provided $\snr \geq c$.
The proof is completed by showing 
\[
\frac{\Delta}{\delta^2} + 
\frac{\abs{\term_{\textup{tot}, 1}}}{\delta^2}
+ \frac{\abs{\term_{\textup{tot}, 2}}}{\delta^2}
< 1
\]
hold with probability $1-n^{-c}$. 
Detailed calculation proceeds as follows. 
\par \noindent
\textbf{Phase I.} 
First, we consider term $\term_{\textup{tot}, 1}$. Conditional on the intersection of 
events $\calE_1(\bB^{*})\bigcap \calE_7\bigcap \calE_9$, we have  
\[
\abs{\term_{\textup{tot}, 1}} \leq ~& \norm{\bB^{\rmt *} \bX_{i, :}}{2} \norm{\bracket{\wt{\bB} - \bB^{*}}^{\rmt} \bX_{\pi^{*}(i), :}}{2} \lsim 
\sqrt{\log n}\Fnorm{\bB^{*}}
\frac{(\log n)^{\nfrac{3}{2}} (\log n^2 p^3) \sqrt{p}}{\sqrt{n}}
\Fnorm{\bB^{*}} \\
=~& 
(\log^2 n)(\log n^2 p^3)\sqrt{\frac{p}{n}}\Fnorm{\bB^{*}}^2.
\]
\par \noindent
\textbf{Phase II.}
Then we turn to term $\term_{\textup{tot}, 2}$.  Adopting the leave-out-out trick, we can 
expand it as 
\[
\term_{\textup{tot}, 2} =~
\underbrace{\la \bB^{*\rmt}  \bX_{\pi^{*}(i), :}, 
\bracket{\wt{\bB} - \wtminus{\bB}{\pi^{*}(i), j}}^{\rmt} \bX_{j, :}\ra}_{\term_{\textup{tot}, 2, 1}}
+ \underbrace{\la \bB^{*\rmt}  \bX_{\pi^{*}(i), :}, \wtminus{\bB}{\pi^{*}(i), j}^{\rmt} \bX_{j, :}\ra}_{\term_{\textup{tot}, 2, 2}}.
\]
For term $\term_{\textup{tot}, 2, 1}$, we have 
\[
\term_{\textup{tot}, 2, 1} \leq~& 
 \norm{\bB^{*\rmt}  \bX_{\pi^{*}(i), :}}{2}
\norm{\bracket{\wt{\bB} - \wtminus{\bB}{\pi^{*}(i), j}}^{\rmt} \bX_{j, :}}{2}
\stackrel{\cirone}{\lsim} \sqrt{\log n}\Fnorm{\bB^{*}}
\frac{p\log n}{n}\Fnorm{\bB^{*}} \\
=~& \frac{p(\log n)^{\nfrac{3}{2}}}{n} \Fnorm{\bB^{*}}^2, 
\]
where in $\cirone$ we condition on event $\calE_7$.
Regarding the term $\term_{2,2,2}$, we notice that $\wtminus{\bB}{\pi^{*}(i), j}$ is independent of the rows $\bX_{\pi^{*}(i), :}$ and 
$\bX_{j, :}$ due to its construction method. 
Then we can bound the term $\term_{2,2,2}$ by 
fixing the rows $\{\bX_{s, :}\}_{s\neq \pi^{*}}$
and viewing $\bX_{\pi^{*}(i), :}$ as the RV, which 
yields 
\begin{align}
\label{eq:term122}
\term_{\textup{tot}, 2, 2} \lsim \sqrt{\log n}\norm{\bB^{*} \wtminus{\bB}{\pi^{*}(i), j}^{\rmt} \bX_{j, :}}{2}
\end{align}
holds with probability $1-n^{-c}$. Conditional on 
event $\calE_1(\bB^{*} \wtminus{\bB}{\pi^{*}(i), j}^{\rmt})$, 
we have 
\[
\term_{\textup{tot}, 2, 2} \lsim~& (\log n)\Fnorm{\bB^{*} \wtminus{\bB}{\pi^{*}(i), j}^{\rmt} }
\lsim (\log n) \Opnorm{\bB^{*}}\Fnorm{\wtminus{\bB}{\pi^{*}(i), j}^{\rmt} } \\
\stackrel{\cirtwo}{\leq}~& (\log n)\Opnorm{\bB^{*}}
\Bracket{\Fnorm{\wtminus{\bB}{\pi^{*}(i), j} - \bB^{*}} + \Fnorm{\bB^{*}}}
\stackrel{\cirthree}{\lsim} \frac{(\log n)\Fnorm{\bB^{*}}^2}{\sqrt{\srank{\bB^{*}}}}, 
\]
where in $\cirtwo$ we use the definition of stable rank, 
and in $\cirthree$ we conditional on event $\calE_6$, $n\geq p$, 
and $n\gsim p\log^6 n$.
\par \noindent
\textbf{Phase III.}
Conditional on ~\eqref{eq:term122}, we can expand the sum $\nfrac{\Delta}{\delta^2} + \term_{\textup{tot}, 1}/\delta^2 + 
\term_{\textup{tot}, 2}/\delta^2$ as 
\[
\frac{\Delta}{\delta^2} + \frac{\term_{\textup{tot}, 1}}{\delta^2} + 
\frac{\term_{\textup{tot}, 2}}{\delta^2} = ~& 
c_0 \sigma(\log n)^{\nfrac{5}{2}}\sqrt{\frac{p}{n}}\frac{1}{\Fnorm{\bB^{*}}}
+ \frac{ c_1\sigma (\log^2 n)}{\Fnorm{\bB^{*}}}
+ c_2
\bracket{\frac{pm}{n} + \sqrt{\frac{mp}{n}}} \frac{(\log n)^2\sigma^2}{\Fnorm{\bB^{*}}^2} \\
+~& \frac{c_3 (\log^2 n)(\log n^2 p^3)\sqrt{p} }{\sqrt{n}} +
\frac{c_4 p(\log n)^{\nfrac{3}{2}}}{n}
+ \frac{c_5 \log n}{\sqrt{\srank{\bB^{*}}}}
 \\
\asymp~&
c_0 \sqrt{\frac{p}{nm}}\frac{\bracket{\log n}^{\nfrac{5}{2}}}{\sqrt{\snr}}
+ \frac{c_1 \log^2 n}{\sqrt{m \cdot \snr}}
+ \frac{c_2 p\bracket{\log n}^2}{n\cdot  \snr}
+c_2 \sqrt{\frac{p}{mn}}\frac{\bracket{\log n}^2 }{\snr} \\
+~&\frac{c_3 (\log^2 n)(\log n^2 p^3)\sqrt{p} }{\sqrt{n}} +
\frac{c_4 p(\log n)^{\nfrac{3}{2}}}{n}
+ \frac{c_5 \log n}{\sqrt{\srank{\bB^{*}}}}.
\]
Provided that $\snr \geq c$, $\srank{\bB^{*}} \gg \log^4 n$ and 
$n\gsim p \log^6 n$, 
we can verify the sum 
$\nfrac{\Delta}{\delta^2} + \term_{\textup{tot}, 1}/\delta^2 + 
\term_{\textup{tot}, 2}/\delta^2$ to be significantly smaller than $1$ when 
$n$ and $p$ are sufficiently large, which suggests 
\[
\zeta_2 \leq \Prob\bracket{\term_{\textup{tot}, 2, 2} \gsim \sqrt{\log n}
\norm{\bB^{*} \wtminus{\bB}{\pi^{*}(i), j}^{\rmt} \bX_{j, :}}{2}
} \leq n^{-c}.
\]
Hence the proof is completed by combining 
~\eqref{eq:term1_tot} and ~\eqref{eq:term1_zeta1}.
\end{proof}

\begin{remark}
If we strength the requirement on $\snr$ from $\snr\geq c$ to 
$\snr \gsim \log^2 n$, we can relax the requirement 
on the stable rank $\srank{\bB^{*}}$ from $\srank{\bB^{*}}\gg \log^4 n$ to 
$\srank{\bB^{*}}\gg \log^2 n$.
\end{remark}


% =====================================
\begin{lemma}
\label{lemma:term1_bound}
Conditional on the intersection of events $\calE_3\bigcap \calE_4\bigcap \calE_5$ and fixing the indices $\pi^{*}(i)$ and $j$, 
we have 
\[
\term_{1} \lsim \sigma(\log n)^{\nfrac{5}{2}}\sqrt{\frac{p}{n}}\Fnorm{\bB^{*}}.
\]
hold with probability at least $1-n^{-c}$.
\end{lemma}

\begin{proof}
Define vectors $\bu_{\bX}$ and  $\bv_{\bX}^{\rmt}$ as 
\[
\bu_{\bX} &= \bX\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}, \\
\bv_{\bX} &= \bB^{*\rmt}\bX_{\pi^{*}(i), :}, 
\] 
respectively. We can rewrite $\term_{1}$ as 
\[
\term_{1} =~&  
%\bX^{\rmt}_{\pi^{*}(i), :}
%\bB^{*} \wt{\bW}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}} = 
(n-h)^{-1}\trace\Bracket{\bX\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}
\bX^{\rmt}_{\pi^{*}(i), :}
\bB^{*} \bW^{\rmt}} 
= (n-h)^{-1}\bu_{\bX}^{\rmt}\bW \bv_{\bX}. 
\]
Invoking the union bound, we conclude 
\begin{align}
\label{eq:term2_tot}
& \Prob\bracket{\term_{1} \gsim \sigma(\log n)^{\nfrac{5}{2}}\sqrt{\frac{p}{n}}\Fnorm{\bB^{*}}}  \notag \\
\leq~& 
\Prob\bracket{\term_{1} \gsim \sigma(\log n)^{\nfrac{5}{2}}\sqrt{\frac{p}{n}}\Fnorm{\bB^{*}},~\norm{\bu_{\bX}}{2}\norm{\bv_{\bX}}{2} \lsim 
(\log n)^{\nfrac{3}{2}}\sqrt{np} \Fnorm{\bB^{*}}} \notag \\
+~& \Prob\bracket{\norm{\bu_{\bX}}{2}\norm{\bv_{\bX}}{2} \gsim 
(\log n)^{\nfrac{3}{2}}\sqrt{np} \Fnorm{\bB^{*}} }\notag  \\
\leq~& 
\underbrace{\Prob\bracket{\term_{1} \gsim  \frac{\sigma (\log n) \norm{\bu_{\bX}}{2} \norm{\bv_{\bX}}{2}}{n-h} }}_{\defequal \zeta_1}
+ \underbrace{\Prob\bracket{ \norm{\bu_{\bX}}{2}\norm{\bv_{\bX}}{2} \gsim 
(\log n)^{\nfrac{3}{2}}\sqrt{np} \Fnorm{\bB^{*}}}}_{\defequal \zeta_2}.
\end{align}
Then we separately bound the probabilities $\zeta_1$ and $\zeta_2$. 
\par \noindent
\textbf{Phase I.}
For probability $\zeta_1$, we exploit the independence between $\bX$ and $\bW$
and can view $\term_{1}$ as a Gaussian RV conditional on $\bX$, since 
it is a linear combination of Gaussian RVs $\set{\bW_{i,j}}_{1\leq i \leq n, 1\leq j \leq m}$.
Easily we can calculate its mean to be zero and its variance as 
\[
\Expc_{\bW}(\term_{1})^2 = \frac{\sigma^2}{(n-h)^2} \norm{\bu_{\bX}}{2} \norm{\bv_{\bX}}{2}^2.
\]
Thus we can upper-bound $\zeta_1$ as 
\begin{align}
\label{eq:term2_zeta1}
\zeta_1= \Expc_{\bX}\Expc_{\bW}\Ind\bracket{\term_{1} \gsim \frac{\sigma (\log n) \norm{\bu_{\bX}}{2} \norm{\bv_{\bX}}{2}}{n-h}} \stackrel{\cirone}{\leq} \Expc_{\bX}\exp\bracket{-c_0 \log n} = n^{-c}, 
\end{align}
where $\cirone$ is due to the bound on the tail-probability of Gaussian RV.
\par \noindent
\textbf{Phase II.}
As for $\zeta_2$, easily we can verify it to be zero conditional on the intersection 
of events $\calE_3\bigcap \calE_4 \bigcap \calE_5$ as 
\[
\norm{\bu_{\bX}}{2}\norm{\bv_{\bX}}{2}\lsim 
\sqrt{\log n}\Fnorm{\bB^{*}}  \cdot 
\bracket{\norm{\bX\bX_{j, :}}{2} + \norm{\bX\bX_{\pi^{*}(i), :}}{2}}
\lsim \bracket{\log n}^{\nfrac{3}{2}}\sqrt{np}\Fnorm{\bB^{*}}.
\]
The proof is then completed by combining 
~\eqref{eq:term2_tot} and ~\eqref{eq:term2_zeta1}. 
\end{proof}


\begin{lemma}
\label{lemma:term2_bound}
Conditional on the intersection of events $\calE_2\bigcap \calE_3\bigcap \calE_4 \bigcap \calE_6$ and fixing the indices 
$\pi^{*}(i)$ and $j$, we have 
$\term_{2} \leq \sigma (\log n)^2 \Fnorm{\bB^{*}}$ hold 
with probability at least $1 - n^{-c}$. 
\end{lemma}

\begin{proof}
Following a similar proof strategy as in Lemma~\ref{lemma:term2_bound}, 
we first invoke the union bound and obtain
\begin{align}
\label{eq:term3_tot}
& \Prob\bracket{\term_{2} \gsim \sigma(\log n)^2\Fnorm{\bB^{*}}} \notag \\
\leq~& \Prob\bracket{\term_{2} \gsim \sigma(\log n)^2\Fnorm{\bB^{*}},~ 
\norm{\wt{\bB}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}}{2} \lsim (\log n)\Fnorm{\bB^{*}} } \notag \\
+~& \Prob\bracket{\norm{\wt{\bB}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}}{2} \gsim (\log n)\Fnorm{\bB^{*}} } \notag \\
\leq~& \underbrace{\Prob\bracket{\term_{2} \gsim \sigma(\log n) \norm{\wt{\bB}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}}{2}}}_{\zeta_1} 
+ \underbrace{\Prob\bracket{\norm{\wt{\bB}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}}{2} \gsim (\log n)\Fnorm{\bB^{*}}}}_{\zeta_2}.
\end{align}
The following analysis separately investigates the two probabilities 
$\zeta_1$ and $\zeta_2$. 
\par\noindent
\textbf{Phase I.}
Exploiting the independence between $\bX$ and $\bW$, we can bound $\zeta_1$ as 
\begin{align}
\label{eq:term3_zeta1}
\zeta_1 = \Expc_{\bX}\Expc_{\bW}\Ind\bracket{\term_{2} \gsim \sigma(\log n) \norm{\wt{\bB}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}}{2}} \stackrel{\cirone}{\leq} 
\Expc_{\bX} \exp\bracket{-c_0 \log n} = n^{-c_0},
\end{align}
where in $\cirone$ we use the fact that $\term_{2}$ is a Gaussian RV with zero mean 
and $\norm{\wt{\bB}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}}{2}$ conditional on $\bX$. 
\par \noindent
\textbf{Phase II.}
Then we bound term $\zeta_2$. Notice
\[
\norm{\wt{\bB}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}}{2}
\leq~& \norm{\bracket{\wt{\bB}-  \wtminus{\bB}{\pi^{*}(i), j}}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}}{2}
+ \norm{\wtminus{\bB}{\pi^{*}(i), j}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}}{2} \\
\leq~& \norm{\bracket{\wt{\bB}-  \wtminus{\bB}{\pi^{*}(i), j}}^{\rmt} \bX_{j, :}}{2}
+ \norm{\bracket{\wt{\bB}-  \wtminus{\bB}{\pi^{*}(i), j}}^{\rmt} \bX_{\pi^{*}(i), :}}{2} \\
+~& \norm{\wtminus{\bB}{\pi^{*}(i), j}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}}{2}, 
\]
we conclude 
\begin{align}
\label{eq:term3_zeta2_tot}
\zeta_2 \stackrel{\cirtwo}{\leq}~& 
\underbrace{\Prob\bracket{\norm{\bracket{\wt{\bB}-  \wtminus{\bB}{\pi^{*}(i), j}}^{\rmt} \bX_{j, :}}{2}
+ \norm{\bracket{\wt{\bB}-  \wtminus{\bB}{\pi^{*}(i), j}}^{\rmt} \bX_{\pi^{*}(i), :}}{2} \gsim \frac{p\log n}{n}\Fnorm{\bB^{*}} }}_{\zeta_{2, 1}} \notag \\
+~& \underbrace{\Prob\bracket{\norm{\wtminus{\bB}{\pi^{*}(i), j}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}}}{2} \gsim (\log n)\Fnorm{\bB^{*}}}}_{\zeta_{2, 2}}, 
\end{align}
where in $\cirtwo$ we use the fact $n\gsim p$. Invoking Lemma~\ref{lemma:xb_multi_indices} then yields $\zeta_{2, 1} = 0$. 
For term $\zeta_{2, 2}$, we exploit the independence between 
$\wtminus{\bB}{\pi^{*}(i), j}$ and $\bX_{j, :}$, $\bX_{\pi^{*}(i), :}$.
Via the Hanson-wright inequality~\citep{vershynin2018high}, 
we have 
\begin{align}
\label{eq:term3_zeta22}
\zeta_{2,2} \leq \exp\Bracket{-c_0 \bracket{\frac{(\log n)^2\Fnorm{\bB^{*}}^2 }{\Opnorm{\wtminus{\bB}{\pi^{*}(i), j}^{\rmt} \wtminus{\bB}{\pi^{*}(i), j}}} \vcap \frac{(\log n)^4 \Fnorm{\bB^{*}}^4 }{\Fnorm{\wtminus{\bB}{\pi^{*}(i), j}^{\rmt} \wtminus{\bB}{\pi^{*}(i), j}}^2} } }
\stackrel{\cirthree}{\leq} n^{-c}, 
\end{align}
where $\cirthree$ is due to the fact 
\[
\Fnorm{\wtminus{\bB}{\pi^{*}(i), j}} 
\leq \Fnorm{\bB^{*}}
+ \Fnorm{\wtminus{\bB}{\pi^{*}(i), j} - \bB^{*}}
\stackrel{
\cirfour}{\lsim} \Fnorm{\bB^{*}}, 
\]
and in $\cirfour$ we condition on event $\calE_6$. 
Combining ~\eqref{eq:term3_tot}, ~\eqref{eq:term3_zeta1}, 
~\eqref{eq:term3_zeta2_tot}, and ~\eqref{eq:term3_zeta22} then completes 
the proof.
\end{proof}

% ====================================
 
\begin{lemma}
\label{lemma:term3_bound}
Conditional on event $\calE_2$ and fixing the 
indices $\pi^{*}(i)$ and $j$, we have 
$\term_{3} \lsim  \frac{mp(\log n)^2\sigma^2}{n}
+ \sigma^2(\log n)^{2}\sqrt{\frac{mp}{n}}$ hold 
with probability exceeding $1 - c_0 n^{-c_1}$. 
\end{lemma}

\begin{proof}
For the benefits of presentation, we first define $\bXi^{\pi^{*}(i), j}$ as $\bXi^{\pi^{*}(i), j} = \bX \bracket{\bX_{\pi^{*}(i), :} - \bX_{j, :}}$.
Then we can rewrite $\term_{3}$ as 
$(n-h)^{-1}\bW_{i, :}^{\rmt} \bW^{\rmt} \bOmega^{\pi^{*}(i), j}$
and expand it as 
\[
\abs{\term_{3}} =~&
\bracket{n-h}^{-1}\abs{\Xi^{\pi^{*}(i), j}_{i}\bW_{i, :}^{\rmt} \bW_{i, :}
+  \bW_{i, :}^{\rmt}\bigg(\sum_{k\neq i} \Xi^{\pi^{*}(i), j}_{k}
\bW_{k, :}\bigg)} \\
\leq~& 
\frac{1}{n-h}
\abs{\Xi^{\pi^{*}(i), j}_{i}}\cdot \norm{\bW_{i, :}}{2}^2
+ \frac{1}{n-h}\abs{\la \bW_{i, :}, \sum_{k\neq i} \Xi^{\pi^{*}(i), j}_{k}
\bW_{k, :}\ra} \\
\stackrel{\cirone}{\leq}~&
\frac{p\log n}{n-h}\norm{\bW_{i, :}}{2}^2
+ \frac{1}{n-h}\abs{\la \bW_{i, :}, \sum_{k\neq i} \Xi^{\pi^{*}(i), j}_{k}
\bW_{k, :}\ra},
\]
where in $\cirone$ we condition on event $\calE_2$ and have
$\abs{\Xi^{\pi^{*}(i), j}_{i}} \leq \norm{\bX_{\pi^{*}(i), :}}{2}^2+ \norm{\bX_{j, :}}{2}^2 \lsim p\log n$.
With the union bound, we obtain 
\begin{align}
\label{eq:term4_tot}
& \Prob\bracket{\term_{3} \gsim 
 \frac{mp(\log n)^2\sigma^2}{n}
+ \sigma(\log n)^{2}\sqrt{\frac{mp}{n}} 
} \notag \\ 
\stackrel{\cirtwo}{\leq}~&
\underbrace{\Prob\bracket{
\frac{p\log n}{n-h}\norm{\bW_{i, :}}{2}^2
\gsim \frac{mp(\log n)^2\sigma^2}{n} }}_{\defequal \zeta_1} + \underbrace{\Prob\bracket{
\frac{1}{n-h}\abs{\la \bW_{i, :}, \sum_{k\neq i} \Omega^{\pi^{*}(i), j}_{k}
\bW_{k, :}\ra} \gsim \sigma^2(\log n)^{2}\sqrt{\frac{mp}{n}} }}_{\defequal \zeta_2}.  
\end{align}
Then we separately bound the 
two terms $\zeta_1$ and $\zeta_2$. 
\par \noindent
\textbf{Phase I.}
For term $\zeta_1$, we have 
\begin{align}
\label{eq:term4_zeta1}
\zeta_1 \leq 
\Prob\bracket{\norm{\bW_{i, :}}{2}^2 \gsim m(\log n)\sigma^2} 
\stackrel{\cirthree}{=} e^{-c_0 \log n} = n^{-c_0}, 
\end{align}
where in $\cirthree$ we use the fact that $\norm{\bW_{i, :}}{2}^2/\sigma^2$
is a $\chi^2$-RV with freedom $m$ and invoke Lemma~\ref{lemma:chi_square}.
\par \noindent
\textbf{Phase II.} 
Then we upper-bound $\zeta_2$ as  
\begin{align}
\label{eq:term4_zeta2}
\zeta_{2}\leq~& 
\underbrace{\Prob\bracket{
\frac{1}{n-h}\abs{\la \bW_{i, :}, \sum_{k\neq i} \Omega^{\pi^{*}(i), j}_{k}
\bW_{k, :}\ra} \gsim \frac{\sigma\sqrt{\log n}}{n} \bigg\|\sum_{k\neq i} \Xi^{\pi^{*}(i), j}_{k}
\bW_{k, :}\bigg\|_{2}}}_{\defequal \zeta_{2, 1}} \notag \\
+~& \underbrace{\Prob\bracket{\bigg\|\sum_{k\neq i} \Xi^{\pi^{*}(i), j}_{k}
\bW_{k, :}\bigg\|_{2}^2\gsim mnp(\log n)^3 \sigma^2}}_{\defequal \zeta_{2,2}}.
\end{align}
For term $\zeta_{2, 1}$, we exploit the independence across
the rows of the matrix $\bW$. 
Conditional on $\set{\bW_{k, :}}_{k\neq i}$, we conclude 
the inner-product $\la \bW_{i, :}, \sum_{k\neq i} \Xi^{\pi^{*}(i), j}_{k}
\bW_{k, :}\ra$ to be a Gaussian RV with zero mean and 
$\norm{\sum_{k\neq i} \Xi^{\pi^{*}(i), j}_{k}
\bW_{k, :}}{2}^2$ variance, which yields 
$\zeta_{2,1} \leq n^{-c}$. 
For term $\zeta_{2, 2}$, we analyze the variance 
$\norm{\sum_{k\neq i} \Xi^{\pi^{*}(i), j}_{k}
\bW_{k, :}}{2}^2$, which reads as 
\begin{align}
\label{eq:term4_zeta22}
\zeta_{2, 2}
\leq~& \
\underbrace{\Prob\bracket{\bigg\|\sum_{k\neq i} \Xi^{\pi^{*}(i), j}_{k}
\bW_{k, :}\bigg\|_{2}^2\gsim m(\log n) \sigma^2\bigg[\sum_{k\neq i} (\Xi^{\pi^{*}(i), j}_{k})^2\bigg],~\sum_{k\neq i} (\Xi^{\pi^{*}(i), j}_{k})^2 \lsim (\log n)^2 np}}_{\defequal \zeta_{2, 2,1} } \notag \\
+ ~&\underbrace{\Prob\bracket{\sum_{k\neq i} (\Xi^{\pi^{*}(i), j}_{k})^2 \gsim 
(\log n)^2 np}}_{\defequal \zeta_{2,2,2}}. 
\end{align}
Due to the independence across $\bX$ and $\bW$, 
we can verify $\norm{\sum_{k\neq i} \Xi^{\pi^{*}(i), j}_{k}
\bW_{k, :}}{2}^2/[\sigma^2\sum_{k\neq i} (\Xi^{\pi^{*}(i), j}_{k})^2]$
to be a $\chi^2$-RV with freedom $m$ conditional on $\bX$. 
Invoking Lemma~\ref{lemma:chi_square}, we can upper-bound $\xi_1$ as 
\begin{align}
\label{eq:term4_zeta221}
\zeta_{2, 2, 1} \leq 
\Prob\bracket{\bigg\|\sum_{k\neq i} \Xi^{\pi^{*}(i), j}_{k}
\bW_{k, :}\bigg\|_{2}^2\gsim m(\log n) \sigma^2\bigg[\sum_{k\neq i} (\Xi^{\pi^{*}(i), j}_{k})^2\bigg]} \leq n^{-c}. 
\end{align}
As for $\xi_2$, we condition on event $\calE_5$ and have 
\begin{align}
\label{eq:term4_zeta222}
\zeta_{2, 2, 2} \leq \Prob\bracket{\norm{\bX \bX_{\pi^{*}(i), :}}{2} + \norm{\bX \bX_{j, :}}{2} \gsim (\log n)\sqrt{np}} = 0.
\end{align}
Then the proof is complete by combining
~\eqref{eq:term4_tot}, ~\eqref{eq:term4_zeta1}, 
~\eqref{eq:term4_zeta2}, ~\eqref{eq:term4_zeta22}, 
~\eqref{eq:term4_zeta221}, and ~\eqref{eq:term4_zeta222}.


\end{proof}



% ==============================================
\section{Supporting Lemmas}
\label{subsec:appendix_proof_multi_observe_support_lemma}

\begin{lemma}
\label{lemma:x_row_norm_ub}
For an arbitrary row $\bX_{i, :}$, we have 
\begin{align*}
\norm{\bB^{*\rmt}\bX_{i, :}}{2} \lsim \sqrt{\log n} \Fnorm{\bB^{*}},  
\end{align*}
with probability exceeding $1 - n^{-c}$. 
\end{lemma}

\begin{proof}
This lemma is a direct consequence of the 
Hanson-wright inequality~\citep{vershynin2018high}.
Easily we can verify 
$\Expc \norm{\bB^{*\rmt}\bX_{i,:}}{2}^2 = \Fnorm{\bM}^2$ and   
hence
\[
\Prob\bracket{ \norm{\bB^{*\rmt}\bX_{i,:}}{2}^2 \gsim \log n \Fnorm{\bB^{*}}^2} 
{\leq}~& 
\Prob\bracket{ \abs{\norm{\bB^{*\rmt}\bX_{i,:}}{2}^2 - \Fnorm{\bB^{*}}^2} \gsim (\log n) \Fnorm{\bB^{*}}^2} \\
\leq~& \exp\bracket{-c_0 \min\bracket{\frac{\log n \Fnorm{\bB^{*}}^2}{\Opnorm{\bB^{*}}^2} \vcap 
\frac{(\log^2 n) \Fnorm{\bB^{*}}^4}{\Fnorm{\bB^{*}}^4}}} \leq n^{-1-c}. 
\]
Adopting the union bound, we have 
\[
\Prob\bracket{ \norm{\bB^{*\rmt}\bX_{i,:}}{2}^2 \gsim \log n \Fnorm{\bB^{*}}^2,~\forall~i} 
\leq n\cdot n^{-1-c} = n^{-c}.
\]
\end{proof}


\begin{lemma}
\label{lemma:inner_product}	
For an arbitrary row $\bX_{i, :}$ (or $\bX_{i, :}^{'}$), we have 
\[
\la \bX_{i_1, :}, \bX_{j_1, :}^{'}\ra ~&\lsim \sqrt{p\log n}; \\
\la \bX_{i_2, :}, \bX_{j_2, :}\ra ~&\lsim \sqrt{p\log n},~~i_2\neq j_2; \\ 
\la \bX^{'}_{i_3, :}, \bX^{'}_{j_3, :}\ra ~&\lsim \sqrt{p\log n},~~i_3 \neq j_3, 
\]
hold with probability $1-n^{-c}$.
\end{lemma}

\begin{lemma}
\label{lemma:Xmat_fnorm}
We conclude $\Prob\bracket{\calE_4} \geq 1 - 1-n e^{-cnp}$. 
\end{lemma}
This lemma is a direct consequence of Lemma~\ref{lemma:chi_square} and hence its 
proof is omitted.


% ==========================
\begin{lemma}
\label{lemma:event5}
Conditional on the intersection of 
events $\calE_2 \bigcap \calE_3 \bigcap \calE_4$, we have 
$\Prob\bracket{\calE_5}\geq 1 - c_0 n^{-c_1}$.
\end{lemma}

\begin{proof}
For a fixed row index $s$ ($1\leq s\leq n$),
we have 
\[
& \Prob\bracket{\norm{\bX\bX_{s, :}}{2} \gsim (\log n)\sqrt{np}} \\
\stackrel{\cirone}{\leq}~& \Prob\bracket{\norm{\bracket{\bX - \sampminus{\bX}{s}} \bX_{s, :} }{2} \gsim 
p\log n}
+ \Prob\bracket{\norm{\sampminus{\bX}{s} \bX_{s, :}}{2} \gsim (\log n)\sqrt{np}} \\
\stackrel{\cirtwo}{\leq}~& 
\underbrace{\Prob\bracket{\bracket{\norm{\bX_{s, :}}{2} 
+ \|\bX_{s, :}^{'}\|_{2}}\norm{\bX_{s, :}}{2} \gsim p\log n}}_{\defequal \zeta_1}
+ \underbrace{\Prob\bracket{\norm{\sampminus{\bX}{s} \bX_{s, :}}{2} \gsim (\log n)\sqrt{np}}}_{\defequal \zeta_2 }, 
\]  
where in $\cirone$ we use the union bound and 
the fact $n\geq p$; and in 
$\cirtwo$ we use the definition of $\sampminus{\bX}{s}$ such that 
the difference $\bX - \sampminus{\bX}{s}$ only have non-zero elements in the 
$s$th column. 
Conditional on the intersection of 
events $\calE_2 \bigcap \calE_3 \bigcap \calE_4$, we conclude that 
probability $\zeta_1$ is zero and 
probability $\zeta_2$ is upper-bounded as 
\[
\Prob\bracket{\norm{\sampminus{\bX}{s} \bX_{s, :}}{2} \gsim (\log n)\sqrt{np}} 
\leq~&\Prob\bracket{\abs{\norm{\sampminus{\bX}{s} \bX_{s, :} }{2}^2 - \Fnorm{\sampminus{\bX}{s}}^2} \gsim (\log^2 n)np } \\
\leq~& \exp\bracket{-c_0 \bracket{\frac{(\log^2 n)np }{\opnorm{\sampminus{\bX}{s}^{\rmt}\sampminus{\bX}{s}}} \vcap \frac{(\log n)^4 n^2p^2}{ \Fnorm{\sampminus{\bX}{s}^{\rmt}\sampminus{\bX}{s}}^2}}} \leq n^{-c}.
\]
Thus the proof is completed by invoking the union bound 
since 
\[
\Prob\bracket{\norm{\bX\bX_{s, :}}{2} \gsim (\log n)\sqrt{np},~\forall~s} 
\leq n \cdot \Prob\bracket{\norm{\bX\bX_{s, :}}{2} \gsim (\log n)\sqrt{np}} \
\leq n\bracket{\zeta_1 + \zeta_2} \leq n^{1-c} = n^{-c^{'}}.
\]	
\end{proof}

% ===================================== 
\begin{lemma}
\label{lemma:Bmat_perturb}
Conditional on $\calE_4$, 
we have $\Prob(\calE_6) \geq 1- c_0 p^{-2}$.	
\end{lemma}
\begin{proof}
We assume that 
the first $h$ rows of $\bX$ are permuted w.l.o.g.  
Due to the iid distribution of 
$\{\bX_{i, :}\}_{i=1}^n$ and $\{\bX^{'}_{i, :}\}_{i=1}^n$, we 
conclude 
\begin{align}
\label{eq:event6_tot}
\Prob(\calE_6) \leq 
n^2 \Prob\bracket{\norm{\bB^{*} - \wt{\bB}}{2}\gsim \
\frac{(\log n)(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}}}. 
\end{align}
First, we expand $\bX^{\rmt}\bPi^{*}\bX$ as 
\begin{align*}
\bX^{\rmt}\bPi^{*}\bX = 
\sum_{i=1}^h \bX_{\pi(i), :}\bX_{i, :}^{\rmt} + \
\sum_{i=h+1}^n \bX_{i, :}\bX_{i, :}^{\rmt}, 
\end{align*}
and obtain 
\begin{align*}
& \Prob\bracket{\norm{\bB^{*} - \wt{\bB}}{2}\gsim \
\frac{(\log n)(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}} } \\
\leq~& \
\Prob\bracket{\frac{1}{n-h}\Fnorm{\sum_{i=1}^h \bX_{\pi(i), :}\bX_{i, :}^{\rmt} \bB^{*} } + \frac{1}{n-h}\Fnorm{\sum_{i=h+1}^n \bracket{\bX_{i, :}\bX^{\rmt} _{i, :} -\bI}\bB^{*}}\gsim \frac{(\log n)(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}}  } \\
\stackrel{\cirone}{\leq}~&\
\underbrace{\Prob\bracket{\frac{1}{n-h}\Fnorm{\sum_{i=1}^h \bX_{\pi(i), :}\bX^{\rmt}_{i, :} \bB^{*} } \gsim    \frac{(\log n)(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}} } }_{\zeta_1} \\
+~& \
\underbrace{\Prob\bracket{\frac{1}{n-h}\Fnorm{\sum_{i=h+1}^n \bracket{\bX_{i, :} \bX_{i, :}^{\rmt} -\bI}\bB^{*}}\gsim  
\frac{(\log n)(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}}
 }}_{\zeta_2}, 
\end{align*} 
where $\cirone$ is because of the union bound. 
The proof is complete by 
proving $\zeta_1 \leq 6n^{-2}p^{-2}$ and $\zeta_2 \leq 4n^{-2}p^{-2}$. 
The computation details come as follows. 

% ========================
\noindent  
\textbf{Phase I: Bounding $\zeta_1$.}
According to Lemma~$8$ in~\citet{pananjady2016linear} (restated as Lemma~\ref{lemma:permute_decomp}), 
we can decompose the set $\set{j: \pi(j)\neq j}$
into three disjoint sets $\calI_i$, $1\leq i \leq 3$,
such that $j$ and $\pi(j)$ does not lie in the same set. 
And the cardinality of set $\calI_i$ is $h_i$ satisfies
$\lfloor h/5 \rfloor \leq h_i \leq h/3$. 
Adopting the union bound, we can upper-bound 
$\zeta_1$ as  
\begin{align}
\zeta_1 \leq~& \sum_{i=1}^3 \Prob\bracket{\frac{1}{n-h}\Fnorm{\sum_{j\in \calI_i} 
\bX_{\pi(j), :}\bX_{j, :}^{\rmt}\bB^{*}} \gsim 
\frac{(\log n)(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}}} \notag \\
\leq ~&
\sum_{i=1}^3 \Prob\bracket{\frac{1}{n-h}\Opnorm{\sum_{j\in \calI_i} 
\bX_{\pi(j), :}\bX_{j, :}^{\rmt}} \gsim
\frac{(\log n)(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}}.
\label{eq:event6_zeta1}
\end{align}
Defining $\bZ_i$ as  
$\bZ_i = \sum_{j\in \calI_i}\bX_{\pi(j), :}\bX^{\rmt}_{j, :}$,  
we would bound the above probability by invoking the matrix 
Bernstein inequality (Theorem~$7.3.1$ in~\citet{tropp2015introduction}). 
First, we have 
\begin{align*}
\Expc\bracket{\bX_{\pi(j), :}\bX^{\rmt}_{j, :}} = 
\bracket{\Expc\bX_{\pi(j), :}}\bracket{ \Expc \bX_{j, :}}^{\rmt} = \bZero,
\end{align*}
due to the independence between $\bX_{\pi(j), :}$ and $\bX_{j, :}$. 
Then we upper bound $\norm{\bX_{\pi(j), :}\bX_{j, :}^{\rmt}}{2}$ as 
\begin{align*}
\norm{\bX_{\pi(j), :}\bX_{j, :}^{\rmt}}{2} \stackrel{\cirtwo}{=} \
\Fnorm{\bX_{\pi(j), :}\bX_{j, :}^{\rmt}} \stackrel{\cirthree}{=} \
\norm{\bX_{\pi(j), :}}{2} \norm{\bX_{j, :}}{2} \stackrel{\cirfour}{\lsim} 
p\log n, 
\end{align*}
where $\cirtwo$ is because $\bX_{\pi(j), :}\bX_{j, :}^{\rmt}$ is rank-$1$, 
$\cirthree$ is due to the fact $\Fnorm{\bu\bv^{\rmt}}^2 = \trace\bracket{\bu \bv^{\rmt}\bv \bu^{\rmt}} = \norm{\bu}{2}^2 \norm{\bv}{2}^2$ for 
arbitrary vector $\bu, \bv \in \RR^p$, and 
$\cirfour$ is because of event $\calE_3$. 
\par 
In the end, 
we compute $\Expc\bracket{\bZ_i\bZ_i^{\rmt}}$ and $\Expc\bracket{\bZ_i^{\rmt}\bZ_i}$ as
\begin{align*}
& \Expc\bracket{\bZ_i\bZ_i^{\rmt}} = 
\Expc\bracket{\sum_{j_1, j_2\in \calI_i} \bX_{\pi(j_1), :}\bX^{\rmt}_{j_1, :}\bX_{j_2, :} \bX^{\rmt}_{\pi(j_2), :} } \stackrel{\cirfive}{=} \
\Expc\bracket{\sum_{j \in \calI_i} \bX_{\pi(j), :}\bX_{j, :}^{\rmt}\bX_{j, :} \bX^{\rmt}_{\pi(j), :} }
\\ \stackrel{\cirsix}{=} ~& \
\Expc\bracket{\sum_{j\in \calI_i} \bX_{\pi(j), :}\Expc\bracket{\bX^{\rmt}_{j, :}\bX_{j, :} }   \bX^{\rmt}_{\pi(j), :}   } = 
p \bracket{\sum_{j\in\calI_i} \Expc \bX_{\pi(j), :} \bX^{\rmt}_{\pi(j), :}} = 
ph_i \bI_{p\times p} = \Expc\bracket{\bZ\bZ^{\rmt}}, 
\end{align*}
where $\cirfive$ and $\cirsix$ is because of the 
fact such that $j$ and $\pi(j)$ are not within the set $\calI_i$ simultaneously. 
To sum up, we invoke the matrix Bernstein inequality (Theorem~$7.3.1$ in~\citet{tropp2015introduction})
and have 
\begin{align*}
\frac{1}{n-h}\Opnorm{\sum_{j\in \calI}\bX_{\pi(j), :}\bX_{j, :}^{\rmt}} 
\leq~&
\frac{p (\log n) \log(n^2 p^3)}{3 (n-h)}+\frac{\sqrt{p^2 (\log^2 n) \log^2\left(n^2 p^3\right) + 18 p h_i \log \left(n^2 p^3\right)}}{(n-h)} \\
\stackrel{\cirseven}{\lsim}~& \frac{p (\log n) \log(n^2 p^3)}{n} + \
\frac{p}{n}\sqrt{(\log^2 n) \log^2\left(n^2 p^3\right)  + \frac{n}{p}(\log n^2 p^3)} \\
\stackrel{\cireight}{\lsim}~& \frac{p (\log n) \log(n^2 p^3)}{n}
+  \frac{(\log n)(\log n^2 p^3)\sqrt{p}}{\sqrt{n}} 
\stackrel{\cirnine}{\lsim} \frac{(\log n)(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}
\end{align*} 
holds with probability $1 - 2(np)^{-2}$, 
where in $\cirseven$, $\cireight$, and $\cirnine$ we
use the fact  such that  
$h\leq n/4$, $h_i \leq h/3$. Hence we can show 
$\zeta_1$ in ~\eqref{eq:event6_zeta1} to be less than 
$6n^{-2}p^{-2}$.
% ============================================  
\par \noindent  
\textbf{Phase II: Bounding $\zeta_2$.}
We upper bound $\zeta_2$ as 
\begin{align*}
\zeta_2 \leq~& \
\Prob\bracket{\frac{1}{n-h}\Fnorm{\sum_{i=h+1}^n \bracket{\bX_{i, :} \bX_{i, :}^{\rmt} -\bI}\bB^{*}}\gsim \frac{(\log n)(\log n^2p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}} } \\
\leq ~&\Prob\bracket{
\Opnorm{\sum_{i=h+1}^n \bracket{\bX_{i, :} \bX_{i, :}^{\rmt} -\bI}}
\gsim (\log n)(\log n^2 p^3)\sqrt{np} }.
\end{align*}
Similar to above, we define 
$\wt{\bZ}_i = \bX_{i, :} \bX_{i, :}^{\rmt} -\bI$. 
First, we verify that 
$\Expc \wt{\bZ}_i = \bZero$ and 
$\bZ_i$ are independent. Then we bound 
$\Opnorm{\bZ}$ as 
\[
\Opnorm{\bZ} \leq \Opnorm{\bX_{i, :} \bX^{\rmt}_{i, :}} + 
\Opnorm{\bI} \stackrel{\cira}{=} \
\norm{\bX_{i, :}}{2}^2 + 1 
\stackrel{\cirb}{\lsim} p\log n + 1 \lsim p\log n, 
\]
where in $\cira$ 
we use $\Opnorm{\bu \bu^{\rmt}} = \norm{\bu}{2}^2$
for arbitrary vector $\bu$, in 
$\cirb$ we condition on event $\calE_4$.
In the end, we compute 
$\Expc(\bZ_i\bZ_i^{\rmt})$ as 
\[
\Expc\bracket{\bZ_i \bZ_i^{\rmt}} = 
\Expc \bracket{\norm{\bX_{i, :}}{2}^2\bX_{i, :} \bX^{\rmt}_{i, :}}
- \bI \preceq 
p\log n\bracket{\Expc\bracket{\bX_{i, :}\bX^{\rmt}_{i, :}}} - \bI 
\preceq (p\log n)\bI. 
\]
Invoking the matrix Bernstein inequality 
(Theorem $7.3.1$ in~\citet{tropp2015introduction}),
we conclude 
\[
\zeta_2 \leq 4p
\exp\bracket{-\frac{3 n(\log n) \log ^2\left(n^2 p^3\right) }{\sqrt{n p} (\log n) \log \left(n^2 p^3\right)+6}}
\stackrel{\circc}{\leq}  4n^{-2}p^{-2}, 
\]
where in $\circc$ we use the
fact $n\gsim p$.  
\end{proof}

% ============================================
\begin{lemma}
\label{lemma:xb_single_idx}
Conditional on the intersection of 
events $\calE_2 \bigcap \calE_3$, we conclude 
\[
\norm{\bracket{\wt{\bB} - \wtminus{\bB}{s}}^{\rmt}\bX_{s, :}}{2} 
\lsim \frac{p\log n}{n}\Fnorm{\bB^{*}}.
\] 	
\end{lemma}
\begin{proof}
Here we focus on the case when $\pi(s) = s$. 
The proof of the case when $\pi(s) \neq s$ can be completed 
effortless by following a similar strategy. 
First, we notice 
\[
\norm{\bracket{\wt{\bB} - \wtminus{\bB}{s}}^{\rmt}\bX_{s,:} }{2} =~& 
\bracket{n-h}^{-1}
\norm{\bB^{*\rmt} 
\bracket{\wt{\bX}_{s, :} \wt{\bX}^{\rmt}_{s, :} - 
\bX_{s, :}\bX^{\rmt}_{s, :} } \bX_{s,:} }{2}\\
\leq~& \bracket{n-h}^{-1} 
\bracket{\abs{\la \bX_{s, :}, \wt{\bX}_{s, :}\ra }\
\norm{ \bB^{*\rmt} \wt{\bX}_{s, :}}{2}   + \norm{\bX_{s, :}}{2}^2 \cdot \norm{\bB^{*\rmt}\bX_{s, :}}{2} }. 
\]
Conditional on the intersection of events 
$\calE_2 \bigcap \calE_3$, we conclude 
\[
\norm{\bracket{\wtminus{\bB}{s} - \wt{\bB}}^{\rmt} \bX_{s,:} }{2}
\lsim \frac{p\log n}{n-h} \Fnorm{\bB^{*}} \stackrel{}{\asymp}
\frac{p\log n}{n} \Fnorm{\bB^{*}}.
\]
\end{proof}

Following the same strategy, we can prove that 
\begin{lemma}
\label{lemma:xb_multi_indices}
Conditional on the intersection of events 
$\calE_2 \bigcap \calE_3$, we conclude
\[
\norm{\bracket{\wt{\bB} - \wtminus{\bB}{s, t}}^{\rmt}\bX_{s, :} }{2} 
\lsim \frac{p\log n}{n}\Fnorm{\bB^{*}}.
\] 	
\end{lemma}

% =================================================== 
\begin{lemma}
\label{lemma:beta_perturb}
Conditional on the intersection of 
events $\calE_6\bigcap \calE_7 \bigcap \calE_8$, we 
conclude $\Prob(\calE_9) \geq 1- c_0 n^{-c_1}$.
\end{lemma}

\begin{proof}
We adopt the leave-one-out trick and construct 
the matrix $\wtminus{\bB}{i}$ as 
\[
\wtminus{\bB}{i} = 
(n-h)^{-1}\bigg( 
\sum_{\substack{k \neq i \\ \pi^{*}(k) \neq i}} \bX_{\pi(k), :}\bX_{k, :}^{\rmt}
+ \sum_{\substack{k = i \\ \pi^{*}(k) \neq i}} \wt{\bX}_{\pi(k), :}\wt{\bX}^{\rmt}_{k, :}\bigg)\bB^{*},
\]
where $\wt{\bX}_{i, :}$ are the independent copy of $\bX_{i, :}$.
Adopting the union bound, we conclude 
\[
& \Prob\bracket{
\norm{(\wt{\bB} - \bB^{*})^{\rmt} \bX_{i, :}}{2} 
\gsim \frac{(\log n)^{\nfrac{3}{2}}(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}} } \\
\leq ~&
\Prob\bracket{
\norm{(\bB^{*} - \wtminus{\bB}{i})^{\rmt}\bX_{i, :} }{2}
+ 
\norm{(\wtminus{\bB}{i} - \wt{\bB})^{\rmt} \bX_{i, :}}{2} \gsim  
\frac{(\log n)^{\nfrac{3}{2}}(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}}
} \\
\leq~& \underbrace{\Prob\bracket{
\norm{(\bB^{*} - \wtminus{\bB}{i})^{\rmt}\bX_{i, :} }{2}
\gsim \frac{(\log n)^{\nfrac{3}{2}}(\log n^2 p^3)\sqrt{p}}{\sqrt{n}}\Fnorm{\bB^{*}}}}_{\defequal \zeta_1} \\
+~& 
\underbrace{\Prob\bracket{
\norm{(\wtminus{\bB}{i} - \wt{\bB})^{\rmt} \bX_{i, :}}{2} \gsim 
\frac{p\log n}{n}\Fnorm{\bB^{*}}}}_{\defequal \zeta_2}.
\]
First, we study the probability $\zeta_1$.
Due to the construction of $\wtminus{\bB}{i}$, we have 
$\bX_{i, :}$ to be independent of 
$\bB^{*} - \wtminus{\bB}{i}$. Conditional 
on $\bB^{*} - \wtminus{\bB}{i}$, we conclude 
\[
\zeta_1 \stackrel{\cirone}{\leq}  
\Prob\bracket{\norm{(\bB^{*} - \wtminus{\bB}{i})^{\rmt}\bX_{i, :} }{2} \geq \sqrt{\log n} \Fnorm{\bB^{*} - \wtminus{\bB}{i}} } 
\leq n^{-c}, 
\]
where in $\cirone$ we condition on event $\calE_6$ 
such that $\Fnorm{\bB^{*} - \wtminus{\bB}{i}} \lsim (\log n)(\log n^2 p^3)\sqrt{\nfrac{p}{n}}\Fnorm{\bB^{*}}$.
As for probability $\zeta_2$, we conclude it to be zero 
conditional on $\calE_7$. 
Thus the proof is completed. 
\end{proof}




% ==========================================
\section{Supplementary Material: Useful Facts}
% ===================================
This section lists some useful  
facts for the sake of self-containing. 
% ===========================================
\begin{lemma}
\label{lemma:chi_square}
For a $\chi^2$-RV $Z$ with $\ell$ freedom, we have
\begin{align*}
\Prob\bracket{Z\leq t} \leq \exp\bracket{\frac{\ell}{2}\bracket{\log \frac{t}{\ell} - \frac{t}{\ell} + 1}},~~t < \ell;  \\
\Prob\bracket{Z\geq t} \leq \exp\bracket{\frac{\ell}{2}\bracket{\log \frac{t}{\ell} - \frac{t}{\ell} + 1}},~~ t >  \ell.  
\end{align*}
\end{lemma}

% ================================
\begin{lemma}[Lemma~$8$ in~\citet{pananjady2016linear}]
\label{lemma:permute_decomp}
Consider an arbitrary permutation map $\pi$ 
with Hamming distance $k$ from the 
identity map, i.e., $\dh\bracket{\bpi, \bI} = h$. 
We define the index 
set $\set{i:~i\neq \pi(i)}$ and can 
decompose it into $3$ independent 
sets $\calI_i$ $(1\leq i \leq 3)$ such that 
the cardinality of each set satisfies 
$|\calI_i|\geq \lfloor h/3\rfloor \geq h/5$.
\end{lemma}



\begin{lemma}[Theorem $1.3$ in~\citet{paouris2012small}]
\label{lemma:small_ball_log_concave}
Let $\bg \in \RR^n$ be an isotropic log-concave random vector 
with sub-gaussian constant $K$, and $\bA$ is a non-zero 
$n\times n$ matrix. For any $\by\in \RR^n$ and 
$\varepsilon\in (0, c_1)$, one has 
\begin{align*}
\Prob\bracket{\norm{\by - \bA\bg}{2} \leq \varepsilon \Fnorm{\bA}}
\leq \exp\bracket{\kappa(K)\srank{\bA}\log \varepsilon }, 
\end{align*}
where $\kappa = c_1/K^2$. 	
\end{lemma}

\bibliography{zhang_696}



\end{document}




