\documentclass[accepted]{uai2023} % for initial submission

% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage[american]{babel}


\usepackage{algcompatible}
\usepackage{algorithm}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{bbm}
\usepackage{amsthm}

\usepackage{pdfrender}

\usepackage{bbm}
\usepackage{amsthm}

\usepackage{thmtools} 
\usepackage{thm-restate}

\usepackage{makecell}

\usepackage{amssymb,amsmath,balance}

\usepackage{amsfonts}
\usepackage{arydshln}

\usepackage{mathtools} 
\usepackage{nicefrac}

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\RequirePackage{mathrsfs}
\RequirePackage{mdframed}

\def\ii{\mathbbm{j}}
\def\hsamp#1#2{\hvarphi^{(#1)}_{#2}}
\def\tsamp#1#2{\tvarphi^{(#1)}_{#2}}
\def\samp#1#2{\varphi^{(#1)}_{#2}}
\def\hftr#1#2{\wh{F}^{\textup{tr}}(\wh{X}^{(#1)}_{#2})}
\def\ftr#1#2{\wh{F}^{\textup{tr}}({X}^{(#1)}_{#2})}

\def\covmatparam{\bSigma^{\natural}}
\def\covmatnonparam{\bSigma^{\natural}}
\def\precmatparam{\bTheta^{\natural}}
\def\precmatnonparam{\bTheta^{\natural}}
\newcommand{\red}[1]{\textcolor{red}{#1}}



% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2020} with \usepackage[nohyperref]{icml2020} above.
\usepackage{hyperref}

% Attempt to make hyperref and algorithmic work together better:
%\newcommand{\theHalgorithm}{\arabic{algorithm}}

\usepackage{enumitem}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{assumption}{Assumption}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}


% Attempt to make hyperref and algorithmic work together better:
%\newcommand{\theHalgorithm}{\arabic{algorithm}}


\allowdisplaybreaks


\newcommand{\circled}[1]{\small{\raisebox{.6pt}{\textcircled{\raisebox{-.8pt}{#1}}}}}

\def\cirone{\circled{1}}
\def\cirtwo{\circled{2}}
\def\cirthree{\circled{3}}
\def\cirfour{\circled{4}}
\def\cirfive{\circled{5}}
\def\cirsix{\circled{6}}
\def\cirseven{\circled{7}}
\def\cireight{\circled{8}}
\def\cirnine{\circled{9}}
\def\cira{\circled{A}}
\def\cirb{\circled{B}}
\def\circc{\circled{C}}
\def\cird{\circled{D}}
\def\cire{\circled{E}}
\def\cirf{\circled{F}}
\def\cirg{\circled{G}}
\def\cirh{\circled{H}}
\def\ciri{\circled{I}}
\def\cirj{\circled{J}}
\def\cirk{\circled{K}}
\def\cirl{\circled{L}}
\def\cirm{\circled{M}}
\def\cirn{\circled{N}}
\def\ciro{\circled{O}}
\def\cirp{\circled{P}}
\def\cirq{\circled{Q}}
\def\cirr{\circled{R}}
\def\cirs{\circled{S}}
\def\cirt{\circled{T}}
\def\ciru{\circled{U}}
\def\cirv{\circled{V}}
\def\cirw{\circled{W}}
\def\cirx{\circled{X}}
\def\ciry{\circled{Y}}
\def\cirz{\circled{Z}}


\newcommand*{\boldcheckmark}{%
  \textpdfrender{
    TextRenderingMode=FillStroke,
    LineWidth=.5pt, % half of the line width is outside the normal glyph
  }{\checkmark}%
}



% =====================================
% Numbering 
% =====================================
%\setcounter{section}{0}
%\def\thesection{\arabic{section}}
%\setcounter{page}{1}
%
%\renewcommand{\thefootnote}{\fnsymbol{footnote}}

% ====================================
% Formula Environment
% ====================================
%\renewcommand{\theequation}{\thesection.\arabic{equation}}
%\numberwithin{equation}{section}
%\numberwithin{equation}{section}

\def\[#1\]{\begin{align*}#1\end{align*}}


% ========================
% Letter
% ========================
\renewcommand{\hat}{\widehat}


\newcommand{\bfm}[1]{\ensuremath{\boldsymbol{#1}}}
\newcommand{\bfmcap}[1]{\ensuremath{\mathbf{#1}}}
\newcommand{\bfsym}[1]{\ensuremath{\boldsymbol{#1}}}


\def\bone{\bfm 1}
\def\btwo{\bfm 0}
%\def\ba{\bfm a}   \def\bA{\bfm A}  \def\AA{\mathbb{A}}
%\def\bb{\bfm b}   \def\bB{\bfm B}  \def\BB{\mathbb{B}}
%\def\bc{\bfm c}   \def\bC{\bfm C}  \def\CC{\mathbb{C}}
%\def\bd{\bfm d}   \def\bD{\bfm D}  \def\DD{\mathbb{D}}
%\def\be{\bfm e}   \def\bE{\bfm E}  \def\EE{\mathbb{E}}
%\def\bff{\bfm f}  \def\bF{\bfm F}  \def\FF{\mathbb{F}}
%\def\bg{\bfm g}   \def\bG{\bfm G}  \def\GG{\mathbb{G}}
%\def\bh{\bfm h}   \def\bH{\bfm H}  \def\HH{\mathbb{H}}
%\def\bi{\bfm i}   \def\bI{\bfm I}  \def\II{\mathbb{I}}
%\def\bj{\bfm j}   \def\bJ{\bfm J}  \def\JJ{\mathbb{J}}
%\def\bk{\bfm k}   \def\bK{\bfm K}  \def\KK{\mathbb{K}}
%\def\bl{\bfm l}   \def\bL{\bfm L}  \def\LL{\mathbb{L}}
%\def\bm{\bfm m}   \def\bM{\bfm M}  \def\MM{\mathbb{M}}
%\def\bn{\bfm n}   \def\bN{\bfm N}  \def\NN{\mathbb{N}}
%\def\bo{\bfm o}   \def\bO{\bfm O}  \def\OO{\mathbb{O}}
%\def\bp{\bfm p}   \def\bP{\bfm P}  \def\PP{\mathbb{P}}
%\def\bq{\bfm q}   \def\bQ{\bfm Q}  \def\QQ{\mathbb{Q}}
%\def\brr{\bfm r}  \def\bR{\bfm R} \def\RR{\mathbb{R}}
%\def\bs{\bfm s}   \def\bS{\bfm S}  \def\SS{\mathbb{S}}
%\def\bt{\bfm t}   \def\bT{\bfm T}  \def\TT{\mathbb{T}}
%\def\bu{\bfm u}   \def\bU{\bfm U}  \def\UU{\mathbb{U}}
%\def\bv{\bfm v}   \def\bV{\bfm V}  \def\VV{\mathbb{V}}
%\def\bw{\bfm w}   \def\bW{\bfm W}  \def\WW{\mathbb{W}}
%\def\bx{\bfm x}   \def\bX{\bfm X}  \def\XX{\mathbb{X}}
%\def\by{\bfm y}   \def\bY{\bfm Y}  \def\YY{\mathbb{Y}}
%\def\bz{\bfm z}   \def\bZ{\bfm Z}  \def\ZZ{\mathbb{Z}}

\def\ba{\bfm a}   \def\bA{\bfmcap A}  \def\AA{\mathbb{A}}
\def\bb{\bfm b}   \def\bB{\bfmcap B}  \def\BB{\mathbb{B}}
\def\bc{\bfm c}   \def\bC{\bfmcap C}  \def\CC{\mathbb{C}}
\def\bd{\bfm d}   \def\bD{\bfmcap D}  \def\DD{\mathbb{D}}
\def\be{\bfm e}   \def\bE{\bfmcap E}  \def\EE{\mathbb{E}}
\def\bff{\bfm f}  \def\bF{\bfmcap F}  \def\FF{\mathbb{F}}
\def\bg{\bfm g}   \def\bG{\bfmcap G}  \def\GG{\mathbb{G}}
\def\bh{\bfm h}   \def\bH{\bfmcap H}  \def\HH{\mathbb{H}}
\def\bi{\bfm i}   \def\bI{\bfmcap I}  \def\II{\mathbb{I}}
\def\bj{\bfm j}   \def\bJ{\bfmcap J}  \def\JJ{\mathbb{J}}
\def\bk{\bfm k}   \def\bK{\bfmcap K}  \def\KK{\mathbb{K}}
\def\bl{\bfm l}   \def\bL{\bfmcap L}  \def\LL{\mathbb{L}}
\def\bm{\bfm m}   \def\bM{\bfmcap M}  \def\MM{\mathbb{M}}
\def\bn{\bfm n}   \def\bN{\bfmcap N}  \def\NN{\mathbb{N}}
\def\bo{\bfm o}   \def\bO{\bfmcap O}  \def\OO{\mathbb{O}}
\def\bp{\bfm p}   \def\bP{\bfmcap P}  \def\PP{\mathbb{P}}
\def\bq{\bfm q}   \def\bQ{\bfmcap Q}  \def\QQ{\mathbb{Q}}
\def\brr{\bfm r}  \def\bR{\bfmcap R} \def\RR{\mathbb{R}}
\def\bs{\bfm s}   \def\bS{\bfmcap S}  \def\SS{\mathbb{S}}
\def\bt{\bfm t}   \def\bT{\bfmcap T}  \def\TT{\mathbb{T}}
\def\bu{\bfm u}   \def\bU{\bfmcap U}  \def\UU{\mathbb{U}}
\def\bv{\bfm v}   \def\bV{\bfmcap V}  \def\VV{\mathbb{V}}
\def\bw{\bfm w}   \def\bW{\bfmcap W}  \def\WW{\mathbb{W}}
\def\bx{\bfm x}   \def\bX{\bfmcap X}  \def\XX{\mathbb{X}}
\def\by{\bfm y}   \def\bY{\bfmcap Y}  \def\YY{\mathbb{Y}}
\def\bz{\bfm z}   \def\bZ{\bfmcap Z}  \def\ZZ{\mathbb{Z}}


\def\ha{\wh{a}}   \def\hba{\wh{\bfm a}} 
\def\hA{\wh{A}}   \def\hbA{\wh{\bfm A}}

\def\hb{\wh{b}}   \def\hbb{\wh{\bfm b}} 
\def\hB{\wh{B}}   \def\hbB{\wh{\bfm B}}
   
\def\hc{\wh{c}}   \def\hbc{\wh{\bfm c}} 
\def\hC{\wh{C}}   \def\hbC{\wh{\bfm C}}
  
\def\hd{\wh{d}}   \def\hbd{\wh{\bfm d}} 
\def\hD{\wh{D}}   \def\hbD{\wh{\bfm D}}
  
\def\he{\wh{e}}   \def\hbe{\wh{\bfm e}} 
\def\hE{\wh{E}}   \def\hbE{\wh{\bfm E}}
 
\def\hff{\wh{f}}  \def\hbff{\wh{\bfm f}} 
\def\hFF{\wh{F}}  \def\hbFF{\wh{\bfm F}}
 
\def\hg{\wh{g}}   \def\hbg{\wh{\bfm g}} 
\def\hG{\wh{G}}   \def\hbG{\wh{\bfm G}}
 
\def\hh{\wh{h}}   \def\hbh{\wh{\bfm h}} 
\def\hH{\wh{H}}   \def\hbH{\wh{\bfm H}}

\def\hii{\wh{i}}   \def\hbii{\wh{\bfm i}} 
\def\hII{\wh{I}}   \def\hbII{\wh{\bfm I}}

\def\hj{\wh{j}}   \def\hbj{\wh{\bfm j}} 
\def\hJ{\wh{J}}   \def\hbJ{\wh{\bfm J}}
%
\def\hk{\wh{k}}   \def\hbk{\wh{\bfm k}} 
\def\hK{\wh{K}}   \def\hbK{\wh{\bfm K}}

\def\hll{\wh{l}}   \def\hbll{\wh{\bfm l}} 
\def\hLL{\wh{L}}   \def\hbLL{\wh{\bfm L}}

\def\hm{\wh{m}}   \def\hbm{\wh{\bfm m}} 
\def\hM{\wh{M}}   \def\hbM{\wh{\bfm M}}

\def\hn{\wh{n}}   \def\hbn{\wh{\bfm n}} 
\def\hN{\wh{N}}   \def\hbN{\wh{\bfm N}}

\def\ho{\wh{o}}   \def\hbo{\wh{\bfm o}} 
\def\hO{\wh{O}}   \def\hbO{\wh{\bfm O}}

\def\hp{\wh{p}}   \def\hbp{\wh{\bfm p}} 
\def\hP{\wh{P}}   \def\hbP{\wh{\bfm P}}
 
\def\hq{\wh{q}}   \def\hbq{\wh{\bfm q}} 
\def\hQ{\wh{Q}}   \def\hbQ{\wh{\bfm Q}}

\def\hrr{\wh{r}}   \def\hbrr{\wh{\bfm r}} 
\def\hRR{\wh{R}}   \def\hbRR{\wh{\bfm R}}
 
\def\hs{\wh{s}}   \def\hbs{\wh{\bfm s}} 
\def\hS{\wh{S}}   \def\hbS{\wh{\bfm S}}
  
\def\htt{\wh{t}}   \def\hbtt{\wh{\bfm t}} 
\def\hTT{\wh{T}}   \def\hbtT{\wh{\bfm T}}
    
\def\hu{\wh{u}}   \def\hbu{\wh{\bfm u}} 
\def\hU{\wh{U}}   \def\hbU{\wh{\bfm U}}
 
\def\hv{\wh{v}}   \def\hbv{\wh{\bfm v}} 
\def\hV{\wh{V}}   \def\hbV{\wh{\bfm V}}

\def\hw{\wh{w}}   \def\hbw{\wh{\bfm w}} 
\def\hW{\wh{W}}   \def\hbW{\wh{\bfm W}}
 
\def\hx{\wh{x}}   \def\hbx{\wh{\bfm x}} 
\def\hX{\wh{X}}   \def\hbX{\wh{\bfm X}}

\def\hy{\wh{y}}   \def\hby{\wh{\bfm y}} 
\def\hY{\wh{Y}}   \def\hbY{\wh{\bfm Y}}
 
\def\hz{\wh{z}}   \def\hbz{\wh{\bfm z}} 
\def\hZ{\wh{Z}}   \def\hbZ{\wh{\bfm Z}}  


\def\calA{{\cal  A}} \def\cA{{\cal  A}}
\def\calB{{\cal  B}} \def\cB{{\cal  B}}
\def\calC{{\cal  C}} \def\cC{{\cal  C}}
\def\calD{{\cal  D}} \def\cD{{\cal  D}}
\def\calE{{\cal  E}} \def\cE{{\cal  E}}
\def\calF{{\cal  F}} \def\cF{{\cal  F}}
\def\calG{{\cal  G}} \def\cG{{\cal  G}}
\def\calH{{\cal  H}} \def\cH{{\cal  H}}
\def\calI{{\cal  I}} \def\cI{{\cal  I}}
\def\calJ{{\cal  J}} \def\cJ{{\cal  J}}
\def\calK{{\cal  K}} \def\cK{{\cal  K}}
\def\calL{{\cal  L}} \def\cL{{\cal  L}}
\def\calM{{\cal  M}} \def\cM{{\cal  M}}
\def\calN{{\cal  N}} \def\cN{{\cal  N}}
\def\calO{{\cal  O}} \def\cO{{\cal  O}}
\def\calP{{\cal  P}} \def\cP{{\cal  P}}
\def\calQ{{\cal  Q}} \def\cQ{{\cal  Q}}
\def\calR{{\cal  R}} \def\cR{{\cal  R}}
\def\calS{{\cal  S}} \def\cS{{\cal  S}}
\def\calT{{\cal  T}} \def\cT{{\cal  T}}
\def\calU{{\cal  U}} \def\cU{{\cal  U}}
\def\calV{{\cal  V}} \def\cV{{\cal  V}}
\def\calW{{\cal  W}} \def\cW{{\cal  W}}
\def\calX{{\cal  X}} \def\cX{{\cal  X}}
\def\calY{{\cal  Y}} \def\cY{{\cal  Y}}
\def\calZ{{\cal  Z}} \def\cZ{{\cal  Z}}
\def\bZero{\bfm 0}
\def\bOne{\bfm 1}

% =========================
% Greek Letters
% =========================
\def\balpha{\bfsym \alpha}
\def\halpha{\hat{\alpha}}              
\def\hbalpha{\hat{\bfsym \alpha}}

\def\bbeta{\bfsym \beta}
\def\hbeta{\hat{\beta}}                 
\def\hbbeta{\hat{\bfsym \beta}}

\def\bgamma{\bfsym \gamma}             
\def\hgamma{\hat{\gamma}}              
\def\hbgamma{\hat{\bfsym \gamma}}

\def\bGamma{\bfsym \Gamma}
\def\hGamma{\hat{ \Gamma}}             
\def\hbGamma{\hat{\bfsym \Gamma}}

\def\bdelta{\bfsym {\delta}}
\def\hdelta{\hat{\delta}}              
\def\hbdelta{\hat{\bfsym {\delta}}}
          
\def\bDelta {\bfsym {\Delta}}
\def\hDelta {\hat{\Delta}}             
\def\hbDelta{\hat{\bfsym {\Delta}}}

\def\bfeta{\bfsym {\eta}}              
\def\heta{\hat {\eta}}                
\def\hbfeta {\hat{\bfsym {\eta}}}

\def\bfEta {\bfsym {\Eta}}

\def\bkappa{\bfsym \kappa}

\def\bmu{\bfsym {\mu}}  
\def\hmu{\hat{\mu}}                    
\def\hbmu {\hat{\bfsym {\mu}}}
               
\def\bMu {\bfsym {\Mu}}

\def\bnu{\bfsym {\nu}}
\def\hnu{\hat{\nu}}                    
\def\hbnu {\hat{\bfsym {\nu}}}

\def\bpi{\bfsym {\pi}}
\def\bPi{\bfsym {\Pi}}

\def\bphi{\bysym{\phi}}
\def\hphi{\hat{\phi}}
\def\hbphi{\hat{\bfsym{\phi}}}

\def\bvarphi{\bysym{\varphi}}
\def\hvarphi{\hat{\varphi}}
\def\tvarphi{\wt{\varphi}}
\def\hbvarphi{\hat{\bfsym{\varphi}}}


\def\btheta{\bfsym {\theta}} 
\def\htheta{\hat {\theta}}             
\def\hbtheta {\hat{\bfsym {\theta}}}
          
\def\bTheta {\bfsym {\Theta}}
\def\hTheta{\hat {\Theta}}             
\def\hbTheta {\hat{\bfsym {\Theta}}}


\def\bpsi{\bfsym{\psi}}
\def\hpsi{\hat{\psi}}
\def\hbpsi{\hat{\bfsym {\psi}}}

\def\bPsid{\bfsym{\Psi}}
\def\hPsi{\hat{\Psi}}
\def\hbPsi{\hat{\bfsym{\Psi}}}

\def\beps{\bfsym \epsilon}          
\def\beps{\bfsym \epsilon}
\def\hbeps{\hat{\bfsym \epsilon}}  

\def\bvareps{\bfsym \varepsilon}          
\def\bvareps{\bfsym \varepsilon}
\def\hbvareps{\hat{\bfsym \varepsilon}}  

\def\bsigma{\bfsym \sigma} 
\def\hsigma{\hat{\sigma}}              
\def\hbsigma{\hat{\bfsym \sigma}}
            
\def\bSigma{\bfsym \Sigma}
\def\hSigma{\hat{\Sigma}}              
\def\hbSigma{\hat{\bfsym \Sigma}}

\def\blambda {\bfsym {\lambda}} 
\def\hlambda{\hat{\lambda}}            
\def\hblambda{\hat{\bfsym \lambda}}
       
\def\bLambda {\bfsym {\Lambda}}
\def\hLambda{\hat{\Lambda}}            
\def\hbLambda{\hat{\bfsym \Lambda}}

\def\bomega {\bfsym {\omega}} 
\def\homega {\hat {\omega}}            
\def\hbomega {\hat{\bfsym {\omega}}}

\def\bOmega {\bfsym {\Omega}}
\def\hOmega {\hat {\Omega}}            
\def\hbOmega {\hat{\bfsym {\Omega}}}

\def\brho   {\bfsym {\rho}}
\def\hrho   {\hat {\rho}}             
\def\hbrho {\hat{\bfsym {\rho}}}
\def\bvarrho {\bfsym {\varrho}}
\def\hvarrho   {\hat {\varrho}}             
\def\hbvarrho {\hat{\bfsym {\varrho}}}


\def\btau{\bfsym {\tau}} 
\def\htau   {\hat {\tau}}               
\def\hbtau {\hat{\bfsym {\tau}}}

\def\bxi{\bfsym {\xi}}
\def\hxi{\hat{\xi}}                   
\def\hbxi{\hat{\bfsym {\xi}}}

\def\bXi{\bfsym {\Xi}}
\def\hXi{\hat{\Xi}}                   
\def\hbXi{\hat{\bfsym {\Xi}}}

\def\bzeta{\bfsym {\zeta}}
\def\hzeta{\hat{\zeta}}               
\def\hbzeta{\hat{\bfsym {\zeta}}}

\def\bZeta{\bfsym {\Zeta}}
\def\hZeta{\hat{\Zeta}}               
\def\hbZeta{\hat{\bfsym {\Zeta}}}

% ================
% General
% ================
\def\andd{\mbox{and}}

\def\+#1{\mathcal{#1}}
\def\-#1{\textup{#1}}

\def\br#1{\overline{#1}}

\def\bracket#1{\left(#1\right)}
\def\Bracket#1{\left[#1\right]}

\def\const{\mbox{const.}\quad}

\def\defequal {\triangleq}

\def \lr {\left}
\def \rt {\right}
\def \lra {\Longrightarrow}
\def \lla {\Longleftarrow}
\def \era {\Longleftrightarrow}

\def\nfrac#1#2{\nicefrac{#1}{#2}}

\def\say{\mbox{(say)}}
\def\sgn{\mbox{sgn}}

\def\wh{\widehat}
\def\wt{\widetilde}

\newcommand{\vect}[1]{{\textup{vec}\bracket{#1}}}
\newcommand{\V}[1]{{\boldsymbol{#1}}}

% =================
% Algebra
% =================
\newcommand{\diag}{\textup{diag}}
\def\dim{\textup{dim}}

\def\eigmax{\gamma_{\textup{max}}}
\def\eigmin{\gamma_{\textup{min}}}

\def\mleq{\preccurlyeq}
\def\mgeq{\succcurlyeq}

\newcommand{\la}{\left \langle}
\newcommand{\ra}{\right \rangle}

\newcommand{\La}{\left\langle\kern-0.64ex\left\langle}
\newcommand{\Ra}{\right\rangle\kern-0.64ex\right\rangle}

\providecommand{\abs}[1]{\left\lvert#1\right\rvert}

    
\def\norm#1#2{{\left\|#1\right\|}_{#2}}

\def\Norm#1#2{{\left\vert\kern-0.4ex\left\vert\kern-0.4ex\left\vert #1 
    \right\vert\kern-0.4ex\right\vert\kern-0.4ex\right\vert}_{#2}}


\def\psitwonorm#1{\norm{#1}{\psi_2}}

\def\infnorm#1{\norm{#1}{\infty}}
\def\Infnorm#1{\Norm{#1}{\infty}}

\def\fnorm#1{\norm{#1}{\textup{F}}}
\def\Fnorm#1{\Norm{#1}{\textup{F}}}

\def\opnorm#1{\norm{#1}{\textup{OP}}}
\def\Opnorm#1{\Norm{#1}{\textup{OP}}}

\def\nucnorm#1{\Norm{#1}{\textup{nuc}}}
\def\offdiagnorm#1{\Norm{#1}{1, \textup{off}}}

\def \Null  {\textup{null}}
\def \Proj  {\mathbb{P}}

\newcommand{\rank}{\textup{rank}}
\def \range {\textup{range}}
\def \rmt   {\top}
\def \rmh   {\mathrm{H}}

\def \sigmax {\sigma_{\textup{max}}}
\def \sigmin {\sigma_{\textup{min}}}
\newcommand{\sign}{\textup{sign}}
\def \supp  {\textup{supp}}

%\newcommand{\trace}{\textup{tr}}
\newcommand{\trace}{\operatorname{Tr}}
\def \tracemgf {\br{\operatorname{Tr}}}

\def\svd#1{\mathsf{SVD}\bracket{#1}}

\def \Indmat {\bI\bd}


% =================
% Analysis 
% ================
\def \card#1{\textup{card}\bracket{#1}}
\def \Fourier {\mathcal{F}}
\def \dist    {\textup{dist}}
\def \interior {\textup{int}}
\def \real {\textup{Re}}
\def \imag {\textup{Im}}
\def \eps  {\epsilon}
\def \veps {\varepsilon}
\def \mps {\mapsto}

% =================
% Geometry
% =================

\def \Vol {\textup{Vol}}

\def\set#1{\left\{#1\right\}}

\def\ball#1#2#3{\BB^{#1}\left(#2; #3\right)}
\def\sphere#1#2#3{\SS^{#1}\left(#2; #3\right)}

\def\sinang#1#2{\sin\angle\left(#1; #2\right)}
\def\cosang#1#2{\cos\angle\left(#1; #2\right)}
\def\tanang#1#2{\tan\angle\left(#1; #2\right)}


% =================
% Optimization
% ================
\newcommand{\argmax}{\textup{argmax}}
\newcommand{\argmin}{\textup{argmin}}

\def \Ind {\mathbbm{1}}
%\def \Ind {\mathbbm{1}}

\def \St  {\textup{~s.t.~}}
\def \Epi {\textup{Epi}}
\def \dom {\textbf{dom}}
\def \prox {\mathsf{prox}}
\def \Prox {\mathsf{prox}}
\def \opt {\textup{opt}}

% =================
% Probability 
% =================

\def \Expc {\mathbb{E}}
\def \Cov  {\textup{Cov}}
\def \cov  {\textup{cov}}


\def \logdet {\log\det}
\def \normdist {\mathsf{N}}
\def \Bino {\textup{Binomial}}
%\def \Prob {\textup{Pr}}
%\def \prob {\textup{Pr}}
\def \Prob {\mathbb{P}}
\def \prob {\mathbb{P}}
\def \prior {\textup{prior}}

\def \Var  {\textup{Var}}
\def \var  {\textup{var}}
\def \mean {\textup{mean}}
\def \Mean {\textup{Mean}}
\def \median  {\textup{med}}
\def \Median {\textup{Med}}
\def \med  {\textbf{med}}
\def \Med  {\textbf{Med}}
\def \as   {\textup{a.s.}}
\def \Unif {\textup{Unif}}
\def \Bern {\textup{Bern}}
\def \Poi  {\textup{Poi}}
\def \Ent  {\textup{Ent}}
\def \LogMnt#1{\log \Expc e^{#1}}
\def \lsim {\lesssim} 
\def \gsim {\gtrsim}

%\def \lsim {\precsim}
%\def \gsim {\succsim}

\def\kl#1#2{\mathbb{KL}\left(#1||#2\right)}
\newcommand{\kull}[2]{\ensuremath{D_{\text{KL}}(#1 \| #2)}}
\def\dtv#1#2{\norm{{#1}-{#2}}{\textup{TV}}}

\def \iid {\stackrel{\textup{i.i.d}}{\sim}}
\def \samedist {\stackrel{(\mathsf{d})}{=}}

\def\dh{\mathsf{d_H}}
\def\dH{\mathsf{d_H}}


% =================
% Statistics 
% =================
\def \natu {\natural}
\def \vcap {\wedge}
\def \vcup {\vee}
\def \Me {\mathfrak{M}}
\def \energy {\textup{Energy}}
\def\OPrate#1{\OO_{\textup{P}}\left(#1\right)}
\def\oprate#1{o_{\textup{P}}\left(#1\right)}

\def \ch {\textup{ch}}
\def \sh {\textup{sh}}
\def \th {\textup{th}}
\def \bias {\textup{bias}}
\def \cconverg {\stackrel{\textup{c}}{\longrightarrow}}
\def \dconverg {\rightsquigarrow}
\def \pconverg {\stackrel{\textup{p}}{\longrightarrow}}
\def \asconverg {\stackrel{\textup{a.s.}}{\longrightarrow}}
\def \se {\mathsf{se}}
\def \mse {\mathsf{MSE}}
\def \mmse {\mathsf{MMSE}}
\def \mmo {\mathsf{MMO}}

% =======================
% Signal Processing
% =======================
\def \snr {\mathsf{SNR}}
\def\quant#1#2{\mathsf{Quant}_{#1}\bracket{#2}}

% =========================
% Online Learn
% =========================
\def \regret {\textup{Regret}}
\def \risk  {\textup{Risk}}

% =========================
% Neural network
% =========================

\def \relu {\textup{ReLU}}

\newenvironment{proofoutline}
 {\renewcommand\qedsymbol{}\proof[Proof outline]}
 {\endproof}


\def \dh {\mathsf{d_H}}
\def \dH {\mathsf{d_H}}

\def \term {\mathsf{Term}}
\def\srank#1{\textup{srank}(#1)}

\def\wtminus#1#2{\wt{#1}_{\setminus (#2)}}
\def\sampminus#1#2{#1_{\setminus (#2)}}
\def\why{\textup{\red{WHY?}}}
\def\valtoset#1{\textup{\red{Value-to-set-#1}}}

\def\red#1{{\color{red}#1}}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Greed is good: correspondence recovery for unlabeled linear regression}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors


\author{Hang Zhang, Ping Li\\
 Cognitive Computing Lab\\
Baidu Research\\
 10900 NE 8th St, Bellevue, WA 98004, USA \\
\texttt{\{zhanghanghitomi, pingli98\}@gmail.com}
}
  
  \begin{document}
\maketitle

% =========================

\begin{abstract}
We consider the unlabeled linear regression reading as $\mathbf{Y} = \mathbf{\Pi}^{*}\mathbf{X}\mathbf{B}^* + \mathbf{W}$, where $\mathbf{\Pi}^{*}, \mathbf{B}^*$ and $\mathbf{W}$ represents missing (or incomplete) correspondence information, signals, and additive noise, respectively. Our goal is to perform data alignment between $\mathbf{Y}$ and $\mathbf{X}$, or equivalently, reconstruct the correspondence information encoded by $\mathbf{\Pi}^*$. Based on whether signal $\mathbf{B}^*$ is given a prior, we separately propose two greedy-selection-based estimators, which both reach the mini-max optimality. Compared with previous works, our work $(i)$ supports partial recovery of the correspondence information; and $(ii)$ applies to a general matrix family rather than the permutation matrices, to put more specifically, selection matrices, where multiple rows of $\mathbf{X}$ can correspond to the same row in $\mathbf{Y}$. Moreover, numerical experiments are provided to corroborate our claims.
\end{abstract}


\section{Introduction}
Starting under the name ``broken sample'' problem in 1970s~\citep{degroot1976matching, degroot1980estimation, goel1975re}, data alignment 
has received increasing attention nowadays due to its 
wide spectrum of applications, 
which span from computer vision to curve registration to natural language processing to data privacy to linkage record~\citep{unnikrishnan2015unlabeled, pananjady2016linear, hsu2017linear, slawski2017linear, domankic2018permutations, zhang2019permutation, slawski2019two,  zhang2020optimal, tsakiris2020algebraic}.
Among the numerous applications, two prominent  
examples are the linkage attack and database merging.
\par  
In linkage attacks, intruders aim at the disclosure of sensitive data by using public data. 
This can be viewed as the inverse problem of 
data de-anonymization. 
Usually, these attacks involve direct comparison between the sensitive data and the public 
data, where their correspondence information 
is formulated 
as an unknown selection matrix to be reconstructed. 
In the task of database merging, the goal is to 
merge multiple databases, which contain data 
of the same identity, into one 
comprehensive database. 
In practice, these databases may not be well aligned 
due to the data formatting and data quality issues. 
How to reconstruct the correspondence information  
across the databases and properly align their
data constitute a technical challenge.  
For more applications, we refer the interested 
readers to \citet{pananjady2016linear, slawski2017linear, unnikrishnan2015unlabeled,  slawski2019two, zhang2022permutation}.

In this paper, we formulate the above-mentioned problem as an unlabeled linear 
regression reading as 
\[
\bY = \bPi^{*}\mathbf{X}\mathbf{B}^* + \bW, 
\]
where $\bY \in \mathbb{R}^{n_1\times m}$ denotes the sensing results, 
$\bPi^{*} \in \{0, 1\}^{n_1\times n_2}$ is the (unknown) selection matrix, 
$\bX\in \mathbb{R}^{n_2\times p}$ is the sensing matrix, 
$\bB^{*} \in \mathbb{R}^{p \times m}$ represents the signal of interest, 
and $\bW \in \mathbb{R}^{n_1\times m}$ is the additive sensing noise. 
Our goal is to reconstruct the correct correspondence of rows, which 
are sabotaged by the unknown matrix $\bPi^{*}$. 
To start with, we briefly review the previous works. 





% \newpage
\textbf{Related work.}
As mentioned before, 
research on regression without correspondence has 
a long history that 
can at least date back to 1970s dubbed ``broken sample problem''~\citep{degroot1976matching, degroot1980estimation, goel1975re, bai2005broken}. Due to their large volume,
we restrict ourselves to recent work on this area, which 
begins with
\citet{unnikrishnan2015unlabeled} and  
a noiseless setting ($\bW = \bZero$) with single observation $(m = 1)$ is adopted. 
Provided that entries $\bX_{ij}$ in the sensing matrix are drawn from 
continuous distributions, they establish the necessary condition $n\geq 2p$
for correct correspondence reconstruction. Similar results can also be found in 
\citet{domankic2018permutations, tsakiris2020algebraic}. Later, 
\citet{pananjady2016linear, slawski2017linear} extend the 
noiseless setting to a noisy setting and discover a phase transition phenomenon 
when the \emph{signal-to-noise-ratio} ($\snr$) exceeds a certain threshold. Apart from the above work, there are other works focusing on the single observation model,  
i.e., $m=1$~\citep{hsu2017linear, haghighatshoar2018signal, tsakiris2019homomorphic, peng2020linear, zhang2021sparse}. 
Apart from the linear sensing model, there are 
also some research on the generalized linear 
sensing relation such as \citet{fang2022regression}. 
Since the single observation model is not 
our focus, we only leave their names without further discussion.

Then we discuss the research on the multiple observation model, i.e., $m\gg 1$. 
Albeit there are some previous works, 
we find the theoretical analysis first appears in~\citet{pananjady2017denoising}, whose 
focus is to denoise the product $\bPi^{*}\bX\bB^{*}$. 
Adopting vastly different strategies,~\citet{slawski2019two, zhang2019permutation, zhang2020optimal}  
later propose three estimators for the correspondence recovery. 
\citet{zhang2019permutation} designs the estimator  
to approximately compute the \emph{maximum likelihood} (ML) estimator.  
In~\citet{slawski2019two}, they take the viewpoint of robustness and view the 
permuted rows as outliers. The basic idea of their estimator is to 
first detect the outliers and then reconstruct the correspondence based on these identified outliers. The same idea is also
used in~\citet{SlawskiRahmaniLi2018}. 
Meanwhile,~\citet{zhang2020optimal} tackles the problem from the perspective of 
non-convex optimization. 
In contrast to our paper, all these works need to reconstruct the whole selection 
matrix even if only one row's correspondence is desired. 
Notably, our estimators designed for recovering one single row's correspondence 
can automatically yield estimators to recover the whole selection matrix, 
which are completed by iterative applications to each row. 
For a detailed comparison, we defer it to Table~\ref{tab:compare}
as the technical details yet to be presented are inevitably involved.

In addition to the literature on unlabeled linear regression, 
other work on graph matching, generalized linear regression without 
correspondence, linear assignment problem, quadratic assignment problem
 are also related to ours. Since their connections are in a rather 
 loose manner, we only mention 
a few of their names without further discussion~\citep{mezard1986mean, mezard1985replicas, caracciolo2017finite, malatesta2019fluctuations, chertkov2010inference, semerjian2020recovery, koopmans1957assignment, umeyama1988eigendecomposition}. 


 
\textbf{Our contribution} can be summarized as follows: 
\begin{itemize}[leftmargin=*]
\item 
We propose two optimal estimators for the correspondence 
recovery with selection matrix. Compared with \textbf{the previous 
work focusing on the permutation matrix}, our estimators 
apply to a broader class of matrices and do not enforce  
bijection between $\bY$ and $\bPi^*\bX\bB^*$. 

\item 
We first propose estimators to recover the correspondence  
for one single row. In this paper, we separately consider  
the oracle case, where $\bB^{*}$ is known, and the non-oracle
case, where $\bB^{*}$ is unknown. 
Numerical experiments suggest both estimators can reliably 
reconstruct the correspondence under certain conditions. 

\item 
We provide a theoretical guarantee of our 
estimators' performance. 
In this part, the major technical difficulty comes from the heavy tails 
inherent in the non-oracle estimator. To handle such an issue, we tailor the \emph{leave-one-out} technique~\citep{el2013robust, karoui2018impact, chen2020noisy, sur2019likelihood}.
Focusing~on~the high stable rank regime, i.e., $\srank{\bB^{*}} \gg \log^4 n$, 
we show that reliable correspondence recovery can be
guaranteed for both oracle and non-oracle case 
once $\snr$ exceeds certain positive constants.
\end{itemize}
 


\textbf{Notations.}
We denote $c$, $c_0$, and $c^{'}$ as some fixed positive constants. 
We write 
$a\lsim b$ if there exists some positive constant $c$ such that $a\leq c b$. 
Similarly, we define $a\gsim b$. Provided $a\lsim b$ and $b\lsim a$ hold
simultaneously, we write $a\asymp b$ to indicate $a$ and $b$ are of the same
order. 
In addition, we denote the inner product between two vectors (and matrices)
as $\la \cdot, \cdot \ra$. 
For an arbitrary vector $\bv$, we denote
its Euclidean norm as $\norm{\bv}{2}$. 
\par 
For an arbitrary matrix $\bM$, we define its stable rank $\srank{\bM}$ as 
$\nfrac{\Fnorm{\bM}^2}{\Opnorm{\bM}^2}$, where 
$\Fnorm{\cdot}$ and $\Opnorm{\cdot}$ denote the Frobenius norm 
and operator norm, respectively, and their definitions can be 
found in~\citet{horn1990matrix}.
For an arbitrary row index $i$ ($1\leq i \leq n$),
we define $\pi(i)$ as the correspondence index
for $i$ associated with the selection matrix 
$\bPi$. For the
ground-truth selection matrix $\bPi^*$, 
we denote $\pi^*(i)$ as the correct 
correspondence index for $i$.
Moreover, we define the \emph{signal-to-noise-ratio} ($\snr$) as 
$\Fnorm{\bB^{*}}^2/({m\sigma^2})$, where $\bB^{*}\in
\RR^{p\times m}$ denotes the signal and $\sigma^2$ represents the variance of 
the sensing noise. 

%\textbf{Organization.} The structure of this paper is summarized as follows. 
%In Section~\ref{sec:prob_set}, we 
%formally state our problem setting. 
%In Section~\ref{sec:oracle_case}, we propose a 
%greedy-selection-based estimator for the oracle case ($\bB^{*}$ is known); 
%and in Section~\ref{sec:nonoracle_case}, we generalize 
%it to the non-oracle case ($\bB^{*}$ is unknown). 
%Section~\ref{sec:numeric_expr} presents the numerical experiments, Section~\ref{sec:conclude} draws the conclusion, and the proof details are deferred to the
%supplementary material.


\section{Problem Setting}
% ======================================
\label{sec:prob_set}

\begin{table*}[!ht]
\centering
\caption{Comparison with the prior art. 
All results are w.r.t. the exact correspondence recovery in the non-oracle case 
($\bB^{*}$ is unknown). Besides, these results are presented in their best orders, which only hold in certain regimes.  
Notation $\snr_{\textup{min}}$, 
$n_{\textup{min}}$, and $h_{\textup{max}}$ 
denotes the minimum required $\snr$, 
minimum sample number, and 
maximum allowed number of mismatched rows, 
respectively. 
Moreover, the logarithmic term is omitted in the notation $\wt{\Omega}(\cdot)$ and 
$\wt{O}(\cdot)$. 
The notation $r(\cdot)$ denotes the rank of the corresponding matrix.}
\label{tab:compare}
\resizebox{6.8in}{!}{%
{\renewcommand{\arraystretch}{1.5}
\begin{tabular}{@{}l|cc|ccc|ccc|c|cc@{}}\hline
 &  \multicolumn{2}{c|}{$\snr_{\textup{min}}~(\geq)$}
 &&  \multicolumn{2}{c|}{$\nfrac{n_{\textup{min}}}{p}~(
 \geq)$} &&  \multicolumn{2}{c|}{$\nfrac{h_{\textup{max}}}{n}~(\leq)$} && \\
\cline{2-9} %\cmidrule{11-12}
&   $m=1$ & $m\gg 1$ && $m=1$ & $m\gg 1$ && $m=1$ & $m\gg 1$ & \textbf{Partial Recover.} && \textbf{Select. Matrix} \\ 
\hline 
\citep{pananjady2016linear} & $\wt{\Omega}(n^c)$ &  && $\wt{\Omega}(1)$  && & $\wt{O}(1)$ & &  \\  \hline 
\citep{hsu2017linear}   & $\infty$ &  && $\wt{\Omega}(1)$  && & $\wt{O}(1)$ & &  \\ \hline
\citep{slawski2017linear}    & $\wt{\Omega}(n^c)$ &  && $\wt{\Omega}(1)$  && & $\wt{O}(\log^{-1}n)$  & &  \\\hline
\citep{zhang2019permutation} & & $\wt{\Omega}(n^{\frac{c}{\srank{\bB^{*}}}})$ && & $\wt{\Omega}(1)$ &&  & $\wt{O}\bracket{\log^{-1} r(\bB^{*})}$ &  \\ \hline
\citep{slawski2019two} & & $\wt{\Omega}(n^{\frac{c}{\srank{\bB^{*}}}})$ && 
& $\wt{\Omega}(p)$ && & $\wt{O}(\log^{-1} n)$ & && $\boldcheckmark$\\  \hline
\citep{zhang2020optimal} & $\wt{\Omega}(n^{c})$  & $\wt{\Omega}(n^{\frac{c}{\srank{\bB^{*}}}})$  && $\wt{\Omega}(1)$ & $\wt{\Omega}(\sqrt{p})$ && $\wt{O}(1)$ & $\wt{O}(1)$ && \\ \hline
\textbf{Our Estimator} & & $\wt{\Omega}(n^{\frac{c}{\srank{\bB^{*}}}})$ && & $\wt{\Omega}(1)$ && & $\wt{O}(1)$ & $\boldcheckmark$ && $\boldcheckmark$\\
\hline
\end{tabular}
}
}
\end{table*}


This section starts with a formal restatement of the 
problem 
\[
\mathbf{Y} = \bPi^{*}\bX\bB^{*} + \bW, 
\]
where $\bY\in \RR^{n_1\times m}$ denotes the sensing result, 
$\bPi^{*}\in \RR^{n_1\times n_2}$ is the (unknown) selection matrix 
such that $\bPi^{*} = \set{0, 1}^{n_1\times n_2}, \sum_j \bPi^{*}_{ij}= 1$, 
$\bX \in \RR^{n_2\times p}$ is the sensing matrix with each entry 
being i.i.d. standard normal random variable, $\bB^{*}\in \RR^{p\times m}$ is the signal of interests, and $\bW \in \RR^{n_1\times m}$ denotes the additive 
sensing noise such that its entries are i.i.d. Gaussian distributed 
random variables with zero mean and $\sigma^2$ variance, namely, $\bW_{ij} \iid \normdist(0, \sigma^2)$. 
For the clarify of presentation, we assume 
$n_1 = n_2 = n$. Notice that this condition is not enforced 
by the selection matrix. In fact, our current analysis 
can be generalized to the $n_1\neq n_2$ case effortlessly. 

Compared with previous works~\citep{pananjady2016linear, hsu2017linear, slawski2017linear, zhang2019permutation, slawski2019two, zhang2020optimal, zhang2022permutation},
our work has the following two 
noticeable characteristics: 
\begin{itemize}[leftmargin=*]
\item 
We can support partial correspondence recovery. 
In certain applications, simultaneously recovering 
all correspondence is unnecessary. One particular 
example is merging databases while 
only a small proportion of the correspondence information is desired. 
While previous works  
such as \citep{pananjady2016linear, hsu2017linear, slawski2017linear, zhang2019permutation, slawski2019two, zhang2020optimal} all
focus on reconstructing the whole permutation (selection) matrix, our work is the first 
work that can support partial correspondence recovery. 
This can lead to significant computational savings, especially when the sample number 
$n$ is sufficiently large.
\item 
Our estimators apply to a general family of matrices,
to put it more specifically, selection matrices, where multiple 
rows in $\bPi^*\bX\bB^*$ can correspond to the same row in 
$\bY$, rather than the permutation matrices, where 
the correspondences between $\bPi^*\bX\bB^*$ and $\bY$ are 
bijective. \footnote{
The family of permutation matrices is a subset of the selection matrices. 
Compared with permutation matrices, 
selection matrices $(i)$ are not necessarily a square matrix, or equivalently, 
$n_1\neq n_2$; $(ii)$ allow multiple 
rows to have the same correspondence. 
} The only work focusing on the selection matrix is 
\citet{slawski2019two} however it $(i)$ only allows a limited number
of mismatched rows, $(ii)$ requires a much larger sample size, 
and $(iii)$ is with a much higher computational cost.
\end{itemize}
A detailed comparison between our work and previous works is put in 
Table~\ref{tab:compare}.
\footnote{Notice that an algorithm with a 
larger $h_{\textup{max}}$ covers a boarder 
class of selection matrices, as more 
mismatched rows 
are allowed.}

\subsection{Mini-max lower bound}
First, we present the mini-max lower bound 
for the permuted linear regression, which is a subset of our problem, as the baseline for comparison. 

\begin{theorem}[Theorem~$1$ in~\citet{zhang2019permutation}]
\label{thm:minimax}
Provided that $ \logdet\bracket{\bI + \nfrac{\bB^{*\rmt}\bB^{*}}{\sigma^2}}<\
\frac{\log n! - 2}{n}$, we conclude
\begin{align}
\label{eqn:exact_minimax}
&\inf_{\wh{\bPi}}\sup_{\bPi^{*}}\Prob_{\bX, \bW}(\wh{\bPi}\neq \bPi^{*}) \geq 
\frac{1}{2}, 
\end{align}
where the probability $\Prob_{\bX, \bW}(\cdot)$ is w.r.t. $\bX$ and $\bW$,
and the infimum is over all possible permutation estimators $\wh{\bPi}$.   	
\end{theorem}

According to the above theorem, we conclude that correct correspondence recovery requires $\logdet(\bI + \nfrac{\bB^{*\rmt}\bB^{*}}{\sigma^2})$ to be at
least of order $\log n$. With the relation such that  
$\logdet(\bI + \nfrac{\bB^{*\rmt}\bB^{*}}{\sigma^2})\simeq \srank{\bB^{*}}\log(1 + \snr)$, we can approximately write 
the $\snr$ requirement as $\log (1+\snr) \gsim \frac{n}{\srank{\bB^{*}}}$, or 
equivalently, $\snr \gsim c_0 n^{\frac{c_1}{\srank{\bB^{*}}}} - 1$.


In the following context, 
we separately present partial correspondence recovery estimators for the 
oracle and non-oracle case, whose behaviors all match the mini-max 
lower bound in 
Theorem~\ref{eqn:exact_minimax}.


\section{Oracle Case Estimator}
% =========================== 
\label{sec:oracle_case}
As a warm-up example, 
this section considers the oracle scenario, where $\bB^{*}$ is given 
a priori. In this scenario, it is well known that \emph{maximum likelihood} (ML) 
estimator can be recast as a \emph{linear assignment problem} (LAP)
\citep{kuhn1955hungarian, bertsekas1992forward}. 
However, we have to solve the whole selection matrix even if 
we only need the correspondence of one single row. 

\par 
To handle such an issue, we modify the LAP formulation and 
propose a greedy-selection-based estimator.
A formal statement is put 
in Algorithm~\ref{alg:greed_search_oracle}. 
Then we conclude

\begin{algorithm}[h]
\textbf{Input:} observation $\bY$,  sensing matrix $\bX$, and matrix $\bB^*$.
		
\textbf{Output:} Reconstruct the correspondence $\wh{\pi}(i)$ as 
\[
\wh{\pi}(i) = \argmax_{j}
\la \bY_{i, :}, \bB^{*\rmt}\bX_{j, :} \ra, 
%\la  \bB^{*\rmt}\bX_{\pi^{*}(i)},
% \bB^{*\rmt}\bX_{j} \ra  + \la \bW_i,  \bB^{*\rmt}\bX_j \ra   
\]
where $\bY^{\rmt}_{i, :}$ denotes the $i$th row of the matrix 
$\bY$ and $\bX^{\rmt}_{j, :}$ denotes the $j$th row of the matrix 
$\bX$. 
\caption{Oracle greedy estimator for correspondence recovery.}
\label{alg:greed_search_oracle}
\end{algorithm}

 

\begin{theorem}
\label{thm:oracle_single_index_recover_MMV}
Consider the oracle case. 
Assume that $(i)~\srank{\bB^{*}}\gg \log^2 n$, 
$(ii)~n \geq 2p$,  
and $(iii)~\snr \geq c$. 
For an arbitrary row index $i$, 
we conclude Algorithm~\ref{alg:greed_search_oracle} obtains its correct correspondence, i.e.,
$\wh{\pi}(i) = \pi^{*}(i)$, 
with probability at least $1 - c_0 n^{-c_1}$ when
$n$ is sufficiently large.  	
\end{theorem}

Invoking Theorem~\ref{thm:oracle_single_index_recover_MMV}, 
we can prove that the ground-truth selection matrix $\bPi^{*}$  
can be  correctly reconstructed once 
$\snr \geq c$. Comparing our estimator in Algorithm~\ref{alg:greed_search_oracle} with the statistical lower bound in 
Theorem~\ref{thm:minimax}, we conclude that our greedy-selection-based estimator almost reaches the minimax optimality. 
In addition, we can automatically obtain an estimator for the entire selection matrix 
$\bPi^{*}$, which is to iterative 
apply Algorithm~\ref{alg:greed_search_oracle}
to each row. 



\begin{remark}
Notice that the assumption on the stable rank is quite
  common in 
permutation recovery literature \citep{slawski2019two, zhang2019permutation, zhang2020optimal}. 
Roughly speaking, it is used to describe the 
data diversity. For an arbitrary matrix 
$\bB^*$, we have its stable rank 
$\srank{\bB^*} = \rank(\bB^*)=\min(p, m)$ when its energy are uniformly distributed among all eigenvalues. When its principal eigenvalue dominates the signal strength, in other words, the energy of the rest eigenvalues is negligible, we have 
$\srank{\bB^*}$ to be approximately one. 
\end{remark}


 
\subsection{Proof outlines}
Denote the correct correspondence index 
associated with index $i$ to be $\pi^*(i)$.
To begin with, we notice that the correct correspondence
$\pi^{*}(i)$ is obtained provided the following inequality 
holds for all indices $j$ except $\pi^{*}(i)$, i.e., 
\begin{align}
\label{eq:oracle_single_index_recover_MMV_summary}
\hspace{-5mm}
\norm{\bB^{*\rmt}\bX_{\pi^{*}(i), :}}{2}^2
>~&
\la \bB^{*\rmt}\bX_{\pi^{*}(i), :}, 
\bB^{*\rmt}\bX_{j, :}  \ra \notag \\
+~& \la \bW_{i, :}, \bB^{*\rmt}\bracket{\bX_{j,:} - \bX_{\pi^{*}(i),:} } \ra,
\end{align}
for all $j\neq \pi^*(i)$.
This inequality is a restatement of 
the condition $\pi^*(i) = \argmax_j \langle \bY_{i, :}, \bB^{*\rmt}\bX_{j, :}\rangle$.

Then, we separately prove the following relations hold with a high probability
\begin{itemize}
\item 
	$\|\bB^{*\rmt}\bX_{\pi^{*}(i), :}\|_{2}\gsim \Fnorm{\bB^{*}}$;
\item 
$\langle \bB^{*\rmt}\bX_{\pi^{*}(i), :}, 
\bB^{*\rmt}\bX_{j, :}  \rangle   \lsim \log n \Fnorm{\bB^{*}\bB^{*\rmt}}$; 
\item 
$\langle \bW_{i, :}, \bB^{*\rmt}(\bX_{j, :} - \bX_{\pi^{*}(i), :})\rangle 
\lsim \sigma \log n\Fnorm{\bB^{*}}$.
\end{itemize} 
%First, we invoke the  \emph{small-ball probability} in \cite{paouris2012small}
%and have $\|\bB^{*\rmt}\bX_{\pi^{*}(i), :}\|_{2}\gsim \Fnorm{\bB^{*}}$ holds with higher 
%probability with increasing $\srank{\bB^*}$. 
Afterward, we complete the proof
by verifying ~\eqref{eq:oracle_single_index_recover_MMV_summary}, which proceeds as 
\[
\Fnorm{\bB^*}^2 \stackrel{\cirone}{\gsim}~& \frac{\log n}{\sqrt{\srank{\bB^*}}} \Fnorm{\bB^*}^2
+ \sigma(\log n)\Fnorm{\bB^*} \\
\stackrel{\cirtwo}{=}~& \Fnorm{\bB^*} \Opnorm{\bB^*} + \
\sigma(\log n)\Fnorm{\bB^*} \\
\stackrel{\cirthree}{\geq}~& \Fnorm{\bB^*\bB^{*\rmt}} + \sigma( 
\log n)\Fnorm{\bB^*}.
\]
In $\cirone$, we use the assumptions 
$\snr \geq c$ and $\srank{\bB^*} \gg \log^2 n$, in $\cirtwo$ we use the definition 
of $\srank{\bB^*}$, and in $\cirthree$ we use the relation $\Fnorm{\bB^{*} \bB^{*\rmt}} \leq \Opnorm{\bB^*}\Fnorm{\bB^*}$.
The technical details are left in  
the supplementary material. 


Having presented the algorithm for the oracle case, 
in the subsequent section, we will move on to the 
non-oracle case. 



\section{Non-oracle Case Estimator}
% ===================================
\label{sec:nonoracle_case}
This section designs an estimator for the
non-oracle case, where information $\bB^{*}$ is unavailable. As explained in 
\cite{pananjady2016linear}, 
even for the unlabeled linear regression,
\footnote{$\bPi^*$ is a permutation matrix rather than a selection matrix.}
reconstructing 
the correspondence is NP-hard for a general sensing matrix $\bX$.

To reconstruct the correspondence information in polynomial time, we need to exploit the statistical properties
of $\bX$. Our design insight starts 
with the fact such that $\Expc \bX^{\rmt}\bY = 
\Expc \bX^{\rmt}\bPi^* \bX \bB^*= (n-h)\bB^*$,
where $h$ is the number of mismatched rows. 
This implies that we can obtain the direction 
of $\bB^*$, i.e., $\nfrac{\bB^*}{\fnorm{\bB^*}}$, from the 
product $\Expc \bX^{\rmt}\bY$.  
With the belief such that $\bX^{\rmt}\bY^*$ should be close 
to $\Expc \bX^{\rmt}\bY^*$, we 
would like to approximate the 
value of $\bB^*$ by $\bX^{\rmt}\bY$. 
The proposed algorithm is summarized in Algorithm~\ref{alg:greed_search_nonoracle}. Numerical results show that 
this greedy selection estimator can 
restore the correct correspondence with 
high probability, even when $\snr$ is 
with modest value. 
\par 

% The same idea is previously adopted in~\citet{zhang2020optimal}. 

\begin{algorithm}[h]
\textbf{Input:} observation $\bY$ and sensing matrix $\bX$. 
		
\textbf{Output:} Reconstruct the corrspondence $\wh{\pi}(i)$ as 
\[
\wh{\pi}(i) = \argmax_j \
\la \bY_{i, :},  \bY^{\rmt}\bX \bX_{j, :}\ra,  
\]
where $\bY^{\rmt}_{i, :}$ denotes the $i$th row of the matrix 
$\bY$ and $\bX^{\rmt}_{j, :}$ denotes the $j$th row of the matrix 
$\bX$. 
\caption{Non-Oracle greedy estimator for correspondence recovery.}
\label{alg:greed_search_nonoracle}
\end{algorithm}

\subsection{Main results for non-oracle estimator}
Regarding its theoretical performance, we have that
 Algorithm~\ref{alg:greed_search_nonoracle} can 
yield the ground-truth correspondence $\pi^{*}(i)$ when 
$\snr \geq c$ in certain regime. A formal statement is given as the following.
\begin{theorem}
\label{thm:nonoracle_single_index_recover_MMV}
Consider the non-oracle case. 
Assume that $(i)~\srank{\bB^{*}}\gg \log^4 n$, 
$(ii)~n\gsim p \log^6 n$, $(iii)$ $h \leq c_0 \cdot n$, and $(iv)~\snr \geq c$. 
Then for an arbitrary row index $i$, 
we conclude Algorithm~\ref{alg:greed_search_nonoracle} obtains its correct correspondence, i.e.,
$\wh{\pi}(i) = \pi^{*}(i)$, 
with probability at least $1-c_0 \cdot p^{-c_1} - c_2 \cdot n^{-c_3}$ when
$n$ and $p$ are sufficiently large.  	
\end{theorem}

Similar to the oracle case, we can design the algorithm for the 
whole selection matrix recovery by iteratively applying  Algorithm~\ref{alg:greed_search_nonoracle} to each row. 

Comparing with Theorem~\ref{thm:minimax}, 
we conclude Algorithm~\ref{alg:greed_search_nonoracle} reaches the minimax optimal 
convergence rate as the lower bound in Theorem~\ref{thm:minimax} becomes  
\[
\Omega\bracket{n^{\frac{c}{\srank{\bB^{*}}}}}
= \Omega\Bracket{\exp\bracket{\frac{c\log n}{\srank{\bB^{*}}}}}
\stackrel{\cirone}{=} \Omega(1), 
\]
where in $\cirone$ we use the assumption $\srank{\bB^{*}} \gg \log^4 n$.

\begin{remark} 
Notice that we do not require most rows to be 
matched. In fact, 
we allow the maximum allowed number of mismatched 
rows to be in the same order of $n$ (optimal order), 
i.e., $h_{\textup{max}} \asymp n$. 
A numerical experiment (c.f. the bottom right of Figure~\ref{fig:permute_row}) suggests our estimator can 
reconstruct the correspondence even when half of the
rows are permuted. 	
\end{remark}


\noindent 
\textbf{Comparison with prior work.} In addition, we would like to compare our results with the
previous works.
We put a detailed comparison in Table~\ref{tab:compare} and briefly discuss our advantages over 
\citet{zhang2020optimal, slawski2019two}, whose 
settings are of the most similarity to ours. 
\par 	
The most noticeable characteristic of our work is its unique ability to 
perform partial correspondence recovery. 
Moreover, we notice certain improvements in the number of required samples 
$n$ and that of maximum allowed mismatched rows $h_{\textup{max}}$. 
For the minimum sample number,
\citet{slawski2019two} require $n \geq \wt{\Omega}(p^2)$ and \citet{zhang2020optimal} require $n \geq \wt{\Omega}(p^{1.5})$. Meanwhile, 
our work improves it to $\wt{\Omega}(p)$. 
Turning to the maximum allowed mismatched rows, 
\citet{slawski2019two} limits it to  
$O(\nfrac{n}{\log n})$, while our work reaches the 
optimal order, i.e., $\wt{O}(n)$. 
In addition, we have a slight advantage over \citet{zhang2020optimal}
in terms of the $\snr$ requirement. While they 
require $\snr \gsim \log n$ even when 
the stable rank $\srank{\bB^*}$ is sufficiently large, we reduce the requirement to 
$\snr \geq \Omega(1)$. 

The only loss is its more stringent requirement 
on $\srank{\bB^{*}}$: we require the stable rank to satisfy $\srank{\bB^{*}} \gg \log^4 n$ while most of the prior arts only require $\srank{\bB^{*}} \gg \log n$. 
We conjecture the stringent requirement on 
$\srank{\bB^{*}}$ is due to the proof artifacts rather than inherent 
in the estimator, which hopefully will be fixed with a more delicate analysis. 



 
\subsection{Proof outlines}
In addition, we would like to discuss the proof technique, which is based on 
a modified version of the \emph{leave-one-out} technique~\citep{el2013robust, karoui2018impact, chen2020noisy, sur2019likelihood} and may serve independent
technical interests. 

Denote matrix $\wt{\bB}$ as 
$\bracket{n-h}^{-1}\bX^{\rmt}\bPi^{*}\bX\bB^{*}$.
The proof of Theorem~\ref{thm:nonoracle_single_index_recover_MMV} lies in 
showing 
\begin{align}
\label{eq:greed_optim_condition_context}
\hspace{-4mm}& \langle \bB^{*\rmt}  \bX_{\pi^{*}(i), :}+ \bW_{i, :}, 
(\wt{\bB} +(n-h)^{-1}\bX^{\rmt}\bW)^{\rmt}
\bX_{\pi^{*}(i), :}
\rangle \notag \\
\geq~&
\langle \bB^{*\rmt}  \bX_{\pi^{*}(i), :}+ \bW_{i, :}, 
(\wt{\bB} + (n-h)^{-1}\bX^{\rmt}\bW)^{\rmt}
\bX_{j, :}\rangle.
\end{align} 
For notational conciseness, we define
\[
\term_{1} &= \la \bB^{*\rmt}  \bX_{\pi^{*}(i), :}, \bW^{\rmt}\bX\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}} \ra/(n-h);\\
\term_{2} &= \la \bW_{i, :}, \wt{\bB}^{\rmt}\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}} \ra; \\
\term_{3} &= (n-h)^{-1}\la \bW_{i, :}, \bW^{\rmt}\bX\bracket{\bX_{j, :} - \bX_{\pi^{*}(i), :}} \ra; \\
\term_{\textup{tot}}&= \la \bB^{*\rmt}  \bX_{\pi^{*}(i), :}, \wt{\bB}^{\rmt}
\bracket{\bX_{\pi^{*}(i), :} -  \bX_{j, :}}\ra. 
\]
\noindent	
Then \eqref{eq:greed_optim_condition_context} is equivalent to 
$\term_{\textup{tot}} \geq \term_{1} + \term_{2} + \term_{3}$. The  technical challenge 
comes from the correlation between 
$\wt{\bB}$ and the rows of $\bX$. 
Take $\term_{\textup{tot}}$ as an example. With the definition of 
$\wt{\bB}$, we conclude this term involves Gaussian random variables of form $(\cdot)^4$, whose
behavior are difficult to capture. Similar problems exist in the other three terms too.   

To decouple the correlation thereof, we propose a tailored version of the leave-one-out technique~\citep{el2013robust, karoui2018impact, chen2020noisy, sur2019likelihood}. 
Compared with the prior works using the \textbf{leave-one-out technique},
which creates independence by replacing 
fixed number of rows/columns with their i.i.d. substitutes, our method is 
rather adaptive and requires simultaneous replacement of rows ranging from two to four (The specific number is determined by the relations of 
$i, j, \pi^*(i)$, and $\pi^*(j)$).


The analysis can be  divided into
the following three stages. 

\begin{itemize}[leftmargin = *]
\item 
\textbf{Stage I.} 
First, we create i.i.d. copies 
$\bX^{'}_{i, :} \in \RR^p$ of the rows in 
$\bX$. Then, for each row $\pi^{*}_i$,
we construct $\{\wtminus{\bB}{\pi^*(i)}\}_{\pi^*(i) = 1}^n$~as 
\[
\wtminus{\bB}{\pi^*(i)} =  
(n-h)^{-1}\bigg(& 
\sum_{\substack{k \neq \pi^*(i) \\ k\neq i }} \bX_{\pi(k), :}\bX_{k, :}^{\rmt} \\
+~& \sum_{\substack{k = i \textup{ or} \\  k = \pi^*(i)}} {\bX}^{'}_{\pi(k), :}{\bX}^{'\rmt}_{k, :}\bigg)\bB^{*}.
\]
Easily, we can verify that $\wtminus{\bB}{\pi^*(i)}$ is independent of 
the $\pi^*(i)$th row $\bX_{\pi^*(i), :}$ 
as $\bX_{\pi^{*}(i), :}$ is not contained
in the perturbed sample $\wtminus{\bB}{\pi^*(i)}$.
\par 
Then, we construct perturbed 
copies $\{\wt{\bB}_{\setminus (\pi^*(i), j})\}_{\pi^*(i)\neq j}$ for 
every possible pair $(\pi^*(i), j)$. 
In formulae:
\[
\wtminus{\bB}{\pi^*(i), j} = 
(n-h)^{-1}&\bigg( 
\sum_{\substack{k \neq \pi^*(i), j\\ \pi^{*}(k) \neq \pi^*(i),j }} \bX_{\pi(k), :}\bX_{k, :}^{\rmt} \\
+~& \sum_{\substack{k = \pi^*(i) \textup{ or } k = j \textup{ or} \\ k =i
  \textup{ or }\pi^{*}(k) = j}} {\bX}^{'}_{\pi(k), :}{\bX}^{'\rmt}_{k, :}\bigg)\bB^{*},
\]
Same as above, we can verify that  
$\wtminus{\bB}{\pi^*(i), j}$ 
is independent of the rows  
$\bX_{\pi^*(i), :}$ and $\bX_{j, :}$, 
$1\leq \pi^{*}(i)\neq j \leq n$.
\item 
\textbf{Stage II.}
We analyze the separate behavior of 
$\term_{\textup{tot}}$, $\term_1$, 
$\term_2$, and $\term_3$. 
The difficulties  
incurred by the correlations between $\wt{\bB}$ and the rows of $\bX$ is tackled via the 
perturbed samples created above. 
To illustrate the procedure, we 
consider $\term_{\textup{tot}}$ without loss of generality. First, we rewrite it
as
\[
&\term_{\textup{tot}}
= \underbrace{\la \bB^{*\rmt}  \bX_{\pi^{*}(i), :}, \wtminus{\bB}{\pi^*(i), j}^{\rmt}
\bracket{\bX_{\pi^{*}(i), :} -  \bX_{j, :}}\ra}_{\term_{\textup{tot}, 1}} \\ 
+~& \underbrace{\la \bB^{*\rmt}  \bX_{\pi^{*}(i), :}, 
\bracket{\wtminus{\bB}{\pi^*(i), j} - \wt{\bB}}^{\rmt}
\bracket{\bX_{\pi^{*}(i), :} -  \bX_{j, :}}\ra}_{\term_{\textup{tot}, 2}}.
\]
For $\term_{\textup{tot}, 1}$, we exploit 
the independence across the rows in 
$\bX$. We first condition on the rows $\bX_{k,:}$ ($k\neq \pi^*(i), j$)
and can view it as a random variable determined 
by $\bX_{\pi^*(i), :}$ and $\bX_{j, :}$, 
which can be analyzed by the standard 
results such as Hanson-wright inequality~\citep{vershynin2018high}, etc.  
\par 
For $\term_{\textup{tot}, 2}$, we notice
that 
$\wtminus{\bB}{\pi^*(i), j}$ only differs 
in $\wt{\bB}$ in a finite number of terms, 
which are all related to 
$\bX_{\pi^*(i), :}, \bX_{j, :}$. 
This suggests that almost all terms in the difference  
$(\wtminus{\bB}{\pi^*(i), j} - \wt{\bB})$ 
have been crossed out. 
Thus, we analyze $\term_{\textup{tot}, 2}$
by separately considering 
each non-zero term in $(\wtminus{\bB}{\pi^*(i), j} - \wt{\bB})$ and complete 
the analysis of $\term_{\textup{tot}}$.
For other terms, we follow a similar approach 
and can show
\begin{align}
\label{eq:greed_optim_condition_seperate}
\term_1 \leq~& c_1\sigma(\log n)^{\nfrac{5}{2}}\sqrt{\frac{p}{n}}\Fnorm{\bB^{*}}
\defequal \Delta_1; \notag \\
\term_2 \leq~& c_2\sigma (\log^2 n)\Fnorm{\bB^{*}} \defequal \Delta_2; \notag \\
\term_3 \leq~& c_3\Bracket{\frac{mp(\log n)^2\sigma^2}{n}
+ \sigma^2(\log n)^{2}\sqrt{\frac{mp}{n}}} \defequal \Delta_3; \notag \\
\term_{\textup{tot}} \geq~& 
c_4 \Fnorm{\bB^*}^2 
+ (\log^2 n)(\log n^2 p^3)\sqrt{\frac{p}{n}}\Fnorm{\bB^{*}}^2
 \notag \\
 +~& \frac{p(\log n)^{\nfrac{3}{2}}}{n} \Fnorm{\bB^{*}}^2
+ \frac{(\log n)\Fnorm{\bB^{*}}^2}{\sqrt{\srank{\bB^{*}}}}.
\end{align}

\item 
\textbf{Stage III.} Under the
settings of Theorem~\ref{thm:nonoracle_single_index_recover_MMV}, 
we complete the proof by showing the 
right-hand side of $\term_{\textup{tot}}$ in 
\eqref{eq:greed_optim_condition_seperate} is 
no less than $\Delta_1 + \Delta_2 + \Delta_3$, which further leads to the relation such that
\[
\term_{\textup{tot}}
\geq \Delta_1  + \Delta_2 + \Delta_3,
\] 
holds with high probability. 
\end{itemize}


Due to the 
space limit, we omit the technical details and defer them to 
the supplementary material. 
In the next section, we will present
numerical 
experiments to verify our theorems. 




\begin{figure*}[!ht]
\centering
\mbox{
\includegraphics[width = 2.1in]{figs/oracle_n500p100_results}
\includegraphics[width = 2.1in]{figs/oracle_n750p150_results}
\includegraphics[width = 2.1in]{figs/oracle_n1000p200_results}
}
\mbox{
\includegraphics[width = 2.1in]{figs/nonoracle_n500p100_results}
\includegraphics[width = 2.1in]{figs/nonoracle_n750p150_results}
\includegraphics[width = 2.1in]{figs/nonoracle_n1000p200_results}
}
\caption{Impact of length $p$: the number of mismatched rows is fixed as $h = \nfrac{n}{4}$.}
\label{fig:len_impact}
\end{figure*}

 \begin{figure*}[!ht]
\centering
\mbox{
\includegraphics[width = 2.1in]{figs/oracle_gauss_n800p100_nprate}
\includegraphics[width = 2.1in]{figs/oracle_gauss_n800p200_nprate}
\includegraphics[width = 2.1in]{figs/oracle_gauss_n800p300_nprate}
}

\mbox{
\includegraphics[width = 2.1in]{figs/nonoracle_gauss_n800p100_nprate}
\includegraphics[width = 2.1in]{figs/nonoracle_gauss_n800p200_nprate}
\includegraphics[width = 2.1in]{figs/nonoracle_gauss_n800p300_nprate}
}
\caption{Impact of $\nfrac{n}{p}$ ratio: the number of mismatched rows is fixed as $h = \nfrac{n}{4}$.}
\label{fig:nprate_impact}
\end{figure*}





\section{Numerical Experiments}
% =================================
\label{sec:numeric_expr}

\begin{figure*}[!ht]
\centering
\mbox{
\includegraphics[width = 2.1in]{figs/oracle_h150}
\includegraphics[width = 2.1in]{figs/oracle_h225}
\includegraphics[width = 2.1in]{figs/oracle_h300}
}

\mbox{
\includegraphics[width = 2.1in]{figs/nonoracle_h150}
\includegraphics[width = 2.1in]{figs/nonoracle_h225}
\includegraphics[width = 2.1in]{figs/nonoracle_h300}
%\includegraphics[width = 1.65in]{figs/oracle_gauss_n800p400_nprate}
}
\caption{Impact of mismatched rows: the sample number $n$ is fixed as $600$ while the signal length $p$ is fixed as $100$.}
\label{fig:permute_row}
\end{figure*}

This section presents the numerical results to validate our 
claims. To evaluate the correspondence recovery, we adopt the 
recovery rate of the whole 
selection matrix, namely, $\Prob(\wh{\bPi} = \bPi^{*})$, rather 
than that of the rows, i.e., $\Prob(\pi^{*}(i) = \wh{\pi}(i))$.
The underlying reason is that one single row's correspondence may 
still be recovered correctly % with non-negligible probability 
even when $\snr$ is zero since numerous rows remain matched. However, due to the lack of signal strength, or equivalently, 
small $\snr$, the whole selection matrix 
cannot be reconstructed in such a case. This suggests $\Prob(\wh{\bPi} = \bPi^{*})$ 
may be a better quantity to measure the performance. 

In the following context, we separately study the impact of signal length 
$p$, the ratio $\nfrac{n}{p}$ between sample number and
 signal length, and the number of mismatched rows $h$ on the 
 correspondence recovery. Note that plots have varying $X$-axis as the phase transitions
 happen at different points.  
 
\subsection{Impact of signal length $p$}
We separately investigate the impact of signal length on the 
oracle estimator and non-oracle estimator
when $p$ increases.   

\textbf{Experiment setup.} We set the signal length $p$ to be  
$\{100, 150, 200\}$ and let $n = 5p$. The number of 
mismatched rows $h$ is set to be $n/4$ and the 
stable rank $\srank{\bB^{*}}$ is set to be 
$\set{0.09n, 0.12n, 0.16n, 0.2n}$.

\textbf{Results discussion.} The  
numerical results are put in Figure~\ref{fig:len_impact}, from 
which we observe a sharp 
transition of the correspondence recovery once $\snr$ 
exceeds a certain threshold.
Comparing the thresholds 
for the oracle case, we find the thresholds for the non-oracle case 
are much larger. 

In addition, we notice the threshold 
shrinks with the increasing $n$, $p$, and $\srank{\bB^{*}}$.
Take the oracle estimator for example.
When $n$ increases from $500$ to $1000$, the corresponding
phase transition threshold reduces from $1$ to $0.7$. 
A similar phenomenon also appears in the non-oracle case. 

\subsection{Impact of ratio $n/p$}
In this subsection, we investigate the impact of the ratio 
$n/p$ on the reconstruction performance. 

\textbf{Experiment setup}. We fix the sample number $n$ to be $800$ and vary $p$ within 
$\set{100, 200, 300}$. The stable rank 
$\srank{\bB^{*}}$ is set $\set{60, 80, 100}$ 
and the number of mismatched rows $h$ is fixed as $n/4$. 


\textbf{Results discussion}. Corresponding numerical 
results are put in Figure~\ref{fig:nprate_impact}. 
For the oracle case, we find the ratio $\nfrac{n}{p}$ hardly 
has any impact on the reconstruction performance: in all cases,
the selection matrix can be recovered with positive probability when 
$\snr \geq 0.7$ and is reliably reconstructed (with almost $100\%$ correctness)
when $\snr \geq 1.08$. 
Meanwhile, for the non-oracle case, we can see that 
a lower $\nfrac{n}{p}$ ratio makes it harder to reconstruct 
the selection matrix. 
For example, when $\nfrac{n}{p} = 8$, we have that
the correct rate becomes positive once $\snr \geq 0.9$, 
a little larger than the corresponding value for 
the oracle case. When $\nfrac{n}{p}$ decreases to $\nfrac{n}{p} = 4$, this value  
increases to $1.4$. When $\nfrac{n}{p}$ decreases to 
$\nfrac{n}{p} = \nfrac{8}{3}$, 
this value further increases to $2.5$. When $\nfrac{n}{p}$ further 
decreases below
$2$ (not plotted), we find it impossible to reconstruct the selection 
matrix even with infinite $\snr$, i.e., noiseless sensing relation,  
which is rigorously proved in \citet{unnikrishnan2018unlabelled}.

In addition, we notice that a lower $\nfrac{n}{p}$ ratio will 
put more stringent  
conditions on $\srank{\bB^{*}}$. When $\nfrac{n}{p} = 8$, 
we can reliably recover the selection matrix with 
$\srank{\bB^{*}}$ being $60$. When $\nfrac{n}{p}$ decreases 
to $8/3$, we find the selection matrix can hardly be reconstructed. 








\subsection{Impact of mismatched rows}
This subsection studies the impact of mismatched rows. 

\textbf{Experiment setup}. We fix 
the signal length $p$ to be $100$,  the sample number $n$ to be 
$600$, and the stable rank $\srank{\bB^{*}}$ to be $\set{60, 80, 100}$. 
Then we vary the number of mismatched rows $h$ to be $\set{n/4, 3n/8, n/2}$. 

\textbf{Results discussion}. Corresponding results are shown in Figure~\ref{fig:permute_row}. Similar to the discussion w.r.t. 
the ratio $\nfrac{n}{p}$, in the oracle case  we find the performances are almost identical 
for a different number of mismatched rows. While for the
non-oracle case, the number of mismatched rows has a negative influence on the 
correspondence recovery: more mismatched rows lead to poorer performance. 
When $\nfrac{h}{n} = \nfrac{1}{4}$, we have the recovery rate become positive
when $\snr\geq 1.1$; when $\nfrac{h}{n} = \nfrac{3}{8}$, we have 
this threshold value increase to $1.2$; and when $\nfrac{h}{n} = \nfrac{1}{2}$, 
we have this threshold value further jump to $1.5$, which exhibits 
a similar 
trend when $\nfrac{n}{p}$ decreases. 

Moreover, we notice a higher $\nfrac{h}{n}$
ratio puts more stringent requirements on the stable rank $\srank{\bB^{*}}$
for a reliable recovery of permutation. 
%In Figure~\ref{fig:permute_row}, we can 
%obtain the correct permutation matrix when 
%$\srank{\bB^*} = 60$ and $\nfrac{h}{n} = \nfrac{1}{4}$.
%When $\nfrac{h}{n} = \frac{1}{2}$, we may need a higher 
%stable rank $\srank{\bB^*}$. 
A similar phenomenon has also been 
observed in Figure~\ref{fig:nprate_impact}.

In addition, we notice the allowed number of mismatched rows are  
affected by the $\nfrac{n}{p}$ ratio: a larger $\nfrac{n}{p}$ allows 
more mismatched rows, in other words, a larger proportion $\nfrac{h_{\textup{max}}}{n}$. 



\section{Conclusion}  
\label{sec:conclude}
This paper considers the correspondence recovery for the unlabeled linear regression. 
Depending on whether the 
signal $\bB^{*}$ is known or not, we propose 
separate estimators for each case. To the best of our knowledge, 
these are the first estimators that can support partial correspondence recovery, 
where only a proportion of the rows' correspondences rather than the whole selection matrix are to be reconstructed.
Compared with the previous works on permuted linear regression, our estimators apply to a 
broader family of matrices, i.e., selection matrices rather than permutation matrices.   
Moreover, we prove both estimators are mini-max optimal. 
Notably, in analyzing the non-oracle estimator, 
we tailor the leave-one-out technique to
an adaptive
``leave-multiple-out'' technique,  
which involves the simultaneous replacement of 
multiple (un-deterministic) rows and may serve as independent technical interests. Moreover, numerical experiments
are presented to confirm our claims. 

\bibliography{zhang_696}



\end{document}




