\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{bm}  
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{wrapfig}
\usepackage{enumitem}
\usepackage{algorithm,algorithmic}
\input{math_command.tex}
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\usepackage{natbib}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{tang_88}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example




\title{Low-Rank Matrix Recovery with Unknown Correspondence\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Zhiwei Tang}{}}
\author[1]{Tsung-Hui Chang}
\author[2]{Xiaojing Ye}
\author[1]{Hongyuan Zha}
% Add affiliations after the authors
\affil[1]{%
    The Chinese University of Hong Kong, Shenzhen
}
\affil[2]{%
    Georgia State University
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle


\appendix
\section{Proof for the Theoretical Results}
\label{app:proof}
\begin{proof}[Proof of Proposition \ref{prop:MR}]
  We denote that $a_1,..,a_{r_A}$ as the linear bases of the column space of $A$. We can extend them to the bases of the column space of $M$ as $a_1,..,a_{r_A},b_1,...,b_{r-r_A}$. In this way, there must exists a matrix $Q\in \Rbb^{r\times m_B}$ such that $$B={\bm[}a_1,..,a_{r_A},b_1,...,b_{r-r_A}{\bm ]}Q.$$ Hence, we have $$PB={\bm[}Pa_1,..,Pa_{r_A},Pb_1,...,Pb_{r-r_A}{\bm ]}Q.$$
    Similarly, there must exists a matrix $T\in \Rbb^{r_A\times m_A}$ such that $$A={\bm[}a_1,..,a_{r_A}{\bm ]}T.$$
    Hence, we obtain that $${\bm [}A,PB{\bm]}={\bm[}a_1,..,a_{r_A},Pa_1,..,Pa_{r_A},Pb_1,...,Pb_{r-r_A}{\bm ]}\begin{bmatrix}
        T &  0 \\
        0 & Q \\
        \end{bmatrix}.$$

        Now, we have
        \begin{align}
            \label{eq:MR2}
            &{\rm rank}({\bm [}A,PB{\bm]})\leq{\rm rank}({\bm[}a_1,..,a_{r_A},Pa_1,..,Pa_{r_A},Pb_1,...,Pb_{r-r_A}{\bm ]}) \notag \\
            & \leq {\rm rank}({\bm[}a_1,..,a_{r_A},Pa_1,..,Pa_{r_A}{\bm ]})+r-r_A\notag\\
            &= {\rm rank}({\bm [}a_1,..,a_{r_A},Pa_1,..,Pa_{r_A}{\bm]}\begin{bmatrix}
                I_{r_A} & -I_{r_A}  \\
                0 & I_{r_A}  \\
                \end{bmatrix}) +r-r_A\notag\\
            &\leq r_A+r-r_A+{\rm rank}({\bm [}Pa_1-a_1,..,Pa_{r_A}-a_{r_A} {\bm]}).
        \end{align}
    
Now we denote the cycles in $\pi_P$ with length greater than 1 as $C_1,...,C_{\Cc(\pi_P)}$, and $\zeta_1,...,\zeta_{n-H(\pi_p)}$ as the indexes that are not in any one of $C_1,...,C_{\Cc(\pi_P)}$. We construct a matrix $Y\in \Rbb^{(n+\Cc(\pi_P)-H(\pi_p))\times n}$ as: 
\begin{align*}
  &Y(i,j)=1\text{ if }j=\zeta_i \text{ else } Y(i,j)=0,\ \text{for }i=1,...,(n-H(\pi_p));\\
  &Y(i,j)=1\ \forall j\in C_i, \text{ and } Y(i,j)=0\ \forall j\notin C_i,\\ &\ \ \ \ \ \ \ \ \text{for }i=(n-H(\pi_p)+1),...,(n+\Cc(\pi_P)-H(\pi_p)).\ 
\end{align*} 

It can be verified that 
\begin{align*}
  Y(Pa_i-a_i)=0,\ i=1,...,r_A.
\end{align*}

We denote the null space of $Y$ as Null$(Y)=\{x\in\Rbb^n|Yx=0\}$. From the construction of Y we can see that dim$($Null$(Y))= H(\pi_P)-\Cc(\pi_P)$.  Hence we have 
\begin{align}
  \label{eq:MR3}
  {\rm rank}({\bm [}Pa_1-a_1,..,Pa_{r_A}-a_{r_A} {\bm]})\leq  H(\pi_P)-\Cc(\pi_P).
\end{align}

        On the other hand, we have 
        \begin{align}
            \label{eq:MR4}
            {\rm rank}({\bm [}A,PB {\bm]})\leq {\rm rank}(A)+{\rm rank}(PB)={\rm rank}(A)+{\rm rank}(B)=r_A+r_B.
        \end{align}
        Combining \eqref{eq:MR2}, \eqref{eq:MR3} and \eqref{eq:MR4} , we can obtain \eqref{eq:MR1}.
\end{proof}

Following the proof of Proposition \ref{prop:MR}, it is easy to show the similar result for the case with multiple permutation, which is summarized as the  Corollary \ref{col:multiple_rank}

\begin{corollary}
  \label{col:multiple_rank}
  For the matrix $M={\bm [}A,B_1,..,B_d{\bm ]}\in \Rbb^{n\times m} $ with ${\rm rank}(M)=r$, ${\rm rank}(A)=r_A$, and ${\rm rank}(B_i)=r_{B_i}$, $i=1,...d$, we have $\forall P_1,...,P_d\in\Pc_n$,
  \begin{align}
      {\rm rank}({\bm [}A,P_1B_1,...,P_dB_d{\bm]})\leq \min\{n,m,r_A+\sum_{i=1}^d r_{B_i},r+\sum_{i=1}^d H(\pi_{P_i})-\Cc(\pi_{P_i})\}.
  \end{align} 
\end{corollary} 




\begin{proof}[Proof of Proposition \ref{prop:attained_rank}]
  To prove Proposition \ref{prop:attained_rank}, we need an important lemma on measure theory from \citep{halmos2013measure}.
\begin{lemma}
  \label{lm:measure}
  Let $p(x)$ be a polynomial on $\Rbb^n$. If there exists  a $x_0\in\Rbb^n$ such that $p(x_0)\neq 0$, then the Lebesgue measure of the set $\{x|p(x)=0\}$ is 0.
\end{lemma}

   $\forall P\in \Pc_n$, we define the polynomial on $\Rbb^{n\times r}\otimes \Rbb^{r\times m}$ as $$p^r_P(R,E)=\sum_{S\in \Sc_{r}([A,PB])}{\rm det}(S)^2,$$ where det($\cdot$) is the determinant of matrix, and $\Sc_{r}(X)$ is the set of all $r\times r$ sub-matrices in $X$. We denote that $r_P=\min\{2r,r+H(\pi_P)-\Cc(\pi_P)$. We can see that ${\rm rank}({\bm [}A,PB{\bm]})\geq r_P$ if and if only $p^{r_P}_P(R,E)>0$. Therefore, from Lemma \ref{lm:measure} and Proposition \ref{prop:MR} we can conclude that if there exists two matrices $R_0\in \Rbb^{n\times r}$ and $E_0\in \Rbb^{r\times m}$ such that $p^{r_P}_P([R_0,E_0])>0$, then ${\rm rank}({\bm [}A,PB{\bm]})=r_P$ holds with probability 1. In this way, we only need to construct such $R_0$ and $E_0$ for every $P\in \Pc_n$. For simplicity, we denote that $k=H(\pi_p)-\Cc(\pi_P)$. We will discuss how to construct such $R_0$ and $E_0$ for the two cases $0<k\leq n-r$ and $k\geq n-r$, respectively.

  (1) If $0<k\leq n-r$:

  We construct the matrix $Y\in \Rbb^{(n+\Cc(\pi_P)-H(\pi_p))\times n}$ the same way with that in the proof of Proposition \ref{prop:MR}. 
  % Now we denote the column space of Y as col$(Y)=$span$\{\phi_1,...,\phi_l\}$, where $\phi_1,..,\phi_l\in\Rbb^n$ are linearly independent, and $l=n-k\geq r$. 
  Firstly, we show that Null$(Y)=$col$(P-I)$.

  col$(P-I)\subseteq$Null$(Y)$: We can verify that $Y(P-I)=0$. 
  
  Null$(Y)\subseteq$col$(P-I)$: This is equivalent to prove that Null$(P-I)\subseteq$col$(Y)$. Now we have $Px=x$, $\forall x\in$Null$(P-I)$. It can be verified that if $Px=x$, then we must have
    $x(s)=x(q)$  if $s$  and $q$ belong to the same cycle  $C_i$, where $C_i$ is one of the cycles in $C_1,...,C_{C(\pi_P)}$. By the definition of $Y$, we can see that  $x\in \text{col}(Y)$.

    Now we know that rank$(P-I)=$dim$(\text{Null}(Y))=k$. We denote the eigen vectors of $P-I$ with non-zero eigen values as $\phi_1,...,\phi_k$, and the eigen vectors with zero eigen values as $\phi_{k+1},...,\phi_n$. Now we have $(P-I)\phi_i=\lambda_i \phi_i$ for $i=1,...,k$ and $(P-I)\phi_i=\lambda_i \phi_i$ for $i=k+1,...,n$.
  
  We construct the matrices $R_0$ and $E_0$ as  \begin{align*}
    &R_0=[\phi_1+\phi_{k+1},\phi_{\min\{2,k\}}+\phi_{k+2},...,\phi_{\min\{r,k\}}+\phi_{k+r}],\\
    &E_0=[I_r,\mathbf{0}_{r\times (m_A-r)},I_r,\mathbf{0}_{r\times (m_B-r)}].
  \end{align*}
Now we have \begin{align*}
  &A=[\phi_1+\phi_{k+1},\phi_{\min\{2,k\}}+\phi_{k+2},...,\phi_{\min\{r,k\}}+\phi_{k+r},\mathbf{0}_{n\times (m_A-r)}],\\
  &B=[\phi_1+\phi_{k+1},\phi_{\min\{2,k\}}+\phi_{k+2},...,\phi_{\min\{r,k\}}+\phi_{k+r},\mathbf{0}_{n\times (m_B-r)}],
\end{align*} since $[A,B]=R_0E_0$.
Therefore, we have
\begin{align*}
  &{\rm rank}({\bm [}A,PB{\bm]})={\rm rank}({\bm[}\phi_1+\phi_{k+1},...,\phi_{\min\{r,k\}}+\phi_{k+r},\lambda_1 \phi_1,...,\lambda_{\min\{r,k\}} \phi_{\min\{r,k\}}{\bm ]})\\
  &={\rm rank}({\bm[}\phi_{k+1},...,\phi_{k+r},\phi_1,..,\phi_{\min\{r,k\}}{\bm ]})\\
  &=r+\min\{k,r\}=\min\{2r,r+k\}.
\end{align*}

Now ${\rm rank}({\bm [}A,PB{\bm]})=r_P$ by this construction of $R_0$ and $E_0$. Hence $p^{r_P}_P([R_0,E_0])>0$.

(2) If $k> n-r$:

We denote that the length of a cycle $C$ as len$(C)$, and denote the cycle with maximum length among the $C_1,...,C_{\Cc(\pi_P)}$ as $C^*$. Now we have \begin{align*}
  {\rm len}(C^*)\geq \frac{H(\pi_P)}{\Cc(\pi_P)}\geq \frac{n}{n-k}>\frac{n}{r}\geq 2r.
\end{align*}
To simplify the notations, we assume that the cycle $C^*$ permute the first $j$ numbers, i.e., $$C^*=(123...(j-2)(j-1)j),$$ where $j> 2r$. We define the vector $u$ as $u=[1,2,3,...,j-2,j-1,j,0,...,0]^\top \in \Rbb^{n}$, and denote the corresponding permutation matrix to  $C^*$ as $P_{*}\in\Pc_n$. We construct the matrices $R_0$ and $E_0$ as  \begin{align*}
  &R_0=\left[
    \begin{matrix}
      u  & P_{*}^2u & \hdots & P^{2r-2}_{*}u\\
     \end{matrix}
     \right],\\
  &E_0=[I_r,\mathbf{0}_{r\times (m_A-r)},I_r,\mathbf{0}_{r\times (m_B-r)}].
\end{align*}
Now we have \begin{align*}
&A=[u  , P_{*}^2u , \hdots , P^{2r-2}_{*}u,\mathbf{0}_{n\times (m_A-r)}],\\
&B=[u  , P_{*}^2u , \hdots , P^{2r-2}_{*}u,\mathbf{0}_{n\times (m_B-r)}].
\end{align*}
Therefore, we have
\begin{align*}
  &{\rm rank}({\bm [}A,PB{\bm]})={\rm rank}({\bm[}u  , P_{*}u , \hdots , P^{2r-1}_{*}u{\bm ]})=2r,
\end{align*} because now ${\bm[}u  , P_{*}u , \hdots , P^{2r-1}_{*}u{\bm ]}$ is a circulant matrix. Now ${\rm rank}({\bm [}A,PB{\bm]})=r_P=2r$ by this construction of $R_0$ and $E_0$. Hence $p^{r_P}_P([R_0,E_0])>0$.
\end{proof}





\begin{proof}[Proof of Proposition \ref{prop:error_bound}.]
  To prove Proposition \ref{prop:error_bound}, we need to derive a series results. We first start with a very important inequality w.r.t nuclear norm.

  \begin{proposition}
    Let $P$ be a permutation matrix, then,
      \label{prop:bound_diff_nuclear}
      \begin{align}
        \label{p:bound_diff_nuclear}
        \|A\|_*+\|B\|_*\geq \|[A,PB]\|_* \geq \frac{\|A\|_*+\|B\|_*}{\|[U_AV_A^\top,PU_BV_B^\top]\|}\geq \frac{\|A\|_*+\|B\|_*}{\sqrt 2}.
      \end{align}
    \end{proposition}

    Based on \eqref{p:bound_diff_nuclear}, the general idea is that under the Assumptions \ref{asp:asp1}, we will have $\|M\|_*\approx \frac{\|A\|_*+\|B\|_*}{\sqrt 2}$ and $\|[U_AV_A^\top,PU_BV_B^\top]\|\to 1$ as $H(\pi_P)$ increases.

    Firstly, we show that under the Assumptions \ref{asp:asp1}, the nuclear norm of the original matrix $M$ will reach the lower bound in \eqref{p:bound_diff_nuclear} approximately, which is summarized as Lemma \ref{lm:core1}.
\begin{lemma}
  \label{lm:core1}
  Under the Assumptions \ref{asp:asp1}, we have 
  \begin{align}
    \|M\|_* \leq (\|A\|_*+\|B\|_*)/\sqrt 2 +(\sqrt 2 +1)\epsilon_1r+\epsilon_2\max\{\|A\|_*,\|B\|_*\}.
  \end{align}
\end{lemma}

Then, we  show that under the Assumptions \ref{asp:asp1}, $\|[U_AV_A^\top,PU_BV_B^\top]\|\to 1$ as $H(\pi_P)$ increases, which is summarized as Lemma \ref{lm:core2}.

\begin{lemma}
  \label{lm:core2}
  Under the Assumptions \ref{asp:asp1}, we have 
  \begin{align}
    \|[U_AV_A^\top,PU_BV_B^\top]\|\leq \sqrt{2-H(\pi_P)\epsilon_3^2/2} + \sqrt{T}\epsilon_2.
  \end{align}
\end{lemma}

    Finally, we  need a classical result on the tail bound for the operator norm of Gaussian matrix, whose proof can be found in \citep{wainwright2019high}.

\begin{lemma}
  \label{lm:tail_bound}
Consider the random matrix $W\in\Rbb^{n\times m}$ whose elements follow $\Nc(0,\sigma^2)$ i.i.d. For any $\delta>0$, we have \begin{align}
  \label{p:tail_bound}
  \|W\|\leq \sqrt L(2+\delta)\sigma
\end{align} holds with probability greater than $1-2\exp\{\frac{-L\delta^2}{2}\}$, where $L=\max\{n,m\}$.
\end{lemma}

  Based on Lemma \ref{lm:tail_bound}, we have 
  \begin{align*}
    \|W\|_*\leq L\|W\|\leq \sqrt{DL\sigma}
  \end{align*} holds with probability greater than $1-2\exp\{-\frac{D}{8L\sigma}\}$.

  From Proposition \ref{prop:bound_diff_nuclear}, Lemma \ref{lm:core1} and Lemma \ref{lm:core2} we can know that, for any $P\in\Pc_n$ with $H(\pi_P)$ satisfies that
  \begin{align*}
    & \frac{D}{\sqrt{2-\frac{H(\pi_p)\epsilon_3}{2}}+\sqrt{T}\epsilon_2}-\|W\|_*
    > \frac{D}{\sqrt{2}}+(\sqrt{2}+1)\epsilon_1r+\epsilon_2N+\|W\|_*,\\
  \end{align*} we must have 
\begin{align*}
  \|A_o,PB_o\|_*&\geq \|A,PB\|_* - \|W\|_* \\
    &\geq \frac{D}{\sqrt{2-\frac{H(\pi_p)\epsilon_3^2}{2}+\sqrt{T}\epsilon_2}}-\|W\|_*\\
    & > \frac{D}{\sqrt{2}}+(\sqrt{2}+1)\epsilon_1r+\epsilon_2N+\|W\|_*\\
    &\geq \|A,B\|_*+\|W\|_*\geq \|A_o,B_o\|_*.
\end{align*}

Therefore, with probability greater than $1-2\exp\{-\frac{D}{8L\sigma}\}$, if $H(\pi_P)$ satisfies that \begin{align*}
  \frac{D}{\sqrt{2-\frac{H(\pi_p)\epsilon_3^2}{2}}+\sqrt{T}\epsilon_2}
    > \frac{D}{\sqrt{2}}+(\sqrt{2}+1)\epsilon_1r+\epsilon_2N+2\sqrt{DL\sigma},\tag{@}\label{p:ax6}
\end{align*} we have $\|A_o,PB_o\|>\|A_o,B_o\|_*$. Now we simplify \eqref{p:ax6} as
\begin{align*}
  &\frac{D}{\sqrt{2-\frac{H(\pi_p)\epsilon_3^2}{2}}+\sqrt{T}\epsilon_2}
    > \frac{D}{\sqrt{2}}+(\sqrt{2}+1)\epsilon_1r+\epsilon_2N+2\sqrt{DL\sigma}\\
    \Leftrightarrow\ & \sqrt{2-\frac{H(\pi_p)\epsilon_3^2}{2}} < \frac{\sqrt{2}D}{D+(\sqrt{2}+2)\epsilon_1r+\sqrt{2}\epsilon_2N+2\sqrt{2DL\sigma}} - \sqrt{T}\epsilon_2.
\end{align*}
It can be verified that $$\frac{\sqrt{2}D}{D+(\sqrt{2}+2)\epsilon_1r+\sqrt{2}\epsilon_2N+2\sqrt{2DL\sigma}} - \sqrt{T}\epsilon_2>0$$ from the condition on $\epsilon_1$, $\epsilon_2$ and $\sigma$.

Therefore, we have 
\begin{align*}
 & \sqrt{2-\frac{H(\pi_p)\epsilon_3^2}{2}} < \frac{\sqrt{2}D}{D+(\sqrt{2}+2)\epsilon_1r+\sqrt{2}\epsilon_2N+2\sqrt{2DL\sigma}} - \sqrt{T}\epsilon_2\\
    \Leftrightarrow\ & H(\pi_P)> \frac{2}{\epsilon_3^2}\bigg(2-(\frac{\sqrt{2}D}{D+(\sqrt{2}+2)\epsilon_1r+\sqrt{2}\epsilon_2N+2\sqrt{2DL\sigma}} - \sqrt{T}\epsilon_2)^2 \bigg).
\end{align*}

Since $P^*$ is the optimal solution to \eqref{p:nuclear_norm}, we must have  \begin{align*}
  \|[A_o,P^*\tilde P B_o]\|_*\leq \|[A_o,B_o]\|_*.
\end{align*} Besides,  $P^*\tilde P$ is also a permutation matrix, we denote its corresponding permutation as $\hat \pi$. Now we have 
\begin{align*}
  d_H(\pi_*,\tilde \pi)=H(\hat \pi)\leq \frac{2}{\epsilon_3^2}\bigg(2-(\frac{\sqrt{2}D}{D+(\sqrt{2}+2)\epsilon_1r+\sqrt{2}\epsilon_2N+2\sqrt{2DL\sigma}} - \sqrt{T}\epsilon_2)^2 \bigg).
\end{align*}

\end{proof}

The proof to the auxiliary results used in the proof of Proposition \ref{prop:error_bound} are provided below.

\begin{proof}[Proof of Proposition \ref{prop:bound_diff_nuclear}]
  Since $\|\cdot\|_*$ is a norm, we have \begin{align*}
    \|[A,PB]\|_*=\|[A,\mathbf{0}]+[\mathbf{0},PB]\|_*\leq \|A\|_*+\|PB\|_*=\|A\|_*+\|B\|_*.
  \end{align*}
  Then since $\|\cdot\|_*$ is the dual norm of $\|\cdot\|$, we have 
  \begin{align*}
    \|[A,PB]\|_*&=\sup_{\|Q\|\leq 1} \langle [A,PB],Q \rangle\\
    &\geq \langle [A,PB],\frac{[U_AV_A^\top,PU_BV_B^\top]}{\|[U_AV_A^\top,PU_BV_B^\top]\|} \rangle\\
    &=\frac{\|A\|_*+\|B\|_*}{\|[U_AV_A^\top,PU_BV_B^\top]\|}.
  \end{align*}
  Finally, we have 
  \begin{align*}
    \|[U_AV_A^\top,PU_BV_B^\top]\|&=\sup_{\substack{x\in\Rbb^m\\\|x\|\leq 1}} \|[U_AV_A^\top,PU_BV_B^\top]x\|\\
    &=\sup_{\substack{x_1\in\Rbb^{m_A},x_2\in\Rbb^{m_B}\\\|[x_1^\top,x_2^\top]\|\leq 1}} \|[U_AV_A^\top x_1,PU_BV_B^\top x_2]\|\\
    &\leq \sup_{\substack{x_1\in\Rbb^{m_A},x_2\in\Rbb^{m_B}\\\|[x_1^\top,x_2^\top]\|\leq 1}} \|U_AV_A^\top x_1\|+\|PU_BV_B^\top x_2\|\\
    &\leq \sup_{\substack{x_1\in\Rbb^{m_A},x_2\in\Rbb^{m_B}\\\|[x_1^\top,x_2^\top]\|\leq 1}} \|x_1\|+\|x_2\|=\sqrt{2}.
  \end{align*}
\end{proof}

\begin{proof}[Proof of Lemma \ref{lm:core1}]
  If $r_A\geq r_B$, we have \begin{align*}
    \|M\|_*&=\|[U_A\Sigma_A V_A^\top,U_B\Sigma_B V_B^\top]\|_*\\
    &=\|[U_A\Sigma_A V_A^\top,[u_A^1,...,u_A^T,\mathbf{0},...,\mathbf{0}]\Sigma_B V_B^\top]+\\ &\ \ \ \ \ \ \ \ \ \ [\mathbf{0},[u_A^1-u_B^1,...,u_A^T-u_B^T,u_B^{T+1},...,u_B^r]\Sigma_B V_B^\top]\|_*\\
    &\leq \|[U_A\Sigma_A V_A^\top,[u_A^1,...,u_A^T,\mathbf{0},...,\mathbf{0}]\Sigma_B V_B^\top]\|_*+\\ &\ \ \ \ \ \ \ \ \ \ \|[u_A^1-u_B^1,...,u_A^T-u_B^T,u_B^{T+1},...,u_B^r]\Sigma_B V_B^\top\|_*\\
    &\leq \|[U_A\Sigma_A V_A^\top,[u_A^1,...,u_A^T,\mathbf{0},...,\mathbf{0}]\Sigma_B V_B^\top]\|_*+\epsilon_2 \|B\|_*\\
    &= \|[U_A\Sigma_A V_A^\top,U_A\Sigma_B V_B^\top]\|_*+\epsilon_2 \|B\|_*.\tag{*} \label{p:ax1}
  \end{align*}
  We denote that $trace(\cdot)$ as the trace of matrix. One property of nuclear norm is $$\|A\|_*=trace(\sqrt{AA^\top}).$$
  Then we have  \begin{align*}
    \|[U_A\Sigma_A V_A^\top,U_A\Sigma_B V_B^\top]\|_*&=trace(\sqrt{U_A(\Sigma_A^2+\Sigma_B^2)U_A^\top})\\
&=\sum_{i=1}^r \sqrt{(\sigma_A^{i})^2+(\sigma_B^{i})^2}\\
&\leq \sum_{i=1}^r \frac{\sigma_A^{i}+\sigma_B^{i}}{\sqrt 2}+(\sqrt{(\sigma_A^{i})^2+(\sigma_B^{i})^2}-\frac{\sigma_A^{i}+\sigma_B^{i}}{\sqrt 2})\\
&\leq \sum_{i=1}^r \frac{\sigma_A^{i}+\sigma_B^{i}}{\sqrt 2}+(\sqrt{(\sigma_A^{i})^2+(\sigma_A^{i}+\epsilon_1)^2}-\frac{2\sigma_A^{i}-\epsilon_1}{\sqrt 2})\\
&\leq \frac{\sqrt 2 \epsilon_1 r}{2} + \frac{\|A\|_*+\|B\|_*}{\sqrt 2}+ \\ & \ \ \ \  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \  \sum_{i=1}^r \frac{2\sigma_A^{i}\epsilon_1+\epsilon_1^2}{\sqrt{2(\sigma_A^{i})^2+2\sigma_A^{i}\epsilon_1+\epsilon_1^2}+\sqrt{2(\sigma_A^{i})^2}}\\
& \leq \frac{\sqrt 2 \epsilon_1 r}{2} + \frac{\|A\|_*+\|B\|_*}{\sqrt 2}+   \sum_{i=1}^r \frac{\sqrt 2 \epsilon_1}{2}+\epsilon_1\\
&=\frac{\|A\|_*+\|B\|_*}{\sqrt 2}+ (\sqrt{2}+1)\epsilon_1 r.\tag{**}
\label{p:ax2}
  \end{align*}

  Combining \eqref{p:ax1} and \eqref{p:ax2}, we have 
  \begin{align*}
    \|[A,B]\|_*\leq \frac{\|A\|_*+\|B\|_*}{\sqrt 2}+ (\sqrt{2}+1)\epsilon_1 r + \epsilon_2 \|B\|_*.
  \end{align*}
Similarly, if $r_B\geq r_A$, we have \begin{align*}
  \|[A,B]\|_*\leq \frac{\|A\|_*+\|B\|_*}{\sqrt 2}+ (\sqrt{2}+1)\epsilon_1 r + \epsilon_2 \|A\|_*.
\end{align*}
Combining them together, we have 
\begin{align*}
  \|[A,B]\|_*\leq \frac{\|A\|_*+\|B\|_*}{\sqrt 2}+ (\sqrt{2}+1)\epsilon_1 r + \epsilon_2 \max \{\|A\|_*,\|B\|_*\}.
\end{align*}
\end{proof}


\begin{proof}[Proof pf Lemma \ref{lm:core2}]
  Firstly, if $r_A\geq r_B$ we have 
  \begin{align*}
    \|[U_AV_A^\top,PU_BV_B^\top]\|&=\|[U_AV_A^\top,P[u_A^1,...,u_A^T,\mathbf{0},...,\mathbf{0}]V_B^\top]\|+\\ &\ \ \ \ \ \ \ \ \ \ \ \ \ \  \|[0,P[u_B^1-u_A^1,...,u_B^T-u_A^T,\mathbf{0},...,\mathbf{0}]V_B^\top]\|\\
    &\leq \|[U_AV_A^\top,P[u_A^1,...,u_A^T,\mathbf{0},...,\mathbf{0}]V_B^\top]\| + \sqrt{T}\epsilon_2. \tag{***} \label{p:ax3}
  \end{align*}
  
To simplify the notations, we denote that $k=H(\pi_P)$ and assume that $\pi_P$ permutes the indexes $(1,...,k)$ into $(\zeta_1,...,\zeta_k)$. Now we have
\begin{align*}
  \langle u_A^{i}, Pu_A^i \rangle &= \sum_{i=1}^k u_A^{i}(i)u_A^{i}(\zeta_i) + \sum_{i=k+1}^n (u_A^{i}(i))^2,
\end{align*} and 
\begin{align*}
  |\sum_{i=1}^k u_A^{i}(i)u_A^{i}(\zeta_i)|&\leq \sum_{i=1}^k | u_A^{i}(i)u_A^{i}(\zeta_i)|\\
  &= \sum_{i=1}^k \frac{(u_A^{i}(i))^2+(u_A^{i}(\zeta_i))^2}{2} -(\frac{(u_A^{i}(i))^2+(u_A^{i}(\zeta_i))^2}{2} - |u_A^{i}(i)u_A^{i}(\zeta_i)|)\\
  &\leq \sum_{i=1}^k (u_A^{i}(i))^2 -( \frac{(u_A^{i}(i))^2+(|u_A^{i}(i)|-\epsilon_3)^2}{2} - |u_A^{i}(i)|(|u_A^{i}(i)|+\epsilon_3))\\
  &=\sum_{i=1}^k (u_A^{i}(i))^2 - (\frac{\epsilon_3^2}{2}+2|u_A^{i}(i)|\epsilon_3)\leq \sum_{i=1}^k (u_A^{i}(i))^2- \frac{\epsilon_3^2}{2}.
\end{align*}
Hence we must have $$|\langle u_A^{i}, Pu_A^i \rangle |\leq 1-\frac{k\epsilon_3^2}{2}.$$

Therefore, we have 
\begin{align*}
    \delta (U_A,P) &\stackrel{\text{def.}}= \max_{\substack{x,y\in \Rbb^T,\\\|x\|=1,\|y\|=1}} \langle [u_A^1,...,u_A^T]x, [Pu_A^1,...,Pu_A^T]y \rangle\\
    &=\max_{\substack{x,y\in \Rbb^T,\\\|x\|=1,\|y\|=1}} \sum_{i=1}^T x(i)y(i) \langle u_A^{i}, Pu_A^i \rangle\\
    &\leq \max_{\substack{x,y\in \Rbb^T,\\\|x\|=1,\|y\|=1}} (1-\frac{k\epsilon_3^2}{2})\sum_{i=1}^T x(i)y(i) \\
    &=1-\frac{k\epsilon_3^2}{2}.
\end{align*}

Now we have, 
\begin{align*}
  &\|[U_AV_A^\top,P[u_A^1,...,u_A^T,\mathbf{0},...,\mathbf{0}]V_B^\top]\|=\sup_{\substack{x\in\Rbb^n,\\\|x\|=1}} \|[U_AV_A^\top,P[u_A^1,...,u_A^T,\mathbf{0},...,\mathbf{0}]V_B^\top]x\|\\
&\leq \sup_{\substack{x_1\in\Rbb^{m_A},x_2\in\Rbb^{m_B}\\\|[x_1^\top,x_2^\top]\|\leq 1}} \sqrt{1+\langle U_AV_A^\top x_1,P[u_A^1,...,u_A^T,\mathbf{0},...,\mathbf{0}]V_B^\top x_2\rangle}\\
&\leq \sup_{\substack{x_1\in\Rbb^{m_A},x_2\in\Rbb^{m_B}\\\|[x_1^\top,x_2^\top]\|\leq 1}} \sqrt{1+\delta(U_A,P)\|x_1\|\|x_2\|}\leq \sqrt{2-\frac{k\epsilon_3^2}{2}}.\tag{****}\label{p:ax4}.
\end{align*}

Combining \eqref{p:ax3} and \eqref{p:ax4}, we have 
\begin{align*}
  \|[U_AV_A^\top,PU_BV_B^\top]\|\leq \sqrt{2-\frac{k\epsilon_3^2}{2}} + \sqrt{T}\epsilon_2.
\end{align*}

The proof is similar for the case $r_B\geq r_A$.
\end{proof}

\section{Discussion on Assumption \ref{asp:asp1}}
\label{app:asp}
\textbf{When $\epsilon_1$ in Assumption 2.5 is sufficiently large:} Consider $A=\sigma_A^1 u, B=\sigma_B^1 u, u\in \mathbb{R}^n$. If $\epsilon_1 > kD$ ($k<1$), according to inequality (6), for any permutation matrix $P$, we have $|\left\|[A,PB]\right\|_*-\left\|[A,B]\right\|_*|\leq \frac{1-k}{1+k} \left\|[A,B]\right\|_*.$ Therefore, the larger the $\epsilon_1$ is, the harder to distinguish $[A,PB]$ and $[A,B]$ through nuclear norm, especially with the perturbation of additive noise.
    
\textbf{When $\epsilon_2$ in Assumption 2.6 is sufficiently large:} Consider $A = u_A\in \mathbb{R}^n$, $B=u_B\in \mathbb{R}^n$ and $\sigma=0$, where $\|u_A\|=\|u_B\|=1$. Let $\epsilon_2 = \|u_A-u_B\|$, we can obtain $
        \|[A,B]\|_*=\sqrt{2+2\sqrt{1-(1-\frac{\epsilon_2^2}{2})^2}}.$
In this case, we can see that $\|[A,B]\|_*$ is in fact an increasing function of $\epsilon_2$. Therefore, for any permutation matrix $P\in \mathcal{P}_n^{\epsilon_2}= \{S\in \mathcal{P}_n\mid \|u_A-Su_B\|\leq\epsilon_2\}$, we have $\|[A,
    PB]\|_*\leq\|[A,
    B]\|_*,$ i.e., it is impossible to recover the original matrix through nuclear norm minimization. Especially, in this case, when $\epsilon_2 = \sqrt{2}$, the set $\mathcal{P}_n^{\epsilon_2} = \mathcal{P}_n$.

    \textbf{When $\epsilon_3=0$ in Assumption 2.7:} Consider $A=B=u\in \mathbb{R}^n$ and $\sigma=0$.
    We first define n set $S(i) = \{j\mid u(i)=u(j)\}$ for $i=1,...,n.$ We let $S^* = \arg\max_{S(i)} \#|S(i)|$. For any permutation $P$ that only permutes the indexes in $S^*$ and $H(\pi_P)=\#|S^*|>0$, we have $\left\|[A,B]\right\|_* = \left\|[A,PB]\right\|_*$, i.e., it is impossible to distinguish the permuted matrix and the original matrix through nuclear norm. 


\section{Asymptotic behavior of Proposition 2.8.}
\label{app:asymp}
In this section, we will discuss about the asymptotic behavior ($n\to\infty$) of the error bound in Proposition 2.8. 

We start with a simple observation: Without $\epsilon_1\to 0,\epsilon_2\to 0,\sigma\to 0$, the original matrix will be impossible to recover by minimizing nuclear norm for sufficient large $n$. This is also reflected in the error bound of Proposition 2.8, where the right hand side of \eqref{p:error_bound} could become trivial, i.e., larger than $n$, when $n$ is sufficiently large.

We provide a simple example to validate this observation. Suppose that the original matrix is $M=[u,u]+W$, where the elements of $W$ follow $\mathcal{N}(0,\sigma^2)$ and $u\in\mathbb{R}^n$ is a random vector whose elements are i.i.d.  following the uniform distribution on $[0,1]$. From  the result in \citep{david2004order}, p. 135, we know that \begin{align*}
  \Ebb{[\max_{i\neq j}|u(i)-u(j)|]}\approx O(n^{-1}\log(n)).
\end{align*} 
Therefore, we can construct a permutation matrix $P\in \Pc_n$ with $H(\pi_P)=n$, such that the following inequality holds with high probability,
$$
|\|[u,Pu]\|_* -\|[u,u]\|_*|\leq \|Pu-u\|_2= O(n^{-\frac{1}{2}}\log(n)).
$$


On the other hand,  from Lemma \ref{lm:tail_bound} we can know that  $\|W\|_* \approx O(\sigma n)$ with high probability. Now if we need that  $\|[u,Pu]+W\|_*>\|[u,u]+W\|_*$, we at least require that $\sigma=o(n^{-\frac{3}{2}}\log(n))$. Otherwise, it will be impossible to distinguish the matrices $[u,Pu]+W$ and $[u,u]+W$ through the value of nuclear norm.

Finally, for this simple example, we have $\epsilon_1=\epsilon_2=0$. Besides, from \citep{david2004order}, we can also know that $\epsilon_3$ is at most $O(n^{-\frac{3}{2}})$ with high probability. With a simple calculattion, we can find that the error bound in Proposition 2.8 is at least $O(n^{\frac{5}{2}}\sigma^{\frac{1}{2}})$. Therefore, in this example, we at least require that $\sigma=o(n^{-5})$ to guarantee a constant error bound for arbitrary $n$.





  \section{Dual Problem of (\ref{p:entropyOT})}
  \label{app:dual}
  To simplify the notation, we denote the primal problem as
  \begin{align*}
    \underset{P\in \Pi(\mathbf{1}_n,\mathbf{1}_n)}{\text{minimize }} &\langle C,P\rangle+\epsilon \Hc(P).
  \end{align*}
  We define two dual variables $\alpha,\beta \in \mathbb{R}^{n}$. The Lagrangian function is  
  \begin{align}
    \label{p:lagrange}
    L(P,\alpha,\beta)&=
    \langle C, P\rangle+\epsilon\langle \log{P}-\mathbf{1}_{n\times n},P\rangle+\left\langle\mathbf{1}_{n}-P \mathbf{1}_{n}, \alpha\right\rangle+\left\langle\mathbf{1}_{n}-P^T \mathbf{1}_{n}, \beta\right\rangle.
  \end{align}
  Now we minimize the Lagrangian function w.r.t $P$ (We note that $\Hc(P)$ implicitly imposes that $P\in \mathbb{R}^{n\times n}_+$). From the first-order necessary condition of unconstrainted optimization, we have
  \begin{align}
    C-&\alpha\oplus\beta+\epsilon \log(P)=0,\notag \\ 
    &\Downarrow \notag\\ 
    P=&\text{exp}\bigg\{\frac{\alpha\oplus\beta-C}{\epsilon}\bigg\}.
    \label{p:reconstruct}
  \end{align}
  Substituting it into the Lagrangian function \eqref{p:lagrange} we have the dual objective
  \begin{align*}
    q(\alpha,\beta)=\underset{P}{\min}\ L(P,\alpha,\beta)=\left\langle\mathbf{1}_{n}, \alpha\right\rangle+\left\langle\mathbf{1}_{n}, \beta\right\rangle-\epsilon \bigg\langle \mathbf{1}_{n\times n} , \text{exp}\bigg\{\frac{\alpha\oplus\beta-C}{\epsilon}\bigg\}\bigg\rangle.
  \end{align*}
  Therefore the dual problem is 
  \begin{align}
    \label{p:DualOT}
  \max _{\alpha,\beta \in \mathbb{R}^{n}} \left\langle\mathbf{1}_{n}, \alpha\right\rangle+\left\langle\mathbf{1}_{n}, \beta\right\rangle-\epsilon\bigg\langle  \mathbf{1}_{n\times n} , \text{exp}\bigg\{\frac{\alpha\oplus\beta-C}{\epsilon}\bigg\}\bigg\rangle.
  \end{align}
  We can recover the primal solution $P$ from the dual solution $\alpha$, $\beta$ via \eqref{p:reconstruct}.

  \section{A Stable Implementation for Sinkhorn Algorithm}
  The Sinkhorn algorithm \citep{peyre2019computational} are often used to solve the dual problem \eqref{p:DualOT}, and the standard form of it reads 
  \begin{align*}
    p^{(t+1)}\leftarrow \frac{\mathbf{1}_{n}}{Kq^{(t)}}\text{  and  }q^{(t+1)}\leftarrow \frac{\mathbf{1}_{n}}{K^\top p^{(t+1)}},
  \end{align*}
  where $K=\text{exp}\bigg\{\frac{\alpha\oplus\beta-C}{\epsilon}\bigg\}$, and $p=\exp(\frac{\alpha}{\epsilon})$, $q=\exp(\frac{\beta}{\epsilon})$. If we adopt a small $\epsilon$, the elements of $K$ can overflow to infinity or zero, which causes a numerical issue. We can remedy this by using a different implementation from \citep{peyre2019computational}. 
  \begin{align*}
    \alpha^{(t+1)}&\leftarrow \text{Min}_\epsilon^{\text{row}}(C-\alpha^{(t)}\oplus\beta^{(t)})+\alpha^{(t)},\\
    \beta^{(t+1)}&\leftarrow \text{Min}_\epsilon^{\text{col}}(C-\alpha^{(t+1)}\oplus\beta^{(t)})+\beta^{(t)},
  \end{align*}
  where for any $A\in\mathbb{R}^{n\times m}$, we define the operator $\text{Min}_\epsilon^{\text{row}}$ and $\text{Min}_\epsilon^{\text{col}}$ as
  \begin{align*}
      \operatorname{Min}_{\varepsilon}^{\text {row }}(\mathbf{A}) \stackrel{\text { def. }}{=}\left(\text{min}_{\varepsilon}\mathbf{A}({i, \cdot})\right)_{i} \in \mathbb{R}^{n}, \\
      \operatorname{Min}_{\varepsilon}^{\text {col }}(\mathbf{A}) \stackrel{\text { def. }}{=}\left(\text{min} _{\varepsilon}\mathbf{A}({\cdot, j})\right)_{j} \in \mathbb{R}^{m},
  \end{align*}
  and for any vector $z=[z_1,...,z_n]^\top\in \mathbb{R}^n$, we denote $$\text{min}_\epsilon z\stackrel{\text{def.}}=\min_i z_i-\epsilon \log\sum_j e^{-(z_j-\min_i z_i)/\epsilon}$$ as the $\epsilon$-soft minimum for the elements of $z$.

  \section{Relationship between $\text{M}^3\text{O}$ and the {Soft-Impute} Algorithm}
  \label{app:link}
  {Soft-Impute} algorithm \citep{mazumder_spectral_2010} is a classical algorithm for matrix completion. Specifically, it tries to solve the nuclear norm regularized problem
  \begin{align}
    \underset{\widehat M}{\text{minimize }} \frac 12\left\|\Pc_\Omega(X)-\Pc_\Omega(\widehat M) \right\|^2_F+\lambda\left\|\widehat M\right\|_*. 
  \end{align} {Soft-Impute} is a simple iterative algorithm with the following two steps:
  \begin{align}
    \widehat X&\leftarrow \Pc_\Omega(X)+\Pc^\perp_\Omega(\widehat M),\\
    \widehat M&\leftarrow \text{prox}_{\lambda\left\|\cdot\right\|_*}(\widehat X)=U\Sc_\lambda(D)V^\top,
  \end{align} where $\widehat X=UDV^\top$ denotes the singular value decomposition of $\widehat X$, and $\Pc^\perp_\Omega$ is the operator that selects  entries whose indexes are not belonging to $\Omega$. Here $\Sc_\lambda$ is the soft-thresholding operator that operates element-wise on the diagonal matrix $D$, i.e., replacing $D_{ii}$ with $(D_{ii}-\lambda)_+$.
  
  Consider the partial observation extension. For the $\text{M}^3\text{O}$ algorithm, if an exact permutation matrix is obtained, i.e., $\widehat P=\text{exp}\bigg\{\frac{\alpha^*\oplus\beta^*-C(\widehat M_{B})}{\epsilon}\bigg\}\in \Pc_n$, it is easy to verify that the the gradient in Algorithm \ref{alg:McubicO} has the following form,
  \begin{align*}
    \nabla_{\widehat M}F_\epsilon(\widehat M,\alpha^*,\beta^*)=2(\Pc_\Omega(\widehat M)-\Pc_\Omega([A,\widehat P\tilde B])).
  \end{align*}
  In this way, if we adopts $\rho_k=0.5$, the proximal gradient update becomes 
  \begin{align*}
    \widehat M^{\text{k+1}} \leftarrow \text{prox}_{\lambda\left\|\cdot\right\|_*} (\Pc_\Omega([A,\widehat P\tilde B])+\Pc^\perp_\Omega(\widehat M^k)).
  \end{align*}
In practice, $\widehat P$ often becomes very close to an exact permutation matrix and the stepsize often reaches the upper bound 0.5, when the algorithm is close to convergence. In this scenario, our algorithm becomes equivalent to the Soft-Impute algorithm. Therefore, we adopt  the {Soft-Impute} algorithm as a baseline method for matrix completion without correspondence issue.

\section{$\text{M}^3\text{O}$-AS-DE for the d-correspondence Problem}
\label{app:algo}
In this section, we summarize our proposed algorithm {$\text{M}^3\text{O}$-AS-DE} for the  general d-correspondence problem \eqref{p:multiple} in Algorithm \ref{alg:M30general}.
To determinate the stop of the Max-Oracle, we find that the criterion $$\frac{1}{\sqrt n}\left\|\mathbf{1}_n^\top\widehat P-\mathbf{1}_n^\top\right\|_2\leq\varepsilon$$ works well in practice, which serves as a good indicator for the $\varepsilon$-good optimality.
% \begin{algorithm}
%   
%   \SetKwFor{DoParallel}{for}{do in parallel}{end}
%   \caption{{$\text{M}^3\text{O}$-AS-DE}}
%   \label{alg:M30general}
%   \KwIn{stepsize parameter $\omega$, number of correspondence $d$, number of iterations $N$, number of tolerance steps $K$, initial entropy coefficient $\epsilon$, tolerance $\varepsilon$, observation matrix $ M_o=[A_o, B_o^1,...,B_o^d]$, initial matrix $\widehat M=[\widehat M_A,\widehat M_{B_1},...,\widehat M_{B_d}]$, nuclear norm coefficient $\lambda$, the set of observable indexes  $\Omega$.}
%   Initialize $\widehat P_\text{new}^l=\mathbf{0}_{n\times n}$ for $l=1,...,d$.
  
  

%   \For{$k=1:N$}{


% \DoParallel{$l=1:d$}{
%   $\widehat P_\text{old}^l=\widehat P_\text{new}^l$.

% $\hat \alpha^l=\hat\beta^l=\mathbf{1}_{n}$.

% Compute the partial pairwise cost matrix $C(\widehat M_{B_l})$.

% \While{$\frac{1}{\sqrt n}\left\|\mathbf{1}_n^\top\widehat P-\mathbf{1}_n^\top\right\|_2>\epsilon$}{
	
% $\hat\alpha^l\leftarrow \text{Min}_\epsilon^{\text{row}}(C(\widehat M_{B_l})-\hat\alpha^l\oplus\hat\beta^l)+\hat\alpha^l$,

% 	$\hat\beta^l\leftarrow \text{Min}_\epsilon^{\text{col}}(C(\widehat M_{B_l})-\hat\alpha^l\oplus\hat\beta^l)+\hat\beta^l$,

%   $\widehat P_\text{new}^l\leftarrow \text{exp}\bigg\{\frac{\hat\alpha^l\oplus\hat\beta^l-C(\widehat M_{B_l})}{\epsilon}\bigg\}$.

  
% }
% Compute the stepsize $\rho_l$ as discussed in Section \ref{sec:algorithm}.

% $\widehat M_{B_l} \leftarrow \widehat M_{B_l}-\rho_l\nabla_{\widehat M} F^l_\epsilon(\widehat M_{B_l},\alpha^l,\beta^l),$ where $$F^l_\epsilon(\widehat M_{B_l},\alpha,\beta)\stackrel{\text{def.}}=\left\langle\mathbf{1}_{n}, \alpha\right\rangle+\left\langle\mathbf{1}_{n}, \beta\right\rangle-\epsilon\bigg\langle  \mathbf{1}_{n\times n} , \text{exp}\bigg\{\frac{\alpha\oplus\beta-C_\Omega(\widehat M_{B_l})}{\epsilon}\bigg\}\bigg\rangle.$$
% }
% $\widehat M_A\leftarrow \Pc_\Omega(A)+\Pc^\perp_\Omega(\widehat M_A)$.

%   $\widehat M \leftarrow \text{prox}_{\lambda\left\|\cdot\right\|_*} ([\widehat M_A,\hat M_{B_1},...,\widehat M_{B_d}])).$
  
%   \If{the objective value is not improved over $K$ steps}{
%     $\epsilon\leftarrow \epsilon/2.$
%   }{}
  
%   }

   
%   \end{algorithm}

\begin{algorithm}[tb]
  \caption{{$\text{M}^3\text{O}$-AS-DE}}
    \label{alg:M30general}
\begin{algorithmic}
  \STATE {\bfseries Input:} stepsize parameter $\omega$, number of correspondence $d$, number of iterations $N$, number of tolerance steps $K$, initial entropy coefficient $\epsilon$, tolerance $\varepsilon$, observation matrix $ M_o=[A_o, B_o^1,...,B_o^d]$, initial matrix $\widehat M=[\widehat M_A,\widehat M_{B_1},...,\widehat M_{B_d}]$, nuclear norm coefficient $\lambda$, the set of observable indexes  $\Omega$.
  \STATE{Initialize $\widehat P_\text{new}^l=\mathbf{0}_{n\times n}$ for $l=1,...,d$.}
  \FOR{$k=1:N$}
  \FOR{$l=1:d$ {\bfseries in parallel}}
 \STATE $\widehat P_\text{old}^l=\widehat P_\text{new}^l$. 
 \STATE $\hat \alpha^l=\hat\beta^l=\mathbf{1}_{n}$.
 \STATE Compute the partial pairwise cost matrix $C(\widehat M_{B_l})$.
 \REPEAT
 \STATE{$\hat\alpha^l\leftarrow \text{Min}_\epsilon^{\text{row}}(C(\widehat M_{B_l})-\hat\alpha^l\oplus\hat\beta^l)+\hat\alpha^l$.}
 \STATE{$\hat\beta^l\leftarrow \text{Min}_\epsilon^{\text{col}}(C(\widehat M_{B_l})-\hat\alpha^l\oplus\hat\beta^l)+\hat\beta^l$.}
 \STATE{$\widehat P_\text{new}^l\leftarrow \text{exp}\bigg\{\frac{\hat\alpha^l\oplus\hat\beta^l-C(\widehat M_{B_l})}{\epsilon}\bigg\}$.}
 \UNTIL{$\frac{1}{\sqrt n}\left\|\mathbf{1}_n^\top\widehat P-\mathbf{1}_n^\top\right\|_2\leq\varepsilon$}
 \STATE{Compute the stepsize $\rho_l$ as discussed in Section \ref{sec:algorithm}.}
 \STATE{$\widehat M_{B_l} \leftarrow \widehat M_{B_l}-\rho_l\nabla_{\widehat M} F^l_\epsilon(\widehat M_{B_l},\alpha^l,\beta^l),$ where $$F^l_\epsilon(\widehat M_{B_l},\alpha,\beta)\stackrel{\text{def.}}=\left\langle\mathbf{1}_{n}, \alpha\right\rangle+\left\langle\mathbf{1}_{n}, \beta\right\rangle-\epsilon\bigg\langle  \mathbf{1}_{n\times n} , \text{exp}\bigg\{\frac{\alpha\oplus\beta-C_\Omega(\widehat M_{B_l})}{\epsilon}\bigg\}\bigg\rangle.$$}
  \ENDFOR
  \STATE{$\widehat M_A\leftarrow \Pc_\Omega(A)+\Pc^\perp_\Omega(\widehat M_A)$.}
  \STATE{$\widehat M \leftarrow \text{prox}_{\lambda\left\|\cdot\right\|_*} ([\widehat M_A,\hat M_{B_1},...,\widehat M_{B_d}])).$}
  \IF{the objective value is not improved over $K$ steps}
 \STATE $\epsilon\leftarrow \epsilon/2.$
  \ENDIF
  \ENDFOR
\end{algorithmic}
\end{algorithm}


  \section{The Baseline Algorithm}
  \label{app:BCD}
  We also extend the Baseline algorithm to a similar d-correspondence problem as \eqref{p:multiple}. Specifically, the extended Baseline algorithm tries to solve the unsmoothed problem  \begin{align}
    \label{p:multiplt_BCD}
    \min_{\widehat M}\min_{P_1,...,P_d}&\left\|\Pc_{\Omega}(A_o) - \Pc_{\Omega}(\widehat M_A)\right\|_F^2+\sum_{l=1}^{d}\langle C(\widehat M_{B_l}), P_l \rangle+\lambda\left\|\widehat M\right\|_*,\\
    &\text{s.t. }P_l \in \Pc_n,\text{ for }l=1,...,d.\notag
  \end{align}  We summarize the  algorithm in Algorithm \ref{alg:BCD}. 

    \begin{algorithm}[tb]
      \caption{Baseline}\label{alg:BCD}
   \begin{algorithmic}
      \STATE {\bfseries Input:} number of iterations $N$, number of Proximal Gradient iterations $N_p$, tolerance $\varepsilon$, observation matrix $ M_o=[A_o, B_o^1,...,B_o^d]$, initial matrix $\widehat M=[\widehat M_A,\widehat M_{B_1},...,\widehat M_{B_d}]$, nuclear norm coefficient $\lambda$, partial observation operator $\Pc_\Omega$.
      \FOR{$k=1:N$}
      \FOR{$l=1:d$ {\bfseries in parallel}}
     \STATE Solving the inner problem of (\ref{p:multiplt_BCD}) for $\hat P^l$ up to tolerance $\varepsilon$ via Hungarian algorithm. 
      \ENDFOR
      \STATE{$X\leftarrow [A_o,\hat P^1 B_o^1,...,\hat P^dB_o^d]$.}
      \FOR{$i=1:N_p$}
     \STATE $\hat X\leftarrow \Pc_\Omega(X)+\Pc^\perp_\Omega(\hat M)$.
     \STATE $\hat M\leftarrow \text{prox}_{\lambda\left\|\cdot\right\|_*}(\hat X)$.
      \ENDFOR
      \ENDFOR
   \end{algorithmic}
   \end{algorithm}


    \section{The MUS Algorithm}
    \label{app:adapt}
    In this section, we provide details for the MUS algorithm discussed in the Section \ref{sec:experiment}. Firstly, inspired by \citep{yao2021unlabeled}, we first transform the MRUC problem, i.e, to recover $[A,B]$ from $[A,\tilde PB]$, into a MUS problem as follows,
    \begin{align}
      \label{p:MUSadapt}
      \min_{P\in\Pc_n,W\in\Rbb^{m_B\times m_A}} \|A-P\tilde PBW\|_F^2.
    \end{align}
    Then, for the scenario without multiple correspondence and missing values, we adopt the algorithm in \citep{zhang2020optimal} to solve \eqref{p:MUSadapt}.

    To extend it into the d-correspondence problem considered by \eqref{p:multiple}, we adopt tow simple procedures. Specifically, to deal with the missing value, we first fill in the missing entries of each submatrices using the Soft-Impute algorithm. As for the multiple correspondence issue, we simply run the MUS algorithm in multiple times. For example, if we want solve the d-correspondence problem, we typically apply the MUS algorithm to the following series of problems in turn,
    \begin{align*}
      \min_{P\in\Pc_n,W\in\Rbb^{m_B\times m_A}} \|A_o-PB_o^lW\|_F^2,\ l=1,...,d.
    \end{align*}

    \section{Discussion on US, MUS and MRUC}
    \label{app:discuss}
    In this section, we wil discuss about the difference and similarity among the US problem, MUS problem and our MRUC problem. Specifically, we wish to answer the following question:
    \begin{itemize}
      \item Why MUS algorithms, like the one in \citep{zhang2020optimal}, are more suitable to be adapted for our MRUC problem than those US algorithms like AIEM \citep{tsakiris2020algebraic} and CCV-Min \citep{peng2020linear} that adopted by \citep{yao2021unlabeled}?
    \end{itemize}

For this question, we  note that the MUS problem \eqref{p:MUS} can be solved by US algorithms, because we can treat it as $m_1$ independent US problems just as what \citep{yao2021unlabeled} did. In this way, we can view the key difference between our adapted MUS algorithm and the method proposed by \citep{yao2021unlabeled} as whether to leverage the prior knowledge that multiple response vectors are shuffled by the same permutation, i.e., to recover the permutation for $m_1$ responses jointly or independently. Theoretically, it has been well studied in the works \citep{zhang2020optimal,pananjady2017denoising,slawski2020sparse,slawski2020two} that one can resist stronger noise and estimate the ground-truth permutation better if we know that more columns are shuffled by the same permutation. We remark that this phenomenon is not a contradiction to the experiment results in \citep{yao2021unlabeled}, as they only reported the residual error for vector recovery instead of permutation recovery. 

We also conduct our own experiment to corroborate our previous discussion. We generate the synthetic matrix $M_o=[A,\tilde P B]$ in the same way with the experiment in Figure \ref{fig:exp1}. Here we use the full matrix $M_o$, i.e., no missing values, and hence the MRUC problem is now barely distinguishable to the MUS problem. We use the following three kinds of algorithm for comparison:
\begin{enumerate}
  \item MRUC: Our proposed algorithm M$^3$O.
  \item US: CCV-min algorithm\footnote{https://github.com/liangzu/CCVMIN.} used in \citep{yao2021unlabeled}, which is shown to be the state-of-the-art US algorithm.
  \item MUS: The algorithm in [Han, 2020].
\end{enumerate}

In this experiment, we also propose improved versions of US algorithm and MUS algorithm, by replacing their inputs $A$ and $\tilde PB$ with their top five left singular vectors  $U_A$ and $U_{\tilde PB}$. This process can be viewed as a simple version of the first step subspace learning in \citep{yao2021unlabeled}. For the US algorithm, we  run it for each column of $\tilde PB$ independently. We provide the result by varying the sparsity of $\tilde P$, i.e., $H(\pi_{\tilde P})$, and report the permutation recovery statistics $d_H(\hat \pi,\pi^*)$, where $\hat \pi$ is the recovered permutation and $\pi^*$ is the ground-truth permutation, in Figure \ref{fig:discuss1}. Besides, we also report the residual error for the US algorithm, i.e., $$\text{residual error}=\frac{\|\hat P B-B\|_F^2}{\|B\|_F^2}$$ where $\hat P$ denotes the recovered permutation matrix, in Figure \ref{fig:discuss2}. Notably, these results verify our discussions that, although US algorithm can perform well in vector recovery (Achieving roughly 0.001 residual error on average.), it is extremely inferior when it comes to the permutation recovery.

\begin{figure}[htbp]
  \centering
  \subfigure[Permutation error]{
    \label{fig:discuss1}
  \includegraphics[width=7cm]{figures/discuss1.jpg}
  %\caption{fig1}
  }
  \subfigure[Residual error]{
    \label{fig:discuss2}
  \includegraphics[width=7cm]{figures/discuss2.jpg}
  }
  \caption{ Performance of MRUC, MUS and US algorithms on a simulated 1-correspondence problem without missing values.}
  \label{fig:discuss}
  \end{figure}



% We can use an informal example to explain this subtle phenomenon: Given a vector $a\in \Rbb^n$ and the permuted observation $\tilde a=\tilde Pa+\text{noise}$ where $\tilde P\in\Pc_n$, for a constant $\varepsilon>0$, the feasible set $\{P\mid \|P\tilde a-a\|\leq \varepsilon\}$ could have more than one element. Suppose that we know there is another vector $b\in \Rbb^n$ which is permuted by the same $\tilde P$, i.e., $\tilde b=\tilde Pb+\text{noise}$, we can narrow the feasible set by considering the set $\{P\mid \|P\tilde a-a\|\leq \varepsilon,\|P\tilde b-b\|\leq \varepsilon\}$ instead.


\section{Details for the experiments}
\label{app:exp}
We use Matlab 2020b for the numerical experiments. The computer environment consists of Intel i9-10920x for CPU and 32GB RAM.
\subsection{Hyperparameters setting}
\textbf{Simulated data. }
We adopt fixed nuclear norm coefficient $\lambda$ in the experiments on simulated data. Specifically, for each setting, we choose the best $\lambda$ out of three candidate values that are 0.4, 0.5 and 0.6. Since adopting large $\omega$ will preserve the final performance and only degrade the convergence speed, we take $\omega=3$ for all the experiments. For the tolerance of Sinkhorn algorithm, we take $\varepsilon=0.01$ for all the experiments.


\textbf{MovieLens 100K. } For all the algorithms, we adopt a sequence of values for $\lambda$. Specifically, we start the algorithm with $\lambda=300$, and once the algorithm stops improving the objective function for 10 steps, we shrink the value as $\lambda\leftarrow \lambda-10$ until $\lambda$ becomes lower than 10. We take $\omega=0.5$ for all the experiments and also set the tolerance of Sinkhorn algorithm as $\varepsilon=0.01$. 

\subsection{Phase transition with different initializations.} \label{app:initialization}
In this section, we conduct a simple experiment to explore the sensitivity of M$^3$O w.r.t initialization by varying the distance between initialization and the ground-truth matrix. We could expect that the variance of the performance of M$^3$O  should decrease as the distance decreases.

We generate different initializations in the following way: We first generate two matrices $M$ and $W$ independently following the way described in Section \ref{sec:experiment1}, and we employ $M$ as the ground-truth matrix. Then, we generate the initialization for M$^3$O as  \begin{align*}
  \hat M = \Lambda M+(1-\Lambda)W,
\end{align*} where $\Lambda\in (0,1)$ is a coefficient designed for controlling  the distance between initialization and the ground-truth matrix.

Figure \ref{fig:distance} shows a phase transition phenomenon for M$^3$O algorithm w.r.t to the coefficient $\Lambda$, which is well aligned with our expectation.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=6cm]{figures/distance.jpg}
  \caption{ A phase transition phenomenon for M$^3$O algorithm w.r.t to the distance between initialization and the ground-truth matrix. The experiment is conducted on a 1-correspondence problem, with $|\Omega|\cdot 100\%/(n\cdot m)=80\%$, $\eta=0.1$, $n=m=100$, $r=5$, $m_A=60$,  and $m_1=40$. The mean  with minimum and maximum are calculated from 10 different random initializations.}
  \label{fig:distance}
  \end{figure}

\subsection{Numbers of Sinkhorn Iteration}
Typically, the numbers of Sinkhorn iteration  required to retrieve an $\varepsilon$-good solution mainly depends on the entropy coefficient $\epsilon$. This also implies that the decaying entropy regularization strategy can also accelerate the convergence process. Figure \ref{fig:niter} shows the relationship between the  numbers of Sinkhorn iteration and entropy coefficient $\epsilon$ under the same simulated data setting with Figure \ref{fig:exp1}. The dash lines and intervals reflect mean, min, maximum aggregated from 20 independent trials. For a practical implementation, we restrict the maximum numbers of Sinkhorn iteration to 10000 on the numerical experiments.
\begin{figure}[htbp]
  \centering
  \includegraphics[width=6cm]{figures/niter.jpg}
  \caption{The required numbers of Sinkhorn iteration v.s. entropy coefficient $\epsilon$}
  \label{fig:niter}
\end{figure}

\subsection{Problem formulation for the face recovery problem}

We show that  M$^3$O  is flexible and can also be used to recover matrix that is not in the form $[A,PB]$. We can see this from the problem formulation in \eqref{p:rewrite}, where the cost matrix $C(\cdot)$ can be constructed in other ways as long as it is a function of a permutation. Typically, M$^3$O  can be used to solve a challenging face image recovery problem. The original face image with size $180\times 180$ in Figure \ref{fig:real_face} comes from the Extend Yale B database \citep{GeBeKr01}. The corrupted image is visualized in Figure \ref{fig:perm_face}, where the  pixel blocks with size $30\times 30$ in the upper left are shuffled randomly, and $30\%$ of the total pixels are removed. 
       This experiment setting is similar to that in  \citep{yao2021unlabeled} but the algorithm in \citep{yao2021unlabeled} can not be applied since it can not work with the missing values. The  MUS algorithm is also not applicable since this problem can not be written in the form of linear regression problem. From Figure \ref{fig:base_face} and \ref{fig:Mcubic_face} we can find that $\text{M}^3\text{O}$ performs better than the Baseline, and can even recover the original orders of pixel blocks. 


In the face recovery experiment, the cost matrix $C$ is constructed as $$C(i,j)=\|P_{\Omega}(B(i)-\widehat{M}(j))\|_F^2,$$ where $B(1),...,B(13)\in\Rbb^{30\times 30}$ are the shuffled pixel blocks from the upper left of the corrupted image shown in Figure \ref{fig:perm_face}, and $\widehat{M}(1),...,\widehat{M}(13)\in\Rbb^{30\times 30}$ are the corresponding recovered pixel blocks from the upper left of the current recovered image. 

We choose fixed stepsize $\rho_k=0.1$, and choose the initial entropy coefficient as $\epsilon=100$. To obtain the initial matrix $\widehat M$, we first complete each pixel blocks independently using the Soft-Impute algorithm. We denote the filled matrix as $M_1$, and carry out the singular decomposition of it as $M1=\sum_i \sigma_i u_i v_i^\top$. Then we set the initial matrix as $\widehat M=\sigma_1 u_1 v_1^\top$. 

More results similar to Figure \ref{fig:face} are shown in Figure \ref{fig:moreface}.

\begin{figure}[htbp]
  \centering
  \subfigure[ Original]{
  \includegraphics[width=3cm]{figures/real_face1.jpg}
  %\caption{fig1}
  }
  \quad
  \subfigure[Corrupted]{
  \includegraphics[width=3cm]{figures/perm_face1.jpg}
  }
  \quad
  \centering
  \subfigure[Baseline]{
  \includegraphics[width=3cm]{figures/base_face1.jpg}
  %\caption{fig1}
  }
  \quad
  \subfigure[$\text{M}^3\text{O}$]{
  \includegraphics[width=3cm]{figures/Mcubic_face1.jpg}
  }
  \quad
  \centering
  \subfigure[ Original]{
  \includegraphics[width=3cm]{figures/real_face2.jpg}
  %\caption{fig1}
  }
  \quad
  \subfigure[Corrupted]{
  \includegraphics[width=3cm]{figures/perm_face2.jpg}
  }
  \quad
  \centering
  \subfigure[Baseline]{
  \includegraphics[width=3cm]{figures/base_face2.jpg}
  %\caption{fig1}
  }
  \quad
  \subfigure[$\text{M}^3\text{O}$]{
  \includegraphics[width=3cm]{figures/Mcubic_face2.jpg}
  }
  \quad
  \centering
  \subfigure[ Original]{
  \includegraphics[width=3cm]{figures/real_face3.jpg}
  %\caption{fig1}
  }
  \quad
  \subfigure[Corrupted]{
  \includegraphics[width=3cm]{figures/perm_face3.jpg}
  }
  \quad
  \centering
  \subfigure[Baseline]{
  \includegraphics[width=3cm]{figures/base_face3.jpg}
  %\caption{fig1}
  }
  \quad
  \subfigure[$\text{M}^3\text{O}$]{
  \includegraphics[width=3cm]{figures/Mcubic_face3.jpg}
  }
  \caption{ Performance of {{M}$^3$O} on more face images from Yale B database.}
  \label{fig:moreface}
  \end{figure}

\nocite{*}
\bibliography{example_paper}

\end{document}
