\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{bm}  
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{wrapfig}
\usepackage{enumitem}
\usepackage{algorithm,algorithmic}
\input{math_command.tex}
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\usepackage{natbib}

%% Provided macros
% er: Because the class footnote size is essentially LaTeX's ,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Low-Rank Matrix Recovery with Unknown Correspondence}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Zhiwei Tang}{}}
\author[1]{Tsung-Hui Chang}
\author[2]{Xiaojing Ye}
\author[1]{Hongyuan Zha}
% Add affiliations after the authors
\affil[1]{%
    The Chinese University of Hong Kong, Shenzhen
}
\affil[2]{%
    Georgia State University
}

  \begin{document}
\maketitle

\begin{abstract}
  We study a matrix recovery problem with unknown correspondence: given the observation matrix $M_o=[A,\tilde P B]$, where $\tilde P$ is an unknown permutation matrix, we aim to recover the underlying matrix $M=[A,B]$. Such problem commonly arises in many applications where heterogeneous data are utilized and the correspondence among them are unknown, e.g., due to data mishandling or privacy concern. We show that, in some applications, it is possible to recover $M$ via solving a nuclear norm minimization problem. Moreover, under a proper low-rank condition on $M$, we derive a non-asymptotic error bound for the recovery of $M$. We propose an algorithm, $\text{M}^3\text{O}$ (Matrix recovery via Min-Max Optimization) which recasts this combinatorial problem as a continuous minimax optimization problem and solves it by proximal gradient with a Max-Oracle. $\text{M}^3\text{O}$ can also be applied to a more general scenario where we have missing entries in $M_o$ and multiple groups of data with distinct unknown correspondence. Experiments on  simulated data, the MovieLens 100K dataset and Yale B database show that $\text{M}^3\text{O}$ achieves state-of-the-art performance over several baselines and can recover the ground-truth correspondence with high accuracy. The code is provided in \url{https://github.com/TZW1998/MRUC}.
\end{abstract}


\section{Introduction}
\label{sec:intro}
In the era of big data, one usually needs to utilize data gathered from multiple disparate platforms when accomplishing a specific task. However, the correspondence among the data samples from these different sources are often unknown or noisy, due to either missing identity information or privacy reasons 
\citep{unnikrishnan2018unlabeled,gruteser2003privacy,das_sample--sample_2018}.
 Examples include the record linkage problem \citep{chan2001file}, the federated recommender system \citep{yang2020federated} and the vertical federated learning \citep{nock2021impact}. Consider the simplest scenario, we have two data matrices $A=[a_1,...,a_n ]^\top$, $B=[b_1,...,b_n ]^\top$ with $a_i\in \mathbb{R}^{m_A}$ and $b_i\in \mathbb{R}^{m_B}$, which are from two different platforms (data sources). As discussed above, the  correspondence $(a_i,b_i)$ may not be available, and thereby the goal is to recover the underlying correspondence between $a_1,...,a_n$ and $b_{\tilde\pi{(1)}},...,b_{\tilde\pi{(n)}}$, where $\tilde\pi(\cdot)$ denotes an unknown permutation. We can translate such problem described above as a matrix recovery problem, i.e., to recover the matrix $M=[A,B]$ from the permuted observation $ M_o=[A,\tilde P B]$, where $\tilde P\in\Pc_n$ is an unknown permutation matrix and $\Pc_n $ denotes the set of all $n\times n$ permutation matrices. We term this problem as \textbf{M}atrix \textbf{R}ecovery with \textbf{U}nknown \textbf{C}orrespondence (\textbf{MRUC}). Inspired by the classical low-rank model for matrix recovery \citep{Wright-Ma-2021,mazumder_spectral_2010,hastie_matrix_2015}, we especially focus on the scenario where the matrix $M$ features a certain low-rank structure. Such low-rank model has achieved great success in many applications like the recommender system \citep{schafer2007collaborative,mazumder_spectral_2010} and the image recovery and alignment problem \citep{zeng_finding_2012,zhou_multi-image_2015}. By denoting $B_o=\tilde PB$, we want to solve the following rank minimization problem for MRUC, 
\begin{align}
  \label{p:minimize_rank}
  \underset{ P\in \Pc_n}{\text{min }} \text{rank}([A, P B_o]).
\end{align} 

\textbf{Applications.} The major application of MRUC problem is related to Vertical Federated Learning (VFL) \citep{kairouz2021advances}, which aims at learning from feature partitioned data. This work specifically considers Recommender System (RS) in the context of VFL. One classical work on this problem is the multi-domain recommender system considered in \citep{zhang2012multi}. Unfortunately, they neglect a crucial issue that data from these diverse platforms (or domains) are not always well aligned for two primary reasons. The first  is that the correspondence information could be noisy due to mishandle in data processing. The other is that those platforms may  not be allowed to share the true linkage information for preserving privacy. As the first step to address these issues, in this work, we study RS in an extreme setting of VFL, i.e., no correspondence information is provided.  Another application is the Visual Permutation Learning problem \citep{santa2017deeppermnet}, where one needs to recover the original image from the {\it shuffled} pixels. Though less practical, this problem is  still interesting to know under what structure in data one can guarantee a successful recovery.  Both of the two applications give rise to a challenging extension of the MRUC problem, where we not only need to recover multiple correspondence across different data sources, but also face the difficulty of dealing with the missing values in data matrix.


\textbf{Unlabeled Sensing.} One similarly motivated problem is the  \textbf{U}nlabeled \textbf{S}ensing (\textbf{US}) problem considered by \citep{unnikrishnan2018unlabeled,pananjady2017denoising,tsakiris2020algebraic,peng2020linear,tsakiris2019homomorphic,slawski2021pseudo,xie_hypergradient_2020}. Especially, as discussed in Appendix \ref{app:discuss},
the MRUC problem is closely related to the \textbf{M}ultivariate \textbf{U}nlabeled \textbf{S}ensing (\textbf{MUS}) problem, which has been studied in \citep{zhang2019benefits,zhang2019permutation,zhang2020optimal,slawski2020sparse,slawski2020two}. Specifically, the MUS is the multivariate linear {\it regression} problem with unknown correspondence, i.e., it solves \begin{align}
  \label{p:MUS}
  \min_{P\in\Pc_n,W\in \Rbb^{m_2\times m_1}} \|Y-PXW\|_F^2,
\end{align}

where $W\in\Rbb^{m_2\times m_1}$ is the regression coefficient matrix, $Y\in \Rbb^{n\times m_1}$ and $X\in \Rbb^{n\times m_2}$ denotes the output and the permuted input respectively, and  $\left\|\cdot\right\|_F$ is the matrix Frobenius norm. When $m_1=1$, the MUS problem reduces to an US problem.  Despite of the similarity to the MUS problem, we remark that MRUC problem  has its own distinct features and, as shown in Section \ref{sec:experiment},  the algorithm for the MUS problem can not be directly and effectively applied, especially when there are multiple unknown correspondence and missing entries to be considered. 


\textbf{Related works. }
To the best of our knowledge, the concurrent and independent work \citep{yao2021unlabeled} is the only work that also considers the MRUC problem. Theoretically, \citep{yao2021unlabeled} showed that there exists an non-empty open subset $U\subseteq \Rbb^{n\times (m_1+m_2)}$, such that $\forall M\in U$, solving \eqref{p:minimize_rank} is bound to recover the original correspondence. However, such results only prove its existence for the subset $U$ and do not provide a concrete characterization.  Regarding the algorithm design, \citep{yao2021unlabeled} first learn a robust subspace following the idea of \citep{slawski2020sparse,slawski2020two}, and then solves  problem \eqref{p:minimize_rank} heuristically as multiple independent US problems using algorithms from \citep{tsakiris2020algebraic,peng2020linear}. However, there are two main drawbacks in  their algorithm that largely limit its practical value. First, as discussed in Appendix \ref{app:discuss} and Remark 8, it ignores the interaction among the shuffled columns and hence can not recover the permutation correctly. Second, their method can not deal with data with missing values. Another recent paper \citep{nock2021impact} also shares a similar concern with ours on how correspondence information can affect VFL, though in a different context.

\textbf{Contributions of this work. } 
 Our contributions in this work lie in both theoretical and practical aspects. Theoretically, we are the first to rigorously study how the rank of the data matrix is perturbed by the permutation, and show that   problem \eqref{p:minimize_rank} can be used to recover a generic low-rank random matrix almost surely. Besides, we propose a nuclear norm minimization problem as a surrogate for  problem \eqref{p:minimize_rank}, and is also the first to study the property of nuclear norm under permutation. Practically, we propose an efficient algorithm $\text{M}^3\text{O}$ that solves the nuclear norm minimization problem, which overcomes the aforementioned two shortcomings in \citep{yao2021unlabeled}. Notably, $\text{M}^3\text{O}$ works very well even for an extremely difficult task, where we need to recover multiple unknown correspondence from the data that are densely permuted and contain missing values. We remark that this is so far a challenging problem unexplored in the existing literature. Based on these findings, we also reach a novel and important observation for VFL: {\it Even without any data linkage information, it is still possible for each participant/platform to benefit from VFL.} 

\textbf{Outline. }
 We  start with building the theoretical understanding for the problem \eqref{p:minimize_rank} and its convex relaxation in Section \ref{sec:low-rank}. Then, based on the theoretical intuition obtained  from Section \ref{sec:low-rank}, we develop an efficient algorithm in Section \ref{sec:algorithm} for most complicated scenario. The simulation results are presented in Section \ref{sec:experiment} and the conclusions are drawn in Section \ref{sec:discussion}.

  \textbf{Notations.} 
 Given two matrices $X,Y\in\mathbb{R}^{n\times m}$, we denote $\langle X,Y\rangle=\sum_{i=1}^{n} \sum_{j=1}^{m} X_{ij}Y_{ij}$ as the matrix inner product. We denote $X(i)$ as the $i$th row of the matrix $X$ and $X(i,j)$ as the element at  the $i$th row and the $j$th column. We denote $\mathbf{1}_m\in\mathbb{R}^m$  and $\mathbf{1}_{n\times m}\in\mathbb{R}^{n\times m}$ as the all-one vector and matrix, respectively, and $I_n$ be the $n\times n$ identity matrix. For $\alpha\in\mathbb{R}^m$, $\beta\in\mathbb{R}^n$, we define the operator $\oplus$ as $\alpha\oplus\beta=\alpha\mathbf{1}_n^\top+\mathbf{1}_m\beta^\top\in\mathbb{R}^{m\times n}$. We denote $\|\cdot\|_*$ as the nuclear norm for matrices. For vectors, we denote $\|\cdot\|_0$, $\|\cdot\|_1$ as the zero norm and 1-norm respectively.

% \subsubsection*{Acknowledgements}
% All acknowledgments go at the end of the paper, including thanks to reviewers who gave useful comments, to colleagues who contributed to the ideas, and to funding agencies and corporate sponsors that provided financial support. 
% To preserve the anonymity, please include acknowledgments \emph{only} in the camera-ready papers.

\section{Matrix Recovery via a Low-rank Model} 
\label{sec:low-rank}
In this section, we study the role of low-rank model for recovering row permutation.


\textbf{How is matrix rank  perturbed by row permutation?}
To rigorously answer this question, we first introduce the notion {\it cycle decomposition of a permutation}. 
 

 \begin{definition}[Cycle decomposition of a permutation \citep{dummit1991abstract}] Let $\Sc$ be a finite set, $\pi(\cdot)$ be a permutation on $\Sc$. A cycle $(a_1,...,a_n)$ is a permutation sending $a_j$ to $a_{j+1}$ for $1\leq j\leq n-1$ and $a_n$ to $a_1$. Then a cycle decomposition of $\pi(\cdot)$ is an expression of $\pi(\cdot)$ as a union of several disjoint cycles\footnote{Two cycles are disjoint if they do not have common elements}.
 \end{definition}
 

 It can be verified that any permutation on a finite set has a unique cycle decomposition \citep{dummit1991abstract}. Therefore, we can define the {\it cycle number} of a permutation $\pi(\cdot)$ as the number of disjoint cycles with length greater than 1, which is denoted as $\Cc(\pi)$. We also define the non-sparsity of a permutation as the Hamming distance between it and the original sequence, i.e., $H(\pi)=\sum_{s\in S}\mathbb{I}[\pi(s)\neq s]$. It is obvious that $H(\pi)>\Cc(\pi)$ if $\pi$ is not an identity permutation. As a simple example, we consider the permutation $\pi(\cdot)$ that maps the sequence (1,2,3,4,5,6) to (3,1,2,5,4,6). Now the cycle decomposition for it is $\pi(\cdot)=(132)(45)(6)$, and $\Cc(\pi)=2$, $H(\pi)=5$. 
 

 We denote the original matrix as $M={\bm[}A,B{\bm ]}\in \Rbb^{n\times m} $  with $A\in \Rbb^{n\times m_A}$, $B\in \Rbb^{n\times m_B}$, and $r={\rm rank}(M)$, $r_A={\rm rank}(A)$, $r_B={\rm rank}(B)$. We denote the corresponding permutation as $\pi_P(\cdot)$ for any permutation matrix $P\in\Pc_n$. The following proposition says that the  perturbation effect of a permutation $\pi$ on the rank of $M$ could become stronger, if  $\pi$ permutes more rows and contains less cycles.
 
 \begin{proposition} \label{prop:MR} 
  For all $P\in\Pc_n$, we have
  \begin{align}
      \label{eq:MR1}
      {\rm rank}({\bm [}A,PB{\bm]})\leq \min\{n,m,r_A+r_B,r+H(\pi_P)\notag\\-\Cc(\pi_P)\}.
  \end{align} 
\end{proposition}

 Similar result for the case with multiple permutations is summarized in  Corollary \ref{col:multiple_rank} in Appendix \ref{app:proof}. It turns out that, without any further assumption on $M$, \eqref{eq:MR1} is sharp and cannot be improved. Notably, the upper bound in \eqref{eq:MR1} is attained with probability 1 for a generic low-rank random matrix.
 \begin{definition} A probability distribution on $\Rbb$ is called a proper distribution if its density function $p(\cdot)$ is absolutely continuous with respect the Lebesgue measure on $\Rbb$.
 \end{definition}
 \begin{proposition}
   \label{prop:attained_rank}
   If the original matrix  $M$ is a random matrix with $M=RE$ where $R\in \Rbb^{n\times r}$ and $E\in\Rbb^{r\times m}$ are two random matrices whose entries are i.i.d and follow a proper distribution on $\Rbb$ , and  $r\leq \min\{\sqrt{\frac n 2},m_A,m_B\}$, then $\forall P\in\Pc_n$, the equality below holds  with probability 1.
   \begin{align}
     \label{p:attained_rank}
       {\rm rank}({\bm [}A,PB{\bm]})= \min\{2r,r+H(\pi_P)-\Cc(\pi_P)\}
   \end{align}
 \end{proposition}

 \textbf{Discussion on Proposition \ref{prop:attained_rank}.} It is worthwhile to mention that our Proposition \ref{prop:attained_rank} strengthens the Theorem 1 in \citep{yao2021unlabeled} to some extent. Specifically, \citep{yao2021unlabeled}  shows that, with probability 1, the rank of the perturbed matrix will never be lower than that of the original matrix. Compared to them, our result precisely predicts how much the rank will increase after row perturbation. Besides, Proposition \ref{prop:attained_rank} is especially favorable from the optimization perspective, as now the rank is a monotone function w.r.t the degree of perturbation.

 \textbf{Convex relaxation for the rank function.}
 Despite the previous theoretical justification for problem \eqref{p:minimize_rank}, it is non-convex and non-smooth. Another crucial issue is that we often have a noisy observation matrix and it is well known that the rank function is extremely sensitive to the additive noise. In this paper, we assume that the observation matrix is corrupted by i.i.d Gaussian additive noise, i.e.,  $$M_o=[A_o,B_o]=[A,\tilde P B]+W,\ W(i,j)\sim \Nc(0,\sigma^2),$$
 where $\sigma^2$ denotes the variance of the noise. We denote the singular values of a matrix $X\in\Rbb^{n\times m}$ as $\sigma_X^1,...,\sigma_X^k$ where $k=\min \{n,m\}$. Since  $\text{rank}(X)=\|[\sigma_X^1,...,\sigma_X^k]\|_0$, from Proposition \ref{prop:attained_rank} we can view the perturbation effect  of a permutation to a low-rank matrix as breaking the sparsity of its singular values, which leads naturally to the nuclear norm minimization problem that has been shown to be robust to additive noise and favor low-rank solution \citep{Wright-Ma-2021}, i.e., 
 \begin{align}
   \label{p:nuclear_norm}
   \min_{P\in \Pc_n} \|[A_o,PB_o]\|_*= \|[\sigma_{M_o}^1,...,\sigma_{M_o}^k]\|_1.
 \end{align}
 

 \textbf{Theoretical justification for the nuclear norm. }
 Nuclear norm has a long history being  used as a convex surrogate for the rank, and it has been theoretically justified for applications like low-rank matrix completion \citep{candes2010power,Wright-Ma-2021}. It is also important to see whether the nuclear norm is still a good surrogate for the rank minimization problem \eqref{p:minimize_rank}. In this work, we establish a sufficient condition on $A$ and $B$ under which  problem \eqref{p:nuclear_norm} is provably justified for correspondence recovery.  We denote  $A=\sum_{i=1}^{r_A}\sigma_A^i u_A^i v_A^{i\top},\ B=\sum_{i=1}^{r_B}\sigma_B^i u_B^i v_B^{i\top}$ as the singular values decomposition of $A$ and $B$,  where the $\sigma_A^i$ and $\sigma_B^i$ are the non-zero singular values. To derive the worst-case error bound of nuclear norm minimization,  we propose the following assumption on $M$.
 
 \begin{assumption}\label{asp:asp1} There exists a constant $\epsilon_1\geq 0,\epsilon_2\geq 0,\epsilon_3\geq 0$ such that
  \begin{align}
    \label{p:cond1}&
  |\sigma_A^i-\sigma_B^i| \leq \epsilon_1,\ \forall i=1,..,r,\\
  \label{p:cond2}
  &\|u_A^i-u_B^i\|\leq \epsilon_2, \forall i=1,...,T,\\
  \label{p:cond3}
   & \min_{u\in U} \min_{i\neq j} |u(i)-u(j)|\geq \epsilon_3>0,  
  \end{align} where we denote  $\sigma_A^i=0$ if $i>r_A$, and similarly for $\sigma_B^i$, $T=\min\{r_A,r_B\}$ and $U=\{u_A^1,...,u_A^T,u_B^1,...,u_B^T\}$.
\end{assumption} 


 Here we provide some intuition behind these assumptions. Firstly, from the definition of nuclear norm, it can be simply verified for any $P\in \Pc_n$ that 
 \begin{align}
   \label{p:ineq1}
   -Z/N \leq (\|[A,PB]\|_*-\|M\|_*)/\|M\|_*\leq Z/N,
 \end{align}
 where  $N=\max\{\|A\|_*,\|B\|_*\}$ and $Z=\min\{\|A\|_*,\|B\|_*\}$. The inequality \eqref{p:ineq1} indicates that $A$ and $B$ should have comparable magnitude, i.e., $\|A\|_*\approx \|B\|_*$, otherwise the influence of the permutation will be less significant. With this observation, as depicted by \eqref{p:cond1}, we assume that the singular values of $A$ and $B$ are comparable. As for \eqref{p:cond2}, we propose it with an aim to capture the intuition that if $A$ and $B$ are data from the same group of users, the distance (in SVD sense) between $A$ and $B$ should be close, i.e., the matrix $[A, B]$ should be "low-rank". We would like to interpret the constants $\epsilon_2$ as a continuous measure for the low-rankness of a matrix, because it indicates that  the column space of $M$ can be approximated by the column space of one of its submatrices. Lastly, it is easy to verify that if there is a $P\in \Pc_n$ such that $u_B^i=Pu_B^i$ for all $i$, then $[A,PB]=[A,B]$. Therefore, we propose \eqref{p:cond3} to avoid this case. 
 
   
%   \begin{assumption} There exists a constant $\epsilon_2\geq 0$ such that
%    \label{asp:asp2}
%    \begin{align}
%      \label{p:cond2}
%    \|u_A^i-u_B^i\|\leq \epsilon_2, \forall i=1,...,T,
%    \end{align}
%    where we denote $T=\min\{r_A,r_B\}$.
%  \end{assumption}
 

%  \begin{assumption} There exists a constant $\epsilon_3\geq 0$ such that
%    \label{asp:asp3}
%    \begin{align}
%      \label{p:cond3}
%      \min_{u\in U} \min_{i\neq j} |u(i)-u(j)|\geq \epsilon_3>0,  
%    \end{align}where $U=\{u_A^1,...,u_A^T,u_B^1,...,u_B^T\}$.
%  \end{assumption}

\textbf{Remark 1. } Though these assumptions could be  refined, we remark that they are almost sharp. In Appendix \ref{app:asp}, we construct a few concrete counterexamples which do not satisfy these assumptions and are impossible to be recovered within meaningful accuracy by nuclear norm minimization problem.

 With these assumptions, we derive the following  result, which  provides high probability bound for the approximation error of \eqref{p:nuclear_norm}. We denote the solution to \eqref{p:nuclear_norm} as $P^*$, and let $\pi^*$ and $\tilde \pi$ be the corresponding permutation to the  permutation matrices $P^{*\top}$ and $\tilde P$, respectively.  We define the difference between the two permutations $\pi^*$ and $\tilde \pi$ as the {\it Hamming} distance 
 
 
 $$d_H(\pi^*,\tilde \pi)\stackrel{\text{def.}}=\sum_{i=1}^n\mathbb{I}(\pi^*(i)\neq \tilde \pi(i)).$$
 
 \begin{proposition}
   \label{prop:error_bound}
   Under  Assumptions \ref{asp:asp1}, if additionally  $\epsilon_1\leq \frac{D}{4r}$, $\epsilon_2\leq \min\{\frac{1}{2\sqrt{2T}},\frac{\sqrt{2}D}{2N}\}$, and $\sigma \leq \frac{D}{16L^2}$, then the following bound 
   \begin{align}
    \label{p:error_bound}
    d_{H}(\pi^*,\tilde \pi)\leq & \frac{2}{\epsilon_3^2}\bigg(2-\Big({\sqrt{2}D}/\big(D+(\sqrt{2}+2)\epsilon_1r+\notag\\ &\sqrt{2}\epsilon_2N+2\sqrt{2DL\sigma}\big) - \sqrt{T}\epsilon_2^2\Big)^2 \bigg)
  \end{align} holds with probability at least $1-2\exp\{-\frac{D}{8L\sigma}\}$, where $L=\max\{n,m\}$, $D=\|A\|_*+\|B\|_*$.
 \end{proposition}
    The proof to all the aforementioned theoretical results are provided in Appendix \ref{app:proof}. 
   
   
    \begin{figure}[htbp]
      \centering 
        \includegraphics[width=8cm]{figures/monoticity_nuclear.jpg}
        \caption{ The relationship  under different percentages of observable entries.}
        \label{fig:nuclearincrease}
      \end{figure}  
    \textbf{Remark 2. }From  Proposition \ref{prop:error_bound} we can see that when $\epsilon_3>0$, and $\epsilon_1\to 0$, $\epsilon_2\to 0$, $\ {\sigma}\to 0$, the error  $d_{H}(\pi^*,\tilde \pi)$ will converge to zero with probability 1. We can also discover that the correspondence can be difficult to recover when: The rank of original matrix $M$ is high; The magnitude  of $A$ and $B$ w.r.t rank or nuclear norm are not comparable; The strength of noise is high.
    Notably, the numerical experiments in Section \ref{sec:experiment1} corroborate these findings as well. Due to page limit, we refer detailed discussion and analysis on Proposition \ref{prop:error_bound} to Appendix \ref{app:asymp}.
    

   \textbf{Remark 3. }In many applications, we can only observe part of the full data. Therefore, it is worthwhile to investigate  whether nuclear norm minimization could work when we can only access a small subset of the entries in $M_o$. Notably, Figure \ref{fig:nuclearincrease} empirically gives the positive   answer and shows that the "{\it monotone relationship of nuclear norm w.r.t numbers of permuted rows}"  is gracefully degraded when the percentage of observable entries is decreasing. This phenomenon is remarkable since it indicates the original correspondence can be recovered from only part of the full data. The matrices used to generate  Figure \ref{fig:nuclearincrease} are the same as those in  Section \ref{sec:experiment1}, and the nuclear norm is computed approximately by first filling  the missing entries using Soft-Impute algorithm \citep{mazumder_spectral_2010}.
    

     \section{Algorithm}
     \label{sec:algorithm}

     In this section, we develop an algorithm for MRUC based on the intuition obtained from Section \ref{sec:low-rank}. Moreover, we require that the algorithm can deal with the scenario with missing values, i.e.,  our observed data is $\Pc_\Omega (M_o)=\Pc_\Omega([A_o,B_o])$, where $\Pc_\Omega$ is an operator that selects  entries that are in the set of observable indices  $\Omega$.
     In this scenario,  problem \eqref{p:nuclear_norm} can not be directly used since the evaluation of the  nuclear norm and  optimization of the permutation are coupled together. Inspired by the matrix completion method \citep{hastie_matrix_2015,mazumder_spectral_2010}, we propose to solve an alternative form of \eqref{p:nuclear_norm} as follows,  
     \begin{align}
       \label{p:unrelaxed}
       \min_{\widehat M\in \Rbb^{n\times m}} \min_{P\in \Pc_n}  &\left\|\Pc_\Omega([A_o, P B_o])-\Pc_\Omega(\widehat M)\right\|_F^2+\lambda\left\|\widehat M\right\|_*,
     \end{align} 

     where $\lambda>0$ is the penalty coefficient. We denote that $\widehat M=[\widehat M_A,\widehat M_B]$ and $\widehat M_A,\widehat M_B$  are the two submatrices  with the same dimension as $A_o$ and $B_o$ respectively. We can write \eqref{p:unrelaxed} equivalently as
     \begin{align}
      \label{p:rewrite}
      \min_{\widehat M\in \Rbb^{n\times m}}\min_{P\in \Pc_n} \left\|\Pc_\Omega(A_o) -\Pc_\Omega(\widehat M_A)\right\|_F^2\notag\\+\langle C(\widehat M_B), P \rangle+\lambda\left\|\widehat M\right\|_*,
    \end{align}
     where  $C(\widehat M_B)\in\mathbb{R}^{n\times n}$ is the pairing cost matrix with      \begin{align*}
      C(\widehat M_B)(i,j)=\sum_{(j,j'')\in \Omega}\bigg(\widehat M_B(i,j'')-B_o(j,j'')\bigg)^2,\\~\forall i,j=1,...,n.
    \end{align*}

     \textbf{Baseline algorithm.}
     A conventional  strategy to handle an optimization problem like \eqref{p:rewrite} is the alternating minimization or the block coordinate descent algorithm \citep{abid2017linear}. Specifically, it executes the following two updates iteratively until it converges.
     \begin{align}
      \label{p:BCD1}
      &\widehat M^{\text{new}}\leftarrow \underset{\widehat M\in \Rbb^{n\times m}}{\arg\min}\left\|\Pc_\Omega([A_o,\widehat P^{\text{old}} B_o])-\Pc_\Omega(\widehat M)\right\|_F^2\notag\\ &\quad\quad\quad\quad\quad\quad\quad\quad\quad\quad\quad\quad\quad\quad+\lambda\left\|\widehat M\right\|_*,\\
      \label{p:BCD2}
      &\widehat P^{\text{new}}\leftarrow \underset{P\in \Pc_n}{\arg\min}\  \langle C(\widehat M^{\text{new}}_B),P\rangle.
    \end{align}
     The first update step \eqref{p:BCD1} is a convex optimization problem and can be solved by the proximal gradient algorithm \citep{mazumder_spectral_2010}. The second update step \eqref{p:BCD2} is actually a discrete optimal transport problem which can be solved by the classical Hungarian algorithm with time complexity $O(n^3)$ \citep{jonker1986improving}. However, as we will see in the Section \ref{sec:experiment}, this algorithm performs poorly, and it is  likely to fall into an undesirable local solution quickly in practice. Specifically, the main reason is that the solution of \eqref{p:BCD2} is often not unique and  a small change in $\widehat M_B$ would lead to large change of $\widehat P$.  To address this issue, we propose a novel and efficient algorithm $\text{M}^3\text{O}$ algorithm based on the entropic optimal transport \citep{peyre2019computational} and min-max optimization \citep{jin2020local}. 
     
   
     \textbf{Smoothing the permutation with entropy regularization.}
     For any $a\in\mathbb{R}^n,b\in\mathbb{R}^m$, we define \begin{align*}
      \Pi(a,b)=\{S\in \mathbb{R}^{n\times m}:S\mathbf{1}_m=a,S^\top \mathbf{1}_n=b,\\S(i,j)\geq 0,~\forall i,j\},
    \end{align*} 
     which is also known as the Birkhoff polytope. The famous Birkhoff-von Neumann theorem \citep{birkhoff1946three} states that the set of extremal points of $\Pi(\mathbf{1}_n,\mathbf{1}_n)$ is equal to $\Pc_n$. Inspired by \citep{xie_hypergradient_2020} and the interior point method for linear programming \citep{bertsekas1997nonlinear}, in order to smooth the optimization process of the baseline algorithm, we relax $P$ from being an exact permutation matrix, i.e., to keep $P$ staying inside the Birkhoff polytope $\Pi(\mathbf{1}_n,\mathbf{1}_n)$. That is, we propose to replace the combinatorial problem \eqref{p:BCD2} with the following continuous optimization problem 
     \begin{align}
       \label{p:entropyOT}
       \underset{P\in \Pi(\mathbf{1}_n,\mathbf{1}_n)}{\text{min }} &\langle C(\widehat M_B),P\rangle+\epsilon \Hc(P),
     \end{align}
     where  $\Hc(P)\stackrel{\text{def.}}=\sum_{i,j}P(i,j)(\log(P(i,j))-1)$ is the matrix negative entropy and $\epsilon>0$ is the regularization coefficient. Notably, \eqref{p:entropyOT} is also known as the Entropic Optimal Transport (EOT) problem \citep{peyre2019computational}, which is a strongly convex optimization problem and can be solved  roughly in the $O(n^2)$ complexity per iteration by the Sinkhorn algorithm. Specifically, the Sinkhorn algorithm solves the dual problem of \eqref{p:entropyOT}, \begin{align}\label{EOT dual}
      \max _{\alpha,\beta \in \mathbb{R}^{n}}  W_\epsilon(\widehat M_B,\alpha,\beta)\stackrel{\text{def.}}=\left\langle\mathbf{1}_{n}, \alpha\right\rangle+\left\langle\mathbf{1}_{n}, \beta\right\rangle\notag-\\\epsilon\bigg\langle  \mathbf{1}_{n\times n} , \text{exp}\bigg\{\frac{\alpha\oplus\beta-C(\widehat M_B)}{\epsilon}\bigg\}\bigg\rangle,
  \end{align} 
     which reduces the variables dimension from $n^2$ to $2n$ and is thus greatly favorable in the high dimension scenario. By substituting the inner minimization problem of \eqref{p:rewrite} with \eqref{p:entropyOT}, we end up with solving the following  unconstrained min-max optimization problem 
     \begin{align}
       \label{p:entropydual}
       \underset{\widehat M}{\min}\underset{\alpha,\beta}{\ \max} \left\|A-\widehat M_A\right\|_F^2+W_\epsilon(\widehat M_B,\alpha,\beta)+\lambda\left\|\widehat M\right\|_*&.
     \end{align}
     Follows the idea of \citep{jin2020local}, we consider to adopt a proximal gradient algorithm with a Max-Oracle for \eqref{p:entropydual}.
     Specifically,  we employ the Sinkhorn algorithm \citep{peyre2019computational} as the  Max-Oracle to retrieve an $\varepsilon$-good solution of the inner max problem \eqref{EOT dual}.
     We summarize our proposed algorithm  $\text{M}^3\text{O}$ (\textbf{M}atrix recovery via \textbf{M}in-\textbf{M}ax \textbf{O}ptimization) in Algorithm \ref{alg:McubicO}, where $\text{prox}_{\lambda\left\|\cdot\right\|_*}(\cdot)$ is the proximal operator of nuclear norm, $\rho_k$ is the gradient stepsize and $$F_\epsilon(\widehat M,\alpha,\beta)\stackrel{\text{def.}}=\left\|A-\widehat M_A\right\|_F^2+W_\epsilon(\widehat M_B,\alpha,\beta).$$ The convergence property of $\text{M}^3\text{O}$ can be obtained by following \citep{jin2020local}, which shows that, with a decaying stepsize, $\text{M}^3\text{O}$ is bound to converge to an $\varepsilon$-good Nash equilibrium within $O(\varepsilon^{-2})$ iterations. 
     
  
    %  \begin{algorithm}
    %    
    %    \While{\rm not }{
    %      
     
    %    Perform
    %    $\widehat M^{\text{k+1}} \leftarrow \text{prox}_{\lambda\left\|\cdot\right\|_*} (\widehat M^{k}-\rho_k\nabla_{\widehat M} F_\epsilon(\widehat M^{k},\alpha^*,\beta^*)),$ where $$F_\epsilon(\widehat M,\alpha,\beta)\stackrel{\text{def.}}=\left\|A-\widehat M_A\right\|_F^2+W_\epsilon(\widehat M_B,\alpha,\beta);$$
    %     }
    %    \end{algorithm}

       \begin{algorithm}[tb]
        \caption{$\text{M}^3\text{O}$ (Simplified)}\label{alg:McubicO}
     \begin{algorithmic}
        \STATE {\bfseries Input:} tolerance $\varepsilon$, observation $M_o$, initialization $\widehat{ M}$.
        \REPEAT
        \STATE Run the Sinkhorn algorithm to find $\alpha^*$, $\beta^*$ such that  $$W_\epsilon(\widehat M_B^{k},\alpha^*,\beta^*)>\max_{\alpha,\beta}\ W_\epsilon(\widehat M_B^{k},\alpha,\beta)-\varepsilon;$$
        \STATE $\widehat M^{\text{k+1}} \leftarrow \text{prox}_{\lambda\left\|\cdot\right\|_*} (\widehat M^{k}-\rho_k\nabla_{\widehat M} F_\epsilon(\widehat M^{k},\alpha^*,\beta^*)).$ 
        \UNTIL{converged}
     \end{algorithmic}
     \end{algorithm}
     \textbf{Remark 4. }A recent work \citep{pmlr-v115-xie20b} proposes a decaying strategy for the entropy regularization coefficient $\epsilon$ in \eqref{p:entropyOT} so that the optimal solutions of \eqref{p:BCD2} and \eqref{p:entropyOT} do not deviate too much. Inspired by it, in our practice, we take large $\epsilon$ in the beginning and gradually shrink it by half whenever the objective value stops improving for $K$ steps. 
     
   
     \textbf{Remark 5. }A useful trick is that we should not take large stepsize $\rho_k$ in the early  iterations because the permutation matrix could still be far away from the optimal one. However,  a small stepsize would lead to slow convergence. Heuristically, we propose an adaptive stepsize strategy that performs  well in practice. For the   solution of (\ref{p:entropyOT}) $\widehat P_k$ at the $k$th iteration, we  compute the two statistics $$\delta_k=\left\|\widehat P_{k-1}-\widehat P_{k}\right\|_F^2/2n, \
       c_k=\left\|\text{max}_j \widehat P_k(\cdot,j)-\mathbf{1}_n\right\|_1/n.$$ 
       Here $\delta_k$ represents how fast the permutation matrix $\widehat P_k$ changes over the iterations, while $c_k$ measures how far the current $\widehat P_k$ is  close to an exact permutation matrix. Both $\delta_k$ and $c_k$  reflect the confidence on the current found correspondence.  Based on them,  we set the stepsize as $\rho_{k+1}=(1-\delta_k)(1-c_k)^\omega,$  where $\omega>0$ is a tunable parameter which is often set to a value between 0.5 to 3. $\omega$ actually trades off the convergence speed and final performance. The smaller the $\omega$, the faster the convergence. Therefore, a practical way is to start with a small $\omega$, and gradually increase it until the final performance stops improving. 
     
 
     
  
     \textbf{Remark 6. }As discussed in Section \ref{sec:intro}, in many cases we have to deal with the problem that involves multiple correspondence, i.e., we need to recover the matrix $M=[A,B_1,...,B_d]$ from the observation data $\Pc_\Omega(M_o)$, where 
     $$M_o=[A_o,B_o^1,...,B_o^d]=[A,\tilde P_1B_1,...,\tilde P_dB_d]+W,$$
     where $\tilde P_l\in\Pc_n$ and $W$ is a noise matrix. We refer such problem as the $\bm{d}$\textbf{-correspondence} problem. An important observation is that, although the number of possible correspondence increase exponentially as $d$ grows, the complexity of   M$^3$O per iteration only linearly increases with $d$ and can be implemented in a fully parallel fashion. Specifically, in this scenario, we solve the  problem
     \begin{align}
      \label{p:multiple}
      \min_{\widehat M}\min_{P_1,...,P_d}&\left\|\Pc_{\Omega}(A_o) - \Pc_{\Omega}(\widehat M_A)\right\|_F^2+\sum_{l=1}^{d}\bigg\{\langle C(\widehat M_{B_l}), P_l \rangle\notag\\ &+\epsilon \Hc(P_l)\bigg\}+\lambda\left\|\widehat M\right\|_*,\\
      &\text{s.t. }P_l \in \Pi(\mathbf{1}_n,\mathbf{1}_n),\ l=1,...,d,\notag
    \end{align}
     where we denote $\widehat M=[\widehat M_A,\widehat M_{B_1},...,\widehat M_{B_d}]$. Here $\widehat M_A$ and $\widehat M_{B_l}$ have the same dimension with $A_o$ and $ B_o^l$, respectively. 
       One can find that the inner problems for solving $P_l$ are actually decoupled for each $l$, which guarantees  an efficient parallel implementation. 

     \textbf{Remark 7. }Since problem \eqref{p:unrelaxed} has a similar form to that considered in \citep{mazumder_spectral_2010}. We adopt the same tuning strategy of $\lambda$ as in \citep{mazumder_spectral_2010}, which suggests that we should start with large $\lambda$ and gradually decrease it.
     
     We relegate more details about $\text{M}^3\text{O}$ to Appendix \ref{app:algo}.

     
     \begin{figure}[htbp] 
        \subfigure[Objective value]{
          \label{fig:exp1_Obj}
        \includegraphics[width=7cm]{figures/exp1_Obj.jpg}
        %\caption{fig1}
        }
        \subfigure[Permutation error]{
          \label{fig:exp1_Perr}
        \includegraphics[width=7.5cm]{figures/exp1_Perr.jpg}
        }
        \caption{ Performance of various algorithms on a simulated 1-correspondence problem.}\label{fig:exp1}
        \label{fig:multi_algo}
        \end{figure}
     \section{Experiments}
     \label{sec:experiment}

     In this section, we evaluate our proposed $\text{M}^3\text{O}$ on both synthetic and real-world datasets, including the MovieLens 100K and the Extended Yale B dataset. We also provide an ablation study for the decaying entropy regularization strategy and the  adaptive stepsize strategy proposed in Remarks 4 and 5. In all the experiments, we employ the Soft-Impute algorithm \citep{mazumder_spectral_2010} as a standard algorithm for matrix completion.   Extra experiment details and auxiliary results can be
     found in Appendix \ref{app:exp}.
     

     \textbf{Algorithms.} We denote the following algorithms for comparison in all the experiments:

      \textit{1. Oracle}:  Running the Soft-Impute algorithm  with ground-truth correspondence.
       

        \textit{2. Baseline}: The Baseline algorithm in \eqref{p:BCD1} and \eqref{p:BCD2}.
     

      \textit{3. MUS}: Since there is currently no existing algorithm directly applicable to the scenario considered by \eqref{p:multiple},  we modify and extend the algorithm in \citep{zhang2020optimal}, which is originally proposed for the MUS problem, to deal with the MRUC problem. The details of the adapted algorithm are provided in Appendix \ref{app:adapt}.
     
       
     \textbf{Remark 8. } As discussed in \citep{pananjady2017denoising}, leveraging the prior knowledge that multiple columns are shuffled by the same permutation is generally helpful for permutation recovery. This is why we only adopt the MUS algorithm in \citep{zhang2020optimal} instead of those US algorithms considered by \citep{yao2021unlabeled} for comparison. For a more serious and experimental discussion, we refer readers to Appendix \ref{app:discuss}. 
     \subsection{Synthetic data}
     \label{sec:experiment1}

     We first investigate the property of our proposed $\text{M}^3\text{O}$  algorithm on the synthetic data.

    
     \textbf{Data generation.}
     We generate the original data matrix in this form $M=R E+\eta W, $ where $R\in\mathbb{R}^{n\times r}$, $E \in \mathbb{R}^{r\times m}$, $W\in \mathbb{R}^{n\times m}$ and $\eta >0$ indicates the strength of the additive noise. The entries of $R$, $E$, $W$ are all i.i.d sampled from the  $\Nc (0,1)$. Then we split the data matrix $M$ by  $M=[A,B_1,...,B_d]$ where we denote $A\in \mathbb{R}^{n\times m_A}$, $B_1\in \mathbb{R}^{n\times m_1}$, ..., $B_d\in \mathbb{R}^{n\times m_d}$ to represent data from $d+1$ data sources. The permuted observation matrix $M_o$ is obtained by first generating $d$  permutation matrices $P_1,...,P_d$ randomly and independently, and then computing $ M_o=[A, P_1  B_1,..., P_d  B_d]$. Finally, we remove $(1-|\Omega|\cdot 100\%/(n\cdot m))$ percent of the entries of $M_o$ randomly and uniformly, where  $|\Omega|$ indicates the number of observable entries.  
     

       \textbf{Ablation study.} We denote the following variants of $\text{M}^3\text{O}$ for the ablation study.
     
      
       
          \textit{1. $\text{M}^3\text{O}$-AS-DE}:  $\text{M}^3\text{O}$  with both {A}dpative {S}tepsize and {D}ecaying {E}ntropy regularization.

         \textit{2. $\text{M}^3\text{O}$-DE}: $\text{M}^3\text{O}$ with Decaying Entropy regularization only. $\text{M}^3\text{O}$-DE-1 and $\text{M}^3\text{O}$-DE-2 adopt constant stepsize $\rho_k=0.5$ and $\rho_k=0.01$, respectively.


        \textit{3. $\text{M}^3\text{O}$-AS}: $\text{M}^3\text{O}$  with {A}dpative {S}tepsize only. The entropy coefficient $\epsilon$ is fixed to 0.0005.
     
     
       In the following results, we denote $\pi_l$ as the corresponding permutation to $ P_l$.  We initialize $\widehat M$ from Gaussian distribution for the $\text{M}^3\text{O}$ algorithm and its variants. We choose initial $\epsilon$ as 0.1 and $K=100$ as the default for the decaying entropy regularization, and set $\omega=3$ as the default for the adaptive stepsize. We also report  the achieved objective values of \eqref{p:multiple} for the tested algorithms, except for the MUS algorithm since it has a different objective. We denote $\hat \pi$ as the recovered permutation.
     
       
      
       \textbf{Results.}   Figure \ref{fig:multi_algo} displays the result under the setting  $\eta=0.1$, $|\Omega|\cdot 100\%/(n\cdot m)=80\%$, $n=m=100$, $r=5$, $d=1$, $m_A=60$ and $m_1=40$.  The algorithm {M}$^3$O-AS-DE achieves the best result, and can recover the ground-truth correspondence. {M}$^3$O-AS behaves similarly to Baseline and MUS. They all converge to a poor local solution quickly. {M}$^3$O-DE-1 converges quickly and also falls into a poor local solution due to large stepsize, while  {M}$^3$O-DE-2 adopts a small stepsize and hence suffers from slow convergence. Due to the superiority of {M}$^3$O-AS-DE over the other variants, in the following results, we refer {M}$^3$O as {M}$^3$O-AS-DE for short.
       
       \begin{figure}[htbp]
        \centering
        \subfigure[ $d_H$ v.s. $|\Omega|$]{
          \label{fig:exp2_Obj}
        \includegraphics[width=6cm]{figures/exp2_Perr.jpg}
        %\caption{fig1}
        }
        \subfigure[ $d_H$ v.s. $\eta$]{
          \label{fig:exp3_Perr}
        \includegraphics[width=6cm]{figures/exp3_Perr.jpg}
        }
        \quad
        \centering
        \subfigure[ $d_H$ v.s. $r$]{
          \label{fig:exp3_r}
        \includegraphics[width=6cm]{figures/exp6_Perr.jpg}
        %\caption{fig1}
        }
        \quad
        \subfigure[ $d_H$ v.s. $m_A/n$]{
          \label{fig:exp3_ma}
        \includegraphics[width=6cm]{figures/exp7_Perr.jpg}
        }
        \caption{ Performance of {{M}$^3$O} on a 1-correspondence problem under different levels of $|\Omega|$, $\eta$, $r$ and $m_A/n$. The default setting is $|\Omega|\cdot 100\%/(n\cdot m)=80\%$, $\eta=0.1$, $n=m=100$, $r=5$, $m_A=60$,  and $m_1=40$. The mean  with minimum and maximum are calculated from 10 different random initializations.}
        \label{fig:various_setting}
        \end{figure}

        \begin{table}[htbp]
          \centering
          \small
          \caption{ Performance of {M}$^3$O for various d-correspondence problems. The normalized permutation error $\sum_{l=1}^d d_H(\hat \pi_l,\pi_l)/d$  is reported as mean$\pm$std (min) over 10 different random initializations.}
          \label{tb:multi-perm}
     
          \begin{tabular}{ccccc}
          \toprule
          $(n,m_A,m_1,...,m_d)$  & $d$ & $ \frac{|\Omega|\cdot 100\%}{n m}$  & $\frac{1}{d}\sum_{l=1}^d d_H(\hat \pi_l,\pi_l) $ \\
          \midrule
          (100,40,30,30)& 2 & 40\% & $33.35\pm32.85$ (0.00)\\
          (100,20,40,40)& 2 & 40\% & $58.90\pm27.21$ (2.00)\\
          (100,45,25,25,25)& 3 & 50\% & $61.97\pm15.41$ (37.33)\\
          (100,40,25,25,25,25)& 4 & 60\% & $59.90\pm13.64$ (38.50)\\
          \bottomrule
          \end{tabular}
          %\caption{这是一张三线表}\label{tab:aStrangeTable}  标题放在这里也是可以的
          \end{table}
  
       Figure \ref{fig:various_setting} examine {M}$^3$O on a 1-correspondence problem under different regimes w.r.t $|\Omega|$, $\eta$, $r$ and $m_A/n$.  Here we use  $m_A/n$ to  control the difference of the magnitude of the submatrices. As we can see, the results are well aligned with our prediction in Remarks 2 and  3. We also find that the performance of {M}$^3$O tends to have high variance. This is mainly because {M}$^3$O is sensitive to random initialization, and more details on this phenomenon are in Appendix \ref{app:initialization}. In practice, we recommend to run {M}$^3$O a few times with different random initializations.
     
      
     
       Finally, we examine {M}$^3$O on a few d-correspondence problems. See Table \ref{tb:multi-perm} for various results, where we set $r=5$ and $\varepsilon=0.1$. Notice that for the 4-correspondence problem in the table, there are $(100!)^4$ possible correspondence. Even for such a difficult problem, {M}$^3$O is able to recover 61.5\% of the ground-truth correspondence with a good initialization. 
  
         

           
           
           
             \subsection{Multi-domain recommender system without correspondence}
           
     
     In this section, we study the performance of $\text{M}^3\text{O}$ on a real world dataset MovieLens 100K\footnote{https://grouplens.org/datasets/movielens/100k/}, which is a widely used movie recommendation dataset \citep{harper2015movielens}. In this application, we mainly focus on the metric Root Mean Squared Error (RMSE), i.e., 
      $$\text{RMSE}\stackrel{\text{def.}}=\sqrt{\frac1N\sum_{i,j}(\widehat M_{ij}-M_{ij})^2}.$$
     \textbf{Data.} MovieLens 100K contains 100,000 ratings within the scale 1-5. The ratings are given by 943 users on 1,682 movies. Genre information about movies is also provided. We adopt a similar setting with \citep{zhang2012multi}. We extract five most popular genres, which are Comedy (C), Romance (R),  Drama (D),  Action (A),  Thriller (T) respectively, to define the data from 5 different domains (or platforms). In addition to \citep{zhang2012multi}, we randomly permute the indexes of the users from these five domains respectively, so that the correspondence among these data become unknown. In this way, the problem belongs to the 4-correspondence problem as discussed before. The ratings are split randomly, with 80\% of them as the training data and the other 20\% of them as the test data. 
     
 
     \textbf{Algorithms.} We consider the following additional algorithms for comparison. 
     \begin{enumerate}[style=sameline,itemindent=0em,leftmargin=20pt]
       {\item \textit{SIC}: Running the Soft-Impute algorithm independently for the 5 different platforms. }
       {\item \textit{SIR}: Running the Soft-Impute algorithm  with Randomly generated correspondence.}
     \end{enumerate}
 
   
     \textbf{Results.}
     As discussed in experiments on the simulated data, the exact recovery of correspondence becomes impossible due to the small amount of observable entries. Therefore, in the following experiment, since exact correspondence is not needed, we fix $\epsilon=0.05$ for $\text{M}^3\text{O}$. Table \ref{tb:federated} shows the results by averaging the RMSE on the test data over 10 different random seeds.  We can first see that the matrix completion with a wrong correspondence, i.e., SIR, can be harmful to the overall performance since it is even worse than the results of SIC. Notably,  although the ground-truth correspondence can not be recovered, each platform  can still benefit from $\text{M}^3\text{O}$ since it improves the performance over SIC. This is mainly because $\text{M}^3\text{O}$ is still able to correspond similar users for inferring missing ratings. On the contrary, since both Baseline and MUS can only establish an exact one-to-one correspondence for each user, they fail to improve SIC significantly. Remarkably, $\text{M}^3\text{O}$ is only inferior to the Oracle method a little, and even achieves lower test RMSE than the Oracle method on the Comedy genre. 
   
    
       \begin{table}[htbp]
         \small
         \centering
         \caption{  Test RMSE of various algorithms on MovieLens 100K}\label{tb:federated}
     
         \begin{tabular}{ccccccc}
         \toprule
         Method & C & R & D & A & T & Total\\
         \midrule
         SIR&1.020  &  1.016 &   0.981  &  0.980  &  0.981  &  0.994\\
         
         SIC&0.969  &  0.970 &   0.932 &   0.918  &  0.925 &   0.942\\
         MUS&0.966  &  0.984 &   0.942  &  0.931  &  0.931  &  0.949\\
           Baseline &0.973& 0.956& 0.938& 0.911  & 0.915&
               0.940 \\
         $\text{M}^3\text{O}$& \textbf{0.9399} & \textbf{0.879} & \textbf{0.914} & \textbf{0.856} & \textbf{0.857} & \textbf{0.895}\\
         Oracle& 0.944   & 0.783 &   0.906 &   0.818  &  0.810 &   0.867\\
         \bottomrule
         \end{tabular}
         %\caption{这是一张三线表}\label{tab:aStrangeTable}  标题放在这里也是可以的
      
         \end{table}
 
    
         \begin{figure}[htbp]
          \centering
         \subfigure[ Original]{
           \label{fig:real_face}
         \includegraphics[width=5cm]{figures/real_face.jpg}
         %\caption{fig1}
         }
         \subfigure[Corrupted]{
           \label{fig:perm_face}
         \includegraphics[width=5cm]{figures/perm_face.jpg}
         }
         \centering
         \subfigure[Baseline]{
           \label{fig:base_face}
         \includegraphics[width=5cm]{figures/base_face.jpg}
         %\caption{fig1}
         }
         \subfigure[$\text{M}^3\text{O}$]{
           \label{fig:Mcubic_face}
         \includegraphics[width=5cm]{figures/Mcubic_face.jpg}
         }
       
         \caption{ Performance of {{M}$^3$O} on a face recovery problem.}
         \label{fig:face}
           \end{figure}
       \subsection{Visual permutation recovery}
       
         We also show that  M$^3$O  is flexible and can also be applied to a visual jigsaw puzzle.  This kind of problem is recently considered in \citep{santa2017deeppermnet}, which proposes to recover the corrupted image in a data-driven way using convolutional neural networks. However, we show that it is possible to recover the image without extra data by merely exploiting the underlying low-rank structure of the image itself. A typical  result is shown in Figure \ref{fig:face}. The experiment details and more results are provided in Appendix \ref{app:exp}.

      %  We show that  M$^3$O  is flexible and can also be used to recover matrix that is not in the form $[A,PB]$. We can see this from the problem formulation in \eqref{p:rewrite}, where the cost matrix $C(\cdot)$ can be constructed in other ways as long as it is a function of a permutation. Typically, M$^3$O  can be used to solve a challenging face image recovery problem. The original face image with size $180\times 180$ in Figure \ref{fig:real_face} comes from the Extend Yale B database \citep{GeBeKr01}. The corrupted image is visualized in Figure \ref{fig:perm_face}, where the  pixel blocks with size $30\times 30$ in the upper left are shuffled randomly, and $30\%$ of the total pixels are removed. 
      %  This kind of problem is recently considered in \citep{santa2017deeppermnet}, which proposes to recover the corrupted image in a data-driven way using convolutional neural networks. However, we show that it is possible to recover the image without additional data by merely exploiting the underlying low-rank structure of the image itself.
      %  This experiment setting is similar to that in  \citep{yao2021unlabeled} but the algorithm in \citep{yao2021unlabeled} can not be applied since it can not work with the missing values. The  MUS algorithm is also not applicable since this problem can not be written in the form of linear regression problem. From Figure \ref{fig:base_face} and \ref{fig:Mcubic_face} we can find that $\text{M}^3\text{O}$ performs better than the Baseline, and can even recover the original orders of pixel blocks. More results similar to the Figure \ref{fig:face} and experiment details are provided in Appendix \ref{app:exp}.



       \section{Conclusion}
       \label{sec:discussion}
       In this paper, we study the important MRUC problem where part of the observed submatrix is shuffled. Such problem underlies the record linkage problem in VFL \citep{nock2021impact}. This problem has not been well explored in the existing literature. Theoretically, we are the first to rigorously analyze the role of low-rank model in the MRUC problem, and  also provide an almost sharp sufficient condition under which minimizing nuclear norm is provably efficient for recovering permutation. For practical implementations, we propose an efficient algorithm, the {$\text{M}^3\text{O}$} algorithm, which consistently achieves the best performance over several baselines in all the tested scenarios. For future works, it is important to extend the theoretical results to the scenario with missing values, and hopefully derive a theorem that can rigorously quantify the remarkable phenomenon exhibited in Figure \ref{fig:nuclearincrease}.
      %  It is worthwhile to point out that apart from the two applications we have studied in this paper, this problem could arise in more scenarios like the gnome assembly problem \citep{huang1999cap3}, the video pose tracking problem \citep{ganapathi2012real}  and the privacy-aware sensor networks \citep{gruteser2003privacy}, etc. We believes that our work provides a general framework to deal with unknown correspondence issue in these scenarios.



\nocite{*}
\bibliography{example_paper}



\end{document}
