\documentclass{article} % For LaTeX2e
\usepackage{iclr2022_conference,times}

% Optional math commands from https://github.com/goodfeli/dlbook_notation.
\input{command.tex}

\usepackage{enumitem}
\usepackage{hyperref}
\usepackage{url}
\usepackage{caption}
\captionsetup[figure]{font=small}
\captionsetup[table]{font=small}


\newcommand{\ye}[1]{{\color{blue}{[Ye: #1]}}}

\title{Low-rank Matrix Recovery with Unknown Correspondence}

% Authors must not appear in the submitted version. They should be hidden
% as long as the \iclrfinalcopy macro remains commented out below.
% Non-anonymous submissions will be rejected without review.

\author{Zhiwei Tang\thanks{Use footnote for providing further information
about author (webpage, alternative address)---\emph{not} for acknowledging
funding agencies.} \\
School of Science and Engineering \\
Chinese University of Hong Kong (Shenzhen)\\
Shenzhen \\
\texttt{zhiweitang1@link.cuhk.edu.cn} \\
}

% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to \LaTeX{} to determine where to break
% the lines. Using \AND forces a linebreak at that point. So, if \LaTeX{}
% puts 3 of 4 authors names on the first line, and the last on the second
% line, try using \AND instead of \And before the third author name.

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

%\iclrfinalcopy % Uncomment for camera-ready version, but NOT for submission.
\begin{document}


\maketitle

\begin{abstract}
   We study on a matrix recovery problem with unknown correspondence: given the observation matrix $M_o=[A,\tilde P B]$, where $\tilde P$ is an unknown permutation matrix, we aim to recover the underlying matrix $M=[A,B]$. Such problem commonly arises in many applications where heterogeneous data are utilized and the correspondence among them are unknown, e.g., due to privacy concerns. We show that it is possible to recover $M$ via solving a nuclear norm minimization problem under a proper low-rank condition on $M$, with provable non-asymptotic error bound for the recovery of $M$. We propose an algorithm, $\text{M}^3\text{O}$ (Matrix recovery via Min-Max Optimization) which recasts the combinatorial optimization problem as a continuous minimax problem and solves it by proximal gradient with a Max-Oracle. $\text{M}^3\text{O}$ can also be applied to a more general scenario where we have missing entries in $M_o$ and multiple groups of data with distinct unknown correspondence. Experiments on  simulated data, the MovieLens 100K dataset and Yale B database show that $\text{M}^3\text{O}$ achieves state-of-the-art performance over several baselines and can recover the ground-truth correspondence with high accuracy.
\end{abstract}

% \red{Overview:

% The main points:
% \begin{itemize}
%    \item We consider the low-rank matrix recovery problem, which has many applications and contains the multi-variate unlabeled sensing problem.
%    \item Merits to existing works: Can deal with dense permutation, multiple correspondence, missing values.
%    \item We study rigorously how rank is permuted by permutation.
%    \item Our performance is state-of-the-art and easy to implement/understand.
% \end{itemize}

% Paper structures:
% \begin{itemize}
%    \item Introduction: 
%    \begin{enumerate}
%       \item The main inspiration for our work:multi-source matrix completion without correspondence.
%       \item Relationship to unlabeled sensing.
%    \end{enumerate}
   
%    \item Related works: 
   
%    \begin{enumerate}
%       \item Multi-variate unlabeled sensing has been used to solve this problem, though no missing values.
%       \item Can only deal with sparse permutation, single correspondence.
%       \item No satisfying theoretical results.
%       \item Our method can deal with the most general scenario.
%       \item Our method can also be used to solve the multi-variate unlabeled sensing problem.
%    \end{enumerate}
   
    
   
    
   
    
   
   
   
   
%    \item MATRIX RECOVERY VIA A LOW-RANK MODEL: 
%    \begin{enumerate}
%       \item How rank is perturbed by permutation.
%       \item The fundamental reason is to perturb the sparsity of singular values->we should use nuclear norm instead -> another interpretation, we can resist noise -> empirical verification for the condition.
%       \item Another viewpoint: link to the multi-variate unlabeled sensing problem. using the decomposition method.
%       \item Another reason, it is known to deal with missing values, study informally on the concurrent case of missing values and permutation.

%    \end{enumerate}


   

%    \item Algorithm: We define the distance between two permutation as the Hamming distance $d_H(\hat \pi,\pi)\stackrel{\text{def.}}=\sum_{i=1}^n\mathbb{I}(\hat \pi(i)\neq  \pi(i))$.

%    \begin{enumerate}
%       \item The hardness comes from the coupling relationship of nuclear norm minimization and permutation-> we can decouple them, leading to a form that can also deal with multiple permutation.
%       \item Now the inner problem is a optimal assignment problem.
%       \item inner optimization: we should use skinhorn algorithm for the optimal assignment problem.
%       \item outer: proximal gradient.
%       \item convergence analysis: mention related materials.
%    \end{enumerate}
   

%    \item Experiments:
%    \begin{enumerate}
%       \item multi-variate unlabeled sensing(no missing values): synthetic data
%       \item matrix recovery: missing values, add naive baseline.
%       \item MovieLens: add naive baseline.
%    \end{enumerate}
   

% \end{itemize}
% }

\section{Introduction}
\label{sec:intro}
In the era of big data, one usually needs to utilize data gathered from multiple disparate platforms when accomplishing a specific task. However, the correspondence among the data samples from these different sources are often unknown due to either missing identity information or privacy reasons \citep{unnikrishnan2018unlabeled,gruteser2003privacy,das_sample--sample_2018}. Examples include the multi-image matching problem studied in \citep{ji_robust_2014,zeng_finding_2012,zhou_multi-image_2015}, the record linkage problem \citep{chan2001file} and the federated recommender system \citep{yang2020federated}. 


\vspace{-0.1cm}
In the simplest scenario, we have two data matrices $A=[a_1,...,a_n ]^\top$, $B=[b_1,...,b_n ]^\top$ with $a_i\in \mathbb{R}^{m_A}$ and $b_i\in \mathbb{R}^{m_B}$, which are from two different platforms (data sources). As discussed above, the  correspondence $(a_i,b_i)$ may not be available, and thereby the goal is to recover the underlying correspondence between $a_1,...,a_n$ and $b_{\tilde\pi{(1)}},...,b_{\tilde\pi{(n)}}$, where $\tilde\pi(\cdot)$ denotes an unknown permutation. We can translate such problem described above as a matrix recovery problem, i.e., to recover the matrix $M=[A,B]$ based on the permuted observation $ M_o=[A,\tilde P B]$, where $\tilde P\in\Pc_n$ is an unknown permutation matrix and $\Pc_n $ denotes the set of all $n\times n$ permutation matrices. We term this problem as \textbf{M}atrix \textbf{R}ecovery with \textbf{U}nknown \textbf{C}orrespondence (\textbf{MRUC}). 

\vspace{-0.1cm}
Inspired by the classical low-rank model for matrix recovery \citep{Wright-Ma-2021,mazumder_spectral_2010,hastie_matrix_2015}, we especially focus on the scenario where the matrix $M$ features a certain low-rank structure. Such low-rank model has achieved great success in many applications like the recommender system \citep{schafer2007collaborative,mazumder_spectral_2010} and the image recovery and alignment problem \citep{zeng_finding_2012,zhou_multi-image_2015}. By denoting $B_o=\tilde PB$, we want to solve the following rank minimization problem for MRUC, 
\vspace{-0.2cm}
\begin{align}
  \label{p:minimize_rank}
  \underset{ P\in \Pc_n}{\text{min }} \text{rank}([A, P B_o]).
\end{align} 
\vspace{-0.6cm}
% In this work, we aim at solving a complicated extension of MRUC, where we have multiple correspondence to recover, and can only observe partial entries of the data matrix. Specifically, the goal becomes to recover the matrix $M=[A,B_1,...,B_d]$ based on only partial observation of the permuted matrix $ \Pc_\Omega (M_o)=\Pc_\Omega([A,\tilde P_1 B_1,...,\tilde P_d We define the distance between two permutation as the Hamming distance $d_H(\hat \pi,\pi)\stackrel{\text{def.}}=\sum_{i=1}^n\mathbb{I}(\hat \pi(i)\neq  \pi(i))$.
%  B_d])$, where $\tilde P_1,...,\tilde P_d$ are d unknown permutation matrices, and $\Pc_\Omega$ is an operator that selects  entries that are in the set observable indices  $\Omega$. For conciseness concern, we will first study the most basic MRUC problem, and then show that the developed algorithm is readily to be extended to the complicated scenario.



% First of all, we realize that finding the correspondence for matrix-like data is essentially equivalent to solving a matrix recovery problem. Consider the case that we need to identify the correspondence among data from two different platforms. We denote the three data matrices 
% $$
%  A=\left[
%  \begin{matrix}
%    a_1^\top  \\
%    \vdots \\
%    a_n^\top 
%   \end{matrix}
%   \right] \in \mathbb{R}^{n\times m_1},\quad
%   B=\left[
%  \begin{matrix}
%    b_1^\top  \\
%    \vdots \\
%    b_n^\top 
%   \end{matrix}
%   \right] \in \mathbb{R}^{n\times m_2},\quad \tilde B = \tilde P  B=\left[
%     \begin{matrix}
%       b_{\pi{(1)}}^\top  \\
%       \vdots \\
%       b_{\pi{(n)}} ^\top
%      \end{matrix}
%      \right] \in \mathbb{R}^{n\times m_2}, 
% $$ 
% where $\pi(\cdot)$ is an unknown permutation and $ \tilde P$ is the corresponding $n\times n $ permutation matrix. For each $i$, the data vectors $a_i\in \mathbb{R}^{m_1}$ and $b_i\in \mathbb{R}^{m_2}$ are from different platforms but generated by the same user. The task is to establish the correspondence among $a_1,...,a_n$ and $b_{\pi{(1)}},...,b_{\pi{(n)}}$. Therefore, this is equivalent to the \textbf{M}atrix \textbf{R}ecovery problem with \textbf{U}nknown \textbf{C}orrespondence (\textbf{MRUC}), i.e., to recover the matrix $M=[A,B]$ based on the permuted observation $ M_o=[A,\tilde P B]$.
% Specifically, the goal becomes to recover the matrix $M=[A,B_1,...,B_d]$ based on only partial observation of the permuted matrix $ \Pc_\Omega (M_o)=\Pc_\Omega([A,\tilde P_1 B_1,...,\tilde P_d B_d])$, where $\tilde P_1,...,\tilde P_d$ are d unknown permutation matrices, and $\Pc_\Omega$ is an operator that selects  entries that are in the set observable indices  $\Omega$. Both the two realistic reasons prevent us from directly combining the data on these platforms for each user. Hence, algorithm that can establish the correspondence among the heterogeneous data needs to be developed. Figure  gives a visualization for our mission in this work. 


\textbf{Practical applications.}  It is known that the recommender system often suffers from  data sparsity \citep{zhang2012multi} because  users typically only provide ratings  few items. To enlarge the set of observable ratings for each user, we may harness extra data from multiple platforms (Netflix, Amazon, Youtube, etc.). One classical work on this problem is the multi-domain recommender system considered in \citep{zhang2012multi}. Unfortunately, their work neglects a crucial issue that data from these diverse platforms (or domains) are not always well aligned for two primary reasons. The first  is that the same user may use different identities, or even leave nothing about their identities, on these platforms. Another reason is that, those platforms are not allowed to share with each other the identity information about their users for preserving privacy. Another application is the visual permutation learning problem \citep{santa2017deeppermnet}, where one needs to recover the original image from only a subset of the {\it shuffled} pixels. Both of the two applications give rise to a challenging extension of the MRUC problem, where we not only need to recover multiple correspondence across different data sources, but also face the difficulty of dealing with the missing values in data matrix .

\vspace{-0.1cm}
\textbf{Relationship to the multivariate unlabeled sensing problem.} Problem
\eqref{p:minimize_rank} is closely related to the \textbf{M}ultivariate \textbf{U}nlabeled \textbf{S}ensing (\textbf{MUS}) problem, which has been studied in \citep{pananjady2017denoising,zhang2019benefits,zhang2019permutation,zhang2020optimal,slawski2020sparse,slawski2020two}. Specifically, the MUS is the multivariate linear {\it regression} problem with unknown correspondence, i.e., it solves \vspace{-0.2cm} \begin{align}
  \label{p:MUS}
  \min_{P\in\Pc_n,W\in \Rbb^{m_2\times m_1}} \|Y-PXW\|_F^2,
\end{align}\vspace{-0.6cm}

where $W\in\Rbb^{m_2\times m_1}$ is the regression coefficient matrix, $Y\in \Rbb^{n\times m_1}$ and $X\in \Rbb^{n\times m_2}$ denotes the output and the permuted input respectively, and  $\left\|\cdot\right\|_F$ is the matrix Frobenius norm. In fact, a concurrent work \citep{yao2021unlabeled} studies the same rank minimization problem as \eqref{p:minimize_rank}, but their approach is to solve it using the algorithm developed for MUS problem. Despite of the similarity to the MUS problem, we remark that MRUC problem  has it own distinct features and, as shown in Section \ref{sec:experiment},  the algorithm for the MUS algorithm can not be directly and effectively applied, especially when there are multiple unknown correspondence and missing entries to be considered. 
% Therefore, unlike \citep{yao2021unlabeled}, we study the problem structure of \eqref{p:minimize_rank} directly without recasting it as a MUS problem.

\vspace{-0.1cm}
\textbf{Related works. }
To the best of our knowledge, the concurrent and independent \citep{yao2021unlabeled} is the only work that also considers the MRUC problem. Theoretically, \citep{yao2021unlabeled} showed that there exists an non-empty open subset $U\subseteq \Rbb^{n\times (m_1+m_2)}$, such that $\forall M\in U$, solving \eqref{p:minimize_rank} is bound to recover the original correspondence. However, such result only proves its existence and do not provide a concrete characterization for the subset $U$. Regarding the algorithm design, \citep{yao2021unlabeled} follows the idea of \citep{slawski2020sparse,slawski2020two} and treats  problem \eqref{p:minimize_rank} heuristically as a MUS problem. However, there are three main drawbacks in the their algorithm that largely limit its practical value. First, it can only work when the data is sparsely permuted, i.e., there are only $k$ data vectors being shuffled with $k\ll n$; Second, they only consider the scenario with a single unknown correspondence; Last but not least, their method can not deal with data with missing values.

\vspace{-0.1cm}
\textbf{Contributions of this work. }
 Our contributions in this work lie in both theoretical and practical aspects. Theoretically, we are the first to rigorously study how the rank of the data matrix is perturbed by the permutation, and show that   problem \eqref{p:minimize_rank} can be used to recover a generic low-rank random matrix almost surely. Besides, we also propose a nuclear norm minimization problem as a surrogate for  problem \eqref{p:minimize_rank}. The most important theoretical result in this work is that we provide a non-asymptotic analysis to bound the error of the nuclear norm minimization problem under a mild assumption. Practically, we propose an efficient algorithm $\text{M}^3\text{O}$ that solves the nuclear norm minimization problem, which overcomes the aforementioned three shortcomings in \citep{yao2021unlabeled}. Notably, $\text{M}^3\text{O}$ works very well even on an extremely difficult scenario, where we need to recover multiple unknown correspondence from the data that are densely permuted and contain missing values. We remark that this is so far a challenging case unexplored in the existing literature.

 \vspace{-0.1cm}
\textbf{Outline. }
For conciseness, we will first study the MRUC problem with one unknown correspondence, and then show that the theoretical results and the algorithm can be readily extended to the more complicated scenarios. We start with building the theoretical results for \eqref{p:minimize_rank} and its convex relaxation in Section \ref{sec:low-rank}. Then, the algorithm is developed in Section \ref{sec:algorithm}. The simulation results are presented in Section \ref{sec:experiment} and the conclusions are drawn in Section \ref{sec:discussion}.




% In Section \ref{sec:low-rank},  we begin with discussion on why and when such rank minimization idea can work from both theoretical and numerical aspects. In particular, we establish a proper low-rank condition on $M$ under which  the  problem \eqref{p:minimize_rank} can yield the ground-truth permutation. We remark that problem \eqref{p:minimize_rank} is still a challenging combinatorial optimization problem with a non-smooth and non-convex objective. To effectively handle it, we propose to relax the rank func We define the distance between two permutation as the Hamming distance $d_H(\hat \pi,\pi)\stackrel{\text{def.}}=\sum_{i=1}^n\mathbb{I}(\hat \pi(i)\neq  \pi(i))$.
% tion via nuclear norm regularization. Then, we derive an equivalent continuous formulation based on the classical Birkhoff von-Neumann Theorem.  

% \begin{wrapfigure}{r}{5cm}
%    \vspace{-0.3cm}
%    \centering
%    \includegraphics[width=5cm]{figures/application.png}
%    \caption{Finding correspondence among the users' data from different platforms.}
%    \label{fig:application}\vspace{-0.3cm}
%    \end{wrapfigure} 
  %  However, the relaxed problem is still hard to solve and the conventional Block Coordinate Descent (BCD) algorithm can fall into local solution easily due to the combinatorial nature of the permutation matrix. To address this issue, in Section \ref{sec:smooth}, we propose to smooth the problem by adding entropy regularization. Moreover, we recast the resulting problem as a min-max optimization problem whose inner problem is in fact the Entropic Optimal Transport (EOT) problem \citep{peyre2019computational}. To solve such  problem, we develop the $\text{M}^3\text{O}$ (\textbf{M}atrix recovery via \textbf{M}in-\textbf{M}ax \textbf{O}ptimization) algorithm which adopts proximal gradient with a Max-Oracle.  Furthermore,   in  Section \ref{sec:extends} we show that our approach can be adapted to a more general scenario where there are missing entries in $M_o$ and multiple  groups of data with unknown correspondence. In Section \ref{sec:experiment}, we conduct  experiments on simulated data and a real-world dataset on movie recommendation, MovieLens 100K, to demonstrate that $\text{M}^3\text{O}$ achieves state-of-the-art performance over several baselines and can recover the ground-truth correspondence with high accuracy.
 
  % In summary, our major contributions are  {(1)}  We are the first to formulate and study MRUC for addressing a common obstacle in utilizing heterogeneous data; {(2)}  We build a theoretical foundation for using the rank minimization problem to solve the MRUC;   {(3)} We propose a novel polynomial-time algorithm $\text{M}^3\text{O}$ that can recover the ground-truth permutation with high accuracy.

  \textbf{Notations.} 
 Given two matrices $X,Y\in\mathbb{R}^{n\times m}$, we denote $\langle X,Y\rangle=\sum_{i=1}^{n} \sum_{j=1}^{m} X_{ij}Y_{ij}$ as the matrix inner product. We denote $X(i)$ as the $i$th row of the matrix $X$ and $X(i,j)$ as the element at  the $i$th row and the $j$th column. We denote $\mathbf{1}_m\in\mathbb{R}^m$  and $\mathbf{1}_{n\times m}\in\mathbb{R}^{n\times m}$ as the all-one vector and matrix, respectively, and $I_n$ be the $n\times n$ identity matrix. For $\alpha\in\mathbb{R}^m$, $\beta\in\mathbb{R}^n$, we define the operator $\oplus$ as $\alpha\oplus\beta=\alpha\mathbf{1}_n^\top+\mathbf{1}_m\beta^\top\in\mathbb{R}^{m\times n}$. We denote $\|\cdot\|_*$ as the nuclear norm for matrices. For vectors, we denote $\|\cdot\|_0$, $\|\cdot\|_1$ as the zero norm and 1-norm respectively.

% \section{Related Works}
%   To our knowledge, there is no existing work studying the MRUC problem  as formulated in this paper.   The work \citep{zhang2012multi} considers a multi-domain matrix completion problem.  However, they assume that the correspondence among different domains are known, which may not hold in real world scenarios.  The work \citep{yang2020federated} considers a similar problem, federated recommender system.  However, their scenario is that different platforms have data from different users instead of different domains. 
%   The Multi-Image Matching problem studied in \citep{ji_robust_2014,zeng_finding_2012,zhou_multi-image_2015}  shares some similarity with our work where they rely on a low rank structure of images to establish the correspondence between patterns from different images. 
%   Despite of the similarity,   their problem formulation and algorithm design are fundamentally different.
%   %works can only deal with matrices without missing entries. 
  
%   Our problem formulation for MRUC is inspired by recent works on unlabeled sensing \citep{xie_hypergradient_2020,unnikrishnan2018unlabeled}, where the correspondence between the inputs and outputs are missing in the regression problem. Specifically, the task is a regression problem, minimizing the squared residuals with respect to the model parameter $\omega$ and a permutation $\pi(\cdot)$, i.e., \begin{align*}
%     \min_{\omega,\pi} \Lc(\omega,\pi)=\sum_{i=1}^n \left\|y_i-f(x_i,z_{\pi(i)};\omega)\right\|_2^2,
%    \end{align*} where  $(x_i, z_{\pi(i)})$ are the total input features, and $z_{\pi(i)}$ are the permuted features that lose correspondence to the output $y_i$. 
%    Additionally,  previous studies are limited to the problem formulation with a single unknown correspondence.
\vspace{-0.3cm} 
\section{Matrix Recovery via a Low-rank Model}
\label{sec:low-rank}
\vspace{-0.2cm} 
\textbf{How the matrix rank is perturbed by the row permutation?}
To answer this fundamental question, we first introduce the cycle decomposition of a permutation. 

\begin{definition}[Cycle decomposition of a permutation\citep{dummit1991abstract}] Let $S$ be a finite set and $\pi(\cdot)$ be a permutation on $S$. A cycle $(a_1,...,a_n)$ is a permutation sending $a_j$ to $a_{j+1}$ for $1\leq j\leq n-1$ and $a_n$ to $a_1$. Then a cycle decomposition of $\pi(\cdot)$ is an expression of $\pi(\cdot)$ as a union of several disjoint cycles\footnote{Two cycles are disjoint if they do not have common elements}.
\end{definition}
\vspace{-0.3cm}

It can be verified that any permutation on a finite set has a unique cycle decomposition\citep{dummit1991abstract}. Therefore, we can define the {\it cycle number} of a permutation $\pi(\cdot)$ as the number of disjoint cycles with length greater than 1 in its cycle decomposition, which is denoted as $C(\pi)$. We also define the non-sparsity of a permutation as the Hamming distance to the original sequence, i.e., $H(\pi)=\sum_{s\in S}\mathbb{I}[\pi(s)\neq s]$. It is obvious that $H(\pi)>C(\pi)$ if $\pi$ is not an identity permutation. As a simple example, we consider the permutation $\pi(\cdot)$ that maps the sequence (1,2,3,4,5,6) to (3,1,2,5,4,6). Now the cycle decomposition for it is $\pi(\cdot)=(132)(45)(6)$, and $C(\pi)=2$, $H(\pi)=5$. 

\vspace{-0.1cm}
In all the following theoretical results, we denote the original matrix as $M={\bm[}A,B{\bm ]}\in \Rbb^{n\times m} $  with $A\in \Rbb^{n\times m_A}$, $B\in \Rbb^{n\times m_B}$, and ${\rm rank}(M)=r$, ${\rm rank}(A)=r_A$, ${\rm rank}(B)=r_B$. We denote the corresponding permutation as $\pi_P(\cdot)$ for any permutation matrix $P\in\Pc_n$. The following proposition says that the  perturbation effect of a permutation $\pi(\cdot)$ on the rank of $M$ becomes stronger, if  $\pi(\cdot)$ permutes more rows and contains less cycles.

\vspace{-0.1cm}
\begin{proposition} \label{prop:MR} 
  $\forall P\in\Pc_n$, we have
  \vspace{-0.1cm}
  \begin{align}
      \label{eq:MR1}
      {\rm rank}({\bm [}A,PB{\bm]})\leq \min\{n,m,r_A+r_B,r+H(\pi_P)-C(\pi_P)\}.
  \end{align} 
  \vspace{-0.5cm}
\end{proposition}
\vspace{-0.3cm}
We have similar result for the case with multiple permutation, which is summarized as the  Corollary \ref{col:multiple_rank} in Appendix \ref{app:proof}. It turns out that, without any assumption on $M$, \eqref{eq:MR1} is the tightest upper bound for the rank of a perturbed matrix. Notably, the following proposition says that the upper bound in \eqref{eq:MR1} is attained with probability 1 for a generic low-rank random matrix.
% \begin{definition}[Kruskal rank \citep{kruskal1977three}] The {\rm Kruskal rank} of a matrix $A$, written as {\rm krank}($A$), is the largest number $r$ such that every subset of $r$ columns of $A$ is linearly independent.
% \end{definition}
\begin{definition} A probability distribution on $\Rbb$ is called a proper distribution if its density function $p(\cdot)$ is absolutely continuous with respect the Lebesgue measure on $\Rbb$.
\end{definition}
\begin{proposition}
  \label{prop:attained_rank}
  If the original matrix  $M$ is a random matrix with $M=RE$ where $R\in \Rbb^{n\times r}$ and $E\in\Rbb^{r\times m}$ are two random matrices whose entries are i.i.d and follow a proper distribution on $\Rbb$ , and  $r\leq \min\{\sqrt{\frac n 2},m_A,m_B\}$, then $\forall P\in\Pc_n$, the equality 
  \vspace{-0.1cm}
  \begin{align}
    \label{p:attained_rank}
      {\rm rank}({\bm [}A,PB{\bm]})= \min\{2r,r+H(\pi_P)-C(\pi_P)\}
  \end{align}\vspace{-0.7cm}
  
  holds  with probability 1.
\end{proposition} 
\vspace{-0.3cm}
% \textbf{Identifying correspondence through rank minimization.}
% As mentioned previously, we propose to recover the unknown correspondence through the rank minimization problem \eqref{p:minimize_rank}.  Then,  a fundamental question is: Why and when does minimizing the rank work?  In fact,  exact recovery of the unknown correspondence relies on the following assumption on the original data matrix.
% \begin{assumption}[Minimum-Rank Assumption]
%   For the original data matrix $M=[A, B]\in \mathbb{R}^{n\times m}$ where 
%   $A\in \mathbb{R}^{n\times m_1}$ and $B \in \mathbb{R}^{n\times m_2}$, it holds that $P=I_n$ is the unique solution for
%   %The submatrices $A\in \mathbb{R}^{n\times m_1}$ and $B \in \mathbb{R}^{n\times m_2}$ satisfy that $ P=I$ is the unique optimal solution to the rank minimization problem 
%   \begin{align}\label{low rank assumption}
%   \underset{ P\in \Pc_n}{\text{\rm{min }}} \text{\rm{rank}}([A, P B]).
%   \end{align}
%   \end{assumption}
%   Clearly,  under the Minimum-Rank Assumption, it is legitimate to solve the MRUC problem based on the rank minimization problem \eqref{p:minimize_rank}. Although such assumption is strong and does not hold for an arbitrary matrix, we discover that it holds when $M$ satisfies a proper low-rank condition and the rank of $B$ is sensitive to permutation. A sufficient condition under which the Minimum-Rank Assumption holds is presented in Theorem \ref{thm:sufficient}.
% \begin{theorem}
%   \label{thm:sufficient}
%   Suppose that the matrices $A\in \mathbb{R}^{n\times m_1}$ and $B \in \mathbb{R}^{n\times m_2}$ satisfy that 
%   \begin{align}
%     \label{p:low-rank}
%     \text{\rm{rank}}([A,B])=\text{\rm{rank}}(A)=\text{\rm{rank}}(B),
%   \end{align}
%   and we have 
%   \begin{align}
%     \label{p:core}
%     \text{\rm{rank}}([ P  B, B]) > \text{\rm{rank}}(B),\ \forall  P\in \Pc_n \text{ and } P\neq I_n,
%   \end{align}
%   then the Minimum-Rank Assumption holds for the matrix $M=[A,B]$.
% \end{theorem}
% One can see that \eqref{p:low-rank} characterizes the required low-rank condition on $M$, where the two data submatrices $A$, $B$ share the same underlying information (In a formal way, they share the same bases.).  This is consistent with our intuition since, for example, $A$ and $B$ are generated by the same group of users. Besides, as stated in \eqref{p:core}, it is also required that the rank of $B$ is sensitive to permutation. In fact, \eqref{p:core} holds commonly because such permutation often brings in extra information into the data matrix. Specifically, for random matrix, we prove the following result.
% \begin{proposition}
%   \label{prop:core}
%   Suppose that $m_2<n$, and that $B \in \mathbb{R}^{n\times m_2}$ is a random matrix whose entries %follow some random distribution with $0$ mean and $\sigma^2$ variance i.i.d ($\sigma>0$)
%   are independent and identically distributed with zero mean  and nonzero variance. Then for any permutation matrix $ P\neq I_n$, the inequality \begin{align}
%     \text{\rm{rank}}([ P  B, B]) \geq \text{\rm{rank}}(B)+1
%   \end{align} holds with probability one.
% \end{proposition}
% \begin{wrapfigure}{r}{6.4cm}
%   \vspace{-0.7cm}
%   \centering
%   \subfigure[Rank]{
%     \label{fig:rankincrease}
%   \includegraphics[width=2.8cm]{figures/monoticity_rank.jpg}
%   %\caption{fig1}
%   }
%   \subfigure[Nuclear norm]{
%     \label{fig:nuclearincrease}
%   \includegraphics[width=2.9cm]{figures/monoticity_nuclear.jpg}
%   }
%   \caption{Monotonicity of rank and nuclear norm w.r.t $\left\| P - I_n\right\|_F$. The matrices are generated as $[A,B]=C D^\top+\eta E$, where the dimensions of $A$, $B$, $C$, $D$, $E$ are  $100\times 70$, $100\times 30$, $100\times 40$, $100\times 40$, $100\times 100$  respectively, and the entries of $C$, $D$, $E$ are drawn from $\Nc(0,1)$ i.i.d. We adopt $\eta=0/0.1$ for the case without/with noise. The dash lines and intervals reflect mean, min, maximum aggregated from 100 samples of random permutation.}\vspace{-0.6cm}
%   \end{wrapfigure}
% In a real world problem, $B$ could have repetitive rows and hence the property \eqref{p:core} would not hold. In fact, we only need that $B$ satisfies \eqref{p:core} by excluding those repetitive rows, since now all the optimal solutions to the problem \eqref{p:minimize_rank} are equivalent. The proof of the Theorem \ref{thm:sufficient} and the Proposition \ref{prop:core} refer to Appendix \ref{app:proof}. In practice,  the impact of permutation on the rank could be stronger than that estimated by the proposition.
% In Figure \ref{fig:rankincrease}, we show that the rank of a randomly generated low-rank matrix monotonically increases w.r.t the strength of permutation,  which is quantified by $\left\| P - I_n\right\|_F$. 

\textbf{Convex relaxation for the rank function.}
Despite the previous theoretical justification for problem \eqref{p:minimize_rank}, it is non-convex and non-smooth. Another crucial issue is that we often have a noisy observation matrix and it is well known that the rank function is extremely sensitive to the additive noise. In this paper, we assume that the observation matrix is corrupted by i.i.d Gaussian additive noise, i.e.,  \vspace{-0.2cm}$$M_o=[A_o,B_o]=[A,\tilde P B]+W,\text{ where }W(i,j)\sim \Nc(0,\sigma^2),$$\vspace{-0.7cm}

where $\sigma^2$ reflects the strength of the noise. We first denote the singular values of a matrix $X\in\Rbb^{n\times m}$ as $\sigma_X^1,...,\sigma_X^k$ where $k=\min \{n,m\}$. Since  $\text{rank}(X)=\|[\sigma_X^1,...,\sigma_X^k]\|_0$, from Proposition \ref{prop:attained_rank} we can view the perturbation effect  of a permutation to a low-rank matrix as breaking the sparsity of its singular values. This view leads naturally to the well-known 1-norm minimization problem which has been proven  robust to additive noise and can yield a sparse solution \citep{Wright-Ma-2021}, i.e., 
\vspace{-0.1cm}
\begin{align}
  \label{p:nuclear_norm}
  \min_{P\in \Pc_n} \|[A_o,PB_o]\|_*= \|[\sigma_{M_o}^1,...,\sigma_{M_o}^k]\|_1.
\end{align}\vspace{-0.5cm}

Since for arbitrary matrix, the 1-norm of its singular values is equivalent to its nuclear norm, we also ref problem \eqref{p:nuclear_norm} as a nuclear norm minimization problem.

\vspace{-0.1cm}
\textbf{Theoretical justification for the nuclear norm. }
Nuclear norm has a long history  used as a convex surrogate for the rank, and it has been theoretically justified for applications like low-rank matrix completion \citep{candes2010power,Wright-Ma-2021}. It is also important to see whether the nuclear norm is still a good surrogate for the rank minimization problem \eqref{p:minimize_rank}. In this work, we establish a sufficient condition on $A$ and $B$ under which  problem \eqref{p:nuclear_norm} is provably justified for correspondence recovery.  We denote  $A=\sum_{i=1}^{r_A}\sigma_A^i u_A^i v_A^{i\top},\ B=\sum_{i=1}^{r_B}\sigma_B^i u_B^i v_B^{i\top}$ as the singular values decomposition of $A$ and $B$, respectively,  where the $\sigma_A^i$ and $\sigma_B^i$ are the non-zero singular values. 

\vspace{-0.1cm}
Firstly, from the definition of nuclear norm, it can be simply verified for any $P\in \Pc_n$ that 

\vspace{-0.6cm}
\begin{align}
  \label{p:ineq1}
  -Z/N \leq (\|[A,PB]\|_*-\|M\|_*)/\|M\|_*\leq Z/N,
\end{align}
where we denote  $N=\max\{\|A\|_*,\|B\|_*\}$ and $Z=\min\{\|A\|_*,\|B\|_*\}$. The inequality \eqref{p:ineq1} indicates that $A$ and $B$ should have comparable magnitude, i.e., $\|A\|_*\approx \|B\|_*$, otherwise the influence of the permutation will be less significant. Therefore, we typically considers the scenario where the singular values of $A$ and $B$ are comparable, which is summarized as following Assumption \ref{asp:asp1}.
\begin{assumption} There exists a constant $\epsilon_1\geq 0$ such that
  \label{asp:asp1}
  \begin{align}
    \label{p:cond1}
  |\sigma_A^i-\sigma_B^i| \leq \epsilon_1,\ \forall i=1,..,r,
  \end{align} where we denote that $\sigma_A^i=0$ if $i>r_A$, and $\sigma_B^i=0$ if $i>r_B$.
\end{assumption}
\vspace{-0.2cm}
  Similar to the matrix rank, we also need a proper low-rank assumption on the matrix $M$ for the nuclear norm. In this work, we particularly study the scenario that the left singular vectors of $A$ and $B$ are similar, which we formally describe as Assumption \ref{asp:asp2}. We  refer Assumption \ref{asp:asp2} as a proper low-rank assumption, because it indicates that  the column space of $M$ can be approximated by the column space of one of its submatrices. 
 \begin{assumption} There exists a constant $\epsilon_2\geq 0$ such that
  \label{asp:asp2}
  \begin{align}
    \label{p:cond2}
  \|u_A^i-u_B^i\|\leq \epsilon_2, \forall i=1,...,T,
  \end{align}
  where we denote $T=\min\{r_A,r_B\}$.
\end{assumption}
\vspace{-0.2cm}

% More importantly, we denote that $u_A^i=\mathbf{0}_n$ if $\sigma_A^i=0$, and $u_B^i=\mathbf{0}_n$ if $\sigma_B^i=0$. \ye{It is unconventional to let $u_A^i=\mathbf{0}_n$. Usually just require them to be orthogonal complements to preceding singular vectors. If this is for (8), then better explain it.}  We first start with an important inequality for the nuclear norm.

Furthermore, we also need that all the column singular vectors $u_A^1,...,u_A^T,u_B^1,...,u_B^T$ are not invariant under any $P\in\Pc_n$ with $P\neq I_n$: we define a vector $u\in \Rbb^n$ to be invariant under $P$ if $Pu=u$. One weak condition for a vector $u$ to satisfy such property  is that $u$ dose not contains duplicated elements, which leads to the following Assumption \ref{asp:asp3}.
\begin{assumption} There exists a constant $\epsilon_3\geq 0$ such that
  \label{asp:asp3}
  \begin{align}
    \label{p:cond3}
    \min_{u\in U} \min_{i\neq j} |u(i)-u(j)|\geq \epsilon_3>0,  
  \end{align}where $U=\{u_A^1,...,u_A^T,u_B^1,...,u_B^T\}$.
\end{assumption}
\vspace{-0.2cm}
The necessity of Assumption \ref{asp:asp3} lies in that it excludes scenarios like, for example, $r_B\geq r_A$ and there exists a $P_a\in\Pc_n$ with $P_a\neq I_n$ such that $P_aA=A$, where  we have $$\|[A,P_a^\top B]\|_*=\|[P_aA,P_aP_a^\top B]\|_*=\|M\|_*.$$ In this scenario, $P_a^\top$ is also a permutation matrix, and it is not possible to distinguish  $[A,P_a^\top B]$ and $M$ by the value of their nuclear norm. 

% For the second requirement, we show that when the elements every singular vectors in $U_A$ are more diverse, the corresponding column space will be more sensitive to permutation. We difine the proximity of two column space col($U_A$) and col($PU_A$) as 
% \begin{align}
%   \delta(U_A,PU_A)\stackrel{\text{def.}}=\max_{\|x\|=1,\|y\|=1} \langle U_Ax,PU_Ay \rangle.
% \end{align}
% We have the following result.

\vspace{-0.1cm}
In summary, the assumptions mentioned above feature a typical low-rank structure in $M$, and implies that the nuclear norm of $M$ is sensitive to permutation.
With the three assumptions, we have the following important theorem, which  provides high probability bound for the error of \eqref{p:nuclear_norm}.

\vspace{-0.1cm}
We denote the solution to \eqref{p:nuclear_norm} as $P^*$, and let $\pi^*$ and $\tilde \pi$ be the corresponding permutation to the  permutation matrices $P^{*\top}$ and $\tilde P$, respectively.  We define the difference between the two permutation $\pi^*$ and $\tilde \pi$ as the {\it Hamming} distance 
\vspace{-0.3cm}
$$d_H(\pi^*,\tilde \pi)\stackrel{\text{def.}}=\sum_{i=1}^n\mathbb{I}(\pi^*(i)\neq \tilde \pi(i)).$$
\vspace{-0.5cm}

 

\begin{theorem}
  \label{thm:error_bound}
  Under  Assumptions \ref{asp:asp1}, \ref{asp:asp2} and \ref{asp:asp3}, if additionally  $\epsilon_1\leq \frac{M}{4r}$, $\epsilon_2\leq \min\{\frac{1}{2\sqrt{2T}},\frac{\sqrt{2}M}{2N}\}$, and $\sigma \leq \frac{M}{16L^2}$, then the following bound for the Hamming distance
  \begin{align}
    \label{p:error_bound}
    d_{H}(\pi^*,\tilde \pi)\leq \frac{2}{\epsilon_3^2}\left(2-\left(\frac{\sqrt{2}D}{D+(\sqrt{2}+2)\epsilon_1r+\sqrt{2}\epsilon_2N+2\sqrt{2DL\sigma}} - \sqrt{T}\epsilon_2\right)^2 \right)
  \end{align} holds with probability at least $1-2\exp\{-\frac{D}{8L\sigma}\}$, where $L=\max\{n,m\}$, $D=\|A\|_*+\|B\|_*$.
\end{theorem}
\vspace{-0.2cm}
   The proof to the aforementioned theoretical results are provided in Appendix \ref{app:proof}. 
  
   \textbf{Remark 1. }From  Theorem \ref{thm:error_bound} we can see that when $\epsilon_3>0$, and $\epsilon_1\to 0$, $\epsilon_2\to 0$, $\ {\sigma}\to 0$, the error  $d_{H}(\pi^*,\tilde \pi)$ will converge to zero with probability 1. Furthermore, we can also discover that the correspondence can be difficult to recover when: 
   \vspace{-0.2cm}
   \begin{itemize}[leftmargin=*]
    \setlength\itemsep{0.01em}
     \item The rank of original matrix $M$ is high, which can be seen from \eqref{p:error_bound}.
     \item The magnitude  of $A$ and $B$ w.r.t rank or nuclear norm are not comparable, which can be seen from and \eqref{p:ineq1} and \eqref{p:cond1}.
     \item The strength of noise is high, which can be seen from the \eqref{p:error_bound} and the probability in Theorem \ref{thm:error_bound}.
   \end{itemize}
   \vspace{-0.2cm}

   Notably, the numerical experiments in Section \ref{sec:experiment1} corroborate our claim as well. 



  \begin{wrapfigure}{r}{4.5cm}
    \vspace{-0.2cm}
    \centering 
      \includegraphics[width=4.5cm]{figures/monoticity_nuclear.jpg}
      \vspace*{-0.6cm}
      \caption{\small The relationship \eqref{p:relation} under different percentages of observable entries.}
      \label{fig:nuclearincrease}
  \vspace{0.4cm}
    \end{wrapfigure}
  \textbf{Remark 2. }Additionally, from the proof of  Theorem \ref{thm:error_bound} we find that the fundamental reason for the success of \eqref{p:nuclear_norm} is that if $M$ satisfies the previous assumptions, we have \vspace{-0.3cm}  \begin{align}
      \label{p:relation}
      \|[A,P B]\|_*/\|M\|_*\approx O\left(\left(1-H(\pi_P)/2n\right)^{-\frac{1}{2}}\right).
    \end{align} 
    \vspace{-0.8cm} 

    In many applications, we can only observe part of the full data. Therefore, it is also worth to investigate that whether \eqref{p:relation} still holds when we can only access a small subset of the entries in $M_o$. Notably, Figure \ref{fig:nuclearincrease} gives the positive answer and shows that the relationship \eqref{p:relation} is gracefully degraded when the percentage of observable entries is decreasing. This phenomenon is remarkable since it indicates the original correspondence can be recovered from only part of the full data. The matrices used to generate  Figure \ref{fig:nuclearincrease} are the same as those in  Section \ref{sec:experiment1}, and the nuclear norm is computed approximately by first filling  the missing entries using Soft-Impute algorithm \citep{mazumder_spectral_2010}.
    
  %  Monotonicity of rank and nuclear norm w.r.t $\left\| P - I_n\right\|_F$. The matrices are generated as $[A,B]=C D^\top+\eta E$, where the dimensions of $A$, $B$, $C$, $D$, $E$ are  $100\times 70$, $100\times 30$, $100\times 40$, $100\times 40$, $100\times 100$  respectively, and the entries of $C$, $D$, $E$ are drawn from $\Nc(0,1)$ i.i.d. We adopt $\eta=0/0.1$ for the case without/with noise. The dash lines and intervals reflect mean, min, maximum aggregated from 100 samples of random permutation.

% We can see from \eqref{p:bound_diff_nuclear} that if $\|\Pc_{T^\perp}(PB-B)\|_*>\|\Pc_{T}(PB-B)\|_*$ for any $P\neq I_n$, \eqref{p:nuclear_norm} can yield the exact recovery. We show empirically that this property exist common in real world data that has a low-rank structure, and there exist a clear phase transition phenomenon \red{figures}.

% \red{More discussion on the lesson learned from the proposition. Show the intuition behind these propositions.}

% \begin{proposition}
%   \label{prop:sufficient} We denote the right singular vectors as $U=[u_1,...,u_r]$. If 
%   \begin{align}
%     \label{p:sufficient_1}
%     |\mathbf{1}_n^\top u_i|>\sqrt{n-\frac{n}{2r(1+m_2)(n-1)}},\forall i=1,...,r,
%   \end{align}
%   we have 
%   \begin{align}
%     \label{p:sufficient_2}
%     \|\Pc_{T^\perp}(PB-B)\|_*>\|\Pc_{T}(PB-B)\|_*, \forall P \in \Pc_n \text{ with } P\neq I_n.
%   \end{align}
% \end{proposition}
\vspace{-0.5cm}
\section{Algorithm}
\label{sec:algorithm}
\vspace{-0.4cm}
In this section, we consider the scenario with missing values, i.e.,  our observed data is $\Pc_\Omega (M_o)=\Pc_\Omega([A_o,B_o])$, where $\Pc_\Omega$ is an operator that selects  entries that are in the set of observable indices  $\Omega$.
In this scenario,  problem \eqref{p:nuclear_norm} can not be directly used since the evaluation of the  nuclear norm and  optimization of the permutation are coupled together. Inspired by the matrix completion method \citep{hastie_matrix_2015,mazumder_spectral_2010}, we propose to solve an alternative form of \eqref{p:nuclear_norm} as follows, 
\vspace{-0.6cm} 
\begin{align}
  \label{p:unrelaxed}
  \min_{\widehat M\in \Rbb^{n\times m}} \min_{P\in \Pc_n}  &\left\|\Pc_\Omega([A_o, P B_o])-\Pc_\Omega(\widehat M)\right\|_F^2+\lambda\left\|\widehat M\right\|_*,
\end{align} \vspace{-0.5cm} 

where $\lambda>0$ is the penalty coefficient. We denote that $\widehat M=[\widehat M_A,\widehat M_B]$ and $\widehat M_A,\widehat M_B$  are the two submatrices  with the same dimension as $A_o$ and $B_o$ respectively. We can write \eqref{p:unrelaxed} equivalently as
\vspace{-0.2cm} 
\begin{align}
  \label{p:rewrite}
  \min_{\widehat M\in \Rbb^{n\times m}}\min_{P\in \Pc_n} \left\|\Pc_\Omega(A_o) -\Pc_\Omega(\widehat M_A)\right\|_F^2+\langle C(\widehat M_B), P \rangle+\lambda\left\|\widehat M\right\|_*,
\end{align}
\vspace{-0.6cm}

where  $C(\widehat M_B)\in\mathbb{R}^{n\times n}$ is the pairing cost matrix with
\vspace{-0.3cm} 
\begin{align*}
  C(\widehat M_B)(i,j)=\sum_{(j,j'')\in \Omega}\bigg(\widehat M_B(i,j'')-B_o(j,j'')\bigg)^2,~\forall i,j=1,...,n.
\end{align*}
\vspace{-0.6cm} 

\textbf{Baseline algorithm.}
A conventional  strategy to handle an optimization problem like \eqref{p:rewrite} is the alternating minimization or the block coordinate descent algorithm \citep{abid2017linear}. Specifically, it executes the following two updates iteratively until converge.
\vspace{-0.2cm} 
\begin{align}
  \label{p:BCD1}
  &\widehat M^{\text{new}}\leftarrow \underset{\widehat M\in \Rbb^{n\times m}}{\arg\min}\left\|\Pc_\Omega([A_o,\widehat P^{\text{old}} B_o])-\Pc_\Omega(\widehat M)\right\|_F^2+\lambda\left\|\widehat M\right\|_*,\\
  \label{p:BCD2}
  &\widehat P^{\text{new}}\leftarrow \underset{P\in \Pc_n}{\arg\min}\  \langle C(\widehat M^{\text{new}}_B),P\rangle.
\end{align}
\vspace{-0.5cm} 

The first update step \eqref{p:BCD1} is a convex optimization problem and can be solved by the proximal gradient algorithm \citep{mazumder_spectral_2010}. The second update step \eqref{p:BCD2} is actually a discrete optimal transport problem which can be solved by the classical Hungarian algorithm with time complexity $O(n^3)$ \citep{jonker1986improving}. However, as we will see in the Section \ref{sec:experiment}, this algorithm performs poorly, and it is  likely to fall into an undesirable local solution quickly in practice. Specifically, the main reason is that the solution of \eqref{p:BCD2} is often not unique and  a small change in $\widehat M_B$ would lead to large change of $\widehat P$.  To address this issue, we propose a novel and efficient algorithm $\text{M}^3\text{O}$ algorithm based on the entropic optimal transport \citep{peyre2019computational} and min-max optimization \citep{jin2020local}. 

\vspace{-0.2cm}
\textbf{Smoothing the permutation with entropy regularization.}
For any $a\in\mathbb{R}^n,b\in\mathbb{R}^m$, we define \vspace{-0.2cm} \begin{align*}
  \Pi(a,b)=\{S\in \mathbb{R}^{n\times m}:S\mathbf{1}_m=a,S^\top \mathbf{1}_n=b,S(i,j)\geq 0,~\forall i,j\},
\end{align*} \vspace{-0.7cm}

which is also known as the Birkhoff polytope. The famous Birkhoff-von Neumann theorem \citep{birkhoff1946three} states that the set of extremal points of $\Pi(\mathbf{1}_n,\mathbf{1}_n)$ is equal to $\Pc_n$. Inspired by \citep{xie_hypergradient_2020} and the interior point method for linear programming \citep{bertsekas1997nonlinear}, in order to smooth the optimization process of the baseline algorithm, we relax $P$ from being an exact permutation matrix, i.e., to keep $P$ staying inside the Birkhoff polytope $\Pi(\mathbf{1}_n,\mathbf{1}_n)$. That is, we propose to replace the combinatorial problem \eqref{p:BCD2} with the following continuous optimization problem 
\vspace{-0.2cm}
\begin{align}
  \label{p:entropyOT}
  \underset{P\in \Pi(\mathbf{1}_n,\mathbf{1}_n)}{\text{min }} &\langle C(\widehat M_B),P\rangle+\epsilon H(P),
\end{align}\vspace{-0.6cm}

where  $H(P)\stackrel{\text{def.}}=\sum_{i,j}P(i,j)(\log(P(i,j))-1)$ is the matrix negative entropy and $\epsilon>0$ is the regularization coefficient. Notably, \eqref{p:entropyOT} is also known as the Entropic Optimal Transport (EOT) problem \citep{peyre2019computational}, which is a strongly convex optimization problem and can be solved  roughly in the $O(n^2)$ complexity by the Sinkhorn algorithm. Specifically, the Sinkhorn algorithm actually solves the dual problem of \eqref{p:entropyOT}, \vspace{-0.3cm}  \begin{align}\label{EOT dual}
	\max _{\alpha,\beta \in \mathbb{R}^{n}}  W_\epsilon(\widehat M_B,\alpha,\beta)\stackrel{\text{def.}}=\left\langle\mathbf{1}_{n}, \alpha\right\rangle+\left\langle\mathbf{1}_{n}, \beta\right\rangle-\epsilon\bigg\langle  \mathbf{1}_{n\times n} , \text{exp}\bigg\{\frac{\alpha\oplus\beta-C(\widehat M_B)}{\epsilon}\bigg\}\bigg\rangle,
\end{align} \vspace{-0.5cm} 

which reduces the variables dimension from $n^2$ to $2n$ and is thus greatly favorable in the high dimension scenario. By substituting the inner minimization problem of \eqref{p:rewrite} with \eqref{p:entropyOT}, we end up with solving the following  unconstrained min-max optimization problem 
\vspace{-0.1cm} 
\begin{align}
  \label{p:entropydual}
  \underset{\widehat M}{\min}\underset{\alpha,\beta}{\ \max} \left\|A-\widehat M_A\right\|_F^2+W_\epsilon(\widehat M_B,\alpha,\beta)+\lambda\left\|\widehat M\right\|_*&.
\end{align}
\vspace{-0.5cm} 

% Then, we can relax the permutation matrix set $\Pc_n$ to the continuous polyhedral set $\Pi(\mathbf{1}_n,\mathbf{1}_n)$ without affecting the optimality of the inner problem of \eqref{p:rewrite}.  
% %{\chang This is because the objective of \eqref{p:rewrite} is linear in $P$ and, by the fundamental result of linear programming \citep{bertsimas1997introduction},  the minimum of \eqref{p:rewrite}, if finite,  can be reached by an extreme point of the polyhedron $\Pi(\mathbf{1}_n,\mathbf{1}_n)$. } Besides, the Birkhoff-von Neumann theorem \citep{birkhoff1946three} states that the set of extremal points of  $\Pi(\mathbf{1}_n,\mathbf{1}_n)$ is equal to $\Pc_n$. 
%  This is a direct corollary of .
% Therefore,  we  arrive at the following continuous problem
% \begin{align}
%   \label{p:continuous}
%   \min_{\widehat M}\min_{P\in \Pi(\mathbf{1}_n,\mathbf{1}_n)}\left\|A -\widehat M_A\right\|_F^2+\langle C(\widehat M_B), P \rangle+\lambda\left\|\widehat M\right\|_*.
% \end{align}


% In order to having an objective function with benign landscape, we need to smooth the relationship between $\widehat M$ and $\widehat P$. The idea is to keep $\widehat P$ staying inside the feasible set $\Pi(\mathbf{1}_n,\mathbf{1}_n)$.  Similar to \citep{xie_hypergradient_2020}, one way is to add entropy regularization term to the inner linear programming problem, i.e., to consider
% \begin{align}
%   \label{p:entropyOT}
%   \underset{P\in \Pi(\mathbf{1}_n,\mathbf{1}_n)}{\text{min }} &\langle C(\widehat M_B),P\rangle+\epsilon H(P),
% \end{align}
% %
% where  $H(P)\stackrel{\text{def.}}=\sum_{i,j}P(i,j)(\log(P(i,j))-1)$ is the matrix negative entropy and $\epsilon>0$ is the regularization coefficient.  It has been noted that \eqref{p:entropyOT} is actually the Entropic Optimal Transport (EOT) problem \citep{xie_hypergradient_2020}, and unlike \eqref{p:BCD2}, this problem \eqref{p:entropyOT} is strongly convex and has a unique solution.   In fact,  the EOT problem \eqref{p:entropyOT} can be efficiently solved by the 
% Skinhorn algorithm  which considers the Lagrange dual of \eqref{p:entropyOT}. Here, we adopt a similar idea to consider the dual problem of \eqref{p:entropyOT}, i.e.,
% \begin{align}\label{EOT dual}
% 	\max _{\alpha,\beta \in \mathbb{R}^{n}}  W_\epsilon(\widehat M_B,\alpha,\beta)\stackrel{\text{def.}}=\left\langle\mathbf{1}_{n}, \alpha\right\rangle+\left\langle\mathbf{1}_{n}, \beta\right\rangle-\epsilon\bigg\langle  \mathbf{1}_{n\times n} , \text{exp}\bigg\{\frac{\alpha\oplus\beta-C(\widehat M_B)}{\epsilon}\bigg\}\bigg\rangle,
% \end{align}  
% where $\alpha,\beta \in \mathbb{R}^{n}$ are the dual variables.   An obvious benefit of the dual problem \eqref{EOT dual} over \eqref{p:entropyOT} is that we can reduce the variable dimension from $n^2$ to $2n$ and eliminate the constraint. 
% %, which is extremely helpful especially when $n$ is large. In this way,  we are in fact dealing with the unconstrained min-max optimization problem, 
%  
Follows the idea of \citep{jin2020local}, we consider to adopt a proximal gradient algorithm with a Max-Oracle for \eqref{p:entropydual}.
Specifically,  we employ the Skinhorn algorithm \citep{peyre2019computational} as the  Max-Oracle to retrieve an $\varepsilon$-good solution of the inner max problem \eqref{EOT dual}.
We summarize our proposed algorithm  $\text{M}^3\text{O}$ (\textbf{M}atrix recovery via \textbf{M}in-\textbf{M}ax \textbf{O}ptimization) in Algorithm \ref{alg:McubicO}, where $\text{prox}_{\lambda\left\|\cdot\right\|_*}(\cdot)$ is the proximal operator of nuclear norm and $\rho_k$ is the gradient stepsize. The convergence property of $\text{M}^3\text{O}$ can be obtained by following \citep{jin2020local}, which shows that, with a decaying stepsize, $\text{M}^3\text{O}$ is bound to converge to an $\varepsilon$-good Nash equilibrium within $O(\varepsilon^{-2})$ iterations. 

\vspace{-0.4cm}
\begin{algorithm}
  \caption{$\text{M}^3\text{O}$}\label{alg:McubicO}
  \While{\rm not converged}{
    For the tolerance $\varepsilon$, run the Sinkhorn algorithm to find $\alpha^*$, $\beta^*$ such that $$W_\epsilon(\widehat M_B^{k},\alpha^*,\beta^*)>\max_{\alpha,\beta}\ W_\epsilon(\widehat M_B^{k},\alpha,\beta)-\varepsilon;$$

  Perform
  $\widehat M^{\text{k+1}} \leftarrow \text{prox}_{\lambda\left\|\cdot\right\|_*} (\widehat M^{k}-\rho_k\nabla_{\widehat M} F_\epsilon(\widehat M^{k},\alpha^*,\beta^*)),$ where $$F_\epsilon(\widehat M,\alpha,\beta)\stackrel{\text{def.}}=\left\|A-\widehat M_A\right\|_F^2+W_\epsilon(\widehat M_B,\alpha,\beta);$$
   }
  \end{algorithm}
 
  \vspace{-0.4cm}
% For the general problem, we just need to replace $W^\epsilon_k(B_k)$ with \begin{align*}
% 	W_k^\epsilon(B_k)=\max _{\alpha_k,\beta_k,\nu_k} &\left\langle\mathbf{1}_{n}, \alpha_k\right\rangle+\left\langle\nu_k, \beta_k\right\rangle-\epsilon\bigg\langle  \mathbf{1}_{n\times n} , \text{exp}\bigg\{\frac{\alpha_k\oplus\beta_k-C_k(B_k)}{\epsilon}\bigg\}\bigg\rangle,\\
%   &\text{s.t. }\nu_k\preceq\mathbf{1}_{\tilde{n}_k},\nu_k^T\mathbf{1}_{\tilde{n}_k}=n, 
% \end{align*} which can be tackled similarly as in \citep{xie_hypergradient_2020}.

\textbf{Remark 3. }A recent work \citep{pmlr-v115-xie20b} proposes a decaying strategy for the entropy regularization coefficient $\epsilon$ in \eqref{p:entropyOT} so that the optimal solutions of \eqref{p:BCD2} and \eqref{p:entropyOT} do not deviate too much. Inspired by it, in our practice, we take large $\epsilon$ in the beginning and gradually shrink it by half until the objective function stops improving for $K$ steps. 

\vspace{-0.15cm}
\textbf{Remark 4. }A useful trick is that we should not take large stepsize in the early  iterations because the permutation matrix could still be far away from the optimal one. However,  a small stepsize would lead to slow convergence. Heuristically, we propose an adaptive stepsize strategy that performs  well in practice. For the   solution of (\ref{p:entropyOT}) $\widehat P_k$ at the $k$th iteration, we  compute the two statistics \vspace{-0.1cm} $$\delta_k=\left\|\widehat P_{k-1}-\widehat P_{k}\right\|_F^2/2n \text{ and }
  c_k=\left\|\text{max}_j \widehat P_k(\cdot,j)-\mathbf{1}_n\right\|_1/n.$$ \vspace{-0.6cm} 

  Here $\delta_k$ represents how fast the permutation matrix $\widehat P_k$ changes over the iterations, while $c_k$ measures how far the current $\widehat P_k$ is  close to an exact permutation matrix. Both $\delta_k$ and $c_k$  reflect the confidence on the current found correspondence.  Based on them,  we set the stepsize as $\rho_{k+1}=(1-\delta_k)(1-c_k)^\omega,$  where $\omega>0$ is a tunable parameter which is often set between 0.5 to 3. $\omega$ actually trades off the convergence speed and final performance. The smaller the $\omega$, the faster the convergence. Therefore, a practical way is to start with a small $\omega$, and gradually increase it until the final performance stops improving. 

% In practice, we restrict the value of $\rho_k$ within 0.01 and 0.5 for numerical stability. 


% In this way, the objective function becomes
% \begin{align}
%   \label{partial}
%  \left\|P_{\Omega_1}(A_1)-P_{\Omega_1}(B_1)\right\|_F^2+\sum_{k=2}^d\langle C^\Omega_k(B_k),P_k\rangle+\lambda\left\|B\right\|_*\
% \end{align}

\vspace{-0.15cm}
\textbf{Remark 5. }As discussed in Section \ref{sec:intro}, in many cases we have to deal with the problem that involves multiple correspondence, i.e., we need to recover $[A,B_1,...,B_d]$ from $[A,\tilde P_1B_1,...,\tilde P_dB_d]$. An important observation is that, although the number of possible correspondence increase exponentially as $d$ grows, the complexity of   M$^3$O per iteration only linearly increase with $d$ and can be implemented in a fully parallel fashion. Specifically, in this scenario, we solve the  problem
\vspace{-0.2cm} 
\begin{align}
  \label{p:multiple}
  \min_{\widehat M}\min_{P_1,...,P_d}&\left\|\Pc_{\Omega}(A) - \Pc_{\Omega}(\widehat M_A)\right\|_F^2+\sum_{l=1}^{d}\bigg\{\langle C_\Omega(\widehat M_{B_l}), P_l \rangle+\epsilon H(P_l)\bigg\}+\lambda\left\|\widehat M\right\|_*,\\
  &\text{s.t. }P_l \in \Pi(\mathbf{1}_n,\mathbf{1}_n),\ l=1,...,d,\notag
\end{align}
\vspace{-0.6cm} 

where we denote $\widehat M=[\widehat M_A,\widehat M_{B_1},...,\widehat M_{B_d}]$. Here $\widehat M_A$ and $\widehat M_{B_l}$ have the same dimension with $A$ and $\tilde B_l$, respectively. 
We refer (\ref{p:multiple}) as the $\bm{d}$\textbf{-correspondence} problem.  One can find that the inner problems for solving $P_l$ are actually decoupled for each $l$, which guarantees  an efficient parallel implementation. 

\vspace{-0.15cm}
\textbf{Remark 6. }Since the problem \eqref{p:unrelaxed} has a similar form to that considered in \citep{mazumder_spectral_2010}. We adopt the same tuning strategy of $\lambda$ proposed in \citep{mazumder_spectral_2010}, which suggests that we should start with large $\lambda$ and gradually decrease it.

\vspace{-0.2cm}
We relegate more details about $\text{M}^3\text{O}$ to Appendix \ref{app:algo}.

\vspace{-0.4cm}
\begin{wrapfigure}{r}{8cm}
  \vspace{-1cm}
  \subfigure[Objective value]{
    \label{fig:exp1_Obj}
  \includegraphics[width=3.7cm]{figures/exp1_Obj.jpg}
  %\caption{fig1}
  }
  \subfigure[Permutation error]{
    \label{fig:exp1_Perr}
  \includegraphics[width=3.7cm]{figures/exp1_Perr.jpg}
  }
  \vspace*{-0.3cm}
  \caption{\small Performance of various algorithms on a simulated 1-correspondence problem.}\label{fig:exp1}\vspace{-0.5cm}
  \label{fig:multi_algo}
  \end{wrapfigure}
\section{Experiments}
\label{sec:experiment}
\vspace{-0.3cm}
In this section, we evaluate our proposed $\text{M}^3\text{O}$ on both synthetic and real-world datasets, including the MovieLens 100K and the Extended Yale B dataset. We also provide an ablation study for the decaying entropy regularization strategy and the  adaptive stepsize strategy proposed in Remarks 3 and 4. In all the experiments, we employ the Soft-Impute algorithm \citep{mazumder_spectral_2010} as a standard algorithm for matrix completion.   Extra experiment details and auxiliary results can be
found in Appendix \ref{app:exp}.

\vspace{-0.1cm}
\textbf{Algorithms.} We denote the following algorithms for comparison in all the experiments:
\vspace{-0.2cm}
\begin{enumerate}[style=sameline,itemindent=0em,leftmargin=20pt]
   \item  \textit{Oracle}:  Running the Soft-Impute algorithm  with ground-truth correspondence.
  \item \textit{Baseline}: The Baseline algorithm in \eqref{p:BCD1} and \eqref{p:BCD2}.
  \item \textit{MUS}: Since there is currently no existing algorithm directly applicable the scenario considered by \eqref{p:multiple}, inspired by \citep{yao2021unlabeled}, we adapt the algorithm in \citep{zhang2020optimal}, which is originally proposed for the MUS problem, to deal with the MRUC problem. The details of the adapted algorithm are provided in Appendix \ref{app:adapt}.
\end{enumerate}
\vspace{-0.3cm}

% We use the Soft-Impute algorithm \citep{mazumder_spectral_2010} for standard matrix completion due to its connection to the $\text{M}^3\text{O}$, see Appendix \ref{app:link} for more discussions.
% We mainly focus on two metrics for the two experiments, respectively. The first one is the permutation error defined by the Hamming distance, and the second one is the Root Mean Squared Error (RMSE), i.e., 
% $$d_H(\hat \pi,\pi)\stackrel{\text{def.}}=\sum_{i=1}^n\mathbb{I}(\hat \pi(i)\neq  \pi(i)),\quad \text{RMSE}\stackrel{\text{def.}}=\sqrt{\frac1N\sum_{i,j}(\widehat M_{ij}-M_{ij})^2}.$$ 
%  For the permutation error, we denote $\mathbb{I}(\cdot)$ as a binary indicator function, while $\hat \pi(\cdot)$ and $\pi(\cdot)$ are the recovered permutation and ground-truth permutation respectively. For the RMSE, we denote $\widehat M_{ij}$ as the estimated rating, $M_{ij}$ as the ground-truth rating, and N as the number of test data.
\vspace{-0.2cm}
\subsection{Synthetic data}
\label{sec:experiment1}
\vspace{-0.2cm}
We first investigate the property of our proposed $\text{M}^3\text{O}$  algorithm on the synthetic data.
\vspace{-0.1cm}

\textbf{Data generation.}
We generate the original data matrix in this form $M=R E+\eta W, $ where $R\in\mathbb{R}^{n\times r}$, $E \in \mathbb{R}^{r\times m}$, $W\in \mathbb{R}^{n\times m}$ and $\eta >0$ indicates the strength of the additive noise. The entries of $R$, $E$, $W$ are all i.i.d sampled from the  $\Nc (0,1)$. Then we split the data matrix $M$ by  $M=[A,B_1,...,B_d]$ where we denote $A\in \mathbb{R}^{n\times m_A}$, $B_1\in \mathbb{R}^{n\times m_1}$, ..., $B_d\in \mathbb{R}^{n\times m_d}$ to represent data from $d+1$ data sources. The permuted observation matrix $M_o$ is obtained by first generating $d$  permutation matrices $P_1,...,P_d$ randomly and independently, and then computing $ M_o=[A, P_1  B_1,..., P_d  B_d]$. Finally, we remove $(1-|\Omega|\cdot 100\%/(n\cdot m))$ percent of the entries of $M_o$ randomly and uniformly, where  $|\Omega|$ indicating the number of observable entries.  

\vspace{-0.1cm}
  \textbf{Ablation study.} We denote the following variants of $\text{M}^3\text{O}$ for the ablation study.

  \vspace{-0.3cm}
  \begin{enumerate}[style=sameline,itemindent=0em,leftmargin=20pt]
    {\item \textit{$\text{M}^3\text{O}$-AS-DE}:  $\text{M}^3\text{O}$  with both {A}dpative {S}tepsize and {D}ecaying {E}ntropy regularization. }
    {\item \textit{$\text{M}^3\text{O}$-DE}: $\text{M}^3\text{O}$ with Decaying Entropy regularization only. $\text{M}^3\text{O}$-DE-1 and $\text{M}^3\text{O}$-DE-2 adopt constant stepsize $\rho_k=0.5$ and $\rho_k=0.01$, respectively.}
    {\item \textit{$\text{M}^3\text{O}$-AS}: $\text{M}^3\text{O}$  with {A}dpative {S}tepsize only. The entropy coefficient $\epsilon$ is fixed to 0.0005.} 
  \end{enumerate}

  \vspace{-0.3cm}
  In the following results, we denote $\pi_l$ as the corresponding permutation to $ P_l$.  We initialize $\widehat M$ from Gaussian distribution for the $\text{M}^3\text{O}$ algorithm and its variants. We choose initial $\epsilon$ as 0.1 and $K=100$ as the default for the decaying entropy regularization, and set $\omega=3$ as the default for the adaptive stepsize. We also report  the achieved objective values of \eqref{p:multiple} for the tested algorithms, except for the MUS algorithm since it has a different objective. 

  \vspace{-0.1cm}
  \textbf{Results.}   Figure \ref{fig:multi_algo} displays the result under the setting  $\eta=0.1$, $|\Omega|\cdot 100\%/(n\cdot m)=80\%$, $n=m=100$, $r=5$, $d=1$, $m_A=60$ and $m_1=40$.  The algorithm {M}$^3$O-AS-DE achieves the best result, and can recover the ground-truth correspondence. {M}$^3$O-AS behaves similarly to Baseline and MUS. They all converge to a poor local solution quickly. {M}$^3$O-DE-1 converges quickly and also falls into a local solution due to large stepsize, while  {M}$^3$O-DE-2 adopts a small stepsize and hence suffers from slow convergence. Due to the superiority of {M}$^3$O-AS-DE over the other variants, in the following results, we refer {M}$^3$O as {M}$^3$O-AS-DE for short.
  
  Figure \ref{fig:various_setting} examine {M}$^3$O under different regimes w.r.t $|\Omega|$, $\eta$, $r$ and $m_A/n$. As we can see, the results are well aligned with our prediction in Remark 1 and Remark 2.

  Finally, we examine {M}$^3$O on the general d-correspondence problem. See Table \ref{tb:multi-perm} for various results, where we set $r=5$ and $\varepsilon=0.1$. Notice that for the 4-correspondence problem in the table, there are $(100!)^4$ possible correspondence. Even for such a difficult problem, {M}$^3$O is able to recover 61.5\% of the ground-truth correspondence with a good initialization. 
  \vspace{-0.3cm}
    \begin{figure}[htbp]
      \centering
      \subfigure[ $d_H$ v.s. $|\Omega|$]{
        \label{fig:exp2_Obj}
      \includegraphics[width=2.7cm]{figures/exp2_Perr.jpg}
      %\caption{fig1}
      }
      \subfigure[ $d_H$ v.s. $\eta$]{
        \label{fig:exp3_Perr}
      \includegraphics[width=2.7cm]{figures/exp3_Perr.jpg}
      }
      \quad
      \centering
      \subfigure[ $d_H$ v.s. $r$]{
        \label{fig:exp3_r}
      \includegraphics[width=2.7cm]{figures/exp3_Obj.jpg}
      %\caption{fig1}
      }
      \quad
      \subfigure[ $d_H$ v.s. $m_A/n$]{
        \label{fig:exp3_ma}
      \includegraphics[width=2.7cm]{figures/exp3_Perr.jpg}
      }\vspace*{-0.4cm}
      \caption{\small Performance of {{M}$^3$O} on a 1-correspondence problem under different $|\Omega|$, $\eta$, $r$ and $m_A/n$. The default setting is $|\Omega|\cdot 100\%/(n\cdot m)=80\%$, $\eta=0.1$, $n=m=100$, $r=5$, $m_A=60$,  and $m_1=40$. The mean  with standard deviation are calculated from 10 different random initializations.}
      \label{fig:various_setting}\vspace{-0.3cm}
      \end{figure}
\vspace{-0.2cm}
      \begin{table}[htbp]
        \centering
        \small
        \caption{\small Performance of {M}$^3$O for various d-correspondence problems. The normalized permutation error $\sum_{l=1}^d d_H(\hat \pi_l,\pi_l)/d$  is reported as mean$\pm$std (min) over 10 different random initializations.}
        \label{tb:multi-perm}
        \vspace*{-0.2cm}
        \begin{tabular}{ccccc}
        \toprule
        $(n,m_A,m_1,...,m_d)$  & $d$ & $|\Omega|\cdot 100\%/(n\cdot m)$  & $\sum_{l=1}^d d_H(\hat \pi_l,\pi_l)/d$ \\
        \midrule
        (100,40,30,30)& 2 & 40\% & $33.35\pm32.85$ (0.00)\\
        (100,20,40,40)& 2 & 40\% & $58.90\pm27.21$ (2.00)\\
        (100,45,25,25,25)& 3 & 50\% & $61.97\pm15.41$ (37.33)\\
        (100,40,25,25,25,25)& 4 & 60\% & $59.90\pm13.64$ (38.50)\\
        \bottomrule
        \end{tabular}\vspace{-0.3cm}
        %\caption{这是一张三线表}\label{tab:aStrangeTable}  标题放在这里也是可以的
        \end{table}

        \vspace{-0.3cm}
        \subsection{Multi-domain recommender system without correspondence}
        \vspace{-0.2cm}

In this section, we study the performance of $\text{M}^3\text{O}$ on a real world dataset MovieLens 100K\footnote{https://grouplens.org/datasets/movielens/100k/}, which is a widely used movie recommendation dataset \citep{harper2015movielens}. In this application, we mainly focus on the metric Root Mean Squared Error (RMSE), i.e., 

\vspace{-0.45cm}
 $$\text{RMSE}\stackrel{\text{def.}}=\sqrt{\frac1N\sum_{i,j}(\widehat M_{ij}-M_{ij})^2}.$$
 \vspace{-0.45cm}

\textbf{Data.} MovieLens 100K contains 100,000 ratings within the scale 1-5. The ratings are given by 943 users on 1,682 movies. Genre information about movies is also provided. We adopt a similar setting with \citep{zhang2012multi}. We extract five most popular genres, which are Comedy, Romance,  Drama,  Action,  Thriller respectively, to define the data from 5 different domains (or platforms). In addition to \citep{zhang2012multi}, we randomly permute the indexes of the users from these five domains respectively, so that the correspondence among these data become unknown. In this way, the problem belongs to the 4-correspondence problem as discussed before. The ratings are split randomly, with 80\% of them as the training data and the other 20\% of them as the test data. 

\vspace{-0.1cm}
\textbf{Algorithms.} We consider the following additional algorithms for comparison. 

\vspace{-0.3cm}
\begin{enumerate}[style=sameline,itemindent=0em,leftmargin=20pt]
  {\item \textit{SIC}: Running the Soft-Impute algorithm independently for the 5 different platforms. }
  {\item \textit{SIR}: Running the Soft-Impute algorithm  with Randomly generated correspondence.}
\end{enumerate}

\vspace{-0.3cm}
\textbf{Results.}
As discussed in experiments on the simulated data, the exact recovery of correspondence becomes impossible due to the small amount of observable entries. Therefore, in the following experiment, since exact correspondence is not needed, we fix $\epsilon=0.05$ for $\text{M}^3\text{O}$. Table \ref{tb:federated} shows the results by averaging the RMSE on the test data over 10 different random seeds. 

\vspace{-0.2cm}
We can first see that the matrix completion with the randomly generated correspondence, i.e., SIR, can be harmful to the overall performance. Besides,  although the ground-truth correspondence can not be recovered, each platform  can still benefit from $\text{M}^3\text{O}$ since it improves the performance over SIC. This is mainly because $\text{M}^3\text{O}$ is still able to correspond similar users for inferring missing ratings. On the contrary, since both Baseline and MUS can only establish an exact one-to-one correspondence for each user, they fail to improve SIC significantly. Remarkably, $\text{M}^3\text{O}$ is only inferior to the Oracle method a little, and even achieves lower test RMSE than the Oracle method on the Comedy genre. 
\vspace{-0.2cm}
  \begin{table}[htbp]
    \small
    \centering
    \caption{  Test RMSE of various algorithms on MovieLens 100K}\label{tb:federated}
    \vspace*{-0.3cm}
    \begin{tabular}{ccccccc}
    \toprule
    Method & Comedy & Romance & Drama & Action & Thriller & Total\\
    \midrule
    SIR&1.0202  &  1.0158 &   0.9808  &  0.9803  &  0.9811  &  0.9944\\
    
    SIC&0.9694  &  0.9695 &   0.9317 &   0.9175  &  0.9253 &   0.9418\\
    MUS&0.9659  &  0.9842 &   0.9423  &  0.9305  &  0.9306  &  0.9485\\
      Baseline &0.9728& 0.9562& 0.9379& 0.9105  & 0.9145&
          0.9395 \\
    $\text{M}^3\text{O}$& \textbf{0.9389} & \textbf{0.8787} & \textbf{0.9139} & \textbf{0.8556} & \textbf{0.8567} & \textbf{0.8948}\\
    Oracle& 0.9444   & 0.7825 &   0.9058 &   0.8176  &  0.8098 &   0.8667\\
    \bottomrule
    \end{tabular}
    %\caption{这是一张三线表}\label{tab:aStrangeTable}  标题放在这里也是可以的
    \vspace{-0.3cm}
    \end{table}
\vspace{-0.0cm}


  \subsection{Visual permutation recovery}
  \vspace{-0.2cm}
  \begin{wrapfigure}{r}{5cm}
    \vspace{-0.6cm}
    \centering
  \subfigure[ Original]{
    \label{fig:real_face}
  \includegraphics[width=2cm]{figures/real_face.jpg}
  %\caption{fig1}
  }
  \quad
  \subfigure[Corrupted]{
    \label{fig:perm_face}
  \includegraphics[width=2cm]{figures/perm_face.jpg}
  }
  \quad
  \centering
  \subfigure[Baseline]{
    \label{fig:base_face}
  \includegraphics[width=2cm]{figures/base_face.jpg}
  %\caption{fig1}
  }
  \quad
  \subfigure[$\text{M}^3\text{O}$]{
    \label{fig:Mcubic_face}
  \includegraphics[width=2cm]{figures/Mcubic_face.jpg}
  }
  \vspace*{-0.4cm}
  \caption{\small Performance of {{M}$^3$O} on a face recovery problem.}
  \label{fig:face}\vspace{-0.5cm}
    \end{wrapfigure}
  We show that  M$^3$O  is flexible and can also be used to recover matrix that is not in the form $[A,PB]$. We can see this from the problem formulation in \eqref{p:rewrite}, where the cost matrix $C(\cdot)$ can be constructed in other ways as long as it is a function of a permutation. Typically, M$^3$O  can be used to solve a challenging face image recovery problem. The original face image with size $180\times 180$ in Figure \ref{fig:real_face} comes from the Extend Yale B database \citep{GeBeKr01}. The corrupted image is visualized in Figure \ref{fig:perm_face}, where the  pixel blocks with size $30\times 30$ in the upper left are shuffled randomly, and $30\%$ of the total pixels are removed. 
  This kind of problem is recently considered in \citep{santa2017deeppermnet}, which proposes to recover the corrupted image in a data-driven way using convolutional neural networks. However, we show that it is possible to recover the image without additional data by merely exploiting the underlying low-rank structure of the image itself.

  This experiment setting is similar to that in  \citep{yao2021unlabeled} but the algorithm in \citep{yao2021unlabeled} can not be applied since it can not work with the missing values. The  MUS algorithm is also not applicable since this problem can not be written in the form of linear regression problem. From Figure \ref{fig:base_face} and \ref{fig:Mcubic_face} we can find that $\text{M}^3\text{O}$ performs better than the Baseline, and can even recover the original orders of pixel blocks. More results similar to the Figure \ref{fig:face} and experiment details are provided in Appendix \ref{app:exp}.
  

  % \begin{figure}[htbp]
  %   \centering
  %   \subfigure[ Original]{
  %     \label{fig:real_face}
  %   \includegraphics[width=2cm]{figures/real_face.jpg}
  %   %\caption{fig1}
  %   }
  %   \quad
  %   \subfigure[Corrupted]{
  %     \label{fig:perm_face}
  %   \includegraphics[width=2cm]{figures/perm_face.jpg}
  %   }
  %   \quad
  %   \centering
  %   \subfigure[Baseline]{
  %     \label{fig:base_face}
  %   \includegraphics[width=2cm]{figures/base_face.jpg}
  %   %\caption{fig1}
  %   }
  %   \quad
  %   \subfigure[$\text{M}^3\text{O}$]{
  %     \label{fig:Mcubic_face}
  %   \includegraphics[width=2cm]{figures/Mcubic_face.jpg}
  %   }
  %   \caption{\small Performance of {{M}$^3$O} on a face recovery problem.}
  %   \label{fig:face}\vspace{-0.5cm}
  %   \end{figure}
  
    

      \vspace{-0.3cm}
\section{Conclusion}
\label{sec:discussion}
\vspace{-0.3cm}
 This paper studies an important matrix recovery problem that is less probed by the existing literature. Unlike  the classical matrix completion problem, in this problem setting, part of the observation matrix is shuffled. Apart from the two applications we have studied in this paper, this problem can arise in more scenarios like  the gnome assembly problem \citep{huang1999cap3}, the video pose tracking problem \citep{ganapathi2012real} and the privacy-aware sensor networks \citep{gruteser2003privacy}, etc. We believes that our work provides a general aolution to deal with correspondence issue in these scenarios.

 Theoretically, this paper is the first to rigorously study the role of low-rank model in the MRUC problem, and is also the first to show that minimizing nuclear norm is provably correct for recovering a typical low-rank matrix. In practice, we propose a highly efficient algorithm, the {$\text{M}^3\text{O}$} algorithm, which consistently achieves the best performance over several baselines in all the tested scenarios. 

 As we have shown in Figure \ref{fig:various_setting}, one major limit of our algorithm is the sensitivity to the initialization. The phenomenon is exacerbated when the additive noise is high or the numbers of observable entries are small. We suggest to try with a few different initialization strategy when applying {$\text{M}^3\text{O}$} to a specific task. Finding stable initialization strategy is also an important task for our future works.
%   
% \textbf{Finding good initiation.}
  
%  \textbf{More applications.} We believe that this work have a fundamental step towards utilizing heterogeneous data, and will be applied to more realistic problems where matrix-like data are used. Typically, the correspondence in a real world problem are lost for two reasons. First, the measuring instruments fails to preserve the correspondence \citep{xie_hypergradient_2020}, like the missing identity information we discussed previously. Additional examples includes the multi-view matching problem \citep{schaffalitzky2002multi}, the gnome assembly problem \citep{huang1999cap3} and the video pose tracking problem \citep{ganapathi2012real}. Second, the correspondence information is hided for preserving privacy, which is common in the Federated Learning Framework and the privacy-aware sensor networks \citep{gruteser2003privacy}.   


% \subsubsection*{Author Contributions}
% If you'd like to, you may include  a section for author contributions as is done
% in many journals. This is optional and at the discretion of the authors.

% \subsubsection*{Acknowledgments}
% Use unnumbered third level headings for the acknowledgments. All
% acknowledgments, including those to funding agencies, go at the end of the paper.


\bibliography{iclr2022_conference}
\bibliographystyle{iclr2022_conference}

\newpage
\appendix
\section{Appendix}
\input{supplement.tex}

\end{document} 