% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage[numbers]{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% Packages added by authors
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{bm}
\newtheorem{theorem}{Theorem} 
\newtheorem{lemma}{Lemma}
\newtheorem{definition}{Definition}
\newtheorem{property}{Property}
\newtheorem{example}{Example}
\usepackage{threeparttable}
\usepackage{xcolor,colortbl}
\usepackage{subfigure}

\usepackage{multirow}
\usepackage{multicol}
\newcommand{\tabincell}[2]{\begin{tabular}{@{}#1@{}}#2\end{tabular}} 
\counterwithin{figure}{section}
\counterwithin{table}{section}
\usepackage{setspace}

\DeclareMathOperator*{\diag}{diag}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
% \usepackage{xr} 
% \externaldocument{yu_34}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Online Estimation of Similarity Matrices with Incomplete Data\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{{Fangchen Yu}}
\author[2]{{Yicheng Zeng}}
\author[1,2]{{Jianfeng Mao}}
\author[1,2]{\href{mailto:<wyli@cuhk.edu.cn>?Subject=Your UAI 2023 paper}{Wenye Li\thanks{Corresponding author}}{}}
% Add affiliations after the authors
\affil[1]{%
    The Chinese University of Hong Kong, Shenzhen
}
\affil[2]{%
    Shenzhen Research Institute of Big Data 
    \linebreak
    2001 Longxiang Boulevard, Longgang District, Shenzhen, China
    \linebreak
    fangchenyu@link.cuhk.edu.cn, statzyc@sribd.cn, jfmao@cuhk.edu.cn, wyli@cuhk.edu.cn
}
  
\begin{document}

\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

This document serves as supplementary material for the paper entitled “Online Estimation of Similarity Matrices with Incomplete Data”. It contains theoretical proofs of the theorem and lemma presented in the main paper, related work, and discussion, along with numerical results. Specifically,

\begin{itemize}
    \item \textbf{Section~\ref{sec:a}} provides the proofs of Theorem 1, Lemma 1, and the theoretical guarantee in Eq.~(10);

    \item \textbf{Section~\ref{sec:b}} discusses the related work on similarities and differences;

    \item \textbf{Section~\ref{sec:c}} presents implementation details and numerical results.
\end{itemize}

% =============================================

\appendix

\section{Theorems and Proofs} \label{sec:a}

This section contains detailed proofs of the theorem and lemma that are presented in the main paper.

\subsection{Proof of Theorem 1} \label{sec:a.1}

Let $\mathcal{M}_n$ be the set of $n\times n$ symmetric matrices. Due to the Kolmogrov's criterion \cite{deutsch2012best}, the projection of $S^o$ onto the feasible region $\mathcal{T}$ is unique and characterized by 
\[ \hat{S} = P_{\mathcal{T}}(S^o) \in \mathcal{T}~\text{and}~\langle S-\hat{S}, S^o-\hat{S} \rangle_{\mathcal{M}_n} \le 0~\text{for all}~S \in \mathcal{T}, \]
where $\langle , \rangle_{\mathcal{M}_n}$ is an inner product of matrices defined on $\mathcal{M}_n$  with
\[ \langle X,Y \rangle_{\mathcal{M}_n} = {\rm trace}(X^{\top} Y),~\text{for any}~X,Y \in \mathcal{M}_n. \]
Considering that the true similarity matrix $S^* \in \mathcal{T}$, we have \citep{li2015estimating,li2020scalable,li2022calibrating}:
\begin{align*}
    & \|S^*-\hat{S}\|_F^2 \\
    \leq~& \|S^*-\hat{S}\|_{F}^{2} - 2\langle S^*-\hat{S}, S^o-\hat{S} \rangle_{\mathcal{M}_n} \\
    \le~& \|S^*-\hat{S}\|_F^2 + \|S^o-\hat{S}\|_F^2 - 2\langle S^*-\hat{S}, S^o-\hat{S} \rangle_{\mathcal{M}_n} \\
    =~& \|(S^*-\hat{S})-(S^o-\hat{S})\|_{F}^{2} \\
    =~& \|S^*-S^o\|_F^2.
\end{align*}
Thus, $\|S^* - \hat{S}\|_F^2 \le \|S^* - S^o\|_F^2$. The equality holds if and only if $\hat{S} = S^o$, i.e., $S^o\in \mathcal{T}$. 

% =====================================

\subsection{Proof of Lemma 1}

Consider a matrix $S_{n+1} \in \mathbb{R}^{(n+1) \times (n+1)}$ partitioned as 
\[ S_{n+1} = \left[\begin{array}{cc}
    S_n & v \\
    v^{\top} & c
\end{array} \right], \]
where $S_n \in \mathbb{R}^{n\times n}, v\in \mathbb{R}^n, c \in \mathbb{R}$. Due to $\det |S_n| \neq 0$, the matrix $D = c-v^{\top} S_n^{-1} v$ is the Schur complement of $S_n$ in $S_{n+1}$. Suppose $S_n \succ 0$ (positive definite), and consider the minimization problem
\[ \min_{u}~~ u^\top S_n u + 2\gamma^\top v^\top u + \gamma^\top c \gamma\]
with variable $u \in \mathbb{R}^n$ and $\gamma \in \mathbb{R}$. The optimal solution is $u^*=-S_n^{-1}v \gamma$ with optimal value \citep[Eq.~A.14]{boyd2004convex}
\begin{equation*}
    \inf_u \left[ \begin{array}{c}
         u  \\
         \gamma 
    \end{array}\right]^\top 
    \left[ \begin{array}{cc}
        S_n & v \\
        v^{\top} & c
    \end{array}\right]
    \left[ \begin{array}{c}
         u  \\
         \gamma 
    \end{array}\right] 
    = 
    \inf_u \left[ \begin{array}{c}
         u  \\
         \gamma 
    \end{array}\right]^\top S_{n+1}
    \left[ \begin{array}{c}
         u  \\
         \gamma 
    \end{array}\right] 
    = 
    \gamma^\top D \gamma.
\end{equation*}
Then we obtain the equivalence of positive definiteness between $D$ and $S_{n+1}$: 
\begin{align*}
     D \ge 0 & \Leftrightarrow \forall \gamma \in \mathbb{R},~ \gamma^\top D \gamma \ge 0 \\ 
     & \Leftrightarrow \forall \gamma \in \mathbb{R},~ 
     \inf_u \left[ \begin{array}{c}
         u  \\
         \gamma 
     \end{array}\right]^\top S_{n+1}
     \left[ \begin{array}{c}
         u  \\
         \gamma
     \end{array}\right] \ge 0 \\
     & \Leftrightarrow \forall u\in \mathbb{R}^n, \gamma \in \mathbb{R},~ 
     \left[ \begin{array}{c}
         u  \\
         \gamma 
     \end{array}\right]^\top S_{n+1}
     \left[ \begin{array}{c}
         u  \\
         \gamma 
     \end{array}\right] \ge 0 \\
     & \Leftrightarrow S_{n+1} \succeq 0
\end{align*} 
which shows $S_{n+1}$ is positive semi-definite (PSD) if and only if $D \ge 0$, i.e., $v^{\top} S_n^{-1} v \le c$ \citep{boyd2004convex}.


\subsection{Theoretical Guarantee}

$\bullet$~\ For our OffMC model in Section~3.1, we have proved $\|S_n^* - \hat{S}_n\|_F^2 \le \|S_n^* - S_n^o\|_F^2$ in Section~\ref{sec:a.1}.

$\bullet$~\ For our OnMC-S model in Section~3.2.1, denote $S_{n+1}^o = \begin{bmatrix}
  \hat{S}_n & v_o \\
  v_o^{\top} & c
\end{bmatrix}$, $\hat{S}_{n+1} = \begin{bmatrix}
  \hat{S}_n & \hat{v} \\
  \hat{v}^{\top} & c
\end{bmatrix}$ and $S_{n+1}^* = \begin{bmatrix}
  S_n^* & v^* \\
  v^{*\top} & c^*
\end{bmatrix}$, then
\begin{equation*}
\begin{aligned}
    & \|S_{n+1}^* - \hat{S}_{n+1}\|_F^2 = \|S_{n}^* - \hat{S}_{n}\|_F^2 + 2\|v^* - \hat{v}\|^2 + (c^*-c)^2  \\
    \le~ & \|S_{n+1}^* - S_{n+1}^o\|_F^2 = \|S_{n}^* - \hat{S}_{n}\|_F^2 + 2\|v^* - v_o\|^2 + (c^*-c)^2.
\end{aligned}
\end{equation*}
Thus, we obtain $\|v^* - \hat{v}\|^2 \le \|v^* - v_o\|^2$, which is a theoretical guarantee for the One-step Online Similarity Matrix Correction. By sequentially applying the one-step correction, the performance of OnMC-S method is guaranteed globally by
\[ \|S^*_{n+t} - \hat{S}_{n+t}\|_F^2 \le \|S^*_{n+t} - S^o_{n+t}\|_F^2,~\forall~t=0,1,\dots,m. \]

$\bullet$~\ For our OnMC-B model in Section~3.2.2, we divide $S_{n+m}^o = \left[\begin{array}{cc}
    S_{\text{off}} & S_{\text{par}} \\
    S_{\text{par}}^{\top} & S_{\text{on}}
\end{array} \right]$ into four block matrices. Firstly, $S_{\text{off}}$ and $S_{\text{on}}$ are corrected to $\hat{S}_{\text{off}}$ and $\hat{S}_{\text{on}}$, respectively. We have $\|S^*_{\text{off}} - \hat{S}_{\text{off}}\|_F^2 \le \|S^*_{\text{off}} - S_{\text{off}}\|_F^2$ and $\|S^*_{\text{on}} - \hat{S}_{\text{on}}\|_F^2 \le \|S^*_{\text{on}} - S_{\text{on}}\|_F^2.$
Then, by concurrently applying the one-step correction, we correct $S_{\text{par}}$ to $\hat{S}_{\text{par}}$ with $\|S^*_{\text{par}} - \hat{S}_{\text{par}}\|_F^2 \le \|S^*_{\text{par}} - S_{\text{par}}\|_F^2.$ Finally, we have the performance guarantee on $\hat{S}_{n+m}$ with $\|S^*_{n+m} - \hat{S}_{n+m}\|_F^2 \le \|S^*_{n+m} - S_{n+m}\|_F^2$ in Eq.~(10).

$\bullet$~\ For our OnMC-L model in Section~3.2.3, we divide $S_{n+m}^o$ into four types of block matrices of small sizes, i.e., $\{S_{\text{off}}^{(i)}\}_{i=1}^{N_{\text{off}}}$, $\{S_{\text{off\_par}}^{(i)}\}_{i=1}^{N_{\text{off}}}$, $\{S_{\text{on}}^{(j)}\}_{j=1}^{N_{\text{on}}}$, and $\{S_{\text{on\_par}}^{(j)}\}_{j=1}^{N_{\text{on}}}$. For all $i\in\{1,\dots,N_{\text{off}}\}$ and $j\in\{1,\dots,N_{\text{on}}\}$, we have 
$\|S^{*(i)}_{\text{off}} - \hat{S}^{(i)}_{\text{off}}\|_F^2 \le \|S^{*(i)}_{\text{off}} - S_{\text{off}}^{(i)}\|_F^2$,
$\|S^{*(i)}_{\text{off\_par}} - \hat{S}^{(i)}_{\text{off\_par}}\|_F^2 \le \|S^{*(i)}_{\text{off\_par}} - S_{\text{off\_par}}^{(i)}\|_F^2$,
$\|S^{*(j)}_{\text{on}} - \hat{S}^{(j)}_{\text{on}}\|_F^2 \le \|S^{*(j)}_{\text{on}} - S_{\text{on}}^{(j)}\|_F^2$, and
$\|S^{*(j)}_{\text{on\_par}} - \hat{S}^{(j)}_{\text{on\_par}}\|_F^2 \le \|S^{*(j)}_{\text{on\_par}} - S_{\text{on\_par}}^{(j)}\|_F^2$.
Thus, we also have the guarantee on the final performance $\|S^*_{n+m} - \hat{S}_{n+m}\|_F^2 \le \|S^*_{n+m} - S^o_{n+m}\|_F^2$.


% % ================================================

\section{Related Work} \label{sec:b}

Due to the limited space in the main paper, we discuss the related work in this section. 

$\bullet$~\ \textbf{Offline Model}: i) For \textit{inner product}, there is no constraints of similarity values, i.e., $S_{ij} \in (-\infty, +\infty)$, thus our offline model is relaxed to find the nearest positive semi-definite matrix, which is a well-studied problem in the optimization community \citep{halmos1972positive,higham1988computing} and the unique solution is given by the following property \citep{cheng1998modified}.
\begin{property} \label{thm:inner}
    Let the symmetric matrix $S \in\mathbb{R}^{n \times n}$ have the spectral decomposition $S = U\Sigma U^{\top}$ ($U$ is orthogonal, $\Sigma = {\rm diag}(\sigma_i)$). The unique positive semi-definite matrix nearest to $S$ in the Frobenius norm is given by
    \[ \hat{S} = U\hat{\Sigma} U^{\top},~ \hat{\Sigma} = {\rm diag}(\max\{\sigma_i,0\}) \]
\end{property}

ii) For \textit{other similarities}, the Dykstra's projection \citep{dykstra1983algorithm} we chosen is a popular alternating projection approach, which is commonly used to solve the convex optimization problem on the intersection of convex sets \citep{bauschke1994dykstra,glunt1990alternating}, and also applied in two matrix calibration methods, including Direct Matrix Calibration (DMC) \citep{li2015estimating} and Cyclic Matrix Calibration (CMC) \citep{li2020scalable}.

$\bullet$~\ \textbf{Online Model}: To deal with incomplete online data, we propose new models for different online scenarios, including sequential, batch, and large-scale data, which are our main contributions and rely on the convex optimization \citep{boyd2004convex,horn2012matrix}.

$\bullet$~\ \textbf{Similarities with Matrix Calibration Methods}:
The matrix calibration methods developed in \citep{li2015estimating} and \citep{li2020scalable} made closely related contributions, which also focused on the similarity matrix estimation with incomplete observations. Both methods used an offline pattern and aimed to calibrate an initial matrix to the nearest matrix that satisfies PSD properties.

$\bullet$~\ \textbf{Differences with Matrix Calibration Methods}:
Although the optimization objectives of these methods are mathematically unified, our main contribution lies in online models, while matrix calibration methods only care about offline models. In terms of data scenarios and optimization procedures, our work is fundamentally different. The OnMC-S method aims at sequential data, and at each step, it only needs to correct one similarity vector instead of the entire matrix. Moreover, the OnMC-B method performs a well-designed and flexible scheme on batch data, which is easily executed in high-parallel efficiency through divide-and-conquer. Comparatively, the existing matrix calibration methods modify the whole matrix globally and blur the known information with a higher computation cost and lower performance, which are not competent for incomplete online data. For fairness of comparison, we provide the results of matrix calibration methods in Section~\ref{sec:c4}. 


% ================================================

\section{Experimental Results} \label{sec:c}

\subsection{Implementation Details}

In an online scenario studied in Section 4, all data is normalized to $[-1,+1]$ before random missing. Suppose we have an offline dataset $X = [x_1,x_2,\dots,x_n] \in \mathbb{R}^{d\times n}$ with complete information. Then $m$ incomplete data points in the online dataset $Y^o = [y_1^o, y_2^o, \dots, y_m^o] \in \mathbb{R}^{d \times m}$ come into observation sequentially. Same as the notation in Section 2.1, denote 

\quad $\bullet~~\ I_t \subset \{1,2,\cdots,d\}$ as the index set recording the positions of fully observed features in the online data point $y_t^o$; 

\quad $\bullet~~\ I_t^c = \{1,2,\cdots,d\} \setminus I_t$ as the complement set of $I_t$ recording the position of unknown/missing features in $y_t^o$;

\quad $\bullet~~\ y_t^o(I_t) \in \mathbb{R}^{|I_t|}$ $\left(y_t^o(I_t^c) \in \mathbb{R}^{|I_t^c|}\right)$ as a vector of selected values in $y_t^o$ on $I_t$ ($I_t^c$);

\quad $\bullet~~\ X(I_t) \in \mathbb{R}^{|I_t|\times n}$ $\left(X(I_t^c) \in \mathbb{R}^{|I_t^c|\times n}\right)$ as a matrix of selected values in $X$ on $I_t$ ($I_t^c$).

Besides, $S^o$ represents the estimated similarity matrix on incomplete data as mentioned in Section 2.1.

To deal with incomplete online data, various representative Missing Value Imputation (MVI) methods and Matrix Calibration approaches are considered, of which the implementation details are specifically listed in Table~\ref{tab:method}.

\begin{table}[bt]
    \caption{The implementation details of missing value imputation (MVI) methods and matrix calibration approaches.}
    \label{tab:method}
    \centering
    \setlength{\tabcolsep}{2.5pt}
    \small
    \begin{tabular}{llp{0.47\textwidth}p{0.23\textwidth}}
    \toprule \toprule
    Method & Category & Technical Description & Key Equation \\ \hline
    
    ZERO & Statistical MVI & Imputes all the missing values by zero. & $y_t^o(I_t^c) = 0$. \\
    
    MEAN & Statistical MVI & Imputes the missing values by the observed mean of the corresponding feature in all offline data. & $y_t^o(I_t^c) = \text{mean}(X(I_t^c))$ \\
    
    $k$NN \cite{kim2004reuse} & Statistical MVI & Imputes the missing values by the weighted mean value of corresponding $k$-nearest neighbors $X_k$ in the offline dataset. The default value of $k$ is set to 10. & $y_t^o(I_t^c) = \text{mean}(X_k(I_t^c))$ \\
    
    \tabincell{l}{LR \cite{seber2012linear} \\ \quad} & \tabincell{l}{Regression MVI \\ \quad} & \tabincell{l}{Imputes the missing values by the multivariate linear regression \\ between observed features and missing features.} & \tabincell{l}{Fit a LR $f: X(I_t) \rightarrow y_t^o(I_t)$ \\ Predict $y_t^o(I_t^c)$ using $f(X(I_t^c))$}  \\
    
    \tabincell{l}{RF \cite{stekhoven2012missforest} \\ \quad \\ \quad} & \tabincell{l}{Regression MVI  \\ \quad \\ \quad} & \tabincell{l}{Imputes the missing values by the random forest (MissForest) \\ between observed features and missing features. The default \\ value of \textit{TreeNum} in a well-known RF package is set to 500.} & \tabincell{l}{Fit a RF $f: X(I_t) \rightarrow y_t^o(I_t)$ \\ Predict $y_t^o(I_t^c)$ using $f(X(I_t^c))$} \\
    
    \tabincell{l}{GROUSE \cite{balzano2010online} \\ \quad} & \tabincell{l}{Matrix Completion  \\ \quad} & \tabincell{l}{Imputes the online data by an online incremental algorithm \\ based on low rank matrix completion.} & \tabincell{l}{$\min_{U,V} \sum_{i,j} (Z - UV)_{ij}^2$ \\ s.t. ~ $Z = [X,Y^o]$} \\
    
    \tabincell{l}{KFMC \cite{fan2019online} \\ \quad} & \tabincell{l}{Matrix Completion \\ \quad} & \tabincell{l}{Imputes the online data by high rank matrix completion and \\ online optimization (default setting: KFMC-on-RBF).} & \tabincell{l}{$\min_{Z,U,V} ~ \frac{1}{2} \| \phi(Z) - UV \|^2$ \\ s.t. ~ $Z = [X,Y^o]$} \\
    
    \tabincell{l}{DMC \cite{li2015estimating} \\ \quad \\ \quad} & \tabincell{l}{Matrix Calibration \\ \quad \\ \quad} & \tabincell{l}{Calibrates the similarity matrix by searching the nearest matrix \\ with positive semi-definiteness based on the Dykstra's alternat-\\ ing projection method. } & \tabincell{l}{$\min_S \|S-S^o\|_F^2$ \\ s.t.~~~$S \succeq 0$} \\
    
    \tabincell{l}{CMC \cite{li2020scalable} \\ \quad} & \tabincell{l}{Matrix Calibration \\ \quad} & \tabincell{l}{Calibrates the similarity matrix by searching the nearest matrix \\ with positive semi-definiteness using cyclic projection method.} & \tabincell{l}{$\min_S \|S-S^o\|_F^2$ \\ s.t.~~~$S \succeq 0$} \\

     \bottomrule  \bottomrule
    \end{tabular}
\end{table}

\subsection{Training Procedure}

In general, there are two different procedures to train the imputation methods on incomplete online data.
\begin{itemize}
    \item[1.] \textbf{Online Update}: the imputation models are trained purely on offline data. The imputed data $\hat{y}_t$ is calculated by
    \[ f: [X, y_t^o] \rightarrow \hat{y}_t. \]
    
    \item[2.] \textbf{Sequential Update}: the imputation models are updated as online data comes into observation. Specifically, each imputed online data will move into the complete offline dataset and the training procedure is defined as
    \[ f: [X, \hat{y}_1, \cdots, \hat{y}_{t-1}, y_t^o] \rightarrow \hat{y}_t. \]
\end{itemize}
In the main paper, we choose the \emph{Online Update} pattern as our training procedure. As shown in Fig.~\ref{fig:seq}, the sequential update pattern may lead to the increase of RMSE, especially for the LR method, because the imputed online data is regarded as new complete data, which may increase the error of the next imputation. For the fairness of data imputation, we adopt the online update pattern and only use $X$ as the training set to train the mapping function $f$.

\begin{figure}[htbp]
    \centering
    \subfigure[MEAN]{
    \includegraphics[width = .24\columnwidth]{Fig-supp/Seq_MNIST_MEAN.pdf}}
    \subfigure[$k$NN]{
    \includegraphics[width = .24\columnwidth]{Fig-supp/Seq_MNIST_KNN.pdf}}
    \subfigure[LR]{
    \includegraphics[width = .24\columnwidth]{Fig-supp/Seq_MNIST_LR.pdf}}
    \subfigure[RF]{
    \includegraphics[width = .24\columnwidth]{Fig-supp/Seq_MNIST_RF.pdf}} 
    \caption{\small Comparison of the Relative-Mean-Square Error (RMSE) under online or sequential update pattern for cosine similarity on MNIST dataset with fixed offline size $n=1000$ and missing ratio $r=20\%$.}
    \label{fig:seq}
\end{figure}

% ================================================

\subsection{Hyper-parameter Analysis} \label{sec:c3}

The hyper-parameter analysis experiments are conducted using an online update pattern for {\it cosine similarity} on MNIST dataset with size $(n,m)=(1000,1000)$ in Fig.~\ref{fig:param}, where three imputation baselines are studied, including $k$NN, RF and KFMC methods. It can be observed that the performance of imputation methods is generally unstable and poor under different hyper-parameters, of which the default settings used in the main paper have been listed in Table~\ref{tab:method}. 

Besides, the hyper-parameter $\gamma$ of the Gaussian kernel in Section 5 is set to $\sigma^{-2}$, where $\sigma$ is the mean of Euclidean distance.

\begin{figure}[htbp]
    \centering
    \subfigure[$k$NN]{
    \includegraphics[width = .25\columnwidth]{Fig-supp/Param_MNIST_KNN.pdf}}
    \subfigure[RF]{
    \includegraphics[width = .25\columnwidth]{Fig-supp/Param_MNIST_RF.pdf}}
    \subfigure[KFMC]{
    \includegraphics[width = .25\columnwidth]{Fig-supp/Param_MNIST_KFMC.pdf}}    
    \caption{\small Hyper-parameter analysis of imputation methods for cosine similarity on MNIST dataset with $(n,m)=(1000,1000)$.}
    \label{fig:param}
\end{figure}

% ==========================================================

\subsection{Numerical Results} \label{sec:c4}

All the numerical results are the average performance of RMSEs on \textit{cosine similarity} matrices for 10 random seeds, where the RMSEs with the standard deviation on the MNIST dataset are listed in Table~\ref{tab:mnist_1000}.

$\bullet$~\ Compared with imputation methods, the OnMC methods exhibit excellent performance with a guarantee of RMSE $<1$.
    
$\bullet$~\ Compared with matrix calibration methods, the OnMC methods also show evident improvement on RMSE due to their well-designed online scheme, which mainly relies on vector optimization, blocking technique, and parallel correction. 

\begin{table}[ht]
    \caption{Numerical results of the Relative-mean-square Error (RMSE) on the MNIST with fixed dataset sizes $(n,m)=(1000,1000)$. The best two scores in each column are highlighted in \textbf{Bold}. All results are the average for 10 random seeds.}
    \label{tab:mnist_1000}
    \centering
    \begin{spacing}{1.2}
    \setlength{\tabcolsep}{3.5pt}
    \begin{tabular}{ll|ccccccc}
    \toprule \hline
    \multicolumn{2}{l|}{$(n,m) = (1000,1000)$} & \multicolumn{7}{c}{Missing Ratio $r$} \\
    Model & Method & 20\% & 30\% & 40\% & 50\% & 60\% & 70\% & 80\% \\ \hline
    \multirow{14}{*}{Online} & ZERO & 54.01\smaller{$\pm$2.195} & 68.76\smaller{$\pm$1.413} & 76.68\smaller{$\pm$2.099} & 76.74\smaller{$\pm$2.575} & 69.21\smaller{$\pm$1.540} & 54.57\smaller{$\pm$1.310} & 35.00\smaller{$\pm$0.524} \\
    
     & MEAN & 6.278\smaller{$\pm$0.139} & 8.123\smaller{$\pm$0.182} & 8.915\smaller{$\pm$0.228} & 9.127\smaller{$\pm$0.181} & 8.046\smaller{$\pm$0.121} & 6.475\smaller{$\pm$0.083} & 4.189\smaller{$\pm$0.065} \\
     
     & $k$NN-5 & 1.385\smaller{$\pm$0.082} & 1.696\smaller{$\pm$0.113} & 1.873\smaller{$\pm$0.073} & 1.834\smaller{$\pm$0.085} & 1.634\smaller{$\pm$0.063} & 1.298\smaller{$\pm$0.035} & 0.841\smaller{$\pm$0.023} \\
     
     & $k$NN-10 & 1.847\smaller{$\pm$0.093} & 2.300\smaller{$\pm$0.121} & 2.583\smaller{$\pm$0.070} & 2.568\smaller{$\pm$0.110} & 2.296\smaller{$\pm$0.071} & 1.831\smaller{$\pm$0.040} & 1.184\smaller{$\pm$0.019} \\
     
     & $k$NN-20 & 2.393\smaller{$\pm$0.108} & 3.007\smaller{$\pm$0.169} & 3.406\smaller{$\pm$0.113} & 3.390\smaller{$\pm$0.156} & 3.041\smaller{$\pm$0.102} & 2.435\smaller{$\pm$0.061} & 1.572\smaller{$\pm$0.028} \\
     
     & $k$NN-50 & 3.284\smaller{$\pm$0.151} & 4.167\smaller{$\pm$0.228} & 4.750\smaller{$\pm$0.163} & 4.747\smaller{$\pm$0.231} & 4.266\smaller{$\pm$0.148} & 3.407\smaller{$\pm$0.092} & 2.183\smaller{$\pm$0.047} \\
     
     & LR & 3.627\smaller{$\pm$0.555} & 3.789\smaller{$\pm$0.564} & 3.637\smaller{$\pm$0.407} & 2.816\smaller{$\pm$0.374} & 2.080\smaller{$\pm$0.241} & 1.206\smaller{$\pm$0.116} & 0.512\smaller{$\pm$0.034} \\
     
     & RF-10 & 1.015\smaller{$\pm$0.037} & 1.343\smaller{$\pm$0.051} & 1.629\smaller{$\pm$0.033} & 1.763\smaller{$\pm$0.046} & 1.738\smaller{$\pm$0.029} & 1.553\smaller{$\pm$0.015} & 1.134\smaller{$\pm$0.014} \\
     
     & RF-50 & 1.102\smaller{$\pm$0.038} & 1.484\smaller{$\pm$0.061} & 1.809\smaller{$\pm$0.031} & 1.963\smaller{$\pm$0.049} & 1.942\smaller{$\pm$0.032} & 1.736\smaller{$\pm$0.016} & 1.282\smaller{$\pm$0.018} \\
     
     & RF-100 & 1.117\smaller{$\pm$0.040} & 1.503\smaller{$\pm$0.061} & 1.832\smaller{$\pm$0.032} & 1.990\smaller{$\pm$0.051} & 1.970\smaller{$\pm$0.030} & 1.763\smaller{$\pm$0.019} & 1.298\smaller{$\pm$0.017} \\
     
     & RF-500 & 1.125\smaller{$\pm$0.041} & 1.516\smaller{$\pm$0.061} & 1.849\smaller{$\pm$0.033} & 2.007\smaller{$\pm$0.050} & 1.990\smaller{$\pm$0.031} & 1.782\smaller{$\pm$0.018} & 1.314\smaller{$\pm$0.015} \\
     
     & GROUSE & 1.953\smaller{$\pm$0.060} & 2.399\smaller{$\pm$0.049} & 2.741\smaller{$\pm$0.088} & 2.731\smaller{$\pm$0.068} & 2.368\smaller{$\pm$0.072} & 1.820\smaller{$\pm$0.062} & 1.111\smaller{$\pm$0.048} \\
     
     & KFMC-on-Poly & \textbf{0.577}\smaller{$\pm$0.009} & \textbf{0.789}\smaller{$\pm$0.031} & 2.641\smaller{$\pm$0.164} & 13.25\smaller{$\pm$0.608} & 35.21\smaller{$\pm$0.624} & 43.25\smaller{$\pm$0.831} & 32.40\smaller{$\pm$0.766} \\
     
     & KFMC-on-RBF & 1.281\smaller{$\pm$0.054} & 1.637\smaller{$\pm$0.070} & 1.907\smaller{$\pm$0.057} & 1.990\smaller{$\pm$0.049} & 1.920\smaller{$\pm$0.058} & 1.683\smaller{$\pm$0.033} & 1.237\smaller{$\pm$0.027} \\ \hline
 
    \multirow{4}{*}{Offline} & KFMC-off-Poly & 1.273\smaller{$\pm$0.049} & 1.699\smaller{$\pm$0.071} & 2.079\smaller{$\pm$0.026} & 2.247\smaller{$\pm$0.064} & 2.239\smaller{$\pm$0.049} & 1.999\smaller{$\pm$0.017} & 3.393\smaller{$\pm$0.106} \\
    
     & KFMC-off-RBF & 1.478\smaller{$\pm$0.059} & 1.939\smaller{$\pm$0.074} & 2.326\smaller{$\pm$0.035} & 2.496\smaller{$\pm$0.079} & 2.454\smaller{$\pm$0.047} & 2.188\smaller{$\pm$0.028} & 1.599\smaller{$\pm$0.011} \\
     
     & DMC & 0.893\smaller{$\pm$0.004} & 0.850\smaller{$\pm$0.003} & 0.798\smaller{$\pm$0.005} & 0.744\smaller{$\pm$0.006} & 0.679\smaller{$\pm$0.005} & 0.595\smaller{$\pm$0.005} & 0.480\smaller{$\pm$0.005} \\
     
     & CMC & 0.903\smaller{$\pm$0.004} & 0.855\smaller{$\pm$0.003} & 0.797\smaller{$\pm$0.005} & 0.737\smaller{$\pm$0.007} & 0.666\smaller{$\pm$0.005} & 0.576\smaller{$\pm$0.005} & 0.458\smaller{$\pm$0.005} \\ \hline

    \multirow{2}{*}{Ours} & OnMC-S & \textbf{0.869}\smaller{$\pm$0.004} & \textbf{0.815}\smaller{$\pm$0.003} & \textbf{0.750}\smaller{$\pm$0.005} & \textbf{0.683}\smaller{$\pm$0.007} & \textbf{0.605}\smaller{$\pm$0.005} & \textbf{0.504}\smaller{$\pm$0.005} & \textbf{0.370}\smaller{$\pm$0.005} \\
    
    & OnMC-B & 0.877\smaller{$\pm$0.004} & 0.823\smaller{$\pm$0.003} & \textbf{0.759}\smaller{$\pm$0.005} & \textbf{0.692}\smaller{$\pm$0.007} & \textbf{0.614}\smaller{$\pm$0.005} & \textbf{0.512}\smaller{$\pm$0.005} & \textbf{0.378}\smaller{$\pm$0.005} \\
\hline \bottomrule
\end{tabular}
\end{spacing}
\end{table}

\clearpage
\bibliography{yu_34}

\end{document}
