%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
\myexternaldocument{yang_240}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


%----------------------------------------
% my packages and commands

\usepackage{amsmath}
\usepackage{graphicx}
%\usepackage[title]{appendix}
\usepackage{bm}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{bbm}
\usepackage{verbatim}
% \usepackage{subfigure}
%\usepackage{subfig}
\usepackage{afterpage}
\usepackage{etoolbox}
%\usepackage{footbib}
\usepackage{float}
\usepackage{rotating}
\usepackage[inline]{enumitem}
\usepackage{diagbox}

\usepackage{multirow}
\usepackage{caption}
\usepackage[skip=0pt]{subcaption}
\newdimen\figrasterwd
\figrasterwd\textwidth

\newtheorem{lemma}{Lemma}
\DeclareMathOperator{\erf}{erf}
\usepackage{algorithmic}
\usepackage{algorithm}% http://ctan.org/pkg/algorithm
\setlength{\marginparwidth}{2cm}
%\usepackage[colorinlistoftodos]{todonotes}
\newcommand {\myvec}[1] {{\mbox{\boldmath $#1$}}}
\newcommand{\myx}{\myvec{x}}
\newcommand{\myX}{\myvec{X}}
\newcommand{\myy}{\myvec{y}}
\newcommand{\myY}{\myvec{Y}}
\newcommand{\myZ}{\myvec{Z}}
\newcommand{\myz}{\myvec{z}}
\newcommand{\myL}{\myvec{L}}
\newcommand{\myth}{\myvec{\theta}}
\newcommand{\mys}{\myvec{s}}
\newcommand{\tils}{\tilde{S}}
\newcommand{\prob}{\mathbb{P}}
\newcommand{\reals}{\mathbb{R}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\M}{\mathcal M}
%\usepackage[colorinlistoftodos]{todonotes}
\newcommand{\rev}[1]{{\color{black}{#1}}}
\newcommand{\del}[1]{{\color{red}{#1}}}
\newcommand{\snncomment}[1]{\todo{SNN: #1}}
\newcommand{\exval}{\mathbb{E}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\usepackage{hyperref}

\usepackage{titling}
\renewcommand\maketitlehooka{\null\mbox{}\vfill}
\renewcommand\maketitlehookd{\vfill\null}
%----------------------------------------




\title{Multi-modal Differentiable Unsupervised Feature Selection\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<junchen.yang@yale.edu>?Subject=mmDUFS}{Junchen Yang}{}}
\author[2]{Ofir Lindenbaum}
\author[1,4,5]{Yuval Kluger}
\author[3]{Ariel Jaffe}
% Add affiliations after the authors
\affil[1]{%
    Interdepartmental Program in Computational Biology and Bioinformatics\\ Yale University\\
    New Haven, CT, USA
}
\affil[2]{%
    Faculty of Engineering, Bar-Ilan University, Israel
}
\affil[3]{%
    Department of Statistics and Data Science, Hebrew University of Jerusalem, Israel
  }
  \affil[4]{%
Applied Math Program, Yale University, New
Haven, CT, USA
}
\affil[5]{%
Department of Pathology, School of Medicine, Yale University, New Haven, CT, USA
  }
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix
\section{Additional Results}\label{sec:extra_simulations}


\subsection{Points in a 3D cube.}
The data consists of points in a 3D cube $[0,l_s] \times [0,l_a] \times [0,l_b]$. The modality $\myX$ includes the first two coordinates, and modality $\myY$ includes the first and third, as explained in Sec. \ref{sec:method}. The upper row in Figure \ref{fig:3d_cube_eigenvectors} shows the eigenvectors of $\myvec{L}_x$. The eigenvectors change in both coordinates. The second row contain the eigenvectors of $\myvec{P}_{\text{shared}}$. the leading eigenvectors change only with the first coordinate, as it is the only shared variable. 


\begin{figure}[htp!]
    \centering
    \includegraphics[width = 0.95\linewidth]{Figs/3d_cube_eigenvecs.png}
    \caption{Data consists of points sampled uniformly at random in a 3D cube. The upper row shows a scatter plot of the points, located according to the first two coordinates $a,b$ and colored by the leading eigenvectors of $\myvec{L}_x$, the Laplacian matrix of modality $\myX$. The bottom row shows the leading eigenvectors of $\myvec{P}_{\text{shared}}$, the product of Laplacians as defined in Eq. \ref{eq:composite_op}.}
    \label{fig:3d_cube_eigenvectors}
\end{figure}

\subsection{Rescaled MNIST.}

Here in Table \ref{tab:f1_mnist_triple}, we compare mmDUFS to the baselines on the rescaled MNIST data with $3$ modalities. We can see that mmDUFS outperforms all the baselines in terms of the F1-score, demonstrating its ability to identify informative features in multimodal scenarios accurately.

\begin{table}[!htb]
    \centering
    \begin{tabular}{|c|c|c|c|c|}
         \hline
         Modality &  MC & mmKS & mmKP & mmDUFS\\
         \hline
         $X$& 0.4012  & 0.6163 & 0.6163 & \textbf{0.7035}\\
         \hline
         $Y$ & 0.5672 & 0.7562 & 0.7612 & \textbf{0.8259} \\
         \hline
         $Z$ & 0.5333 & 0.7385 & 0.7385 & \textbf{0.8154} \\
         \hline
    \end{tabular}
    \caption{F1-score of different methods on the rescaled MNIST data with 3 modalities}
    \label{tab:f1_mnist_triple}
\end{table}

\subsection{Rotating Dolls.}
The two modalities include video frames taken simultaneously from two cameras, of three dolls rotating at different angular speeds. The first camera (modality $\myX$) captures the left two dolls while the right camera (modality $\myY$) captures the right two dolls. Thus, the angle of the middle doll constitutes a shared variable $\myvec{\theta}_s$. The angle of the left doll $\myvec{\theta}_x$ is modality $\myX$-specific latent variable, and the angle of the right doll $\myvec{\theta}_y$ is modality $\myY$-specific latent variable.

From the left video, we cut the frames such that it includes only the middle doll (the shared component). From these images we computed a graph Laplacian matrix and its leading eigenvectors denoted $\myvec{\phi}^s_i$. As explained in Sec. \ref{sec:method}, we expect the eigenvectors of the shared operator, denoted $\myvec{v}_i^s$ to be similar to $\myvec{\phi}^s_i$, as both are associated with the latent variable $\myvec{\theta}_s$. Figure \ref{fig:rotating_dolls_operator} shows $\myvec{v}_i^s$ as a function of $\myvec{\phi}_i^s$
for $i=1,2,3$. The three vectors are clearly highly correlated.

\begin{figure}[htp!]
    \centering
    \includegraphics[width = 0.9\linewidth]{Figs/dolls_shared_operator.png}
    \caption{The figure shows a scatter plot of $\myvec{v}^s_i$, the leading eigenvectors of $\myvec{P_{\text{shared}}}$ as a function of $\myvec{\phi}^s_i$, the estimated leading vectors of the shared component in the rotating doll dataset.}
    \label{fig:rotating_dolls_operator}
\end{figure}

\subsection{CITE-seq Dataset.}

To demonstrate the feature selection performance of mmDUFS on the shared structures, we focus on the CITE-seq data and analyze four cell types: B cells, CD8 T cells, CD16+ Monocytes, and Naive CD4 T cells. This subset has $2,101$ cells for both RNA and protein modalities. We select the top $500$ variable genes as the informative features in the RNA modality and add $1,500$ nuisance features generated according to a Gaussian distribution. Then, we apply different baseline methods to select the informative features in the RNA modality and compare their performance using F1-score. As shown in Table \ref{tab:cite_shared}, mmDUFS outperforms other baseline methods in terms of selecting the correct informative features. 

\begin{table}[!htb]
    \centering
    \begin{adjustbox}{max width=0.4 \textwidth, valign=c}
    \begin{tabular}{|c|c|c|c|c|}
         \hline
          &  MC & mmKS & mmKP & mmDUFS\\
         \hline
         F1-score & 0  & 0.664 & 0.778 & \textbf{0.808}\\
         \hline
    \end{tabular}
    \end{adjustbox}
    \caption{Comparison of F1-score between different methods on the CITE-seq data (RNA modality)}
    \label{tab:cite_shared}
\end{table}

\subsection{Synthetic Gaussian Mixtures.}

\begin{figure*}[htb!]%
    \centering
      \parbox{\figrasterwd}{
      \centering
    \parbox{.2\figrasterwd}{%
      \subcaptionbox{\label{fig:gauss_mat}}{\includegraphics[height=0.4\textwidth,width=0.2\textwidth]{Figs/gaussian_mixture_heatmap_new}}}
    \parbox{.6\figrasterwd}{%
      \subcaptionbox{\label{fig:gauss_shared}}{\includegraphics[height=0.17\textwidth,width=0.6\textwidth]{Figs/gaussian_mixture_scores_shared}}
      
      \subcaptionbox{\label{fig:gauss_diff}}{\includegraphics[height=0.17\textwidth,width=0.6\textwidth]{Figs/gaussian_mixture_scores_diff}}  
    }}
    
    
    \caption{Synthetic Gaussian mixture cluster example. (a): Data matrix of modality $\myvec{X}$ (top) and $\myvec{Y}$ (bottom). Rows are samples, and columns are features. Each modality has $3$ clusters (labeled in red). Clusters $1$ and $2$ are shared between modalities, and cluster $3$ and $4$ are specific to each modality. (b): Change of the Shared Laplacian Scores, regularization loss, and the F1-score of the selected features concerning the number of epochs (x-axis) for mmDUFS with the shared operator. (c): Change of the Differential Laplacian Scores, regularization loss, and the F1-score of the selected features concerning the number of epochs (x-axis) for mmDUFS with the differential operator.}%
    \label{fig:supp_gauss_data}%
 
\end{figure*}

Here we apply mmDUFS to uncover the informative features of the shared clusters and the modality-specific clusters. Fig. \ref{fig:gauss_shared} and Fig. \ref{fig:gauss_diff} show the change of the average Shared/Differential Laplacian Scores across features, the regularization loss, and the F1-score of the selected features from mmDUFS with respect to the number of epochs, where we can see that mmDUFS gradually selects the correct features corresponding to high scores while sparsifying the number of features. 




We also apply DUFS to each modality on this data and compare its performance to mmDUFS in terms of F1-score, as shown below in Table \ref{tab:dufs_gaussian}.

\begin{table}[!htb]
    \centering
    \begin{tabular}{|c|c||c|c|}
         \hline
         Dataset & Modality & DUFS & mmDUFS\\
         \hline
         \multirow{2}{*}{Original Gaussian} &  X & 0.300 & \textbf{1} \\
                       &Y & 0 &\textbf{1}\\
          \hline
          \multirow{2}{*}{Gaussian + $10$ Noisy Feats} &  X & 0.2667 & \textbf{1} \\
                       &Y & 0 &\textbf{1}\\
          \hline
          \multirow{2}{*}{Gaussian + $30$ Noisy Feats} &  X & 0.100 & \textbf{1} \\
                       &Y & 0 &\textbf{1}\\
          \hline
         \multirow{2}{*}{Gaussian + $50$ Noisy Feats} &  X & 0.033 & \textbf{0.9667} \\
                       &Y & 0 &\textbf{0.8500}\\
            \hline
    \end{tabular}
    \caption{Comparison of F1-score on the synthetic Gaussian mixture data between DUFS and mmDUFS}
    \label{tab:dufs_gaussian}
\end{table}

DUFS is suboptimal for this task because it recovers the most informative features in a single modality. It does not, however, distinguish between modality-specific and modality-shared features. 

\section{Experiment Details}\label{sec:exp_details}

In the following subsections, we provide additional experimental details required for the reproduction of the experiments provided in the main text. The CPU model used for the experiments is Intel(R) Xeon(R) Gold 6150 CPU @ 2.70GHz (72 cores total). GPU model is NVIDIA GeForce RTX 2080 Ti. 

Below in Table \ref{tab:shared_param} and \ref{tab:diff_param}, we list the parameters we used on each experiment for mmDUFS with the shared operator and the differential operator. Paramter $c$ is a regularization constant for mmDUFS with the differential operator, as mentioned in the main text. Parameter $b$ is a scaling factor to the operators to balance between the Shared/Differential Laplacian Scores with respect to the regularization term. We used normalized Laplacian Matrix throughout the experiments except for the CITE-seq example where we found the performance was satisfactory with the un-normalized Laplacian Matrix.

\begin{table}[htb!]
    \centering
    \begin{tabular}{|c|c|c|c|c|c|}
      \hline
    Datasets & learning rate & epochs & $\lambda_x$  & $\lambda_y$ & $b$  \\
    \hline
    Rescaled MNIST & $2$ & $10000$ & $1e-1$ & $1e-1$ & $1e2$\\
    \hline
    Synthetic Tree & $2$ & $25000$ & $1e-1$ & $1e-1$ & $1e3$\\
    \hline
    Gaussian Mixture & $2$ & $10000$ & $1e-4$ & $1e-4$ &  $1$\\
    \hline
    Gaussian Mixture ($10$ Noisy Features) & $2$ & $20000$ & $1e-8$ & $1e-6$ &  $1$\\
    \hline
   Gaussian Mixture ($30$ Noisy Features) & $2$ & $40000$ & $1e-4$ & $1e-4$ &  $1$\\
    \hline
    Gaussian Mixture ($50$ Noisy Features) & $2$ & $10000$ & $1e-2$ & $1e-3$ &  $1e2$\\
    \hline
    Rotating Dolls & $2$ & $10000$ & $0.2$  & $0.2$ &  $1e3$\\
    \hline
    \end{tabular}
    \caption{Parameters for mmDUFS with the shared operator across different datasets.}
    \label{tab:shared_param}
\end{table}



\begin{table}[htb!]
    \centering
    \begin{tabular}{|c|c|c|c|c|c|c|}
      \hline
    Datasets & learning rate & epochs & $\lambda_x$  & $\lambda_y$ &$c$ & $b$  \\
    \hline
    Rescaled MNIST & $1$ & $10000$ & $0.5$ & $0.5$ & $1e-3$ & $1e-4$\\
    \hline
    Synthetic Tree & $2$ & $10000$ & $4$ & $2$ & $1e-3$ & $1e-3$\\
    \hline
    Gaussian Mixture & $1$ & $10000$ & $0.4$ & $0.4$ & $1e-1$ &  $1e-1$\\
    \hline
    Rotating Dolls & $2$ & $10000$ & $2$  & $2$ & $3$ &  $1e3$\\
    \hline
    CITE-seq & $2$ & $5000$ & $3$  & \diagbox & $2$ &  $1$\\
    \hline
    \end{tabular}
    \caption{Parameters for mmDUFS with the differential operator across different datasets.}
    \label{tab:diff_param}
\end{table}

For the baseline methods, $k$ features with the highest Laplacian Scores are selected. When evaluating f1-score on the synthetic datasets, we set $k$ to be the correct number of informative features. To make a fair comparison, we also let mmDUFS to select $k$ features by sorting the raw gates ($\mu_d$ for feature $d$). For other datasets, we define selected features by mmDUFS as features whose gates converged to $1$ ($z_d = 1$ for feature $d$). 

For the image datasets (rescaled MNIST, rotating dolls), we add small Gaussian noise drawn from $N(0,\sigma^2)$ to the pixels to stabilize feature selection of mmDUFS. For the rescaled MNIST dataset, $\sigma 
 = 0.1$ and we add noise to the non-informative pixels before standardizing the pixels via z-scoring. For the rotating dolls data, $\sigma = 5e-3$ and we add noise to all pixels before standardizing the pixels via z-scoring.

\subsection{Tuning of the Regularization Parameter}

mmDUFS has tunable regularization parameters $\lambda_x$ and $\lambda_y$ that control the sparsity of the number of selected features. For synthetic datasets, one can tune these parameters to select features such that the selected number is close to the prescribed number $s$. However, it can still be time and resource consuming to optimize these parameters. Also, for real data, one might not know how many features to select and what $\lambda_x$ and $\lambda_y$ to choose.

To alleviate this issue, we propose a "warm-up" procedure similar to \citep{lindenbaum2021differentiable} to optimize $\lambda_x$ and $\lambda_y$. Specifically, we evaluate the mean Shared Laplacian Scores $S_{\text{shared}} = \frac{1}{2n} (\text{Tr}[\myvec{\tilde{X}}^T \myvec{\tilde{P}_{\text{shared}}} \myvec{\tilde{X}}]/{m} +  \text{Tr}[\myvec{\tilde{Y}}^T \myvec{\tilde{P}_{\text{shared}}} \myvec{\tilde{Y}}]/{d})$  and the mean Differential Laplacian Scores  $S_{\text{x}} = \text{Tr}[\myvec{\tilde{X}}^T \myvec{Q}_{\tilde{x}} \myvec{\tilde{X}}]/{(d\times n)}$, $S_{\text{y}} = \text{Tr}[\myvec{\tilde{Y}}^T \myvec{Q}_{\tilde{y}} \myvec{\tilde{Y}}]/{(m \times n)}$ over a grid of $\lambda_x$ and $\lambda_y$ at the early stage of training (e.g., first $1000$ epochs), and pick the parameters that maximize the Scores. Here $n$ is the number of samples in the batch, and $m$ and $d$ are the number of selected features on each modality for real data, or the number of pre-specified features for synthetic data.

To demonstrate this procedure, we use the synthetic Gaussian mixture dataset as the example, and we evaluate $\lambda_x$ and $\lambda_y$ over \{$1e-6$,$1e-5$,$1e-4$,$1e-3$,$1e-2$,$1e-1$,$1$,$1e1$,$1e2$\} using mmDUFS with the shared operator. For illustration purpose, we set $\lambda_x = \lambda_y$ Fig. \ref{fig:gauss_lambda} shows the mean Shared Laplacian Scores over different $\lambda$ values. We can see that \{$1e-6$,$1e-5$,$1e-4$,$1e-3$\} are the best candidates that give the highest Shared Laplacian Scores that also correspond to the highest F1-score.

\begin{figure}
    \centering
    \includegraphics[width = 0.8\linewidth]{Figs/gaussian_mixture_lambdas}
    \caption{Evaluation of the mean Shared Laplacian Scores (left) and the corresponding F1-scores (right) over a grid of $\lambda$s on the synthetic Gaussian mixture dataset. y-axis shows the mean Shared Laplacian Scores (left) and F1-scores (right) whereas the x-axis shows the values of $\lambda$.}
    \label{fig:gauss_lambda}
\end{figure}

\subsection{Synthetic Gaussian Mixtures}
We simulate $2$ modalities $\myX$ and $\myY$, where modality $\myX$ has $260$ samples with $130$ features and modality $\myY$ has $260$ samples with $90$ features. Both modalities have $3$ clusters in the data ($\myX$ has cluster $1$, $2$, $3$ and $\myY$ has cluster $1$, $2$, $4$, all labeled in red in Fig. \ref{fig:gauss_mat}), and each cluster has a set of informative features denoted as $\myvec{f}_{x,i}$ and $\myvec{f}_{x,i}$ ($i$ = $1$, $2$, $3$, $4$) with length $m_i$ ($i$ = $1$, $2$, $3$, $4$). Each set of these informative features is drawn from $N(\myvec{\mu_i},\myvec{I})$ independently for each sample, where $\myvec{\mu_i}$ is a vector of length $m_i$ drawn from $U(2,4)$ and $\myvec{I}$ is an $m_i \times m_i$ identity matrix.

By design, cluster $1$ and $2$ are shared between modalities with $m_1 = 20$ and $m_2 = 10$ in modality $\myX$, and $m_1 = 10$ and $m_2 = 10$ in modality $\myY$. On the other hand, cluster $3$ is specific to modality $\myX$ with $m_3 = 40$, and cluster $4$ is specific to modality $\myY$ with $m_4 = 40$. The remaining features are considered noisy features and are drawn from $N(0,1)$.


\subsection{Synthetic Developmental Tree}

We use \textit{generate\_data()} function from dyntoy \footnote{https://github.com/dynverse/dyntoy},a tree simulator package, to generate a dataset $\myvec{X}_0$ with $1000$ samples and $100$ features. Specifically, the parameter \textit{num\_branchpoints} is set to $1$, \textit{num\_cells} is set to $1000$, \textit{num\_features} is set to $100$, \textit{sample\_mean\_count} is set to $10$, \textit{sample\_dispersion\_count} is set to $50$, \textit{differentailly\_expressed\_rate} is set to $4$, and \textit{dropout\_probability\_factor} is set to $0$.

This step yields an initial data matrix $\myvec{X}_0 \in \mathbb{R}^{1000 \times 100}$, and these $1000$ samples are initially partitioned into $4$ groups: $G_1$ and $G_2$, $G_3$ and $G_4$, $G_5$, $G_6$ shown in Fig. \ref{fig:tree_umap}. For $\myvec{X}_0$, we further divide it into two halves, resulting in $2$ data matrices $\myX \in \mathbb{R}^{1000 \times 50}$ and $\myY \in \mathbb{R}^{1000 \times 50}$. We regard $\myX$ and $\myY$ as $2$ data modalities and these features as informative features contributing to the shared tree structure.

We further add $50$ features to each modality that are drawn from negative binomial distributions to construct the differential structures between modalities. Specifically, for modality $\myX$, the $50$ features of $G_1$ are drawn from $NB(\mu=4,\alpha=0.1)$ where $\mu$ and $\alpha$ are the mean and dispersion parameter of the negative binomial distribution, whereas the $50$ features of the other groups of samples are drawn from $NB(\mu=20,\alpha=0.1)$. Similarly, for modality $\myY$, the $50$ features of $G_3$ are drawn from $NB(\mu=4,\alpha=0.1)$ while the $50$ features of the other groups of samples are drawn from $NB(\mu=20,\alpha=0.1)$. Therefore, $G_1$ is bifurcated from $G_2$ and this structure is only observed in $\myX$, and $G_3$ is bifurcated from $G_4$ and this structure is only observed in $\myY$. 

Next, we row normalize each data matrix multipled by a scaling factor $1e4$, and log1p transform the data. Then we standardize the features by z-scoring. At the end, we add $200$ features drawn from $N(0,1)$ to each modality as the noisy features.

\subsection{CITE-seq}

The human cord blood mononuclear cells (CBMCs) CITE-seq data was generated by \citep{stoeckius2017simultaneous}, where the expression levels of both RNA and protein are measured for the same cells. We analyze $3$ cell types: Erythoid cells, CD 34+ cells, and Murine cells. We row normalize each data matrix for both modalities. For the gene expression matrix (RNA), we filter the genes by standard deviation and keep the top $500$ variable genes. Then for both matrices, we standardize the features by z-scoring. 

%\bibliography{ref}

\end{document}
