\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
%%%% my packages %%%%
\usepackage{amsmath,amsfonts,bm}
\usepackage[many]{tcolorbox}
\usepackage{empheq}
\usetikzlibrary{shadows}
\usepackage{cancel}
%%%%%% my commands %%%%%
\newcommand{\indep}{\rotatebox[origin=c]{90}{$\models$}}
\newcommand*{\matb}[1]{\begin{bmatrix}#1\end{bmatrix}}
\newcommand*{\matp}[1]{\begin{pmatrix}#1\end{pmatrix}}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{remark}{Remark}[section]
\newtheorem{definition}{Definition}[section]
\newtheorem{corollary}{Corollary}[section]
\newtheorem{proposition}{Proposition}[section]
\newtheorem{example}{Example}[section]
\tcbset{
  highlight math style={
    enhanced,
     colback=white,
    arc=4pt,
    boxrule=1pt,
  }
}
%%%% math commands %%%%%%
%%%%% NEW MATH DEFINITIONS %%%%%

% Mark sections of captions for referring to divisions of figures
\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

% Highlight a newly defined term
\newcommand{\newterm}[1]{{\bf #1}}


% Figure reference, lower-case.
\def\figref#1{figure~\ref{#1}}
% Figure reference, capital. For start of sentence
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
% Section reference, lower-case.
\def\secref#1{section~\ref{#1}}
% Section reference, capital.
\def\Secref#1{Section~\ref{#1}}
% Reference to two sections.
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
% Reference to three sections.
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
% Reference to an equation, lower-case.
\def\eqref#1{equation~\ref{#1}}
% Reference to an equation, upper case
\def\Eqref#1{Equation~\ref{#1}}
% A raw reference to an equation---avoid using if possible
\def\plaineqref#1{\ref{#1}}
% Reference to a chapter, lower-case.
\def\chapref#1{chapter~\ref{#1}}
% Reference to an equation, upper case.
\def\Chapref#1{Chapter~\ref{#1}}
% Reference to a range of chapters
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
% Reference to an algorithm, lower-case.
\def\algref#1{algorithm~\ref{#1}}
% Reference to an algorithm, upper case.
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
% Reference to a part, lower case
\def\partref#1{part~\ref{#1}}
% Reference to a part, upper case
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


% Random variables
\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
% rm is already a command, just don't name any random variables m
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

% Random vectors
\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

% Elements of random vectors
\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

% Random matrices
\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

% Elements of random matrices
\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

% Vectors
\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

% Elements of vectors
\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

% Matrix
\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

% Tensor
\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


% Graph
\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

% Sets
\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
% Don't use a set called E, because this would be the same as our symbol
% for expectation.
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

% Entries of a matrix
\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

% entries of a tensor
% Same font as tensor, without \bm wrapper
\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

% The true underlying data generating distribution
\newcommand{\pdata}{p_{\rm{data}}}
% The empirical distribution defined by the training set
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
% The model distribution
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
% Stochastic autoencoder distributions
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
% But then they seem to use $L^2$ for vectors throughout the site, and so does
% wikipedia.
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Multi-View Independent Component Analysis\\ with Shared and Individual Sources}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{\href{mailto:<t.p.pandeva@gmail.com>?Subject=Your UAI 2023 paper}{Teodora Pandeva}{}}
\author[1]{Patrick Forr\'e}
% Add affiliations after the authors
\affil[1]{%
   AI4Science, AMLab\\
   University of Amsterdam\\
    The Netherlands
}
\affil[2]{%
   Swammerdam Institute for Life Sciences\\
   University of Amsterdam\\
    The Netherlands
}
  
  \begin{document}
\maketitle

\begin{abstract}
Independent component analysis (ICA) is a blind source separation method for linear disentanglement of independent latent sources from observed data. We investigate the special setting of noisy linear ICA, referred to as ShIndICA,  where the observations are split among different views, each receiving a mixture of shared and individual sources. We prove that the corresponding linear structure is identifiable and the sources distribution can be recovered. To computationally estimate the sources, we optimize a constrained form of the joint log-likelihood of the observed data among all views. Furthermore, we propose a model selection procedure for recovering the number of shared sources. Finally, we empirically demonstrate the advantages of our model over baselines. We apply  ShIndICA in a challenging real-life task, using three transcriptome datasets provided by three different labs (three different views). The recovered sources were used for a downstream graph inference task, facilitating the discovery of a plausible representation of the data's underlying graph structure.
\end{abstract}
\section{Introduction}\label{sec:intro}

Independent Component Analysis (ICA) is a method used to solve Blind Source Separation (BSS) problems \citep{comon1994independent}. The aim is to separate independent latent sources from mixed observed signals, thus revealing essential structures in various types of data. Historically, linear ICA has proven to be a successful approach in recovering spatially independent sources, such as regions of brain activity from magnetoencephalography (MEG) data \citep{vigario1997independent}, or functional MRI (fMRI) data \citep{mckeown1998independent}. The utility of ICA extends beyond neuroscience, with applications in omics data analysis, for example, \citep{zheng2008gene,zhou2018data,nazarov2019deconvolution,dubois2019refining,tan2020independent,rusan2020granular,cary2020application,aynaud2020transcriptional,urzua2021improving}. In these works, the interpretation of the latent sources hinges on the assumption that each experimental outcome is a linear mixture of different cell types, disease states, or other independent biological processes. For example, the gene expression data from a tumor biopsy might include signals from cancer cells, immune cells, and other cell types within the tumor microenvironment. Each of these cell types has distinct gene expression profiles that get mixed together in the observed data~\citep{avila2018computational}.

The rapid advancement of technology in the biomedical domain has provided a unique opportunity to find valuable insights from large-scale data integration studies. Many of these applications can be transformed into multiview BSS problems. A significant body of research has been devoted to developing multiview ICA methods focused on unraveling group-level (shared) brain activity patterns in multi-subject fMRI and EEG datasets \citep{salman2019group,huster2015group,congedo2010group, durieux2019partitioning, congedo2010group,calhoun2001method}. However, these methods cannot be applied directly to problems where one is interested in retrieving both shared and view-specific signals.  A scenario highlighting this limitation is when one is interested in investigating the individual-specific brain functions (view-specific) and shared phenotypes  patterns in individuals' brain activity in a natural stimuli experiment \citep{dubois2016building,bartolomeo2017botallo}. 

Another application, where the estimation of both shared and view-specific sources  is essential, is omics data integration (e.g. see \citep{smilde2017common,li2018review}).  For instance, we often have datasets from different sources (such as gene expression, proteomics, etc.) collected under various conditions and perhaps from different cohorts of individuals. The shared signals across these datasets represent stable patterns, such as consistently expressed genes across different types of cells or universally present metabolic pathways. These shared patterns can help us understand the fundamental biological processes common to all cells. On the other hand, view-specific signals reflect unique patterns under particular conditions or within specific cohorts. For instance, some genes may be expressed only under certain stress conditions, or particular biological patterns may be unique to a specific cohort of individuals with a disease. These view-specific patterns can provide insights into the specific factors that differentiate one condition or cohort from another. 
 \tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        
\begin{figure}
\centering
\begin{tikzpicture}[x=0.5pt,y=0.5pt,yscale=-1,xscale=1]
%uncomment if require: \path (0,300); %set diagram left start at 0, and has height of 300

%Shape: Circle [id:dp26705628276222093] 
\draw   (266,63) .. controls (266,49.19) and (277.19,38) .. (291,38) .. controls (304.81,38) and (316,49.19) .. (316,63) .. controls (316,76.81) and (304.81,88) .. (291,88) .. controls (277.19,88) and (266,76.81) .. (266,63) -- cycle ;
%Shape: Circle [id:dp5268731154449309] 
\draw   (192,151) .. controls (192,137.19) and (203.19,126) .. (217,126) .. controls (230.81,126) and (242,137.19) .. (242,151) .. controls (242,164.81) and (230.81,176) .. (217,176) .. controls (203.19,176) and (192,164.81) .. (192,151) -- cycle ;
%Shape: Circle [id:dp9018137182109075] 
\draw   (344,153) .. controls (344,139.19) and (355.19,128) .. (369,128) .. controls (382.81,128) and (394,139.19) .. (394,153) .. controls (394,166.81) and (382.81,178) .. (369,178) .. controls (355.19,178) and (344,166.81) .. (344,153) -- cycle ;
%Shape: Circle [id:dp1311990761525299] 
\draw   (267,204) .. controls (267,190.19) and (278.19,179) .. (292,179) .. controls (305.81,179) and (317,190.19) .. (317,204) .. controls (317,217.81) and (305.81,229) .. (292,229) .. controls (278.19,229) and (267,217.81) .. (267,204) -- cycle ;
%Straight Lines [id:da549974006261108] 
\draw    (291,88) -- (291.98,177) ;
\draw [shift={(292,179)}, rotate = 269.37] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da056450731626277895] 
\draw    (236,169) -- (266.47,194.71) ;
\draw [shift={(268,196)}, rotate = 220.16] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Straight Lines [id:da7132205635632564] 
\draw    (350,170) -- (318.53,196.71) ;
\draw [shift={(317,198)}, rotate = 319.69] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29)   ;
%Shape: Rectangle [id:dp9450710015089508] 
\draw   (135,113) -- (449,113) -- (449,245) -- (135,245) -- cycle ;

% Text Node
\draw (280,56) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle s_0$};
% Text Node
\draw (206,145) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle s_{d}$};
% Text Node
\draw (359,147) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle \epsilon _{d}$};
% Text Node
\draw (280,198) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle x_{d}$};
% Text Node
\draw (340,220) node [anchor=north west][inner sep=0.75pt]   [align=left] {$\displaystyle d=1\ ...\ D$};
\end{tikzpicture}
\caption{A graphical representation of Equation \ref{eq:model} where $x_d$ is the observed variable, $s_0$ denotes the shared sources, $s_d$ the view-specific ones and $\epsilon_d$ is the Gaussian noise.}
\label{fig:model}
\end{figure}

\textbf{Summary.} To address these and similar scientific applications, we formalize the described multi-view BSS problem as a linear noisy generative model for a multi-view data regime, assuming that the mixing matrix and a number of individual sources are view-specific. We call the resulting model, ShIndICA. By requiring that the sources are non-Gaussian and mutually independent and that the linear mixing matrices have full column rank, we provide identifiability  guarantees for the mixing matrices and latent sources in distribution. We maximize the joint log-likelihood of the observed views to estimate the mixing matrices. Furthermore, we provide a model selection criterion for selecting the correct number of shared sources. Finally, we apply ShIndICA to a data integration  problem of two large transcriptome datasets. We show empirically that our method works well when the estimated components are used for a graph inference task.
 
 \textbf{Contributions.} We summarize our contributions:
\begin{enumerate}
\item We propose a new multi-view generative BSS model with shared and individual sources called ShIndICA.
    \item We provide theoretical guarantees for the identifiability of the recovered linear structure and the source and noise distributions, as well as their dimensions.
    \item We derive the closed-form joint likelihood of ShIndICA, which is used for estimating the mixing matrices.
    \item By leveraging the generative model assumptions, we propose a selection criterion for inferring the correct number of shared sources.
\end{enumerate}

\section{Problem Formalization}
\label{sec:method}
Consider the following $D$-view multivariate linear BSS model where for $ d\in\{1,\ldots, D\}$ with a graphical representation in Figure \ref{fig:model}:
\begin{empheq}[box=\tcbhighmath]{align}
   x_d = A_d(\tilde{s}_d +\epsilon_d)=A_{d0}s_0+A_{d1}s_d+A_d\epsilon_d,
    \label{eq:model}
\end{empheq}
and it holds that
\begin{enumerate}
    \item $x_d\in\mathbb{R}^{k_d}$ is a random vector,
    \item $\tilde{s}_d = (s_0^\top,s_d^\top)^\top $ are latent non-Gaussian  sources  with  $\E[\tilde{s}_d]=0$ and $\Var[\tilde{s}_d] = \mathbb{I}_{k_d},$ and $s_0\in\mathbb{R}^c$ and $s_d\in\mathbb{R}^{k_d-c}$  the shared and  individual sources,
    \item $A_d \in \mathbb{R}^{k_d\times k_d}$ is an invertible mixing matrix, $A_{d0}$ and $A_{d1}$ are the columns corresponding to the shared and individual sources,
    \item $\epsilon_d \sim \mathcal{N}(0,\sigma^2 \mathbb{I}_{k_d})$ is Gaussian noise,
    \item   all latent random variables are mutually independent.
\end{enumerate}

We name the proposed generative model ShIndICA. This model builds upon the MultiViewICA model by \cite{richard2020modeling}, which  assumes the presence of shared sources exclusively.  The Gaussian noise in Equation \ref{eq:model} mirrors a device measurement error with variance $\sigma^2 A_dA_d^\top$, akin to \citep{richard2020modeling,richard2021shared}. We choose this configuration as it yields a closed-form joint data likelihood (see Section~\ref{sec:loss}), which is not available for alternative representations. Assumption~$5$ maintains that noise does not interact with the true signal, a concept typical for measurement error models. Moreover, when $D=1$, ShIndICA reverts to a standard linear ICA model, solved by \citet{comon1994independent,bell1995information,hyvarinen2000independent} for independent non-Gaussian latent sources $z:=\tilde{s}_1+\epsilon_1$.

 \section{Identifiability Results}
 \label{sec:ident}
Due to the absence of labels in unsupervised  learning, the algorithm's reliability cannot be directly confirmed outside of simulations. Therefore, we rely on theoretical guarantees to trust the algorithm's estimation of quantities of interest. In the case of BSS problems, it is necessary for sources and mixing matrices to be uniquely determined (or \textit{identifiable}) by the data, particularly when dealing with large samples.
 
 Identifiability results for noiseless single-view ICA are proved by \cite{comon1994independent}. It turns out that if at most one of the latent sources is normal and the mixing matrix is invertible, then both the mixing matrix and sources can be recovered \emph{almost surely} up to permutation, sign, and scaling. However, this result does not hold in the general additive noise setting.  \cite{davies2004identifiability} shows that if the mixing matrix has a full column rank, then the structure is identifiable, but not the latent sources. 

By employing the multi-view ($D\geq 2$)  noisy setting defined in Equation \ref{eq:model}, we extend the results by \cite{kagan1973characterization,comon1994independent,davies2004identifiability,richard2020modeling}. Compared to previous work, we provide identifiability guarantees not only for the  mixture matrices up to sign and permutation, but also for the \emph{sources and noise distributions} (up to the same sign and permutation), and the latent (both shared and individual) sources dimensions\footnote{Note that the identifiability of the source distributions is a weaker notion of identifiability than the almost sure one (i.e. recovering the exact sources) in the noiseless case~\citep{comon1994independent}.}. Moreover, our identifiability statement holds for a more general case than Equation \ref{eq:model} where the noise distribution can be view-specific and the mixing matrices can be non-square.  This is stated in the following Theorem~\ref{cor:identmultiview}, proved in Section~1 of the Supplementary Material:
 \begin{theorem}
\label{cor:identmultiview}
  Let $x_1, \ldots, x_D$ for $D\geq 2$ be random vectors with the following two representations:
 \begin{align*}
  A_{d}^{(1)} \Big(\matb{s_0^{(1)}\\s_d^{(1)}}+\epsilon_d^{(1)}\Big)  = x_d = A_{d}^{(2)}\Big( \matb{s_0^{(2)}\\s_d^{(2)}}+\epsilon_d^{(2)}\Big),
 \end{align*}
 where $d\in\{1,\ldots, D\}$, with the following properties for $i=1,2$
 \begin{enumerate}
     \item $A_d^{(i)} \in \mathbb{R}^{p_d\times {k_d^{(i)}}}$ is a (non-random) matrix with full column rank, i.e. $\operatorname{rank}(A_d^{(i)})=k_d^{(i)},$
     \item $\epsilon_d^{(i)}\in \mathbb{R}^{k_d^{(i)}}$ and $\epsilon_d^{(i)}\sim \mathcal{N}(0,\sigma_d^{(i)2}\mathbb{I}_{k_d^{(i)}})$ is a $k_d^{(i)}$-variate normal random variable,
      \item $\tilde{s}_d^{(i)} = (s_0^{(i)\top}, s_d^{(i)\top})^\top$ with $s_0^{(i)} \in \mathbb{R}^{c^{(i)}}$ and $s_d^{(i)} \in \mathbb{R}^{k_d^{(i)}-c^{(i)}}$ is a random vector such that:
        \begin{enumerate}
            \item the components of $\tilde{s}_d^{(i)}$ are mutually independent and each of them is a.s. a non-constant random variable,
            \item $\tilde{s}_d^{(i)}$ is non-normal with $0$ mean and unit variance.
        \end{enumerate}
        
    \item $\epsilon_d^{(i)}$ is independent from $s_0^{(i)}$ and $s_d^{(i)}$: $\epsilon_d^{(i)} \indep s_0^{(i)}$ and  $\epsilon_d^{(i)} \indep s_d^{(i)}$.
 \end{enumerate}
 
   Then,  the number of shared sources is identifiable, i.e. $c^{(1)}=c^{(2)}=:c$ and for all $d=1, \ldots, D$ we get that $k_d^{(1)}=k_d^{(2)}=:k_d,$  and there exists a sign matrix $\Gamma_d$ and a permutation matrix  $P_d\in \mathbb{R}^{k_d\times k_d}$ such that:
 \begin{align*}
     A^{(2)}_d = A^{(1)}_dP_d\Gamma_d,
 \end{align*}
 and furthermore, the source and noise distributions are identifiable, i.e.
 \begin{align*}
     \matb{s_0^{(2)}\\s_d^{(2)}} &\sim \Gamma_d^{-1} P_d^{-1} \matb{s_0^{(1)}\\s_d^{(1)}}, & 
   \sigma_d^{(2)}  &= \sigma_d^{(1)}.
 \end{align*}

 
 \end{theorem}

 Note that the requirement $D\geq 2$ is essential for the identifiability of the non-Gaussian latent source  and noise distributions. Conversely, in a single-view scenario, \cite{kagan1973characterization} demonstrates that identifying any arbitrary non-Gaussian source distribution is unfeasible unless we introduce an additional constraint mandating the latent sources to have non-normal components (refer to Theorem A.2).\footnote{A random variable $x$ is said to have non-normal components if for every representation $x\sim v +w$ with $v \indep w$, then $v$ and $w$ are non-normal.}. 
 
 Furthermore, to identify the linear structure, it is necessary to assume the non-normality of the latent sources—a standard presumption in ICA literature \citep{comon1994independent}, as previously mentioned. In a multi-view shared ICA scenario, \cite{richard2021shared} posits that the sources can be Gaussian if we impose additional conditions regarding the diversity of noise distributions. However, this is not relevant in our case as we do not incorporate these assumptions in our model.
  \section{Joint Data Log-Likelihood}
\label{sec:loss}
Here, we derive  the joint log-likelihood of the observed views which we use for estimating the mixing matrices. Following the standard ICA approaches \citep{bell1995information,hyvarinen2000independent}, instead of optimizing directly for the mixing matrices $A_d$, we estimate their inverses $W_d=A_d^{-1}$, called unmixing matrices. 

Let $z_d:=W_dx_d=\tilde{s}_d +\epsilon_d,$ and $z_{d,0}:=s_0 +\epsilon_{d0}\in \mathbb{R}^{c}$ and $z_{d,1}:=s_d +\epsilon_{d1} \in \mathbb{R}^{k_d-c},$ i.e. $z_d = (z_{d,0}^{\top}, z_{d,1}^{\top})^\top$ are the estimated noisy sources of the $d$-th view. Furthermore, let $p_{Z_{d,1}}$ be the probability distribution of $z_{d,1}$ and $|W_d|=|\det W_d|$. Then we can derive the data log-likelihood of  Equation \ref{eq:model} for $N$ observed samples per view (proved  in Section~2 of the Supplementary Material), given by
\begin{align}
\label{eq:unconstrained}
&\mathcal{L}(W_1, \ldots, W_D) =\sum_{i=1}^N\Big(\log f(\bar{z}_0^i) +  \sum_{d=1}^D \log  p_{Z_{d,1}}(z_{d,1}^{i})\Big) \\ \notag
  &  - \frac{1}{2\sigma^2} \sum_{d=1}^D  \Big( \operatorname{trace}(  Z_{d,0}Z_{d,0}^{\top}) -\frac{1}{D} \sum_{l=1}^D \operatorname{trace}(  Z_{d,0}Z_{l,0}^{\top}) \Big)\\ \notag
  &+ N\sum_{d=1}^D \log\vert W_d\vert +C 
\end{align}
where  $Z_{d,0}\in\mathbb{R}^{c\times N}$ for $d=1,\ldots, D$ is the data matrix that stores $N$ observations of $z_{d,0}$.

The first term in Equation~\ref{eq:unconstrained} refers to data log-likelihood of the estimated shared sources $\bar{z}_0^{i}=\sum_{d=1}^D z_{d,0}^{i}/D$ with density  $f(\bar{z}_0) = \int \exp\Big(-\dfrac{D\Vert s_0-\bar{z}_0\Vert^2} {2\sigma^2}\Big)p_{S_0}(s_0)ds_0$; the second term is the  data log-likelihood of the view-specific sources. The second line in the above equation can be further simplified by assuming that the data matrices $X_1\in\mathbb{R}^{k_1\times N}, \ldots, X_D\in\mathbb{R}^{k_D\times N}$ are whitened.

 \paragraph{Joint data log-likelihood after whitening.} Whitening is a data pre-processing procedure  that consists  of linearly transforming the random variables' realizations $x_d$ such that the resulting variable $\tilde{x}_d = K_dx_d$ has  uncorrelated components, i.e. unit variance, $\mathbb{E}[\tilde{x}_d\tilde{x}_d^\top ]=\mathbb{I}_{k_d}$, where $K_d$ is the whitening matrix. This step transforms the mixing matrix to an orthogonal one $\tilde{A}_d$.
 
After whitening, the joint data  log-likelihood has the form
 
 
 \begin{align}
\label{eq:opt}
 \mathcal{L}(\tilde{W}_1, \ldots, \tilde{W}_D) &\propto\sum_{i=1}^N \Big(\log f_{\sigma}(\tilde{\bar{z}}_0^i) +\sum_{d=1}^D \log  p_{\tilde{Z}_{d,1}}(\tilde{z}_{d,1}^{i}) \Big)\\ \notag  
 & + \frac{1+\sigma^2}{2D\sigma^2}\sum_{d=1}^D \sum_{l=1}^D \operatorname{trace}(  \tilde{Z}_{d,0}\tilde{Z}_{l,0}^{\top}  ),
\end{align}

where analogously to Equation \ref{eq:unconstrained} we abbreviate: ${\tilde{z}_{d}=(\tilde{z}_{d,0}^{\top}, \tilde{z}_{d,1}^{\top})^\top =\tilde{W}_d \tilde{x}_d}$ and $\tilde{\bar{z}}_0^{i}=\sum_{d=1}^D \tilde{z}_{d,0}^{i}/D$. Note that after whitening $\operatorname{trace}(\tilde{Z}_{d,0}\tilde{Z}_{d,1}^\top)=c$ and $|\tilde{W}_d|=1$  and thus vanish from Equation \ref{eq:opt} (see Section~2 of the Supplementary Material for detailed derivations). 

\paragraph{Understanding the loss.} In our work, we use Equation~\ref{eq:opt} for parameter estimation. The first line illustrates the log-likelihoods of the sources, while the second line functions as a regularization term, essential for uncovering the shared information among views. This regularization term maximizes the alignment between the estimated noisy shared signals across each pair of views.

\paragraph{Optimization.}  Both the density of shared and individual sources, denoted by $f_{\sigma}(\cdot)$ and $p_{\tilde{Z}_{d,1}}$, are unknown and, thus, approximated by a \textit{prior} nonlinear function $g(s)$. For instance, we  use $g(s)=-\log\cosh(s)$ in our experiments. Importantly, we do not directly optimize for the noise variance $\sigma^2$; instead, we establish it as a Lagrange multiplier through the relationship $\lambda=\frac{1+\sigma^2}{\sigma^2}$~\footnote{Finally, after training we compute the mixing matrices $\hat{A}_d$ by setting $\hat{A}_d=K^{-1}_d\tilde{W}_d$.  Thus, we recover the true ones $A_d$ up to scaling with $(1+\sigma^2)^{\frac{1}{2}}$, sign and column  permutation.}. 
\section{Model Selection}
\label{subsec:modsel}

The selection of the number of shared sources $c$ can be accomplished in an unsupervised manner, leveraging the assumptions of the data generation model. To be precise, we consider $k<k_d$ for all $d=1,\ldots,D$ as a potential candidate for the unknown shared number of sources $c$. Recall that $z_d=W_dx_d$ where $(z_{d,0}^\top, z_{d,1}^\top)^\top$ are the estimated sources for the $d$-th view, and $z_{d,0}\in\mathbb{R}^k$. To signify the dependency of $z_{d,0}$ on $k$, we write it as $z_{d,0}^{(k)}$. We introduce a goodness-of-fit measure called the Normalized Reconstruction Error (NRE), designed to evaluate the quality of $z_{d,0}^{(k)}$ for varying values of $k$, defined as:
\begin{align*}
\operatorname{NRE}(k):= \sum_{d=1}^D \frac{\Vert \hat{z}_{d}^{(k)} \Vert^2}{k},
\end{align*}
where  we  make use of the variable ${\hat{z}_{d}^{(k)} = z_{d,0}^{(k)}-\Bar{z}_0^{(k)}=z_{d,0}^{(k)} - \frac{1}{D}\sum_{l=1}^D z_{l,0}^{(k)}}$. Now, if we assume $k^*$ to be the correct guess (i.e. $k^*=c$) and that the true unmixing matrices $W_d$'s have been estimated, the $z_{d,0}^{(k^*)}$ for $d=1,\ldots, D$ become well-aligned. Consequently, $\hat{z}_{d}^{(k^*)}$ follows a normal distribution with a mean $\mathbb{E}[\hat{z}_{d}^{(k^*)}]=0$ and variance $\Var(\hat{z}_{d}^{(k^*)})=\frac{D-1}{D}\sigma^2 \mathbb{I}_{k^*}$ for each $d=1,\ldots, D.$ We can then relate this to the log-likelihood of $\hat{z}_{d}^{(k^*)}$:
\begin{align*}
\operatorname{NRE}(k^*)&:= \sum_{d=1}^D \frac{\Vert \hat{z}_{d}^{(k^*)} \Vert^2}{k^*}\stackrel{+}{\propto}- \sum_{d=1}^D \frac{\log p(\hat{z}_d^{(k^*)})}{k^*}\\
&=\sum_{d=1}^D \frac{D \Vert \hat{z}_{d}^{(k^*)} \Vert^2}{2(D-1)\sigma^2 k^*}-\frac{\cancel{k^*}\log(2\pi\sigma^2)}{2\cancel{k^*}}.
\end{align*}
The two quantities differ by translation and multiplication with constants (indicated with $\stackrel{+}{\propto}$). Thus, $\operatorname{NRE}(k^*)$ approaches $(D-1)\sigma^2$ in the large sample size limit, i.e. ${\operatorname{NRE}(k^*)\approx D\cdot\frac{D-1}{D}\frac{\sigma^2\cdot k^*}{k^*}}$.  Overestimation of shared sources by choosing $k>c$ breaches the generative model assumptions, indicating that the estimated $z_{d,0}^{(k)}$ are misaligned, leading to a $\hat{z}_{d}^{(k)}$ with ${\operatorname{NRE}(k)>(D-1)\sigma^2 \approx \operatorname{NRE}(k^*)}$. For $k<c$, our method outputs well-aligned $\hat{z}_{d}^{(k)}$ leading to ${\operatorname{NRE}(k)\approx \operatorname{NRE}(k^*)}$. Hence, we can apply an elbow or the following method to select the optimal parameter.

\textbf{Calculating NRE.} We repeat the procedure for various $k$
\begin{enumerate}
    \item We divide the data (applicable to each view) into two separate sets, with sizes $N_0$ and $N_1$, which do not necessarily have the same sample sizes.
    \item We then estimate the unmixing matrices for a fixed $k$ on the training set and estimate the shared sources on the test data.
    \item  We calculate the mean $\operatorname{NRE}(k)$ on the recovered test shared sources (not on the training set due to potential overfitting).
\end{enumerate}
We choose the maximum of all $k'$s that minimize the NRE:
\begin{align*}
k^*=\max\{\arg\min_k \overline{\operatorname{NRE}}(k)\},
\end{align*}
where
 \begin{align*}\overline{\operatorname{NRE}}(k)=\frac{1}{N_1} \sum_{i\leq N_1} \operatorname{NRE}(k)_i=\frac{1}{N_1} \sum_{i\leq N_1}\sum_d \frac{\Vert \hat{z}_{d}^{(k)i} \Vert^2}{ k}
 \end{align*}
is the average NRE score over all observed test samples. This score serves as a measure of the model's goodness of fit and indicates how well the true shared sources can be reconstructed from the unseen data. Even with $k<<c$, we can still achieve high-quality shared sources due to model fitting, as we will demonstrate empirically. Consequently, we prefer to select the highest possible $k$ that yields the minimum average shared source reconstruction error.


\section{Related Work}
  The existing body of work on linear multi-view BSS, inspired by the ICA literature, considers mostly \emph{shared} response model applications (i.e., no individual sources), some of them adopting a maximum likelihood approach  \citep{guo2008unified,richard2020modeling,richard2021shared} to model the noisy views of the proposed models. Other methods, such as independent vector analysis (IVA), relax the assumption about the shared sources by assuming that  they have the same first or higher order moments across view \citep{lee2008independent,anderson2011joint,via2011maximum,anderson2014independent,engberg2016independent}. Many of these approaches, such as Group ICA \citep{calhoun2001method},  shared response ICA (SR-ICA) \citep{zhang2016searchlight}, MultiViewICA \citep{richard2020modeling}, and ShICA\citep{richard2021shared}  incorporate a dimensionality reduction step for every view (CCA \citep{varoquaux2009canica,richard2021shared} or PCA) to extract the mutual signal between the multiple objects before applying an ICA procedure on the reduced data. However, there are no guarantees that the pre-processing procedure will entirely remove the influence of the object-specific sources on the transformed data.   In the ICA literature, there exist three methods for extracting shared and individual sources from data.  \cite{maneshi2016validation} proposes a heuristic way of using FastICA for the given task without discussing the identifiability of the results; \citep{long2020independent} suggests applying ICA on each view separately followed by statistical analysis to separate the individual from the shared sources;   \citep{lukic2002ica} exploits temporal correlations rather than the non-Gaussianity of the sources and thus is not applicable in the context we are considering.

A common tool for analyzing multi-view data is canonical correlation analysis (CCA), initially proposed by \cite{hotelling1992relations}. It finds two datasets' projections that maximize the correlation between the projected variables. Gaussian-CCA \citep{bach2005probabilistic}, its kernelized version \citep{bach2002kernel} and deep learning \citep{andrew2013deep} formulations of the classical CCA problem aim to recover shared latent sources of variations from the multiple views. There are extensions of CCA that model the observed variables as a linear combination of group-specific and dataset-specific latent variables: estimated with Bayesian inference methods  \citep{klami2013bayesian} or exponential families with MCMC inference \citep{seppo2010bayesian}. However, most of them assume that the latent sources are Gaussian or non-linearly related to the observed data \citep{salzmann2010factorized,kang2011restricted,wang2016deep} and thus lack identifiability results. 

Existing non-linear multiview versions such as \citep{tian2020contrastive,federici2020learning} cannot recover both shared and individual signals across multiple measurements and do not assure the identifiability of the proposed generative models. There are identifiable deep non-linear versions of ICA (e.g. \citep{hyvarinen2019nonlinear}) which can be employed for this task. However, their assumptions for achieving identifiability  are often hard to satisfy in real-life applications, especially in biomedical domains with low-data regimes. 

\section{Experiments}
\label{sec:exp}
 
\paragraph{Model Implementation and Training.}  We used the python library \texttt{pytorch} \citep{paszke2017automatic} to implement our method. We model each view with a separate unmixing matrix. To impose  orthogonality constraints on the unmixing matrices, we made use  of the \texttt{geotorch} library, which is an extension of \texttt{pytorch} \citep{lezcano2019trivializations}. The gradient-based method applied for training is L-BFGS. Before running any of the ICA-based methods we whiten every single view by performing PCA to speed up computation.   We estimate the mixing matrix up to scale (due to the whitening) and permutation (see Sections \ref{sec:ident} and~\ref{sec:loss}). To ensure that the algorithm consistently outputs the shared sources in the same order across all views, we initialized the unmixing matrices using Canonical Correlation Analysis (CCA).  We made this decision based on the property of CCA weights to constitute orthogonal matrices and the fact that they facilitate pairing and ordering of the transformed components across all views. For all conducted experiments, we fixed the parameter $\lambda=1$ where $\lambda: = \frac{1+\sigma^2}{\sigma^2}.$  The code for our method is publicly available at  \url{https://github.com/tpandeva/shindica/}.


 \paragraph{Baselines Implementation.}  We benchmark our ShIndICA against the standard single-view ICA method, Infomax \citep{ablin2018faster}. To adapt Infomax to the multi-view context, we apply it independently to each view. To match the shared components across different views, we use the Hungarian algorithm \citep{Kuhn55thehungarian} on their cross-correlation. For settings that involve a shared response model, we draw comparisons between ShIndICA and related methods such as MultiViewICA \cite{richard2020modeling}, ShICA, ShICA-ML \cite{richard2021shared}, and GroupICA, as proposed by \cite{richard2020modeling}. GroupICA incorporates a two-step preprocessing procedure that initially whitens the data in the single views, followed by a dimensionality reduction step on the joint views.

 \begin{figure*}
    \centering
    \includegraphics[scale=0.09]{ shared_all.png}
    \caption{We generate data with 50 individual and 50 shared sources (annotated by a dashed line) for $D=2,5,10,20$ and noise standard deviation $\sigma=0.1,0.5,1$. We train the model with varying $k$ (x-axis), and compute the average Amari distance (left plot) and the MCC (right plot) of the estimated shared sources and the ground truth ones. While the Amari distance suggests that we get the best mixing matrix estimates  when we "guessed" the right number of sources, the shared sources MCC plot shows that we can  estimate the true shared sources with high quality if $D$ is large enough also for overestimated $c$. }
    \label{fig:sharedsyn}
\end{figure*}


\begin{figure}[tbh]
    \centering
    \includegraphics[scale=0.07]{ noiseless.png}
    \caption{ShIndICA's (this paper) performance is compared with ShICA, Infomax, GroupICA, MultiViewICA, and ShICA-ML. Datasets from two views, each with 100 sources and 1000 samples, are used. The known number of shared sources varies from 10 to 100 (x-axis). The Amari distance (y-axis) measures accuracy with lower values indicating better performance. ShIndICA consistently outperforms the other methods.}
    \label{fig:noiseless}
\end{figure}
   \begin{figure*}
    \centering
    \includegraphics[scale=0.08]{ nre_all_new.png}
    \caption{We generate data with 100 sources, of which 50 are shared (dashed line) for different views $D=2,5,10,20$ and noise variances $\sigma=0.1,0.5,1$. We compute the NRE on the test and train data for different candidates of  $c=10,\ldots,100$. If we "overestimate" the number of shared sources we see that the NRE score increases.}
    \label{fig:nse}
\end{figure*}
\subsection{Synthetic Experiments}
\label{subsec:syn}
\paragraph{Data Simulation.}  We simulated the data using the Laplace distribution $\exp(-\frac{1}{2}\vert x \vert)$. The mixing matrices are sampled with entries following a normal distribution with a mean of $1$ and a standard deviation of $0.1$. The realizations of the observed views are obtained according to the proposed model. In the different scenarios described below, we vary the noise distribution.  We conducted each experiment $50$ times and based on that we provided  error bars in  all figures where applicable.  Additional experiments are provided in Section~4 of the  Supplementary Material.

\paragraph{Motivational Example: Noiseless Views.} This example illustrates the advantage of our method compared to the other multi-view ICA methods for modeling view-specific and individual sources. In Figure \ref{fig:noiseless}, we consider a noiseless view setting, where we fixed the dimension to be 100 and we vary the number of shared sources $c$ from 10 to 100 in a two-view setting. We fit a model for every $c$ which is considered to be known. The quality of the mixing matrix estimation is measured  with the Amari distance \citep{amari1995new}, which cancels if the estimated matrix differs from the ground truth one up to scale and permutation.  We can see that as soon as the ratio of shared sources to individual sources gets around 1:1 we can recover the mixing matrices with very high accuracy (the Amari distance is almost 0), compared to the baseline methods which cannot perform well in this setting. Moreover, even in the case when all sources are shared, i.e. the baselines' model assumption is satisfied, our method performs as well as MultiViewICA which is a state-of-the-art model designed for this  task.   More experiments on the noisy views are provided in Section~4 of the Supplementary Material.

\paragraph{Shared Sources Estimation.}
This experiment illustrates ShIndICA's performance when the number of sources is unknown beforehand and user-defined. We generate a dataset comprising 50 shared and 50 individual sources from $D=2,5,10,20$ views with noise standard deviations of $\sigma=0.1,0.5,1$. The number of shared sources given for training is varied from 10 to 100 and a model is trained on each dataset for every choice of this hyperparameter. The results are summarized in Figure \ref{fig:sharedsyn}, where the x-axis indicates the number of shared sources given for the training. The line colors and styles denote the number of views and noise distribution used for the data generation. We first evaluate ShIndICA's overall performance using the Amari distance between the estimated and actual mixing matrices. The left plot of Figure \ref{fig:sharedsyn} shows the Amari distance reaching its lowest when the correct value of $c$ is guessed.
Next, we evaluate the quality of the recovered shared sources (the average shared sources across all views, $\bar{z}_0$) by calculating the mean cross-correlation (MCC) between the estimates and the actual sources. This involves aligning the ground truth components with the estimated ones using the Hungarian algorithm and computing the mean correlations between these matched pairs. The right plot in Figure \ref{fig:sharedsyn} implies that even with high noise variance, high-quality estimates of the shared sources (high MCC scores) are attainable given sufficient views. This holds even when $c$ is overestimated.

\begin{figure}
         \centering
         \includegraphics[scale=0.07]{shared_noisy.png}
         \caption{ The data is generated according to a model where no individual sources are present and the noise per view is uniformly sampled from the interval $[1,2]$. The number of views is set to 10, and the sample size is 1000. We vary the number of  sources from 10 to 50 (y-axis). ShIndICA and MultiviewICA  have the best Amari distance compared to the other methods. }
         \label{fig:noisy}
     \end{figure}
   
\paragraph{Model Selection.}
The preceding experiment indicates the critical role of hyper-parameter $c$ in ShIndICA's training and performance. The NRE score, as introduced in Section~\ref{subsec:modsel}, aids in selecting the correct number of sources by serving as a goodness-of-fit measure. The same data generation models as before are considered. Each model is trained with various shared sources $k$. Figure \ref{fig:nse} summarizes the results: the x-axis refers to the hyper-parameter $k$ used for training, and the y-axis denotes the corresponding NRE score on both the training and test data (each with a sample size $N=1000$), distinguished by the line style. Notably, in all scenarios, the NRE remains low if the number of sources is underestimated and it rises upon overestimating $c$, particularly when the noise variance is low (left plot). Furthermore, due to overfitting, the NRE score on the training data is increasing with increasing k.  Therefore, the NRE on the test data is more suitable for model selection than the NRE on the training data. In contrast, the NRE score on the test data remains stable and attains its minimum at the correct number of sources.

     
\paragraph{Robustness to Model Misspecification in a Shared Response Model Application.} Here we want to investigate the robustness of our model  when the noise has a view-specific variance. To provide a fair comparison to the baseline methods, we apply our method to a shared response setting, i.e. no individual sources are available. For this experiment the view-specific variances are uniformly sampled from $[1,2]$, the number of views is $10$ and the number of sources varies from 10 to 50. Figure \ref{fig:noisy}  shows that  ShIndICA and  MultiViewICA  show consistently the best model performances (lowest Amari distance between estimates and ground truth matrices) compared to the other methods.


\subsection{Data Fusion of Transcriptome Data}
\label{sub:real}
\paragraph{Background and Data Generation Assumption.} Transcriptome datasets are relevant to the field of genomics.  After preprocessing they have the form of  random data matrices, where each row corresponds to a gene and each column refers to an experiment. Based on these datasets, scientists try to infer gene-gene interactions in the genome. Combining as many datasets as possible enables getting better gene regulatory predictions. This is a challenging task due to the batch effects (non-biological noise) in the data. 

Consider the cocktail party problem, a classical application of Independent Component Analysis (ICA). Imagine a room filled with speakers and several microphones placed strategically to record everyone. Each microphone picks up a different combination of all voices. The objective of ICA, in this context, is to isolate the original speakers' speeches from the mixed microphone signals. Analogously, we can interpret a transcriptome dataset  as the microphones in this scenario. Each microphone represents an experimental condition, and similar to how individuals act independently at a party, we posit that the regulators in the cell operate independently. Therefore, each column of the transcriptome data represents a mixed measure of transcriptional regulator effects. This hypothesis has found support in several studies, such as \citep{sastry2019escherichia,sastry2021independent,fraunhoffer2022multi}. Upon applying ICA, we aspire to recover the sources symbolizing these independent regulatory pathways.

To formalize, let $X\in \mathbb{R}^{n\times p}$ denote a transcriptome data matrix with $n$ samples (or experimental outcomes) and $p$ genes. We assume that the transcriptome matrix follows a linear latent model, meaning there exist matrices $A\in\mathbb{R}^{n\times k}$ and $S\in\mathbb{R}^{k\times p}$ such that $X=AS.$ The $k$ components are representations of gene expression levels. If a group of genes appears over or under-expressed in a specific component, they are typically assumed to share a functional property in the genome. Furthermore, if these components operate independently, it implies that they correspond to distinct genetic pathways. In other words, sets of genes that demonstrate overexpression or underexpression in these components are assumed to act independently from each other under the observed experimental conditions.
\begin{figure}
    \centering
    \includegraphics[scale=0.3]{data_int.pdf}
    \caption{ICA for Data Fusion. We are given two views $X_1$ and $X_2$ that share information in terms of gene activity. The common information here is the activation of the orange and blue networks. There are  view-specific signals: the green ($X_1$) and the purple ($X_2$) networks. The goal of the data integration task is to extract the shared signal $S_0$ and the view-specific sources $S_1$ and $S_2$.}
    \label{fig:icadf}
\end{figure}
\paragraph{ICA for Data Fusion.} We combine noisy transcriptome datasets by considering that the different datasets (or views) share information but also have view-specific ones. The shared information can be the activity of regulatory pathways invariant across conditions, such as housekeeping genes. In the example shown in Figure \ref{fig:icadf}, we have two noisy datasets $X_1$ and $X_2$, presumably provided by different labs. The shared information is the orange and blue networks. After successfully combining the two views, we would like to retrieve these shared sources denoted by $S_0$ and the dataset-specific ones $S_1$ and  $S_2$ efficiently. These are colored green and purple in our diagram.


\paragraph{Datasets.} In this example, we consider the bacterium \textit{Bacillus subtilis}, the best studied Gram-positive bacteria which can be found in the soil and human intestines. Our goal is to  combine three  publicly available datasets (views) provided by different labs \citep{nicolas2012condition,arrieta2015experimentally,Sastry2021.07.01.450581}. Each of the datasets contains gene expression levels of about 4000 genes measured across more than 250 experimental outcomes. For a detailed description of the datasets refer to Section~3 of the Supplementary Material.

\begin{figure}
    \centering\includegraphics[scale=0.4]{ nre_omics_3v.png}
    \caption{The NRE score computed on test data for the transcriptome data for various $k$. This procedure was repeated $50$ times and the error bars represent the estimated $95\%$ confidence interval.}
    \label{fig:omicsshared}
\end{figure}

\paragraph{Model Selection.}
For the data fusion task, we do not have any prior knowledge about the shared information between the three datasets (three views). Therefore, we utilize the model selection procedure in Section \ref{subsec:modsel} to choose the number of shared sources. In this case, we randomly split the data into train and test sets with proportions 3:1. We estimated the mixing matrices on the train data for different~$k$. We reconstruct the test set shared sources and compute the corresponding NRE scores. This procedure is repeated 50 times for different splits and the results are displayed in Figure \ref{fig:omicsshared}. The NRE score reaches its minimum for $k=4$, indicating the number of shared sources. 


\paragraph{Data Integration for Co-Regulation Inference.} 
Often, the primary objective of transcriptome data fusion studies is to predict novel gene-gene interactions from the data, such as co-regulation patterns. Here, we consider an application for estimating an undirected graph where the nodes represent the genes, and the edges indicate that these genes have a common regulator. Given the high-dimension-low-sample-size nature of transcriptome datasets (i.e., the number of genes exceeds the number of samples), it is often challenging to discern meaningful relationships. A tool like the Graphical Lasso (glasso) \citep{friedman2008sparse} becomes particularly valuable here, as it is well-suited for uncovering the underlying graphical structure of the observed data. The output of glasso is a precision matrix from which the graphical structure can be directly determined: each non-zero entry from the precision matrix indicates an edge between the corresponding variables.

In this scenario, instead of feeding the "raw" data samples directly into the  glasso, we utilize the components generated by the data integration algorithm. The rationale behind this is that the combined data, resulting from the integration of multiple datasets, can provide a more comprehensive and accurate picture, which, in turn, can enhance the performance of the glasso in identifying the true co-regulation relationships among genes.
\begin{figure}
         \centering 
         \includegraphics[scale=0.09]{ glasso_3v.png}
    \caption{We compare the top ten models with ShIndICA and Infomax. We order the edges from the  networks according to their strength. We count the true positives (y-axis) and possibly (?) false positive edges (x-axis) in the first $100, 200, \ldots$ edges.  ShIndICA combined with glasso identifies more true positive edges than Infomax with glasso. }    
\label{fig:tpfp}
\end{figure}
\paragraph{Use Case: Co-Regulation in \textit{Bacillus subtilis}.}Our analysis focuses on the comparative evaluation of ShIndICA and the naive ICA approach (Infomax as utilized in the previous example) within the defined downstream data integration task. We specify the number of shared sources for ShIndICA to be $5$, and for the naive Infomax approach, we set it to $0$. After performing PCA whitening on the data, the number of sources per view is brought down to 72, 30, and 59, respectively. A comprehensive description of the exact component selection process is provided in Section~3 of the Supplementary Material.

After applying each method (ShIndICA or Infomax), we fit 30 glasso models, each with varying penalization parameters, on the estimated components. Using a statistical goodness-of-fit measure, we select the top 10 models (for a detailed understanding, refer to Algorithm 1 in Section~3 of the Supplementary Material). To evaluate the efficacy of our results, we gather a ground truth network from the online \textit{Subti}Wiki database \citep{pedreira2021}, which comprises 5,952 pairs of regulator and regulated genes. As our method predicts pairs of co-regulated genes, we modify the ground truth network into an undirected graph, establishing connections between genes with a shared regulator. 

Figure \ref{fig:tpfp} compares the ten output graphs generated by the glasso combined with ShIndICA or Infomax. For every estimated graph, we rank the edges according to their strength and count the true positive (y-axis) and false positive (x-axis) edges within the top $100, 200, \ldots$ edges. ShIndICA outperforms Infomax, as demonstrated by all ten selected glasso models with curves above the ones from the glasso models combined with Infomax. In other words, when paired with glasso, ShIndICA identifies more true positive edges than when glasso is combined with Infomax.


\section{Discussion}
We introduced ShIndICA, a novel noisy linear Independent Component Analysis (ICA) approach, designed to leverage shared information across different views to extract both shared and view-specific sources effectively. Our method is backed by theoretical guarantees affirming the identifiability of the model's linear structure, latent source, noise distributions, and the number of shared and individual sources. The unmixing matrices are estimated through the maximization of the joint log-likelihood of the observed views, while a novel goodness-of-fit measure guides the selection of the number of shared sources. Our empirical studies demonstrate the robust performance of ShIndICA on simulated data, even in the case of model misspecification. We introduced a novel strategy for merging transcriptome data, showing that our model can align estimated sources with biologically meaningful signals. ShIndICA not only combines transcriptome data effectively but also boosts the effectiveness of the chosen graphical inference models in extracting biologically relevant insights.

\begin{acknowledgements} 
We sincerely value the contributions made by various individuals to this work. Our profound thanks to Joris Mooij for his insightful discussions and manuscript reviews. We are also grateful to Leendert Hamoen and Martijs Jonker for their valuable perspectives on transcriptome data fusion. Finally, we thank Sara Magliacane for the constructive feedback  that greatly improved our manuscript.
\end{acknowledgements}

% References
\bibliography{pandeva_351}
\end{document}
