\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
%%%% my packages %%%%
\usepackage{amsmath,amsfonts,bm}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amsthm}
\usepackage{wrapfig}
%%%%%% my commands %%%%%
\newcommand{\indep}{\rotatebox[origin=c]{90}{$\models$}}
\newcommand*{\matb}[1]{\begin{bmatrix}#1\end{bmatrix}}
\newcommand*{\matp}[1]{\begin{pmatrix}#1\end{pmatrix}}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{remark}{Remark}[section]
\newtheorem{definition}{Definition}[section]
\newtheorem{corollary}{Corollary}[section]
\newtheorem{proposition}{Proposition}[section]
\newtheorem{example}{Example}[section]
\usepackage{algorithm}
\usepackage{subcaption}

\usepackage[noend]{algpseudocode}
\algnewcommand{\Input}[1]{%
	\State \textbf{Input:}
	\Statex \hspace*{\algorithmicindent}\parbox[t]{.8\linewidth}{\raggedright #1}
}
\algnewcommand{\Output}[1]{%
	\State \textbf{Output:}
	\Statex \hspace*{\algorithmicindent}\parbox[t]{.8\linewidth}{\raggedright #1}
}\algrenewcommand\alglinenumber[1]{\tiny #1:}
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Multi-View Independent Component Analysis\\ with Shared and Individual Sources\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{\href{mailto:<t.p.pandeva@gmail.com>?Subject=Your UAI 2023 paper}{Teodora Pandeva}{}}
\author[1]{Patrick Forr\'e}
% Add affiliations after the authors
\affil[1]{%
   AI4Science, AMLab\\
   University of Amsterdam\\
    The Netherlands
}
\affil[2]{%
   Swammerdam Institute for Life Sciences\\
   University of Amsterdam\\
    The Netherlands
}
  
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\section{Identifiability Results}
\label{sec:proof}



Here we cite and correct needed results from {\citep[Lemma 10.2.3, Theorem 10.3.1]{kagan1973characterization}}:

\begin{theorem}[Identifiability for independent non-constant sources {\citep[Lemma 10.2.3, Theorem 10.3.1]{kagan1973characterization}}]
\label{theo:kagan}
Let $x \in \mathbb{R}^p$ be a $p$-dimensional random vector with two representations:
\begin{align}
    A^{(1)} y^{(1)} + \mu^{(1)} = x = A^{(2)} y^{(2)} + \mu^{(2)},
\end{align}
with the following properties for $i=1,2$:
\begin{enumerate}
    \item $A^{(i)} \in \mathbb{R}^{p\times {k^{(i)}}}$ is a (non-random) matrix with non-zero columns and for which no two columns are proportional to each other,
    \item $\mu^{(i)} \in \mathbb{R}^{p}$ a (non-random) column vector,
    \item $y^{(i)} \in \mathbb{R}^{k^{(i)}}$ is a random vector such that:
        \begin{enumerate}
            \item its ${k^{(i)}}$ components $\{y^{(i)}_1,\dots,y^{(i)}_{k^{(i)}} \}$ are mutually independent,
            \item each of its components $y^{(i)}_j$ is a non-constant random variable (a.s.), i.e.\ does not have a delta-peak distribution,   $j=1,\dots,{k^{(i)}}$.
        \end{enumerate}
\end{enumerate}
Then we have the following:
\begin{align}
    \mu^{(2)} - \mu^{(1)} &\in A^{(1)} \mathbb{R}^{k^{(1)}} =  A^{(2)} \mathbb{R}^{k^{(2)}}, & \mathrm{rank}(A^{(1)}) = \mathrm{rank}(A^{(2)}).
\end{align}
In particular, there exist $c^{(1)} \in \mathbb{R}^{k^{(1)}}$, $c^{(2)} \in \mathbb{R}^{k^{(2)}}$ such that: $\mu^{(2)} - \mu^{(1)} = A^{(1)} c^{(1)} = A^{(2)} c^{(2)}$.

Furthermore, the following statements hold:
\begin{enumerate}
    \item If the $l$-th column of $A^{(2)}$ is not proportional to any column of $A^{(1)}$, then $y^{(2)}_l$ is a normally distributed random variable.
    \item Assume that the $l$-th column of $A^{(2)}$ is proportional to the $j$-th column of $A^{(1)}$ with proportionality constant\footnote{Note that this proportionality constant was forgotten to be reintroduced in {\citep[Theorem 10.3.1]{kagan1973characterization}} after it was ``w.l.o.g.'' removed in {\citep[Lemmata 10.2.4, 10.2.5.]{kagan1973characterization}}.} $0 \neq \lambda \in \mathbb{R}$, i.e.: $a^{(2)}_l = \lambda \cdot a^{(1)}_j$.
        Then there exists a (complex) polynomial $g$ such that we have the following equation for the characteristic functions of the components $y^{(2)}_l$ and $y^{(1)}_j$ (in a neighborhood of the origin):
    \begin{align}
        \phi_{y^{(2)}_l}(\lambda t) &= \phi_{y^{(1)}_j}(t) \cdot \exp(g(t)).
    \end{align}
    In particular $y^{(2)}_l$ is (non-)normal if and only if $y^{(1)}_j$ is (non-)normal.
\end{enumerate} 
\end{theorem}




The following result is a corollary from the work of \citep{kagan1973characterization} and is used for proving the main result of our paper.
\begin{theorem}[Identifiability of the single view ICA model]
\label{theo:singleview}
Let $x  \in \mathbb{R}^p$ be a random variable. 
Assume that we have the following two representations of $x$:
\begin{align}
    A^{(1)}(y^{(1)}+\epsilon^{(1)}) + b^{(1)} = x = A^{(2)}(y^{(2)}+\epsilon^{(2)}) + b^{(2)},
    \label{eq:repr}
\end{align}
with the following properties for $i=1,2$:
\begin{enumerate}
    \item $A^{(i)} \in \mathbb{R}^{p\times {k^{(i)}}}$ is a (non-random) matrix with full column rank, i.e.\ $\mathrm{rank}(A^{(i)}) ={k^{(i)}} \le p$,
    \item $b^{(i)} \in \mathbb{R}^{p}$ a (non-random) column vector,
    \item $\epsilon^{(i)}\in \mathbb{R}^{k^{(i)}}$ is an uncorrelated $k$-variate normal random variable: $\epsilon^{(i)}\sim \mathcal{N}(\mu^{(i)},\Sigma^{(i)})$, with mean $\mu^{(i)} \in \mathbb{R}^{k^{(i)}}$ and a positive-definite diagonal covariance matrix $\Sigma^{(i)}\in \mathbb{R}^{{k^{(i)}}\times {k^{(i)}}}$,
    \item $y^{(i)} \in \mathbb{R}^{k^{(i)}}$ is a random variable such that:
        \begin{enumerate}
            \item its ${k^{(i)}}$-components $\{y^{(i)}_1,\dots,y^{(i)}_{k^{(i)}} \}$ are mutually independent,
            \item each of its component $y^{(i)}_j$ is a non-constant random variable (a.s.), $j=1,\dots,{k^{(i)}}$,
            \item $y^{(i)}$ has no normal components, i.e.\ if we can write: $y^{(i)} \sim \tilde{y}^{(i)} + \hat{y}^{(i)}$ with $\tilde{y}^{(i)} \indep \hat{y}^{(i)}$, then $\tilde{y}^{(i)}$ and $\hat{y}^{(i)}$ are non-normal,
        \end{enumerate}
    \item $\epsilon^{(i)}$ is independent from $y^{(i)}$: $\epsilon^{(i)} \indep y^{(i)}$.
\end{enumerate}
Then $k^{(1)}=k^{(2)}=:k$ and there exist a permutation matrix $P\in \mathbb{R}^{k\times k}$, an invertible diagonal  matrix $\Lambda \in \mathbb{R}^{k\times k}$ and a column vector $c \in \mathbb{R}^{k}$ such that:
\begin{align*}
    A^{(2)} = A^{(1)}P\Lambda,
\end{align*}
and such that the corresponding random variables
 have the same distributions:
\begin{align*}
   P\Lambda y^{(2)} + c & \sim y^{(1)}, & P\Lambda(\epsilon^{(2)}-\mu^{(2)}) & \sim \epsilon^{(1)}-\mu^{(1)}, & P\Lambda\Sigma^{(2)}\Lambda^\top P^\top &=\Sigma^{(1)}.
\end{align*}
\end{theorem}

\begin{proof}


1. In the first part of our proof we show that $k^{(1)}=k^{(2)}=:k$ and $ A^{(2)} = A^{(1)}P\Lambda$ for some permutation matrix $P\in \mathbb{R}^{k\times k}$, an invertible diagonal  matrix $\Lambda \in \mathbb{R}^{k\times k}$.

First, for $i=1,2$ we state an equivalent formulation of the linear representation of $x$ given in \ref{eq:repr}. According to \citep[Lemma 10.2.3]{kagan1973characterization}, there exist a constant column vector $c^{(2)}\in\mathbb{R}^{k^{(2)}}$ such that $b^{(2)} - b^{(1)} =A^{(2)} c^{(2)}$. It follows that     $\tilde{x}=x- b^{(1)}=A^{(1)}(y^{(1)}+\epsilon^{(1)}) = A^{(2)}(y^{(2)}+\epsilon^{(2)} + c^{(2)})$. 



Furthermore, note that if $y^{(i)}$ is non-normal, then the random variables $g^{(1)} = y^{(1)}+\epsilon^{(1)}$ and $g^{(2)} = y^{(2)}+\epsilon^{(2)}+ c^{(2)}$  are also non-normal. This follows from the fact that if $g^{(i)}$ is normal then both $y^{(i)}$ and $\epsilon^{(i)}$ would be normal according to the Lévy-Cramér theorem. 

Thus, we can apply Theorem \ref{theo:kagan} for the two representations of $\tilde{x}$,  $\tilde{x}=A^{(1)}g^{(1)}$ and $\tilde{x}= A^{(2)}g^{(2)}$. Since every component of $g^{(i)}$ is non-normal, it follows that every column of $A^{(1)}$ is proportional to a column of $A^{(2)}$ and vice versa. 



Now assume  w.l.o.g that $k^{(1)}>k^{(2)}$. Then, there exist two columns of $A^{(1)}$ that are proportional to a column of $A^{(2)}$. However, this is a contradiction to assumption 1. that the matrix $A^{(1)}$ has full column rank.

Thus, it follows that $k^{(1)}=k^{(2)}=:k$ and $ A^{(2)} = A^{(1)}P\Lambda$ for some permutation matrix $P\in \mathbb{R}^{k\times k}$, an invertible diagonal  matrix $\Lambda \in \mathbb{R}^{k\times k}$. Moreover,
\begin{align*}
    A^{(1)}(y^{(1)}+\epsilon^{(1)}) = A^{(1)} P \Lambda(y^{(2)}+\epsilon^{(2)} + c^{(2)}).
\end{align*} 

Multiplying with $(A^{(1),\top}A^{(1)})^{-1}A^{(1),\top}$, which gives:

\begin{align*}
    y^{(1)}+\epsilon^{(1)} = P \Lambda(y^{(2)}+\epsilon^{(2)} + c^{(2)}).
\end{align*}



2. In the remaining we show that there exists a column vector $c$ such that  $ y^{(1)}  \sim  P \Lambda(y^{(2)} + c^{(2)})+c$ and $\epsilon^{(1)}-\mu^{(1)}  \sim P \Lambda( \epsilon^{(2)}-\mu^{(2)})$ (or equivalently $\Sigma^{(1)}= P\Lambda\Sigma^{(2)}\Lambda^\top P^\top$). Now, define $\tilde{y}^{(2)} = P \Lambda y^{(2)}$, $\tilde{c}^{(2)}= P \Lambda c^{(2)}$ and $\tilde{\epsilon}^{(2)}  =  P \Lambda {\epsilon}^{(2)}$ which is normally distributed with mean $\tilde{\mu^{(2)}}=P\Lambda \mu^{(2)}$ and a diagonal covariance matrix $\tilde{\Sigma}^{(2)} = P\Lambda\Sigma^{(2)}\Lambda^\top P^\top$.


Define the characteristic functions of $y^{(1)},\tilde{y}^{(2)},\epsilon^{(1)},\tilde{\epsilon}^{(2)}$ as $\phi_{y^{(1)}}(\cdot),\phi_{\tilde{y}^{(2)}}(\cdot),\phi_{\epsilon^{(1)}}(\cdot),\phi_{\tilde{\epsilon}^{(2)}}(\cdot):\mathbb{R}^k\rightarrow \mathbb{R}$, from assumption 5. it follows that
\begin{align*}
   \phi_{\epsilon^{(1)}}(t)\phi_{y^{(1)}}(t)&=e^{it^\top\tilde{c}^{(2)}} \phi_{\tilde{\epsilon}^{(2)}}(t)\phi_{\tilde{y}^{(2)}}(t)\\
 \phi_{\epsilon^{(1)}}(t)\prod_{i=1}^k\phi_{y_i^{(1)}}(t_i)&=e^{it^\top\tilde{c}^{(2)}}\phi_{\tilde{\epsilon}^{(2)}}\prod_{i=1}^k\phi_{\tilde{y}_i^{(2)}}(t_i)\\
\end{align*}

 The last equation follows from assumption $4$a. Now set $t_i=0$ for all $i\neq 1.$ We get for all $t_1$
 
 $$\exp(it_1\mu_1^{(1)}-\Sigma^{(1)}_{11}t_1^2)\phi_{y_1^{(1)}}(t_1) =\exp(it_1\tilde{c}_1^{(2)})  \exp(it_1\tilde{\mu}_1^{(2)}-\tilde{\Sigma}^{(2)}_{11}t_1^2)\phi_{\tilde{y}_1^{(2)}}(t_1).$$
 
 W.l.o.g. we assume $0<\Sigma^{(1)}_{11}<\tilde{\Sigma}^{(2)}_{11}.$ Thus, the characteristic function given by $\exp(-(\tilde{\Sigma}^{(2)}_{11}-\Sigma^{(1)}_{11})t_1^2)$ is a well defined characteristic function of a normally distributed random variable with mean $0$ and variance $\tilde{\Sigma}^{(2)}_{11}-\Sigma^{(1)}_{11}$.  Then, the characteristic function of $y_1^{(1)}$ is proportional to a  product of the characteristic functions of $\tilde{y}_1^{(2)}$ and a Gaussian random variable. This is a contradiction to the assumption that $y_1^{(1)}$ does not have a normal component (assumption 4c). It follows that, $\Sigma^{(1)}_{11}=\tilde{\Sigma}^{(2)}_{11}$ and for all $t_1\in\mathbb{R}$ $\phi_{y_1^{(1)}}(t_1) =\exp{it_1(\tilde{c}_1^{(2)}+\tilde{\mu}_1^{(2)}-\mu_1^{(1)})} \phi_{\tilde{y}_1^{(2)}}(t_1),$ i.e. $ \tilde{y}_1^{(2)} + c_1  \sim y_1^{(1)}$ where $c_1 = \tilde{c}_1^{(2)}+\tilde{\mu}_1^{(2)}-\mu_1^{(1)}$. The remaining statements can be proven analogously.
 
\end{proof}

\subsection{Proof of Theorem 3.1}

 \begin{proof}
 First, we can directly apply Theorem \ref{theo:singleview} to every single view $d, d\in\{1,\ldots, D\}$ which ensures  the identifiability of the mixing matrices up to permutation and scaling, i.e. there exists a permutation matrix $P_d$ and an invertible diagonal matrix $\Lambda_d$ such that  $A^{(2)}_d = A^{(1)}_dP_d\Lambda_d$ and $\mathrm{rank}(A_d^{(2)})= \mathrm{rank}(A_d^{(1)}) = k_d$. 
 
 W.l.o.g., let $c^{(1)}>c^{(2)}$. That means that the shared sources in representation $(1)$ are more than the ones in representation $(2)$. It follows,  according to Theorem \ref{theo:kagan}, that there exist a component of the shared sources from $(1)$   and an individual component from $(2)$ in every view such that they are both proportional. More precisely, for any $d \in\{1,\ldots, D\}$ there exist $k, l\in\{1, \ldots, k_d\}$ such that $s_{0k}^{(1)}$ is a component of the shared sources $s_{0}^{(1)}$ and $s_{dl}^{(2)}$ is a component from the individual sources $s_{d}^{(2)}$ such that $s_{0k}^{(1)} + \epsilon_{d0k}^{(1)} = (\Lambda_{d})_{ll} (s_{dl}^{(2)}+\epsilon_{d1l}^{(2)}).$ Let $r\neq d$ be another view such that there exist $m\in\{1, \ldots, k_r\}$ with $s_{mr}^{(2)}$ being an individual component and $s_{0k}^{(1)} + \epsilon_{r0k}^{(1)} = (\Lambda_{d})_{mm} (s_{rm}^{(2)}+\epsilon_{r1m}^{(2)}).$  This is contradiction to the assumption that $s_{rm}^{(2)} \indep s_{dl}^{(2)}$. It follows that $c^{(1)}=c^{(2)}$.
 
 Furthermore, $\mathrm{Var}(x_d)=\sigma_d^{(1)2}A_d^{(1)}A_d^{(1),\top}=\sigma_d^{(2)2}A_d^{(2)}A_d^{(2),\top}=\sigma_d^{(2)2}A_d^{(1)}P_d\Lambda_d^2P_d^\top A_d^{(1),\top}$. Multiplying with $A_d^{(1),\dagger} =(A_d^{(1)\top}A_d^{(1)} )^{-1}A_d^{(1)\top}$  from left and $A_d^{(1),\dagger,^\top}=A_d^{(1)}(A_d^{(1)\top}A_d^{(1)} )^{-1}$ from right yields
  $\sigma_d^{(1)2}\mathbb{I}_{k_d} = \sigma_d^{(2)2} P_d\Lambda_d^2P_d^\top$. It follows that $\frac{\sigma_d^{(2)2}}{\sigma_d^{(1)2}}\Lambda_d^2 = \mathbb{I}_{k_d}.$ Computing the covariance between two different views $d, l \in \{1,\ldots, D\}$ gives 
  \begin{align*}
      \mathrm{Cov}(x_d,x_l) = A_{d0}^{(1)}A_{l0}^{(1),\top} = A_{d0}^{(2)}A_{l0}^{(2),\top}=A_{d0}^{(1)} \Lambda_d[c,c]\Lambda_l[c,c]A_{l0}^{(1),\top} 
  \end{align*}
  where $\Lambda_d[c,c]$ is an invertible diagonal matrix composed by the first $c$ columns and rows of the matrix $\Lambda_d.$ By multiplying with the left-inverse of $ A_{d0}^{(1)}$ from the left and right-inverse of $A_d^{(1),\top}$ from the right, we get for any $d$ and $l$ $\Lambda_d[c,c]\Lambda_l[c,c] = \mathbb{I}_{c}$. It follows that all entries of $\Lambda_d$ equal $1$ or $-1$ and therefore $\frac{\sigma_d^{(2)2}}{\sigma_d^{(1)2}} = 1$ for every $d$. 
  
  In the remaining, we will show that the distribution of the sources is identifiable even in the cases when they have normal components. Let $s_i^{(1)}$  be component from $\tilde{s}_i^{(1)}$. Furthermore, there exist $j\in\{1, \ldots, k_d\}$ such that $s_i^{(1)}+\epsilon_i^{(1)} = s_j^{(2)}+\epsilon_j^{(2)}$. Taking the characteristic functions from both sides yields
\begin{align*}
    \phi_{s_i^{(1)}}(t)\phi_{\epsilon_i^{(1)}}(t) = \phi_{s_j^{(2)}}(t)\phi_{\epsilon_j^{(2)}}(t)
\end{align*}
   Since $\sigma_d^{(1)2} = \sigma_d^{(2)2}$ and the noise and sources are with 0 mean, the above equation simplifies to $ \phi_{s_i^{(1)}}(t) = \phi_{s_j^{(2)}}(t)$, i.e. $\phi_{s_i^{(1)}}(t)\sim\phi_{s_j^{(2)}}(t)$.
\end{proof}

\subsection{Additional Results}
  \begin{theorem}
  \label{cor:gauss}
 Let $x_1, \ldots, x_D$ for $D\geq 3$ be random vectors which are generated according to the model defined in Equation 1. Furthermore, we assume that we have the following two representations of $x_1, \ldots, x_D$ according to Equation 1:
 \begin{align*}
  A_{d0}^{(1)}s_0^{(1)}+A_{d1}^{(1)}s_d^{(1)}+A_d^{(1)}\epsilon_d^{(1)}  = x_d =A_{d0}^{(2)}s_0^{(2)}+A_{d1}^{(2)}s_d^{(2)}+A_d^{(2)}\epsilon_d^{(2)},\qquad d\in\{1,\ldots, D\},
 \end{align*}
 Additionally, to the assumptions of Equation 1 it holds that 
  \begin{enumerate}
     \item each of the components $s_{dj}^{(i)}$ of $s_{d}^{(i)}$ for $j=1, \ldots, k_d^{(i)}-c^{(i)}$ is non-Gaussian.
     \item  $s_{0}^{(i)}$ can have  Gaussian components. Furthermore, if the number of Gaussian components exceeds 2, for all $k,l\in\{1,\ldots, c\}$ with $k\neq l$ it holds that $\gamma^{(i)}_{k}\neq \gamma^{(i)}_{l}$, where $\gamma^{(i)}_{k}$ and $\gamma^{(i)}_{l}$ are the variances of the components $s_{0k}^{(i)}$ and  $s_{0l}^{(i)}$
 \end{enumerate}
 Then,  for fixed number of shared sources $c$ and for all $d=1, \ldots, D$ $k_d^{(1)}=k_d^{(2)}=k_d,$  and there exist a permutation matrix $P_d\in \mathbb{R}^{k_d\times k_d}$ and an ivertible diagonal matrix $\Lambda_d \in \mathbb{R}^{k_d\times k_d}$ such that
 \begin{align*}
     A_d^{(2)} = A_d^{(1)}P_d\Lambda_d  
 \end{align*}
 \end{theorem}
 
 \begin{proof}
 Theorem \ref{theo:kagan} yields that if the individual components are not normal, then for each column of $a_j^{(1)}$ of $A_{d1}^{(1)}$ there is a column  $a_i^{(2)}$ of $A_{d1}^{(1)}$  such that there exist $\lambda\neq 0$ with $a_j^{(2)} = \lambda a_j^{(1)}$.  Since all mixing matrices have full column rank, it follows that there is one-to-one correspondence between the columns of  $A_{d1}^{(1)}$  and the columns of $A_{d1}^{(2)},$ and thus $k_d^{(1)} = k_d^{(2)}$ 
 
 If at most one of the shared components is normal please refer to \cite{comon1994independent}. Now consider the case when at least two components are normal. First, the number of normal components in both representations is the same since $c$ is fixed and the number of non-normal components is identifiable with the same arguments as above.
 
 Computing the covariance between two different views $d, l\in\{1,\ldots,D\}$ yields
 \begin{align*}
     \mathrm{Cov}(x_d, x_l) = A_{d0}^{(1)}\Gamma^{(1)}A_{l0}^{(1),\top}=A_{d0}^{(2)}\Gamma^{(2)}A_{l0}^{(2),\top}
 \end{align*}
where $\Gamma^{(i)}$ is the covariance matrix of $s_0^{(i)}$ for $i=1,2.$ We define $A_{d0}^{\gamma,(i)} = A_{d0}^{(i)}\Gamma^{(i)\frac{1}{2}}$ for any $d\in\{1,\ldots,D\}$. Let $P_d = (A_{d0}^{\gamma,(1),\top}A_{d0}^{\gamma,(1)})^{-1}A_{d0}^{\gamma,(1),\top}A_{d0}^{\gamma,(2)}.$ Following the proof of Theorem 1 \citep{richard2021shared} we get that $P_dP_l^\top=\mathbb{I}_c = P_dP_k^\top=P_kP_l^\top$ for any $d,k,l \in \{1, \ldots, D\}.$ Thus, $P_l=P_d=P_k=P$ and they are orthogonal. Moreover, for all $d=1,\ldots,D$ it holds $\tilde{s}_0^{(1)}+\tilde{\epsilon}_d^{(1)} = P(\tilde{s}_0^{(2)}+\tilde{\epsilon}_d^{(2)})$ where $\tilde{\epsilon}_d^{(i)} \sim \mathcal{N}(0, \sigma_d^{(i)2}\Gamma^{(i)-1})$ and $\tilde{s}_0^{(i)} = \Gamma^{(i)-\frac{1}{2}}s_0^{(i)}.$ From the last equation it follows that $\sigma_d^{(1)2}\Gamma^{(1)-1} = P(\sigma_d^{(2)2}\Gamma^{(2)-1})P^\top$. Lemma 2 \citep{richard2021shared} implies that $P$ is a sign and permutation matrix. 
 \end{proof}



\newpage

\section{Joint Data Log-Likelihood}
\label{app:opt}
\begin{lemma}
\label{lem:help}
Let $W\in \mathbb{R}^{c\times k}$ such that  $WW^\top=\mathbb{I}_c$ and  $x^{1}, \ldots, x^{N} \in \mathbb{R}^k$ such that for every $j=1,\dots,k$, we have $\sum_{i=1}^N (x_j^i)^2=1$ and for every $j \neq k$, we have $\sum_{i=1}^N x_j^i x_k^i =0$. 
Then for every $j =1, \ldots, c$, it also holds that $\sum_{i=1}^N ((Wx^i)_j)^2=1.$
\begin{proof}
Let $W_j$ be the $j-$th row of $W$. Then
\begin{align*}
    \sum_{i=1}^N ((Wx^i)_j)^2&= \sum_{i=1}^N (\sum_{l=1}^kW_{jl}x_l^i)^2
     =\sum_{i=1}^N \sum_{l=1}^k \sum_{r=1}^k W_{jl}x_l^i W_{jr}x_r^i\\
     &= \sum_{l=1}^k \sum_{r=1}^k W_{jl}W_{jr} \sum_{i=1}^N x_l^i x_r^i=\sum_{l=1}^k \sum_{r=1}^kW_{jl}W_{jr}\delta_{lr} = \sum_{r=1}^kW_{jr}^2=1
\end{align*}
where $\delta_{lr}=1$ if $l=r$ and $0$ otherwise. For the fourth equation we used that $\sum_{i=1}^N (x_j^i)^2=1$ and $\sum_{i=1}^N x_j^i x_k^i =0$ for all $j\neq k$; and for the last one we used $WW^\top=\mathbb{I}_c.$
\end{proof}
\end{lemma}
\subsection{Derivation}
Under the generative model assumptions and optimization constraints stated in Section~4, it holds
\begin{align}
\label{eq:proofloss}
\mathcal{L}(W_1, \ldots, W_D) &=\sum_{i=1}^N\log f(\bar{z}_0^i) +  \sum_{i=1}^N \sum_{d=1}^D \log  p_{Z_{d,1}}(z_{d,1}^{i})+ N\sum_{d=1}^D \log\vert W_d\vert\\
    &- \frac{1}{2\sigma^2} \Big(  \sum_{d=1}^D \operatorname{trace}(  Z_{d,0}Z_d^{(1)\top}) -\frac{1}{D}  \sum_{d=1}^D \sum_{l=1}^D \operatorname{trace}(  Z_{d,0}Z_{l,0}^{\top}) \Big)
\end{align}
\begin{proof}

Let $\mathbf{x} = (x_1^\top,x_2^\top,\ldots,x_D^\top)^\top  \in \mathbb{R}^{K_D},$ $\mathbf{\tilde{s}}=(\tilde{s}_1^\top,\tilde{s}_2^\top,\ldots,\tilde{s}_D^\top)^\top  \in \mathbb{R}^{K_D},$  $\mathbf{\epsilon}=(\epsilon_1^\top,\epsilon_2^\top,\ldots,\epsilon_D^\top)^\top  \in \mathbb{R}^{K_D}$, where $K_D=\sum_{d=1}^D k_d$ and for $W_d = A_d^{-1}$ define
\begin{align*}
    \mathbf{W}=\left( \begin{array}{ccccc}
W_1 & 0 & \ldots & 0 &0  \\
 0& W_2 &  \ldots & 0 & 0 \\
 &  & \ddots & &  \\
 0& 0 &\ldots  & W_{D-1} & 0 \\
 0& 0 & \ldots & 0 & W_D \\
\end{array} \right),\
  \mathbf{A}=\left( \begin{array}{ccccc}
A_1 & 0 & \ldots & 0 &0  \\
 0& A_2 &  \ldots & 0 & 0 \\
 &  & \ddots & &  \\
 0& 0 &\ldots  & A_{D-1} & 0 \\
 0& 0 & \ldots & 0 & A_D \\
\end{array} \right).
\end{align*}

Furthermore, let $z_d:=W_dx_d=\tilde{s}_d +\epsilon_d,$ and $z_{d,0}:=s_0 +\epsilon_{d0}\in \mathbb{R}^{c}$ and $z_{d,1}:=s_d +\epsilon_{d1} \in \mathbb{R}^{k_d-c},$ i.e. $z_d = (z_{d,0}, z_{d,1})^\top.$ Let $p_{\mathbf{X}}$ be the joint distribution of $x_1,\ldots, x_D$, $p_{\mathbf{Z}}$ the joint distribution of $z_1,\ldots, z_D$,  $p_{\mathbf{Z}_0}$ the joint distribution of $z_{1,0},\ldots, z_{D,0}$,  $p_{\mathbf{Z}_1}$ the joint distribution of $z_{1,1},\ldots, z_{D,1}$ and $ p_{Z_{d,1}}$ the probability distribution of $z_{d,1}$.


Note that the model in Equation~1 is equivalent to $\mathbf{x}=\mathbf{A}\mathbf{z}$. By multiplying with the inverse of $\mathbf{A}$ (i.e.\  $\mathbf{W}$) from the left we get $\mathbf{W}\mathbf{x}=\mathbf{z}$. Then for the joint likelihood of $x_1,\ldots,x_D$ we get 
\begin{align*}
p_{\mathbf{X}}(\mathbf{x})&=p_{\mathbf{Z}}(\mathbf{z})\vert \mathbf{W}\vert\\
 &=p_{\mathbf{Z}}(\mathbf{z})\prod_{d=1}^D\vert W_d\vert\\
    &= p_{\mathbf{Z}_0}(z_{1,0},\ldots, z_{D,0})p_{\mathbf{Z}_1}(z_{1,1},\ldots, z_{D,1}) \prod_{d=1}^D\vert W_d\vert\\
    & =p_{\mathbf{Z}_0}(z_{1,0},\ldots, z_{D,0})\prod_{d=1}^D  p_{Z_{d,1}}(z_{d,1}) \prod_{d=1}^D\vert W_d\vert.
\end{align*}
\begin{enumerate}
\item Second equation: $\mathbf{W}$ is a block diagonal matrix and for all $d=1,\ldots ,D$, and  $W_{d}\in\mathbb{R}^{k_d\times k_d}$.
    \item Third equation: $z_{1,0},\ldots, z_{D,0} \indep z_{1,1},\ldots, z_{D,1}.$ 
    \item Fourth equation follows from the fact that $z_{1,1},\ldots, z_{D,1}$ are mutually independent since  $\{s_{1i}\}_{i=1}^{k_1-c}, \ldots\{s_{Di}\}_{i=1}^{k_D-c},\{\epsilon_{1i}\}_{i=1}^{k_1},\ldots,\{\epsilon_{Di}\}_{i=1}^{k_D}$ are mutually independent.
\end{enumerate}
It follows that
\begin{align*}
    p_{\mathbf{Z}_0}(z_{1,0},\ldots, z_{D,0})& = \int  p_{\mathbf{Z}_0\vert S_0}(z_{1,0},\ldots, z_{D,0}\vert s_0) p_{S_0}(s_0) ds_0 \\
    & = \int \Big(\prod_{d=1}^D \mathcal{N}(z_{d,0};s_0, \sigma^2\mathbb{I}_c)\Big) p_{S_0}(s_0) ds_0\\
  &\propto\int \exp\Big(-\sum_{d=1}^D\frac{\Vert z_{d,0}-s_0 \Vert^2}{2\sigma^2}\Big)p_{S_0}(s_0)ds_0\\
    & = \int \exp\Big(-\dfrac{D\Vert s_0-\bar{z}_0\Vert^2 + \sum_{d=1}^D \Vert z_{d,0}-\bar{z}_0\Vert^2}{2\sigma^2}\Big)p_{S_0}(s_0)ds_0\\
    & = \exp\Big(-\dfrac{ \sum_{d=1}^D \Vert z_{d,0}-\bar{z}_0\Vert^2}{2\sigma^2}\Big )\int \exp\Big(-\dfrac{D\Vert s_0-\bar{z}_0\Vert^2} {2\sigma^2}\Big)p_{S_0}(s_0)ds_0
\end{align*}
where $\bar{z}_0 = \frac{1}{D}\sum_{d=1}^D z_{d,0}$. 
\begin{itemize}
    \item For the second and third equation recall that $z_{d,0}=s_0 +\epsilon_{d0}\in \mathbb{R}^{c}$, where $\epsilon_{d0} \sim \mathcal{N}(0,\sigma^2\mathbb{I}_c)$ and $s_0\indep\epsilon_{d0}$. This means that $z_{d,0}|s_0 \sim \mathcal{N}(s_0,\sigma^2\mathbb{I}_c)$. From the following equations follow
    \begin{align*}
        p_{\mathbf{Z}_0\vert S_0}(z_{1,0},\ldots, z_{D,0}\vert s_0)&= \prod_{d=1}^D p_{Z_{d,0}\vert s_0}(z_{d,0}\vert S_0)\\
        &= \prod_{d=1}^D \mathcal{N}(z_{d,0};s_0, \sigma^2\mathbb{I}_c)
    \end{align*}
    \item The fourth equation results from
    \begin{align*}
        \sum_{d=1}^D \Vert z_{d,0}-s_0 \Vert^2 &= \sum_{d=1}^D \Vert z_{d,0}-\bar{z}_0 + \bar{z}_0 -s_0 \Vert^2 =  \sum_{d=1}^D \Big(  \Vert z_{d,0}-\bar{z}_0\Vert ^2 +2\langle  z_{d,0}-\bar{z}_0, \bar{z}_0 -s_0 \rangle + \Vert\bar{z}_0 -s_0 \Vert^2\Big)\\
        & = \sum_{d=1}^D  \Vert z_{d,0}-\bar{z}_0\Vert ^2 +2  \sum_{d=1}^D \langle  z_{d,0}-\bar{z}_0, \bar{z}_0 -s_0 \rangle  +D\Vert\bar{z}_0 -s_0 \Vert^2\\
        & = \sum_{d=1}^D  \Vert z_{d,0}-\bar{z}_0\Vert ^2 +2\Big\langle \sum_{d=1}^D z_{d,0} - D \cdot  \frac{1}{D}\sum_{d=1}^D z_{d,0} , \bar{z}_0 -s_0 \Big\rangle  +D\Vert\bar{z}_0 -s_0 \Vert^2\\
        &=  \sum_{d=1}^D  \Vert z_{d,0}-\bar{z}_0\Vert ^2+D\Vert\bar{z}_0 -s_0 \Vert^2.
    \end{align*}
\end{itemize}

We define $f(\bar{z}_0) = \int \exp\Big(-\dfrac{D\Vert s_0-\bar{z}_0\Vert^2} {2\sigma^2}\Big)p_{S_0}(s_0)ds_0$ similarly to \citep{richard2020modeling}.

Note that 

\begin{align*}
    \Vert z_{d,0}-\bar{z}_0\Vert^2 = \Vert  z_{d,0} \Vert^2 - \frac{2}{D}\sum_{l=1}^D \langle z_{d,0},  z_{l,0} \rangle +\frac{1}{D^2}\sum_{l=1}^D  \sum_{r=1}^D \langle   z_{r,0},   z_{l,0} \rangle.
\end{align*}
Thus, it follows that
\begin{align*}
     \sum_{d=1}^D \Vert  z_{d,0}-\bar{z}_0\Vert^2 &= \sum_{d=1}^D \Big( \Vert   z_{d,0} \Vert^2 - \frac{2}{D}\sum_{l=1}^D \langle   z_{d,0},   z_{l,0} \rangle +\frac{1}{D^2}\sum_{l=1}^D  \sum_{r=1}^D \langle   z_{r,0},   z_{l,0} \rangle\Big)\\
     &=  \sum_{d=1}^D \Vert   z_{d,0} \Vert^2 - \frac{2}{D} \sum_{d=1}^D \sum_{l=1}^D \langle   z_{d,0},   z_{l,0} \rangle +D \frac{1}{D^2}\sum_{l=1}^D  \sum_{r=1}^D \langle   z_{r,0},   z_{l,0} \rangle\\
     & = \sum_{d=1}^D \Vert   z_{d,0} \Vert^2 - \frac{1}{D} \sum_{d=1}^D \sum_{l=1}^D \langle   z_{d,0},   z_{l,0} \rangle
\end{align*}
Collecting all terms together we get
\begin{align*}
     p_{\mathbf{X}}(\mathbf{x})&= \exp \Big(- \dfrac{\sum_{d=1}^D \Vert   z_{d,0} \Vert^2 - \frac{1}{D} \sum_{d=1}^D \sum_{l=1}^D \langle   z_{d,0},   z_{l,0} \rangle}{2\sigma^2}\Big)  f(\bar{z}_0) \prod_{d=1}^D  p_{Z_{d,1}}(z_{d,1}) \prod_{d=1}^D\vert W_d\vert
\end{align*}

The data log-likelihood can be expressed as

\begin{align*}
    \sum_{i=1}^N\log p_{\mathbf{X}}(x_1^i,\ldots, x_D^i)&= \sum_{i=1}^N \Big(- \dfrac{\sum_{d=1}^D \Vert  z_{d,0}^{i} \Vert^2 - \frac{1}{D} \sum_{d=1}^D \sum_{l=1}^D \langle  z_{d,0}^{i},   z_{l,0}^{i} \rangle}{2\sigma^2}\\
    &+\log f(\bar{z}_0^i) +  \sum_{d=1}^D \log  p_{Z_{d,1}}(z_{d,1}^{i}) +\sum_{d=1}^D \log\vert W_d\vert \Big)\\
    &=\sum_{i=1}^N\log f(\bar{z}_0^i) +  \sum_{i=1}^N \sum_{d=1}^D \log  p_{Z_{d,1}}(z_{d,1}^{i})+ N\sum_{d=1}^D \log \vert W_d\vert\\
    &- \frac{1}{2\sigma^2} \Big( \sum_{i=1}^N \sum_{d=1}^D \Vert  z_{d,0}^{i} \Vert^2 -\frac{1}{D} \sum_{i=1}^N \sum_{d=1}^D \sum_{l=1}^D \langle  z_{d,0}^{i},   z_{l,0}^{i} \rangle \Big)\\
    &=\sum_{i=1}^N\log f(\bar{z}_0^i) +  \sum_{i=1}^N \sum_{d=1}^D \log  p_{Z_{d,1}}(z_{d,1}^{i})+ N\sum_{d=1}^D \log \vert W_d\vert\\
    &- \frac{1}{2\sigma^2} \Big(  \sum_{d=1}^D \operatorname{trace}(  Z_{d,0}Z_{d,0}^{\top}) -\frac{1}{D}  \sum_{d=1}^D \sum_{l=1}^D \operatorname{trace}(  Z_{d,0}Z_{l,0}^{\top}) \Big)
\end{align*}
In the case when the data is pre-whitened, it holds that the unknown unmixing matrices are orthogonal, i.e. $W_dW_d^\top=W_d^\top W_d=\mathbb{I}_{k_d}$ and $\vert\det W_d\vert=1,$ and $x_d$ and $z_d$ are uncorrelated. Note that in the main paper, we used a different notation for the mixing matrices and sources to stress the difference before and after whitening. This notation is here omitted for simplicity.

 Making similar observations as before, we get for the joint probability of the multiple views:
\begin{align*}
p_{\mathbf{X}}(\mathbf{x})=p_{\mathbf{Z}_0}(z_{1,0},\ldots, z_{D,0})\prod_{d=1}^D  p_{Z_{d,1}}(z_{d,1})
\end{align*}
Note that after whitening $z_{d,0}=\alpha(\sigma)(s_0 +\epsilon_{d0})$ with $\alpha(\sigma)=(1+\sigma^2)^{-\frac{1}{2}}$. With similar observations as above we get
\begin{align*}
        p_{\mathbf{Z}_0\vert s_0}(z_{1,0},\ldots, z_{D,0}\vert s_0)&=p_{\mathbf{Z}_0\vert s_0}(\alpha(\sigma)(s_0 +\epsilon_{10}),\ldots,\alpha(\sigma)( s_0 +\epsilon_{D0})\vert s_0) = \prod_{d=1}^D p_{Z_{d,0}\vert S_0}(\alpha(\sigma)(s_0 +\epsilon_{d0})\vert s_0)\\
        &= \prod_{d=1}^D \mathcal{N}(\alpha(\sigma)(s_0 +\epsilon_{d0});s_0, \sigma^2\mathbb{I}_c)= \prod_{d=1}^D \mathcal{N}(z_{d,0};\alpha(\sigma)s_0, \alpha(\sigma)^2\sigma^2\mathbb{I}_c)
    \end{align*}
    
It follows that
\begin{align*}
    p_{\mathbf{Z}_0}(z_{1,0},\ldots, z_{D,0})& = \int  p_{\mathbf{Z}_0\vert s_0}(z_{1,0},\ldots, z_{D,0}\vert s_0) p_{S_0}(s_0) ds_0 \\
    & = \int \Big(\prod_{d=1}^D \mathcal{N}(z_{d,0};\alpha(\sigma)s_0, \alpha(\sigma)^2\sigma^2\mathbb{I}_c)\Big) p_{S_0}(s_0) ds_0\\
  &\propto\int \exp\Big(-\sum_{d=1}^D\frac{\Vert z_{d,0}-\alpha(\sigma)s_0 \Vert^2}{2\alpha(\sigma)^2\sigma^2}\Big)p_{S_0}(s_0)ds_0\\
    & = \int \exp\Big(-\dfrac{D\Vert \alpha(\sigma)s_0-\bar{z}_0\Vert^2 + \sum_{d=1}^D \Vert z_{d,0}-\bar{z}_0\Vert^2}{2\alpha(\sigma)^2\sigma^2}\Big)p_{S_0}(s_0)ds_0\\
    & = \exp\Big(-\dfrac{ \sum_{d=1}^D \Vert z_{d,0}-\bar{z}_0\Vert^2}{2\alpha(\sigma)^2\sigma^2}\Big )\int \exp\Big(-\dfrac{D\Vert \alpha(\sigma)s_0-\bar{z}_0\Vert^2} {2\alpha(\sigma)^2\sigma^2}\Big)p_{S_0}(s_0)ds_0
\end{align*}
where $\bar{z}_0 = \frac{1}{D}\sum_{d=1}^D z_{d,0}$.    We define $f_{\sigma}(\bar{z}_0) = \int \exp\Big(-\dfrac{D\Vert \alpha(\sigma)s_0-\bar{z}_0\Vert^2} {2\alpha(\sigma)^2\sigma^2}\Big)p_{S_0}(s_0)ds_0= \int \exp\Big(-\dfrac{D\Vert s_0-(1+\sigma^2)^{\frac{1}{2}}\bar{z}_0\Vert^2} {2\sigma^2}\Big)p_{S_0}(s_0)ds_0$. For the data log-likelihood we get

\begin{align*}
    \sum_{i=1}^N\log p_{\mathbf{x}}(x_1^i,\ldots, x_D^i)&=\sum_{i=1}^N\log f_{\sigma}(\bar{z}_0^i) +  \sum_{i=1}^N \sum_{d=1}^D \log  p_{Z_{d,1}}(z_{d,1}^{i}) - N\cdot D\cdot 1 \\
    &- \frac{D\cdot c}{2\alpha(\sigma)\sigma^2} + \frac{1}{2D\alpha(\sigma)^2\sigma^2}\sum_{d=1}^D \sum_{l=1}^D \operatorname{trace}(  Z_{d,0}Z_{l,0}^{\top})
\end{align*}
    
It be easily derived from \ref{eq:proofloss} by making the following observations resulting from whitening
\begin{itemize}
    \item $N\sum_{d=1}^D \log \vert W_d\vert = ND$ since $\forall d$ $W_d$ is orthogonal
    \item $\operatorname{trace}(  Z_{d,0}Z_{d,0}^{\top}) = c$ due to Lemma \ref{lem:help}
\end{itemize}
\end{proof}

\newpage
  \section{Real Data Experiment}
\label{app:data}
\subsection{Data Acquisition and Preprocessing}

A transcriptome dataset resembles a random data matrix (see Figure \ref{fig:my_label}). Each column represents an experimental condition (such as knock-out or stress conditions) that cells were subjected to, and each row represents a gene. So each entry of this matrix is an expression value indicating a gene activity under a given condition (typically measured using RNA sequencing or microarrays). 

\begin{wrapfigure}[15]{r}{6cm}
    \centering
    \includegraphics[scale=0.5]{datapdf.pdf}
    \caption{Example of Transcriptome Data}
    \label{fig:my_label}
\end{wrapfigure}

Our analysis is primarily based on three large gene expression data sets, denoted by (in our code) Dataset1\footnote{The dataset is available at \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE67023}} \citep{arrieta2015experimentally} with 265 transcriptome datasets obtained from 38 unique experimental designs and Dataset2 \citep{nicolas2012condition}\footnote{The dataset can be found at \url{https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE27219}} containing 262 samples from 104 different experimental conditions and Dataset3\footnote{Dataset link \url{https://imodulondb.org/dataset.html?organism=b_subtilis&dataset=modulome}} collected and preprocessed RNA-seq data by \citep{Sastry2021.07.01.450581} with 265 samples of 93 unique conditions. We removed genes  with missing values  from Dataset 1 
and we selected 3990 genes that are present in all three datasets. 





\subsection{Gene-gene Interaction Pipeline}
The main steps of our methodology are presented in Algorithm \ref{alg:method}. First, we select the number of total and shared components. We infer latent components from the data as described in the main paper, Section 7.2.  Afterward, we learn a sparse undirected graph from the estimated independent components (see Section \ref{subsub:glasso}). 
\subsubsection{Selection of Total Number of ICA Components}

To select the total number of components for each single dataset, we utilize the following heuristic:
\begin{enumerate}
\item Estimate the sources $S$ and the mixing matrix $A$ from the observed data $X$ via FastICA or another related method.
\item For each component $S_k$, estimate its relevance by computing $r_k = \sum_{i}(A_{ki})^2$.
\item Order the components' relevance from the highest to the lowest value and scale them to sum to 1, i.e. ${r = \operatorname{orderDescending}(r_1,\ldots,r_k)/(\sum_k r_k})$.
\item For $perm=1,\ldots,P$, repeat:
\begin{enumerate}
\item Permute the features of each sample of $X$ to form a permuted dataset $X^{perm} = \operatorname{permuteFeaturesPerSample}(X)$.
\item Estimate the sources $S^{perm}$ and the mixing matrix $A^{perm}$ from the permuted data $X^{perm}$ via FastICA or another related method.
\item For each permuted component $S_k^{perm}$, we estimate its relevance by computing $r_k^{perm} = \sum_{i}(A_{ki}^{perm})^2$.
\item Order the permuted components' relevance $r_k^{perm}$ from the highest to the lowest value and scale them to sum to 1, i.e. $r^{perm} = \operatorname{orderDescending}(r_1^{perm},\ldots,r_k^{perm})/(\sum_k r_k^{perm})$.
\end{enumerate}
\item Apply permutation testing for each value of $r$ with respect to the values of $r^{perm}$ and compute the corresponding $p$-values, i.e. $p_k = |\{r_k^{perm}|r_k\geq r_k^{perm}\}|/P$.
\item The number of components is the number of $p_k$'s for which $p_k<0.05$. The p-values indicate how many components have higher relevance than the components from the permuted data.
\end{enumerate}
In our application, we first select the number of total components $k_d$ for each dataset via the proposed procedure. Then we fit a ShIndICA model, for which we select the first $k_d$ components according to their relevance. Thus, the performed dimensionality reduction step happens after training.

\subsubsection{Graphical Lasso}
\label{subsub:glasso}
Graphical lasso (glasso) is a maximum likelihood estimator for inferring graph structure in a high-dimensional setting \citep{friedman2008sparse}. This method uses $l_1$ regularization to estimate the precision matrix (or inverse covariance) of a set of random variables from which a graph structure can be determined. The optimization problem that glasso solves can be formalized as follows
\begin{align}
\label{eq:glasso}
\min_{\Theta\succ 0 } -\log\det(\Theta) + \operatorname{tr}(\hat{\Sigma}\Theta)+\lambda \Vert \Theta\Vert_1,
\end{align}
where $\hat{\Sigma}$ is the empirical covariance or correlation matrix and $\Theta:=\Sigma^{-1}$ denotes  the precision matrix. In our setting, the input for the glasso is the Pearson's correlation matrix of the gene representations retrieved with ICA at the preceding step. We can read graph structure from the estimated matrix $\hat{\Theta}$ as follows: if the $ij$ entry of $\hat{\Theta}$ is not 0 (i.e. $\hat{\Theta}_{ij}\neq0$) there is an edge between the genes $i$ and $j$, i.e. the genes might be co-regulated.  We used the \texttt{huge}\footnote{See \url{https://CRAN.R-project.org/package=huge}.} R package for the implementation of the graphical lasso.

\subsubsection{Extended EBIC}
There are various criteria for model selection and hyperparameter tuning of glasso models. \cite{chen2008extended}  propose an information criterion for Gaussian graphical models called extended BIC (EBIC) that takes the form 
\begin{align}
\label{eq:ebic}
-\log\det(\Theta(E)) + \operatorname{tr}(\hat{\Sigma}\Theta(E)) +\vert E\vert \log n + 4\vert E \vert \gamma \log p,
\end{align}
where $E$ is the edge set of a candidate graph and $\gamma \in [0,1].$ Models that yield low EBIC scores are preferred. Note that positive values for $\gamma$ lead to sparser graphs. \cite{foygel2010extended}  suggest that   $\gamma=0.5$  is a good choice when no prior knowledge is available. In our experiments, we select the $\lambda$ that minimizes the EBIC score with  $\gamma=0.5$.


\subsubsection{Method}
All steps described above are summarized in the following pseudo-code.
\begin{algorithm}[tbh!]
  	\begin{algorithmic}[1]
	\Input{$X_1, \in \mathbb{R}^{n_1\times p},X_2 \in \mathbb{R}^{n_2\times p}$ is a data matrix with $n_1$ and $n_2$ samples and $p$ genes\\$\Lambda$ is a set of regularization parameters\\ $\gamma$ EBIC selection parameter (\ref{eq:ebic})}
	\State Perform a data integration method to obtain $S_1,\in \mathbb{R}^{k_1\times p},S_2\in \mathbb{R}^{k_2\times p}$ 
	\State Concatenate $S=(S_1,S_2)^{\top}\in\mathbb{R}^{k_1+k_2\times p}$
	\State Compute the Pearson correlation matrix $\hat{\Sigma}\in\mathbb{R}^{p\times p}$ of $S$.
	\State Estimate the precision matrices $\{\hat{\Theta}^{\lambda}\}_{\lambda\in\Lambda}$ which solves \ref{eq:glasso} for each $\lambda$ from  the set $\Lambda$
	\State Select  the final $\hat{\Theta}^{out}\in \{\hat{\Theta}^{\lambda}\}_{\lambda\in\Lambda}$ according to EBIC($\gamma$) (see \ref{eq:ebic})
		\Output{the selected $\hat{\Theta}^{out}$}
\end{algorithmic}
\caption{Algorithmic description of the downstream task for $D=2$.}
\label{alg:method}
\end{algorithm}
\newpage

\section{Synthetic Experiments}
\label{app:exp}

\begin{figure}
     \centering
     \begin{subfigure}[b]{0.45\linewidth}
         \centering
         \includegraphics[width=\linewidth]{ mle_vs_trace.png}
         \caption{}
         \label{fig:mlevsours}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.45\linewidth}
         \centering
         \includegraphics[width=\linewidth]{ sample_size.png}
         \caption{}
         \label{fig:samples}
     \end{subfigure}
     \caption{Comparison of MultiViewICA and ShIndICA on a two-view shared response model setting. In Figure \ref{fig:mlevsours}, we fix the sample size and measure the Amari distance for sources $60, 70,\ldots 110$. In Figure \ref{fig:samples} the number of sources is set to 100 and we conduct the experiments for different sample sizes (x-axis). It seems that ShIndICA outperforms MultiViewICA in both scenarios. }
\end{figure}

\subsection{Amari distance}
The Amari distance \citep{amari1995new} between two invertible matrices $A,B\in \mathbb{R}^{n\times n}$ is defined by
\begin{align*}
    \operatorname{amari}(A, B)&:=\sum_{i=1}^n\Big(\sum_{j=1}^n\dfrac{\vert c_{ij}\vert}{\max_k \vert c_{ik}\vert}-1\Big)+\sum_{j=1}^n\Big(\sum_{i=1}^n\dfrac{\vert c_{ij}\vert}{\max_k \vert c_{kj}\vert}-1\Big), & C&:=A^{-1}B.
\end{align*}
\subsection{Additional Experiments on Synthetic Data}
\label{app:syn}
\begin{wrapfigure}{r}{9cm}
    \centering
    \includegraphics[scale=0.6]{ noisy_amari2view.png}
    \caption{We have the two view case with a number of total sources and observed signals $100$ and a number of samples $1000$. We consider three cases of noise standard deviation: $\sigma=0.1,0.5,1.$ As soon as enough shared sources are present (around 60) ShIndICA  reaches its lowest Amari distance value (the lower, the better) in all cases. In the first two cases ($\sigma=0.1$ or $0.5$) the Amari distance gets closer to $0$ when the shared sources are $60.$ The error bars correspond to $95\%$ confidence intervals based on $50$ independent runs of the experiment. }
    \label{fig:2viewnoisy}
\end{wrapfigure}

\textbf{Objective function motivation.} In the following experiment, we compare MultiViewICA and ShIndICA when the observed data is high-dimensional on a two-view shared response model application, i.e., no individual sources. The experimental setup allows comparing standard MLE (MultiViewICA) and MLE after whitening (ShIndICA). Figure \ref{fig:mlevsours} reaches the two methods for fixed sample size $1000$. In Figure \ref{fig:samples}, we set the number of sources to 100 and vary the sample size. For all experiments, the noise standard deviation is $0.01$. It seems that ShIndICA performs better in the case of insufficient data. This could be empirical evidence that the trace has stronger regularization properties than the MMSE term in the MultiViewICA objective.

\textbf{Noisy high-dimensional views.} First, we investigate the effect of noise on the Amari distance in the two-view experiment. We consider three cases when the noise's standard variation is $\sigma=0.1,0.5,1$. The results are depicted in Figure \ref{fig:2viewnoisy}. In the first two cases, the results are close to the ones discussed in the main paper. As expected, by adding noise with high variance ($\sigma=1$) ShIndICA does not converge and affects the quality of the estimated mixing matrices measured with the Amari distance. The procedure is repeated 50 times, and the error bars are the $95\%$ confidence intervals based on the independent runs.

 
\textbf{Choice of $\lambda$}  This experiment used data from 2 views with 50 individual and 50 shared sources with varying noise standard deviation $\sigma \in\{0.1, 0.5, 1, 2, 10\}$ (x-axis).  Each of the lines in Figure \ref{fig:hyper} correspond to a fixed hyperparameter $\lambda \in \{0.1, 0.5, 1, 2, 10\}$. It can be deduced that for this particular experiment for $\lambda\geq 0.5$ there is no significant difference in the model performance.
\begin{figure}
\centering
 \includegraphics[scale=0.4]{ noisy_lambda.png}
\caption{Choice of Hyperparameter $\lambda$. The data comes from a two-view model with 50 shared and 50 individual sources per view. The x-axis represents the noise standard deviation and the y-axis the Amari distance.}
\label{fig:hyper}
\end{figure}


\section{Model Justification}


\paragraph{Multi-view ICA importance in the scientific community.} As mentioned in our introduction, we would like to point out that ICA has proven to be a successful approach for analyzing biomedical data over the years since it solves blind source separation problems common in neuroscience and biomedicine, as stated in the main paper. Furthermore, many biomedical applications can be addressed as multi-view problems due to multiple subjects in a study (e.g., fMRI, EEG data) or data coming from different modalities (e.g., omics data). This led to the development of multi-view methods. Most of those approaches focus on shared response model setting (only shared sources), e.g., Group ICA, ShICA, MultiviewICA, IVA methods, and their corresponding variations. We list some recent scientific applications where multi-view ICA models were used in Table \ref{tab:my_label}. We also interpreted the used views and latent and observed signals.

\begin{table}[]
\tiny
    \centering
    \begin{tabular}{l|l|l|l|l}
      Study   & Application & Observed Signals & Latent Sources & Views \\ \hline
      \cite{salman2019group}   & Identifying biomarkers 
& fMRI data &  brain functional networks & multiple subjects \\
\citep{durieux2019partitioning} & Mental disorders detection & fMRI data &  brain functional networks & multiple subjects \\
\citep{long2020independent} & subgroup detection & fMRI data &  brain functional networks & multiple subjects \\
\citep{huster2015group} & Denoising & EEG data & brain activity patterns & multiple subjects \\
\citep{congedo2010group} & Diagnosis and assessment  & EEG data & eyes-closed resting EEG patterns & multiple subjects \\
&of abnormal brain functioning & & & \\
\citep{sompairac2019independent} & extensive overview & tumoral omics data & gene/protein  profiles & heterogeneous omics data \\
\citep{avila2018computational} & cell type decomposition &  tissue/tumor samples
& cell type-specific expressions & tissue/tumor samples \\
\citep{fraunhoffer2022multi} & prognostic prediction & transcriptomic profiles from PDAC epithelial & gene profile  & three types of transcriptome data\\
& &   and microenvironment cells & &
    \end{tabular}
    \caption{List of recent studies that use ICA as a common data analysis tool. We also provide the application, used data modalities latent sources and views interpretation.}
    \label{tab:my_label}
\end{table}
\paragraph{The shared response models are restrictive.} There is a growing interest in examining individual variability rather than shared signals in the areas mentioned above of applications \citep{dubois2016building} ,  such as \citep{seghier2018interpreting,bartolomeo2017botallo,long2020independent}.  For instance, one can be interested in the effect of individual brain patterns on brain activity to develop more robust biomarkers. Another application where shared response models (GroupICA, MultiviewICA, IVA, etc.) would not be a sensible choice is data integration of omics data. This is an important research direction in computational biology, where we are interested in preserving the shared biological signal between datasets (views) and individual ones, as illustrated in our example. Existing approaches for the tasks mentioned above consist of two steps: applying ICA/IVA on the data followed by statistical analysis (as in \citep{long2020independent}) to separate the individual from the shared sources (or vice versa). Thus, we believe ShIndICA is a valuable addition to this set of tools. 

\paragraph{Linearity assumption in the biomedical domain.} The nature of the data in the targeted domains can explain the linear assumption. More precisely, if we consider the examples from above: the linear mixing of the components in the fMRI data context has been justified by various studies, e.g. \cite{mckeown1998independent}, and in the other applications, the linear assumption can be achieved after data transformation, e.g. log-transforming the transcriptome data. Moreover, the linearity assumption is valid in many real-life applications in the biomedical domain, where we often have a high-dimensional setting (gene activity, experimental measurements, etc.) with a low number of observed samples (participants, experiments). Moreover, in the low-data regime, if we know too little about the underlying problem, the linear approach is often a better option than eventually overparametrization it with a deep learning model. 
Even though a  non-linear multiview version will be a valuable addition to the current active research on non-linear ICA, e.g. \citep{hyvarinen2016unsupervised,hyvarinen2017nonlinear,monti2020causal}, the identifiability justification of the proposed methods has assumptions that are hard to satisfy in real-life data scenarios (e.g. the assumption of Variability \citep{hyvarinen2019nonlinear}. In our linear version, we assure identifiability without any requirements on how distinct the views should be. 
\section{Model Assumptions}

To prove the identifiability of the stated model, we require that four assumptions should be satisfied:
 
\begin{enumerate}
\item The mixing matrices have full-column rank.  This implies that we require that the sources have a minimal representation, i.e. the number of latent sources is minimal, which is a realistic assumption.
    \item The second assumption is additive noise on the sources. It can be interpreted as a measurement error on the device with variance $\sigma^2 A_dA_d^\top.$ We choose this setting compared to the $A_ds_d+\epsilon_d$ because, in our case, we get a likelihood in a closed form which is not available in the latter representation. \cite{richard2020modeling,richard2021shared} make a similar assumption for the shared response model setting.
    \item The sources are mutually independent and non-Gaussian. This is a standard ICA assumption \citep{comon1994independent}. Gaussian random variables, called “white” noise, represent noise variables, which besides location and scale, do not carry real information.  Thus, if all sources are Gaussian, either they cannot be identified (see, for example,  Proposition 3 \citep{richard2020modeling}) or additional assumptions on the variance structure need to be made to assure identifiability \citep{richard2021shared}. The non-Gaussian random variables carry meaning and are identifiable.  This is not a restrictive assumption since the sources in real-life scenarios are often non-Gaussian: fMRI, EEG, and omics data. The fixed mean, and variance are also assumptions often adopted in ICA (e.g. \citep{richard2021shared,hyvarinen2000independent}). 
    \item The measurement error is independent of the latent signal. This is a common assumption in measurement error models known as classical errors. It is a realistic assumption since we usually do not expect the measurement error to influence the true signal and vice versa \cite{richard2020modeling,richard2021shared,gresele2020incomplete}.
\end{enumerate}




\newpage


\bibliography{pandeva_351}

\end{document}
