\documentclass[accepted]{uai2022_supplement} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.
%\documentclass[twoside]{article}
%\usepackage{aistats2022}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    %\bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{algorithm}
\usepackage{algorithmicx}
\usepackage[noend]{algpseudocode}

\usepackage[american]{babel}

\usepackage{amsthm}

\usepackage{graphicx}
\usepackage{amsmath, amssymb} % For better mathhttps://www.overleaf.com/project/5e5e58e0b1be6f0001a94d36
\usepackage{float}
\usepackage{xcolor}
\usepackage{tikz}


\newcommand{\antti}[1]{{{\color{blue} [Antti: #1]}}}
\newcommand{\vitoria}[1]{{{\color{violet} [Vitoria: #1]}}}
\newcommand{\aapo}[1]{{{\color{magenta} [Aapo: #1]}}}

%\usepackage{amsthm}
%\newtheorem{theorem}

\newcommand{\indep}{\perp \!\!\! \perp}

\newcommand{\w}{\mathbf{w}}
\newcommand{\Rb}{\mathbb{R}}
\newcommand{\db}{\mathbf{d}}
\renewcommand{\a}{\mathbf{a}}
\renewcommand{\b}{\mathbf{b}}
\newcommand{\s}{\mathbf{s}}
\renewcommand{\S}{\mathbf{S}}
\newcommand{\e}{\mathbf{e}}
\newcommand{\xaug}{\tilde{\x}}
\newcommand{\xauggen}{\tilde{\tilde{\x}}}
\newcommand{\cb}{\mathbf{c}}
\newcommand{\qtot}{\tilde{q}}
\newcommand{\qmarg}{\bar{q}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\xlat}{\xi}
\newcommand{\logistic}{\phi}
\newcommand{\xlatb}{{\boldsymbol{\xi}}}
\newcommand{\wmarg}{\bar{\mathbf{w}}}
\newcommand{\xpoint}{\mathbf{\bar{u}}}
\newcommand{\xpointind}{\bar{u}}
\newcommand{\z}{\mathbf{z}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\xx}{\tilde{x}}
\newcommand{\yy}{\tilde{y}}
\renewcommand{\ss}{\tilde{s}}
\newcommand{\xxb}{\mathbf{\xx}}
\newcommand{\yyb}{\mathbf{\yy}}
\newcommand{\ssb}{\mathbf{\ss}}
\newcommand{\sest}{z}
\newcommand{\sestb}{\mathbf{\sest}}
\newcommand{\yplain}{y}
\newcommand{\m}{\mathbf{m}}
\newcommand{\h}{\mathbf{h}}
\newcommand{\uu}{\mathbf{u}}
\newcommand{\kk}{\mathbf{k}}
\newcommand{\vb}{\mathbf{v}}
\newcommand{\cc}{\mathbf{c}}
\renewcommand{\u}{\mathbf{u}}
\newcommand{\f}{\mathbf{f}}
\newcommand{\g}{\mathbf{g}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\J}{\mathbf{J}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\C}{\mathbf{C}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\U}{\mathbf{U}}
\newcommand{\M}{\mathbf{M}}
\newcommand{\B}{\mathbf{B}}
\renewcommand{\L}{\mathbf{L}}
\newcommand{\A}{\mathbf{A}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\q}{\mathbf{q}}
\newcommand{\Q}{\mathbf{D}_\mathbf{q}}
\newcommand{\mub}{\boldsymbol{\mu}}
\newcommand{\Sigmab}{\boldsymbol{\Sigma}}
\newcommand{\Qb}{\mathbf{Q}}
\newcommand{\n}{\mathbf{n}}
\newcommand{\0}{\mathbf{0}}
\newcommand{\lb}{\mathbf{l}}
\newcommand{\Sb}{\mathbf{S}}

\renewcommand{\texttt}[1]{#1}

\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}

\title{Binary Independent Component Analysis: A  Non-stationarity-based Approach\\
(Supplementary Material)}

\author[1,2]{\href{mailto:<antti.hyttinen@helsinki.fi>?Subject=Your UAI 2022 paper}{Antti Hyttinen}{}}
\author[1,2,3]{\href{mailto:<vitoria.barin-pacela@mila.quebec>?Subject=Your UAI 2022 
paper}{Vitória Barin-Pacela}{}}
\author[1]{\href{mailto:<aapo.hyvarinen@helsinki.fi>?Subject=Your UAI 2022 paper}Aapo~Hyv\"arinen}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
University of Helsinki\\
   Helsinki, Finland
}
\affil[2]{%
    Helsinki Institute for Information Technology, Finland
}
\affil[3]{%
    Mila\\
    Universit\'e de Montr\'eal\\
    Montr\'eal, Canada
  }

\begin{document}
\onecolumn
\maketitle

\iffalse
\begin{abstract}
We consider independent component analysis of binary data. While fundamental in practice, this case has been much less developed than ICA for continuous data. We start by assuming a linear mixing model in a continuous-valued latent space, followed by a binary observation model. Importantly, 
we assume that the sources are non-stationary; 
this is necessary since any non-Gaussianity would essentially be destroyed by the binarization.
Interestingly, the model allows for closed-form likelihood by employing the cumulative distribution function of the multivariate Gaussian distribution. In stark contrast to the continuous-valued case, we prove non-identifiability of the model with few observed variables; our empirical results imply identifiability when the number of observed variables is higher. We present a practical method for binary ICA that uses only pairwise marginals, which are faster to compute than the full multivariate likelihood. 
Experiments give insight into the requirements for the number of observed variables, segments, and latent sources that allow the model to be estimated.
\end{abstract}
\fi

\appendix

\section{Proof of the Row Order Indeterminacy (Theorem 1)}
\setcounter{theorem}{0}
\setcounter{figure}{0}
\setcounter{equation}{0}

\begin{theorem}
If the row order of the 2-by-2 mixing matrix $\A$ of a binary ICA model is reversed, then the source means $\mub^u_\z$ and variances $\Sigmab^u_\z$ can be adjusted such that the implied distributions for the observed binary $\x^u$ remain identical.
\end{theorem}
\begin{proof}
Consider two binary ICA models
$\mathcal{M}=(\A,\{\mub^u_\z\}_u,\{\Sigmab^u_\z\}_u)$ and $\hat{\mathcal{M}}=(\hat{\A},\{\hat{\mub}^u_\z\}_u,\{\hat{\Sigmab}^u_\z\}_u)$ that have $n=2$ observed variables. Let $\hat{\A}$ be $\A$ with rows switched. We define parameters $\{\hat{\mub}^u_\z\}_u$, $\{\hat{\Sigmab}^u_\z\}_u$ and scaling matrices $\{\Qb^u\}_u$ such that Equations~10 and~11 in the main paper are satisfied and therefore the binary distributions implied by both models for each segment are identical. First, let 
$\hat{\Sigmab}^u_\z=\Sigmab^u_\z$. This and the row switching of $\A$ means that the covariance matrix of $\q^u$ has just the order switched:
$\hat{\Sigmab}_{ \q}^u[2,2]= \Sigmab_{ \q}^u[1,1]$,
$\hat{\Sigmab}_{ \q}^u[1,1]= \Sigmab_{ \q}^u[2,2]$, $\hat{\Sigmab}_{ \q}^u[1,2]= \Sigmab_{ \q}^u[1,2]$ (since this matrix is symmetric).
The equations implied by Equation~9 in the main paper for each $u$ are:
\begin{eqnarray*} \Qb^u[1,1]^2 \Sigmab_{ \q}^u[1,1] &=&\Sigmab_{ \q}^u[2,2], \\
\Qb^u[2,2]^2 \Sigmab_{ \q}^u[2,2]&= &\Sigmab_{ \q}^u[1,1],\\ 
\Qb^u[1,1] \cdot \Qb^u[2,2]\cdot \Sigmab_{ \q}^u[1,2]&=& \Sigmab_{ \q}^u[1,2].
\end{eqnarray*}
These can be solved by setting 
\begin{eqnarray*}
\Qb^u[1,1]&=&\sqrt{\Sigmab_{ \q}^u[2,2]/\Sigmab_{ \q}^u[1,1]}, \\
\Qb^u[2,2] &=&\sqrt{\Sigmab_{ \q}^u[1,1]/\Sigmab_{ \q}^u[2,2]}. 
\end{eqnarray*}
Finally, solve for $\hat{\mub}_\q^u$ from Equation~10 since $\A,\hat{\A},\Qb^u$ are invertible. 

\end{proof}

\section{Proof of the Correlation Identifiability (Theorem 2)}

\begin{theorem}
Two binary ICA models imply different distributions for binary observations $\x^u$ (in a given segment $u$) if the correlation matrices for $\q^u$ are not equal.
\end{theorem}

\iftrue
We will first present the result assuming zero means for $\q^u$ since it is more approachable to the reader. Appendix Figure~1 explains this case visually. The full technical proof is given afterwards. Appendix Figures~2 and~3 explain the general case visually.

\begin{proof}[Proof assuming zero means]

We can focus here on bivariate models as the multivariate normal for $\q^u$ can be straightforwardly marginalized to the bivariate case.
Suppose the two models respectively imply:
\begin{equation}
\q^u \sim \mathcal{N}( \mathbf{0} , \Sigmab_{ \q}^u),\quad \hat{\q}^u \sim \mathcal{N}( \mathbf{0} , \hat{\Sigmab}_{ \q}^u),\label{q_eq}\end{equation}

Due to Equations~10 and~11 in the main paper we can also assume we are dealing with ``standardized'' models where the diagonals of the covariances are units for both models. 

The correlation/covariance matrices for $\q$ and $\hat{\q}$ are:
$$
%cor(\q)=
\Sigmab_{\q}^u=
 \left( \begin{array}{ccc} 
 1 &\alpha \\
\alpha & 1 \end{array}\right),\quad 
\hat{\Sigmab}_{\q}^u=
 \left( \begin{array}{ccc} 
 1 &\beta \\
\beta & 1 \end{array}\right).
$$
We study the difference in the implied binary distribution by the two models by creating the Gaussian distributions for $\q^u$ and $\hat{\q}^u$ from a single standard multivariate Gaussian source. The distributions can be formed from a standard normal $\mathbf{n}\sim N(\mathbf{0}, \mathbf{I})$, for example by multiplying with matrices
$$
\A= \left( \begin{array}{ccc} 
 1 & 0 \\
%\frac{\alpha}{\sqrt{1-\alpha^2}} 
\alpha
& \sqrt{1-\alpha^2}\end{array}\right), \quad \hat{\A}= \left( \begin{array}{ccc} 
 1 & 0 \\
%\frac{\alpha}{\sqrt{1-\alpha^2}} 
\beta
& \sqrt{1-\beta^2}\end{array}\right)
$$
such that
$$
\q= \A \mathbf{n}, \quad \hat{\q} = \hat{\A} \mathbf{n}.
$$
 We will assume $\alpha > \beta$ without loss of generality. Let's look at which values for $\n$ result in different assignments for the binary variables. Recall that the assignment is determined deterministically by the quadrant $\q^u$ and $\hat{\q}^u$ land in. Intuitively, the model with higher correlation $\alpha$ implies more similar values for the binary variables. For the $\alpha$-model (with $\A$):
$$
x^u_1 =  \begin{cases} 0, & \text{ if }n_1 > 0 \\
1, & \text{ if }n_1 < 0
\end{cases} , \quad x_2^u = \begin{cases} 0, & \text{ if }-n_2 < \frac{\alpha}{\sqrt{1-\alpha^2}}n_1\\
1, & \text{ if }-n_2 > \frac{\alpha}{\sqrt{1-\alpha^2}}n_1
\end{cases}.
$$
And for the $\beta$-model (with $\hat{\A}$):
$$
x^u_1 =  \begin{cases} 
0, & \text{ if }n_1 > 0\\
1, & \text{ if }n_1 < 0
\end{cases}, \quad 
x^u_2 =  \begin{cases} 
0, & \text{ if }-n_2 < \frac{\beta}{\sqrt{1-\beta^2}}n_1\\
1, & \text{ if }-n_2 > \frac{\beta}{\sqrt{1-\beta^2}}n_1
\end{cases}.
$$
Note that due to the construction both models agree on the value of the binary variable $x^u_1$.

\begin{figure*}
    \centering
    \includegraphics[scale=0.75]{thm_plot1.pdf}
\caption{Bivariate standard normal $\n$ and colors indicating which binary assignments are implied with $\alpha=0.5$ (left) and with $\beta=-0.5$ (center). For this case with zero means, with higher correlation value $\alpha$ we get more $00$ and $11$ assignments as can be seen from the rightmost plot. Grey points in the rightmost plot do not imply extra 00 or 11 assignments with either correlation value and are irrelevant for the proof. 
\label{fig:thm_plot1} }
\end{figure*}

With $\beta$ we get extra assignments such that $x^u_1=x^u_2=0$ if:
\begin{eqnarray}
n_1 &>& 0 \quad \text{AND} \quad 
-n_2 \in \left[\frac{\alpha}{\sqrt{1-\alpha^2}}n_1,\frac{\beta}{\sqrt{1-\beta^2}}n_1 \right] \label{eq:full1zero}
\end{eqnarray} 
Since  $\alpha > \beta$ and $x/\sqrt{1-x^2}$ is increasing, the interval for $n_2$ is empty, and no $\n$ implies $x^u_1=x^u_2=0$ with $\beta$ if not with $\alpha$. Suppose $\n$ is such that
\begin{eqnarray*}
n_1 &>& 0
%%%%%%%%
\quad  \text{ AND } \quad
-n_2 \in \left[\frac{\beta}{\sqrt{1-\beta^2}}n_1, \frac{\alpha}{\sqrt{1-\alpha^2}}n_1 \right].
\end{eqnarray*}
The binary values implied are $x^u_1=x^u_2=0$ with $\alpha$ and $x^u_1=0,x^u_2=1$ with $\beta$.
Since  $\alpha > \beta$ and $x/\sqrt{1-x^2}$ is increasing, the interval for $n_2$ has non-zero measure.
Thus there is a nonzero measure for obtaining extra $x^u_1=x^u_2=0$ with $\alpha$. See Figure~\ref{fig:thm_plot1} for pictorial representation of the situation when $\alpha=0.5$, $\beta=-0.5$. 
\end{proof}

\begin{proof}
We can focus here on bivariate models as the multivariate normal for $\q^u$ can be straightforwardly marginalized to the bivariate case.
Suppose the two models respectively imply:
\begin{equation}
\q^u \sim \mathcal{N}( \mub_{ \q}^u , \Sigmab_{ \q}^u),\quad \hat{\q}^u \sim \mathcal{N}( \hat{\mub}_{ \q}^u , \hat{\Sigmab}_{ \q}^u),\label{q_eq2}\end{equation}
Then the marginals are:
\begin{eqnarray*}
P(x_1^u=1)&=&\Phi(0|\mu_1,\sigma_1^2)=\Phi\left(-\frac{\mu_1}{\sigma_1}|0,1\right),\\
P(\hat{x}_1^u=1)&=&\Phi(0|\hat{\mu}_1,\hat{\sigma}_1^2)=\Phi\left(-\frac{\hat{\mu}_1}{\hat{\sigma}_1}|0,1\right),\end{eqnarray*}
where $\mu_1$, $\hat{\mu}_1$, $\sigma_1$, and $\hat{\sigma}_1$ denote the parameters in Equation~\ref{q_eq2}. For the models to imply the same distributions the marginals need to be the same. The same applies for $x_2^u$ with parameters $\mu_2$, $\hat{\mu}_2$, $\sigma_2$, and $\hat{\sigma}_2$. Since $\Phi$ is monotonically increasing, we can assume from here on:
$$
\mu_1\hat{\sigma}_1 = \hat{\mu}_1\sigma_1, \quad
 \mu_2 \hat{\sigma}_2 =\hat{\mu}_2 \sigma_2.
$$
Due to Equations~10 and~11 in the main paper we can also assume we are dealing with ``standardized'' models where the diagonals of the covariances are units for both models. We get:
$$
\mu_1 = \hat{\mu}_1,\quad \mu_2 =\hat{\mu}_2,\quad
\hat{\sigma}_1 = \sigma_1 =  \hat{\sigma}_2 = \sigma_2 = 1.
$$

The correlation/covariance matrices for $\q$ and $\hat{\q}$ are:
$$
%cor(\q)=
\Sigmab_{ \q}^u=
 \left( \begin{array}{ccc} 
 1 &\alpha \\
\alpha & 1 \end{array}\right),\quad 
\hat{\Sigmab}_{ \q}^u=
 \left( \begin{array}{ccc} 
 1 &\beta \\
\beta & 1 \end{array}\right)
$$
We study the difference in the implied binary distribution by the two models by creating the Gaussian distributions for $\q^u$ and $\hat{\q}^u$ from a single standard multivariate Gaussian source. The distributions can be formed from a standard normal $\mathbf{n}\sim \mathcal{N}(\mathbf{0}, \mathbf{I})$, for example by multiplying with matrices$$
\A= \left( \begin{array}{ccc} 
 1 & 0 \\
%\frac{\alpha}{\sqrt{1-\alpha^2}} 
\alpha
& \sqrt{1-\alpha^2}\end{array}\right), \quad \hat{\A}= \left( \begin{array}{ccc} 
 1 & 0 \\
%\frac{\alpha}{\sqrt{1-\alpha^2}} 
\beta
& \sqrt{1-\beta^2}\end{array}\right)
$$
such that 
$$
\q= \A \mathbf{n}+\mub, \quad \hat{\q} = \hat{\A} \mathbf{n}+\hat{\mub},
$$
where $\mub=\hat{\mub}$ due to the earlier. We will assume $\alpha > \beta$ without loss of generality. Let's look at which values for $\n$ result in different assignments for the binary variables. Recall that the assignment is determined deterministically by the quadrant $\q^u$ and $\hat{\q}^u$ land in. Intuitively, the model with higher correlation $\alpha$ implies more similar values for the binary variables. For the $\alpha$ model:
$$
x_1^u =  \begin{cases} 0, & \text{ if }n_1 > -\mu_1 \\
1, & \text{ if }n_1 < -\mu_1
\end{cases} , \quad x_2^u = \begin{cases} 0, & \text{ if }-n_2 < \frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2\\
1, & \text{ if }-n_2 > \frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2
\end{cases}.
$$
And for the $\beta$ model:
$$
\hat{x}_1^u =  \begin{cases} 
0, & \text{ if }n_1 > -\mu_1 \\
1, & \text{ if }n_1 < -\mu_1
\end{cases}, \quad 
\hat{x}_2^u =  \begin{cases} 
0, & \text{ if }-n_2 < \frac{\beta}{\sqrt{1-\beta^2}}n_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2\\
1, & \text{ if }-n_2 > \frac{\beta}{\sqrt{1-\beta^2}}n_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2
\end{cases}.
$$
%Lets look at the probability of $\x=\mathbf{0}$ implied by the models, i.e. $P\left( \q  > \mathbf{0} \right)$, this is the probability that we draw $\mathbf{u}$ such that both elements of $\q$ are positive. 
Due to the construction both models agree on the value of the binary variable $x_1^u$.

In the zero-mean case presented above, we got more 00 \emph{and} 11 assignments with the higher correlation $\alpha$ than with the lower correlation $\beta$ (Figure~1). Here we can only prove that we always get more 00 \emph{or} 11 assignments, since changing the mean complicates matters (Figures~2 and~3). This is still enough for showing that the distributions are different. First, we show that the lower correlation $\beta$ cannot give extra 00 \emph{and} 11 assignments in comparison to $\alpha$ (separately for positive and negative $\alpha$).


\paragraph{Case $\alpha > 0$} With $\beta$ we get additional assignments such that $x_1^u=x_2^u=0$ if:
\begin{eqnarray}
n_1 > -\mu_1 \quad \text{AND} \quad 
-n_2 \in \left[\frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2,\frac{\beta}{\sqrt{1-\beta^2}}n_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2 \right] \label{eq:full1} 
\end{eqnarray} 
Replacing $n_1$ with smaller $-\mu_1$ in the lower bound gives a necessary condition for this:
\begin{eqnarray}
-n_2 
 \in \left[-\frac{\alpha}{\sqrt{1-\alpha^2}}\mu_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2, \frac{\beta}{\sqrt{1-\beta^2}}n_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2 \right] \label{eq:nec1}
\end{eqnarray}
With $\beta$ we get additional assignments $x^u_1=x^u_2=1$ if:
\begin{eqnarray}
n_1 < -\mu_1 \quad \text{AND} \quad
-n_2 \in \left[\frac{\beta}{\sqrt{1-\beta^2}}n_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2,\frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2\right] \label{eq:full2}
\end{eqnarray}
Replacing $n_1$ with larger $-\mu_1$ in the upper bound gives a necessary condition:
\begin{eqnarray}
-n_2 \in \left[\frac{\beta}{\sqrt{1-\beta^2}}n_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2,-\frac{\alpha}{\sqrt{1-\alpha^2}}\mu_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2\right]\label{eq:nec2}
\end{eqnarray}
Since the lower bound of  Equation~\ref{eq:nec1} matches the upper bound of Equation~\ref{eq:nec2}, and the bound is constant with respect to $\n$, both necessary conditions cannot be fulfilled given any fixed model. Therefore, the conditions the latter were necessary to, Equation~\ref{eq:full1} and Equation~\ref{eq:full2} respectively, will not be satisfied either for any fixed model. Note that either Equation~\ref{eq:full1} or Equation~\ref{eq:full2} can be satisfied alone.

\paragraph{Case $\alpha < 0$} Also $\beta <0$ here. With $\beta$ we get additional assignments such that $x^u_1=x^u_2=0$ if:
\begin{eqnarray}
n_1 > -\mu_1 \quad \text{AND} \quad -n_2
\in \left[\frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2,\frac{\beta}{\sqrt{1-\beta^2}}n_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2 \right] \label{eq:full1neg}
\end{eqnarray} 
Replacing $\beta n_1$ with larger $-\beta \mu_1$ in the upper bound gives a necessary condition for this is:
\begin{eqnarray}
-n_2
\in \left[\frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2,-\frac{\beta}{\sqrt{1-\beta^2}}\mu_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2 \right] \label{eq:nec1neg}
\end{eqnarray}
With $\beta$ we get additional assignments $x^u_1=x^u_2=1$ if:
\begin{eqnarray}
n_1 < -\mu_1 \quad \text{AND} \quad
-n_2 \in \left[\frac{\beta}{\sqrt{1-\beta^2}}n_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2,\frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2 \right] \label{eq:full2neg} 
\end{eqnarray}
Replacing $\beta n_1$ with smaller $-\beta \mu_1$ in the lower bound gives a necessary condition:
\begin{eqnarray}
-n_2 \in \left[-\frac{\beta}{\sqrt{1-\beta^2}}\mu_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2,\frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2 \right]\label{eq:nec2neg}
\end{eqnarray}
Since the upper bound of  Equation~\ref{eq:nec1neg} matches the lower bound of Equation~\ref{eq:nec2neg}, and the bound is constant with respect to $\n$, both necessary conditions cannot be fulfilled given any fixed model. Therefore the conditions the previous were respectively necessary to, Equation~\ref{eq:full1neg} and Equation~\ref{eq:full2neg}, will not be satisfied either for any fixed model. Note that either Equation~\ref{eq:full1neg} or Equation~\ref{eq:full2neg} can be satisfied alone.

\begin{figure*}
    \centering
    \includegraphics[scale=0.75]{thm_plot3.pdf}
\caption{Bivariate standard normal $\n$ and colors indicating which binary assignments are implied with $\alpha=0.5$ (left) and with $\beta=-0.5$ (center). For this case with $\mu_1=-1,\mu_2=-1$, with higher correlation value $\alpha$ we (provably) get more $00$ assignments as can be seen from the rightmost plot. Grey points in the rightmost plot do not imply extra 00 or 11 assignments with either correlation value and are irrelevant for the proof. \label{fig:thm_plot3} }
\end{figure*}

\paragraph{Extra 00 with $\alpha$} 
Suppose Equation~\ref{eq:full1} or Equation~\ref{eq:full1neg} is not satisfied. This means that no $\n$ implies $x^u_1=x^u_2=0$ with $\beta$ if not with $\alpha$. Suppose $\n$ is such that
\begin{eqnarray*}
n_1 &>& \max \left(-\mu_1, 
%%%%%%%%%
\mu_2 \left(\frac{1}{\sqrt{1-\beta^2}} - \frac{1}{\sqrt{1-\alpha^2}}\right) \big/ \left(\frac{\alpha}{\sqrt{1-\alpha^2}}-\frac{\beta}{\sqrt{1-\beta^2}}\right) 
%%%%%%%%
\right)  \text{ and } \\
-n_2 &\in& \left[\frac{\beta}{\sqrt{1-\beta^2}}n_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2, \frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2 \right].
\end{eqnarray*}
The binary values implied are $x^u_1=x^u_2=0$ with $\alpha$ and $x^u_1=0,x^u_2=1$ with $\beta$.
Furthermore, the following shows that interval for $-n_2$ has non-zero measure. The first multiplication is permitted as the $x/\sqrt{1-x^2}$ is increasing and $\alpha > \beta$.
\begin{eqnarray*}
n_1 &>&
\mu_2\left(\frac{1}{\sqrt{1-\beta^2}} - \frac{1}{\sqrt{1-\alpha^2}}\right)/\left(\frac{\alpha}{\sqrt{1-\alpha^2}}-\frac{\beta}{\sqrt{1-\beta^2}}\right)\quad || \cdot \left(\frac{\alpha}{\sqrt{1-\alpha^2}}-\frac{\beta}{\sqrt{1-\beta^2}}\right) \\
\left(\frac{\alpha}{\sqrt{1-\alpha^2}}-\frac{\beta}{\sqrt{1-\beta^2}}\right) n_1 &>&
\mu_2\left(\frac{1}{\sqrt{1-\beta^2}} - \frac{1}{\sqrt{1-\alpha^2}}\right)\\
\frac{\alpha}{\sqrt{1-\alpha^2}}n_1  &>&
\frac{\beta}{\sqrt{1-\beta^2}}n_1+
\mu_2\left(\frac{1}{\sqrt{1-\beta^2}} - \frac{1}{\sqrt{1-\alpha^2}}\right)\\
\frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2
&>&
\frac{\beta}{\sqrt{1-\beta^2}}n_1+
\mu_2\left(\frac{1}{\sqrt{1-\beta^2}} - \frac{1}{\sqrt{1-\alpha^2}}\right)+\frac{1}{\sqrt{1-\alpha^2}}\mu_2 \\
&=& \frac{\beta}{\sqrt{1-\beta^2}}n_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2.
\end{eqnarray*}
Thus there is a nonzero measure for obtaining extra $x^u_1=x^u_2=0$ with $\alpha$. See Figure~\ref{fig:thm_plot3} for pictorial representation of the situation when $\alpha=0.5$, $\beta=-0.5$, $\mu_1=-1,\mu_2=-1$. 



\begin{figure*}
    \centering
    \includegraphics[scale=0.75]{thm_plot2.pdf}
    %\includegraphics[scale=0.55]{idplot_time.pdf}
\caption{Bivariate standard normal $\n$ and colors indicating which binary assignments are implied with $\alpha=0.5$ (left) and with $\beta=-0.5$ (center). For this case with $\mu_1=1,\mu_2=1$, with higher correlation value $\alpha$ we (provably) get more $11$ assignments as can be seen from the rightmost plot. Grey points in the rightmost plot do not imply extra 00 or 11 assignments with either correlation value and are irrelevant for the proof.  \label{fig:thm_plot2} }
\end{figure*}

\paragraph{Extra 11 with $\alpha$} 
Suppose Equation~\ref{eq:full2} or Equation~\ref{eq:full2neg} is not satisfied. This means that no $\n$ implies $x^u_1=x^u_2=1$ with $\beta$ if not with $\alpha$.
Suppose $\n$ is such that
\begin{eqnarray*}
n_1 &<& \min \left(-\mu_1,  
%%%%%%%%
\mu_2(\frac{1}{\sqrt{1-\beta^2}} - \frac{1}{\sqrt{1-\alpha^2}})/(\frac{\alpha}{\sqrt{1-\alpha^2}}-\frac{\beta}{\sqrt{1-\beta^2}}) 
%%%%%%%%
\right)  \text{ and }\\
-n_2 &\in& \left[ \frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2, \frac{\beta}{\sqrt{1-\beta^2}}n_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2 \right].
\end{eqnarray*}
 The binary values implied are $x^u_1=x^u_2=1$ with $\alpha$ and $x^u_1=1,x^u_2=0$ with $\beta$.
Furthermore, the following shows that interval for $-n_2$ has non-zero measure. The first multiplication is permitted as the $x/\sqrt{1-x^2}$ is increasing and $\alpha > \beta$.
\begin{eqnarray*}
n_1 &<& 
\mu_2\left(\frac{1}{\sqrt{1-\beta^2}} - \frac{1}{\sqrt{1-\alpha^2}}\right) \big/ \left(\frac{\alpha}{\sqrt{1-\alpha^2}}-\frac{\beta}{\sqrt{1-\beta^2}}\right) \quad || \cdot \left(\frac{\alpha}{\sqrt{1-\alpha^2}}-\frac{\beta}{\sqrt{1-\beta^2}}\right) \\
\left(\frac{\alpha}{\sqrt{1-\alpha^2}}-\frac{\beta}{\sqrt{1-\beta^2}}\right)n_1 &<& 
\mu_2\left(\frac{1}{\sqrt{1-\beta^2}} - \frac{1}{\sqrt{1-\alpha^2}}\right)  \\
\frac{\alpha}{\sqrt{1-\alpha^2}}n_1 &<&\frac{\beta}{\sqrt{1-\beta^2}}n_1+
\mu_2 \left(\frac{1}{\sqrt{1-\beta^2}} - \frac{1}{\sqrt{1-\alpha^2}}\right)  \\
\frac{\alpha}{\sqrt{1-\alpha^2}}n_1 + \frac{1}{\sqrt{1-\alpha^2}}\mu_2
&<&
\frac{\beta}{\sqrt{1-\beta^2}}n_1+
\mu_2(\frac{1}{\sqrt{1-\beta^2}} - \frac{1}{\sqrt{1-\alpha^2}})
+\frac{1}{\sqrt{1-\alpha^2}}\mu_2 \\
&=& \frac{\beta}{\sqrt{1-\beta^2}}n_1+\frac{1}{\sqrt{1-\beta^2}}\mu_2.
\end{eqnarray*}
Thus there is a nonzero measure for obtaining extra $x^u_1=x^u_2=1$ with $\alpha$. 
See Figure~\ref{fig:thm_plot2} for pictorial representation of the situation when $\alpha=0.5$, $\beta=-0.5$, $\mu_1=1,\mu_2=1$.
\iffalse
\paragraph{Extra $11$ with $\alpha >0$ $\beta=0$} Suppose Equation~\ref{eq:full2} is not satisfied.
This means that no $\n$ implies $x^u_1=x^u_2=1$ with $\beta$ if not with $\alpha$. With $\alpha$ we get extra $x^u_1=x^u_2=1$ by sampling
$$
n_1 < \min(\mu_1,  \frac{\sqrt{1-\alpha^2}}{\alpha}\mu_2 - \frac{1}{\alpha}\mu_2    )  \quad \text{ and }\quad -n_2 \in [ \frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2, \mu_2]
$$
the interval for $-n_2$ has non-zero measure:
$$
\frac{\alpha}{\sqrt{1-\alpha^2}}n_1+\frac{1}{\sqrt{1-\alpha^2}}\mu_2
<
\frac{\alpha}{\sqrt{1-\alpha^2}}(\frac{\sqrt{1-\alpha^2}}{\alpha}\mu_2 - \frac{1}{\alpha}\mu_2)+\frac{1}{\sqrt{1-\alpha^2}}\mu_2
= \mu_2.
$$
There is a nonzero measure for obtaining extra $x^u_1=x^u_2=1$ with $\alpha$.
\fi
\end{proof}
\section{Proof of Theorem 3}

\begin{theorem}
%If two models
%$\mathcal{M}$
%and $\mathcal{M}'$ with $n=n_z$
%imply the same correlation matrices for $\q$ (in %a given segment)
% covariance matrices $\I + \dfrac{\pi}{8} \hat{\A} \hat{\Sigmab}_u {\hat{\A}}^\intercal  =\I + \dfrac{\pi}{8} \A \Sigmab_\z^u \A^\intercal$,
% then the means $\mub_\z^u$ can be adjusted such that the implied binary distributions are identical. \label{thm:means}
If two models
$\mathcal{M}$
and $\hat{\mathcal{M}}$ with $n=n_z$
imply the same correlation matrices for $\q^u$ (in a given segment)
% covariance matrices $\I + \dfrac{\pi}{8} \hat{\A} \hat{\Sigmab}_u {\hat{\A}}^\intercal  =\I + \dfrac{\pi}{8} \A \Sigmab_\z^u \A^\intercal$,
 then the means $\mub_\z^u$ can be adjusted such that the implied binary distributions are identical. %\label{thm:means}
\end{theorem}
\begin{proof}
If the models imply sample correlations for $\q^u$ they satisfy Equation 11.
Thus determine the positive diagonal matrices $\Qb^u$ from Equation~11  in the main paper, from the diagonal. Then solve for $\mub_\z^u$ from Equation~10 in the main paper since $\A$ and $\Qb^u$ are invertible. %\antti{Problem here in the less sources case! Need invertible $\A^T\A$ which it is.} 
Since the equations are satisfied, the implied binary distributions are identical.
\end{proof}

\section{Evaluation: Mean Cosine Similarity} \label{sec:mcs}

In the binary case, it is more relevant to evaluate the estimated \textbf{mixing matrix} than the sources, since the binarization process adds much more noise than simply adding Gaussian noise to the observations.
For this purpose, a similar procedure to mean correlation coefficent (MCC) is applied between the estimated mixing matrix and the true mixing matrix.

When there are only two components, the mixing matrix $\mathbf{A} \in \mathbb{R}^{2 \times 2}$ can be written considering its column vectors $\mathbf{A} = [\mathbf{a}_1, \mathbf{a}_2]$.
Each vector contains only two elements, so the correlation coefficient cannot be used, since $r(\mathbf{v}_1, \mathbf{v}_2) = 1 \quad \forall \ \mathbf{v}_1, \mathbf{v}_2 \in \mathbb{R}^2$. 
In addition, even if $n > 2$, the MCC is undesired because by subtracting the means of each vector, the correlation between ``shifted'' vectors is the same as if they were not shifted: $r(\mathbf{v}_1 + \mathbf{d}, \mathbf{v}_2) = r(\mathbf{v}_1, \mathbf{v}_2)$ for any $\mathbf{d} \in \mathbb{R}^2$.

Therefore, we employ the \textbf{Mean Cosine Similarity (MCS)} instead of the MCC. The MCS uses the cosine similarity -- instead of the correlation coefficient -- to determine whether the vectors of the true and estimated matrices are aligned:
\begin{equation}
    \cos( \mathbf{a}_1, \mathbf{a}_2)=
    \frac{\mathbf{a}_1 \cdot \mathbf{a}_2}{\|\mathbf{a}_1\|\|\mathbf{a}_2\|}
\end{equation}

Let us denote the $i^{\text{th}}$ column of a matrix $\mathbf{A} \in \mathbb{R}^{n \times n_s}$ as $\A[,i]$. In the MCS calculation, we aim to compare each column of $\mathbf{A}$ with each column of the estimated matrix $\hat{\mathbf{A}}$, thus getting a pair-wise cosine similarity. For simplicity, we consider a column permutation $p$ of matrix $\hat{\mathbf{A}}$ as $\hat{\mathbf{A}}[,p[i]]$. We compute the mean cosine similarity across all the columns for each permutation, and take the maximum, hence defining the MCS as:
\begin{equation}
    \text{MCS}(\mathbf{A}, \hat{\mathbf{A}}) = \max_p \left( \dfrac{1}{n_s} \sum_{i=1}^{n_s} \mid \cos(\mathbf{A}[,i], \hat{\mathbf{A}}[,p[i]]) \mid \right).
\end{equation}
Instead of actually going through the permutation, the computation can be efficiently performed via a linear assignment problem or a linear program.



\section{Variational Autoencoder for Binary Data (\texttt{linear iVAE})} 
\label{appendix_ivae}

\paragraph{Estimation}
The variational autoencoder\footnote{The notation here differs slightly from the previous in order to follow the notation in [Khemakhem \textit{et al.}, 2019] more closely.} iVAE
[Khemakhem \textit{et al.}, 2019] aims to estimate the observed data distribution $p(\mathbf{x}|\mathbf{u})=\int p(\mathbf{x}|\mathbf{z})p(\mathbf{z}|\mathbf{u})d\mathbf{z}$.
Given a dataset $\mathcal{D} = \{ (\mathbf{x}_i, \mathbf{u}_i) \}_{i}$, let $q_\mathcal{D}(\mathbf{x},\mathbf{u})$ be the empirical data distribution. The model learns by maximizing a lower bound $\mathcal{L}$ of the data log-likelihood 
\begin{equation}
    \mathbb{E}_{q_\mathcal{D}(\mathbf{x},\mathbf{u})}[ \log p_{\boldsymbol{\theta}} (\mathbf{x}|\mathbf{u})] \geq \mathcal{L}(\boldsymbol{\theta},\boldsymbol{\phi}).
\end{equation}
The loss function is: 
\begin{equation}
\begin{aligned}
\label{eq:ivae_loss}
    \mathcal{L}(\boldsymbol{\theta},\boldsymbol{\phi}) &:= \mathbb{E}_{q_\mathcal{D}(\mathbf{x},\mathbf{u})}[\mathbb{E}_{q_{\boldsymbol{\phi}}(\mathbf{z}|\mathbf{x},\mathbf{u})}[\log p_{\boldsymbol{\theta}}(\mathbf{x}, \mathbf{z}|\mathbf{u}) - \log q_{\boldsymbol{\phi}} (\mathbf{z|x,u})]] \\
    &= \mathbb{E}_{q_\mathcal{D}(\mathbf{x},\mathbf{u})}[\mathbb{E}_{q_{\boldsymbol{\phi}}(\mathbf{z}|\mathbf{x},\mathbf{u})}[\log p_{\boldsymbol{\theta}}(\mathbf{x} | \mathbf{z}, \mathbf{u})] + \mathbb{E}_{q_{\boldsymbol{\phi}}(\mathbf{z}|\mathbf{x},\mathbf{u})}[\log p_{\boldsymbol{\theta}}(\mathbf{z}|\mathbf{u})]
    -\mathbb{E}_{q_{\boldsymbol{\phi}}(\mathbf{z}|\mathbf{x},\mathbf{u})}[\log q_{\boldsymbol{\phi}}(\mathbf{z}|\mathbf{x},\mathbf{u})]].
\end{aligned}
\end{equation}

\begin{figure}[t]
    \centering
    \includegraphics[scale=0.8]{binary_ivae_illustration.pdf}
    \caption{Binary linear iVAE illustration. In VAE terminology: the inference model is equivalent to the encoder, and the mixing model is equivalent to the decoder. The iVAE uses an additionally observed variable $\mathbf{u}$ to estimate the inference model. Additionally, the iVAE estimates a ``prior" model for such additionally observed variables. Different from the continuous iVAE, the mixing model does not model the noise explicitly. Also in contrast to the continuous iVAE, the outputs of the model are the estimated probabilities, not the estimated observations. To obtain the probability of each element being 1, a Sigmoid function is applied element-wise to the output of the mixing model. Variables in bold under the model names denote the transformations learned by the model and are described in detail in the text.}
    \label{fig:binary_ivae}
\end{figure}

To compute the loss function, the expectation over the data distribution is implemented as an average over data samples. In order to deal with expectation over $q_{\boldsymbol{\phi}}(\mathbf{z|x,u})$, we use the reparametrization trick and \emph{draw} vectors $\mathbf{z}$ from $q_{\boldsymbol{\phi}}(\mathbf{z|x,u})$. 

To further develop iVAEs for binary data--which we refer to as \texttt{linear iVAE} in this paper---, we notice that we are working with a factorized Bernoulli observational model. The loss terms developed previously in the continuous iVAE model can remain the same for the inference model and the prior model.
However, the loss term referring to the \textbf{mixing model} should be modified, since the data follows a \textbf{multivariate Bernoulli distribution}. We draw $\z^{(i)} \sim q_{\boldsymbol{\phi}}(\mathbf{z}|\mathbf{x},\mathbf{u})$ using the output of the inference model in the reparameterization trick $\z^{(i)} = \g(\x, \uu) + \mathbf{v}(\x, \uu) \odot \boldsymbol{\epsilon}^{(i)}$. Thus, the loss term relating to the mixing model can be given as:
\begin{equation}
\begin{aligned}
    \mathbb{E}_{q_{\boldsymbol{\phi}}(\mathbf{z}|\mathbf{x},\mathbf{u})}[\log p_{\boldsymbol{\theta}}(\mathbf{x} \vert \mathbf{z}, \mathbf{u})]
    &= \mathbb{E}_{q_{\boldsymbol{\phi}}(\mathbf{z}|\mathbf{x},\mathbf{u})}[\log p_{\boldsymbol{\theta}}(\mathbf{x} \vert \mathbf{z})] \approx \dfrac{1}{l} \sum_{j=1}^l \log p_{\boldsymbol{\theta}}(\mathbf{x} \vert \mathbf{z}^{(i)}) = \dfrac{1}{l} \sum_{i=1}^l \sum_{j=1}^{n} \log p_{\boldsymbol{\theta}}({x}_j | \z^{(i)}) \\
    &= \dfrac{1}{l} \sum_{i=1}^l \sum_{j=1}^{n} \left[x_j \log y_j^{(i)} + (1-x_j) \log (1-y_j^{(i)}) \right] \\
    &= \dfrac{1}{l} \sum_{i=1}^l \sum_{j=1}^{n} \log \text{Bernoulli}(x_j; y_j^{(i)}),
\end{aligned}
\end{equation}
where $y_j$ is the probability of the observation being 1, $0 \leq y_j \leq 1$, and it is modeled by applying an element-wise sigmoid function to the continuous output of the linear mixing model. Notice that $\y^{(i)}$ is a function of the estimated sources $\z^{(i)}$ drawn from the estimated posterior.
Hence, the expectation is approximated by computing the log-probability mass function of a Bernoulli distribution given such probability $y_j$.

\paragraph{Binary model}
In the model defined, all the transformations are linear, and the sources are drawn from a Gaussian distribution given their segment. Compared to the continuous iVAE, which uses nonlinear transformations in all the models, the binary model is linear and introduces changes to the mixing model and to the prior model. The prior model now estimates not only the log-variances but also the means.

When the observed variables are binary, we use a ``Bernoulli MLP'' 
[Kingma and Welling, 2014, Rezende et al., 2014]
as a decoder in the mixing model, which aims to estimate parameters from a Bernoulli distribution instead of a Normal distribution. The mixing model is modified from the continuous case by applying a sigmoid function element-wise to the output of the mixing model. In addition, in the binary case, we do not have an explicit factor accounting for the noise in the mixture, as illustrated in Figure \ref{fig:binary_ivae}.

Following, we describe the model in more detail.
First of all, we notice that for simplicity and numerical stability when modeling the variances in both the inference model and the prior model, the transformations model the log-variances, which can easily be converted to the variances via exponentiation. With this trick, even a linear transformation can suffice for modeling the log-variances, thus making the model simpler.

The \textbf{prior model} is composed of a transformation modeling the prior mean, and a transformation modeling the prior log-variance.
The prior \textbf{mean} is modeled by 
\begin{equation}
    \begin{aligned}
        \boldsymbol{\eta}: \ & \R^m \rightarrow \mathbb{R}^{n_s} \quad
              \ & \mathbf{u} \mapsto \boldsymbol{\eta}(\mathbf{u})
    \end{aligned}
\end{equation}
where $\boldsymbol{\eta}$ is an affine transformation.
So the vector of means is given by $\boldsymbol{\eta} (\mathbf{u}) = \mathbf{W}_{\eta} \mathbf{u} + \mathbf{b}_{\eta}$, with matrix weights $\mathbf{W}_{\eta} \in \mathbb{R}^{n_s \times m}$, and a bias vector $\mathbf{b}_{\eta} \in \R^{n_s}$. 
The prior \textbf{log-variance} is modeled by
\begin{equation}
    \begin{aligned}
        \boldsymbol{\lambda}: \ & \R^m \rightarrow \mathbb{R}^{n_s} \quad
                  & \mathbf{u} \mapsto \boldsymbol{\lambda}(\mathbf{u})
    \end{aligned}
\end{equation}
where $\boldsymbol{\lambda}$ is an affine transformation.
The vector of log-variances is given by $\boldsymbol{\lambda}(\mathbf{u}) = \mathbf{W}_{\lambda} \mathbf{u} + \mathbf{b}_{\lambda}$, in which $\mathbf{W}_{\lambda} \in \mathbb{R}^{n_s \times m}$ are the weights, and $\mathbf{b}_{\lambda} \in \R^{n_s}$ are the biases. Notice that $\boldsymbol{\lambda}$ is unrelated to the notation from the exponential family, since we are modeling both the means and variances.

The \textbf{mixing model} learns a transformation
\begin{equation}
    \begin{aligned}
        \f: \ & \mathbb{R}^{n_s} \rightarrow \mathbb{R}^{n} \quad
             & \z \mapsto \f(\z)
    \end{aligned}
\end{equation}
where $\f$ is a linear transformation resulting in the the continuous output $\f(\mathbf{z}) = \mathbf{W}_f \mathbf{z}$, in which $\mathbf{W}_{f} \in \mathbb{R}^{n \times n_s}$ is the matrix of weights.
Then, the probability of the estimated observed variables is given by
\begin{equation}
    \mathbf{y} = \text{Sigmoid}(\mathbf{W}_{f} \mathbf{z}).
\end{equation}
It is important to notice that each element of $\y$ is an individual probability of the particular observed variable being 1, $\{ y_i = P (x_i = 1) \}_{i=1}^{n}$.

The \textbf{inference model} has a transformation modeling the mean, and a transformation modeling the log-variance of the data. The data \textbf{mean} is modeled by
\begin{equation}
    \begin{aligned}
        \g: \ & \mathbb{R}^{n+m} \rightarrow \mathbb{R}^{n_s}  \quad
             & (\x, \uu) \mapsto \g(\x, \uu)
    \end{aligned}
\end{equation}
where $\g$ is an affine transformation.
We denote the concatenation of the vectors $\x$ and $\uu$ as $\x||\uu$.
The vector of means is given by $\g(\mathbf{x}, \mathbf{u}) = \mathbf{W}_{g} (\mathbf{x}||\mathbf{u}) + \mathbf{b}_g$, for a matrix $\mathbf{W}_{g} \in \mathbb{R}^{n_s \times (n+m)}$, and a bias vector $\mathbf{b}_g \in \R^{n_s}$.
The data \textbf{log-variance} is modeled by
\begin{equation}
    \begin{aligned}
        \vb: \ & \mathbb{R}^{n+m} \rightarrow \mathbb{R}^{n_s} \quad 
             & (\x, \mathbf{u}) \mapsto \vb(\x, \mathbf{u})
    \end{aligned}
\end{equation}
where $\vb$ is an affine transformation.
The vector of log-variances is given by $\vb(\mathbf{x}, \mathbf{u}) = \mathbf{W}_{v} (\mathbf{x}||\mathbf{u}) + \mathbf{b}_v$, where $\W_v \in \R^{n_s \times n+m}$ are the weights and $\mathbf{b}_v \in \R^{n_s}$ the biases.

\section{Further Details}

The experiments were run in computer clusters employing Intel Xeon E5-2680 v4
processors. The running times in Figure~5 (right) in the main paper (as well as all the results in all other experiments) were obtained using a single processor for a specific run. 

\end{document}
