\documentclass[accepted]{uai2023} 
\usepackage{natbib} 
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} 

\usepackage{booktabs} 
\usepackage{tikz} 

\usepackage{amsmath,amsthm,amssymb}

\usepackage{subcaption}
\usepackage{natbib}
\usepackage{dblfloatfix}
\usepackage[ruled, vlined, nofillcomment, linesnumbered]{algorithm2e}
\usepackage{eda} 
\usepackage{prettyplots}
\usepackage{comment}

% Theorems
\newtheorem{assumption}{Assumption}
\newtheorem*{problem}{Problem Statement}
\newtheorem*{postulate}{Postulate}

\usepackage{graphicx}
\usetikzlibrary{calc}
% \usetikzlibrary{fit}
\usetikzlibrary{positioning}
\usetikzlibrary{external}
% \tikzexternalize


\newcommand{\ourmaintitle}{Causal Discovery with Hidden Confounders\\ using the Algorithmic Markov Condition}
\newcommand{\ourtitle}{\ourmaintitle}
\newcommand{\ourmethod}{\textsc{cdhc}\xspace}
\newcommand{\oururl}{\url{https://eda.rg.cispa.io/prj/pepsi/}}
\newcommand{\codeurl}{\oururl}
\newcommand{\apxurl}{\oururl}


\newif\ifapx
\newif\ifpdf
\pdftrue
\apxfalse 


\definecolor{blue1}{RGB}{6, 131, 178}
\tikzset{ourcolor/.style={blue(munsell), line width = 1.2pt, opacity = .9},
	ourorange/.style={deepcarrotorange, line width = 1.1pt,opacity = .7 }, 
	ourbl/.style={plum(traditional), line width = 1.1pt,opacity = .9}, 
	ourgreen/.style={dollarbill, line width = 1.1pt,opacity = .7},
	ouryellow/.style={mambacolor5, line width = 1.1pt,opacity = .9},
	ourred/.style={red(ryb)!80},  
	ourblue/.style={blue1!90} 
}

\pgfdeclareplotmark{hexagon}
{%
  \pgfpathmoveto{\pgfqpoint{0pt}{1.1547\pgfplotmarksize}}
  \pgfpathlineto{\pgfqpointpolar{150}{1.1547\pgfplotmarksize}}
  \pgfpathlineto{\pgfqpointpolar{210}{1.1547\pgfplotmarksize}}
  \pgfpathlineto{\pgfqpointpolar{270}{1.1547\pgfplotmarksize}}
  \pgfpathlineto{\pgfqpointpolar{330}{1.1547\pgfplotmarksize}}
  \pgfpathlineto{\pgfqpointpolar{30}{1.1547\pgfplotmarksize}}
  \pgfpathclose
  \pgfusepathqfill
}

\SetKwComment{tcpas}{\{}{\}}
\SetCommentSty{textnormal}
\SetArgSty{textnormal}
\SetKwRepeat{Do}{do}{while}
\SetKw{False}{false}
\SetKw{True}{true}
\SetKw{Null}{null}
\SetKwInOut{Output}{output}
\SetKwInOut{Input}{input  }
\SetKw{AND}{and}
\SetKw{OR}{or}
\SetKw{Continue}{continue}

\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}

\newcommand{\Ai}{\ensuremath{(I-A^t)^{-1}}}
\newcommand{\Ait}{\ensuremath{(I-A^t)^{-t}}}
\newcommand{\At}{A^\top}
\newcommand{\Bt}{B^\top}
\newcommand{\Ct}{C^\top}
\newcommand{\Xb}{X}
\newcommand{\xb}{x}
\newcommand{\yb}{y}
\newcommand{\Zb}{Z}
\newcommand{\zb}{z}
\newcommand{\Wb}{W}
\newcommand{\wb}{w}
\newcommand{\Px}{P(\Xb)}
\newcommand{\Pz}{P(\Zb)}
\newcommand{\Pxz}{P(\Xb, \Zb)}
\newcommand{\Qxz}{Q(\Xb, \Zb)}
\newcommand{\M}{\ensuremath{\mathcal{M}}}
\newcommand{\Ac}{\ensuremath{\mathcal{A}}}
\newcommand{\Pc}{\ensuremath{\mathcal{P}}}
\newcommand{\Mf}{\mathfrak{M}}
\newcommand{\N}{N}
\newcommand{\argmin}{\text{arg\,min}}
\newcommand{\gt}{\ensuremath{G^{\ast}}}
\newcommand{\Gxz}{\ensuremath{G_{X,Z}}}
\newcommand{\Gx}{\ensuremath{G_X}}

\newcommand\norm[1]{\lVert#1\rVert}

\newcommand{\MB}{\text{MB}}
\newcommand{\Fnet}{F_{1}^{\text{net}}}
\newcommand{\Fconf}{F_{1}^{\text{conf}}}
\newcommand{\leqp}{\overset{+}{\leq}}
\newcommand{\eqp}{\overset{+}{=}}

\newcommand{\GFCI}{\textsc{gfci}\xspace}
\newcommand{\RFCI}{\textsc{rfci}\xspace}
\newcommand{\FCI}{\textsc{fci}\xspace}
\newcommand{\GES}{\textsc{ges}\xspace}
\newcommand{\GGSL}{\textsc{ggsl}\xspace}
\newcommand{\NOTEARS}{\textsc{notears}\xspace}
\newcommand{\CoCa}{\textsc{coca}\xspace}
\newcommand{\tofft}{\textsc{3off2}\xspace}
\newcommand{\opt}{\textsc{opt}\xspace}
\newcommand{\DCD}{\textsc{dcd}\xspace}


\DeclareMathOperator{\Pa}{Pa}
\DeclareMathOperator{\cov}{cov}
\DeclareMathOperator{\diag}{diag}


\pagestyle{numbered}
\title{\ourtitle}

\author[1]{David Kaltenpoth}
\author[1]{Jilles Vreeken}
\affil[1]{%
  CISPA Helmholtz Center for Information Security\\
  Saarbr\"ucken

}
  \begin{document}
\maketitle
\begin{abstract}
 Causal sufficiency is a cornerstone assumption in causal discovery. It is, however, both unlikely to hold in practice as well as unverifiable. When it does not hold, existing methods struggle to return meaningful results.
 In this paper, we show how to discover the causal network over both observed \emph{and} unobserved variables. Moreover, we show that the causal model is identifiable in the sparse linear Gaussian case. More generally, we extend the algorithmic Markov condition to include latent confounders. 
 We propose a consistent score based on the Minimum Description Length principle to discover the full causal network, including latent confounders. Based on this score, we develop an effective algorithm that finds those sets of nodes for which the addition of a confounding factor $Z$ is most beneficial, then fits a new causal network over both observed as well as inferred latent variables.
\end{abstract}

\section{Introduction}
\label{sec:intro}

Discovering causal relationships from observational data is one of the most important open problems in science~\citep{pearl2009causality}.
Causal discovery methods aim to identify causal networks from data by reporting only edges that cannot be explained away by any other variables.
Most typical causal discovery methods have in common that unless \emph{all} relevant variables have been measured, they return models that include (many) spurious edges and thus lack a causal interpretation. The most commonly used approach is to wish the problem away by assuming \emph{causal sufficiency}, i.e., that all common causes of all observed variables are observed.
However, doing this does not make the issue disappear in practice; in many applications, including epidemiology~\citep{kesteloot2006dynamics}, economics~\citep{angrist2009mostly}, and bio-medicine~\cite{imbens2015causal}, we do not know all the relevant variables, nor would we be able to measure them even if we knew. 

In this paper, we present an approach to causal discovery that does not require causal sufficiency over the observed variables.
We give conditions under which it is possible to identify joint confounders, and show how to infer these using factor analysis, as well as how to construct a causal network without spurious edges over both the observed $X$ and the discovered confounders $Z$.

This may seem impossible. After all, given only a sample from $\Px$, there exist infinitely many joint distributions $\Pxz$ consistent with the marginal $\Px$, and picking out the $\Pxz$ that corresponds to the true causal mechanism sounds far-fetched. It is not.
First, we can exploit the fact that a causal graph discovered over the observed variables $X$ will contain many spurious edges $X_i \rightarrow X_j$ when both are affected by the hidden latent variable $Z$~\citep{elidan2000discovering}.
That is, by focusing on those subsets of $X$ that are densely connected in the graph, we can determine which variables are likely to share a hidden confounder.

While not every densely connected set of variables necessarily shares a hidden confounder, we can use the \emph{algorithmic Markov condition} (AMC) to find the simplest causal model describing the data~\citep{janzing2010causal}. In particular, such a model may include latent variables $Z$ affecting the observed $X$ so as to respect the \emph{independence of causal mechanisms}~\citep{parascandolo2018learning}.

Putting these two ideas together, we have the following natural approach: run a causal discovery algorithm on the observed data, find sets of densely connected variables in the learned graph, learn a latent factor model for each set, and then use the AMC to determine which so-found $\Pxz$ are simpler than $\Px$. If so, add the best newly discovered confounder $Z$ to $X$ and iterate until convergence. 

We show that our approach is both theoretically and empirically sound. It provably recovers the true set of confounded nodes under general conditions, while for the sparse linear Gaussian setting, it is consistent for recovering the entire model. 
Empirical evaluation shows it to be highly accurate. It improves both over methods that do assume causal sufficiency as well as those which do not.
All code, data, results, and proofs can be found online on the authors' website.\!\footnote{\codeurl} 


\section{Theory}
\label{sec:theory}

We first describe our problem setting. We then prove identifiability for sparse linear Gaussian (SLG) models and provide a framework for causal discovery under latent confounding. With this, we derive a consistent score for the SLG.

\subsection{Problem Setting}
\label{sec:problem}
Let $X = (X_1,\ldots,X_m)$ and $Z = (Z_1,\ldots,Z_l)$ be two sets of variables with joint distribution $\Pxz$, where $X$ are observed and $Z$ unobserved variables. Our goal is to discover a network over $X, Z$, that is, a directed acyclic graph (DAG) $G_{X,Z} = (V, E)$ with vertices $X \cup Z$ and edges capturing the causal relationships in $\Pxz$. By marginalizing over $Z$, we obtain a distribution $\Px$ with a corresponding network $\Gx$. When the variables are clear from the context, we write $G$ for $\Gxz$, respectively $\Gx$.

To permit identifiability of the network $\Gxz$, we have to make some common assumptions~\citep{koller2009probabilistic}.
First, \emph{Causal Faithfulness}: if $U$ and $V$ are independent given $W$ in $\Pxz$, then $U$ and $V$ are $d$-separated by $W$ in $G$.
Second, the \emph{Causal Markov Condition}: each $Y \in X \cup Z$ is independent of its non-descendants given its parents $\Pa_G(Y)$. 
Third, we do not assume \emph{Causal Sufficiency} over $X$, but we \emph{do} assume it over $X \cup Z$. That is, we assume that all common parents of at least two variables $U, V \in (X, Z)$ are included in $\Xb \cup \Zb$. 
In other words, all non-causal correlations can be \emph{explained away} by conditioning on the right variables.
Last, we assume that all $Z_j$ are jointly independent and that no reverse causation exists, i.e., $\Pa(Z_j) = \emptyset$. 
Under these assumptions, we have
\begin{equation*}
  \Pxz = \prod_{i = 1}^mP(X_i\mid \Pa_i)\prod_{j = 1}^lP(Z_j)\,,
\end{equation*}
where $\Pa_i = \Pa_G(X_i)$. Such factorizations are a cornerstone of causal learning~\citep{pearl2009causality}, allowing for the identification of many causal effects. Our goal is the following.
\begin{problem}
  Given a sample $x^n$ only from the observed distribution $\Px$, discover
  \begin{itemize}
  \item a (small) set of latent variables $Z$
  \item a (sparse) network $G$ over $X$ and $Z$
  \item and a (simple) joint distribution $\Pxz$ such that
    \begin{equation*}
      \Pxz = \prod_{i = 1}^mP(X_i\mid \Pa_i)\prod_{j = 1}^lP(Z_j)\,,
    \end{equation*}
    factorizes according to the discovered $G$.
  \end{itemize}
\end{problem}
In the following, we note a simple property laying the foundations for discovering all three of these components. 

\begin{figure}[t]
  \centering
  \begin{tikzpicture}[every node/.style={circle, black, draw=black, minimum size=1.2cm}]
    \node[black!80, dashed] (Z1) {$Z_1$};
    \node[black!80, dashed, right=3cm of Z1] (Z2) {$Z_2$};
    \node[below left=1cm and 0.25cm of Z1] (X1) {$X_1$};
    \node[below right=1cm and 0.25cm of Z1] (Xk) {$X_k$};
    \node[below left=1cm and 0.25cm of Z2] (Xk1) {$X_{k+1}$};
    \node[below right=1cm and 0.25cm of Z2] (Xm) {$X_m$};
    \node[white, black, draw=none] (mid1) at ($(X1)!0.5!(Xk)$) {$\cdots$};
    \node[white, black, draw=none] (mid2) at ($(Xk1)!0.5!(Xm)$) {$\cdots$};
    \draw[dotted] ($(X1.north west)+(-.25,.25)$) rectangle ($(Xk.south east)+(0.25, -0.25)$) node[white, pos=.5] (S1) {};
    \node[below=-0.25cm of S1, draw=white] (S1lab) {$S_1$};
    \draw[dotted] ($(Xk1.north west)+(-.25,.25)$) rectangle ($(Xm.south east)+(0.25, -0.25)$) node[white, pos=.5] (S2) {};
    \node[below=-0.25cm of S2, draw=white] (S2lab) {$S_2$};
    \path[->]
    (Z1) edge (X1)
    (Z1) edge (mid1)
    (Z1) edge (Xk)
    (Z2) edge (Xk1)
    (Z2) edge (mid2)
    (Z2) edge (Xm)
    (S1) edge[dotted, bend right=60] (S2)
    (S1) edge[dotted, bend right=60, thick] (S2)
    (Z1) edge[dotted, bend left=30, thick] (S2)
    (S2) edge[dotted, thick, loop below, min distance=20mm, in=315, out=225] (S2)
    ;
  \end{tikzpicture}
  \caption{Structural assumptions 1 and 2 of our model. Each $Z_i$ has an edge towards all nodes in $S_i$ (solid), but only few other edges (dotted) are incoming to each $S_i$.}
  \label{fig:model}
\end{figure}
\subsection{Structure of Latent Confounding}
\label{sec:graph-confounding}
To solve our problem, we start with the following observation.
Whenever a set of variables $X_S = (X_i)_{i \in S}$, $S \subseteq \left\{ 1,\ldots,m \right\}$, are co-caused by an unmeasured $Z$, no pair $X_i, X_j \in X_S$ can be made independent by conditioning on any other subset $W \subset X$.
Thus, when $G$ captures the independences of $P$, all pairs in $X_S$ are connected. That is, $G$ contains a \emph{clique} over $X_S$~\citep{elidan2000discovering}.

\begin{proposition}[Confounders and Cliques]\label{prop:clique}
  Let $\Pxz$ be the joint distribution of $X, Z$ where $Z$ is one-dimensional and let $S = \left\{ i : Z \rightarrow X_i \right\}$.
  Then any graph $\Gx$ capturing the correlations in $\Px$ contains a clique over $X_S$.
\end{proposition}
When $Z$ is multivariate, each $Z_j$ induces its own clique in $G$, all of which may overlap.
Next, we show that this graphical characterization of confounding is already sufficient to identify the true model in the sparse linear Gaussian case.


\subsection{Identifiability for the Sparse Linear Gaussian Model}
\label{sec:ident}
To prove identifiability of the causal model, we assume that $\Pxz$ is given by a linear Gaussian structural causal model (SCM, \cite{pearl2009causality})
\begin{equation}
  \label{eq:gauss-sem}
  X = \At X + \Bt Z + \epsilon
\end{equation}
where $A$ encodes the DAG $G$, $Z \sim N(0, I)$ and $\epsilon \sim N(0, \diag(\sigma_{\epsilon}^2))$. We further make the following assumptions on the causal model generating our data $X, Z$.

\begin{figure*}
  \centering
  \begin{subfigure}{0.33\textwidth}
    \centering
    \begin{tikzpicture}[every node/.style={circle,draw}, scale=0.66, transform shape]
      \foreach \i in {1,...,5} {
        \node (x\i) at (-1.5*\i, -2) {$X_{\i}$};
      }
      \foreach \i in {6,...,10} {
        \node (x\i) at (-1.5*\i+7.5, 4) {$X_{\i}$};
      }
      \node[black!80, dashed] (Z) at (-4.5, 1) {$Z$};
      \foreach \i in {1,...,10} {
        \draw[-{latex[]}] (Z) -- (x\i);
      }
      \path[-{latex[]}]
      (x1) edge (x2)
      (x3) edge (x4)
      (x5) edge (x10)
      (x6) edge (x7)
      (x7) edge (x8)
      (x8) edge (x9);
      \node[rectangle, draw, dashed, minimum width=7.5cm, minimum height=7cm, label=left:$S_1$] (S1) at (-4.5,1) {};
    \end{tikzpicture}
    \caption{}
  \end{subfigure}%
  \hfill
  \begin{subfigure}{0.33\textwidth}
    \centering
    \begin{tikzpicture}[every node/.style={circle,draw}, scale=0.66, transform shape]
      \foreach \i in {1,...,5} {
        \node (x\i) at (-1.5*\i, 0) {$X_{\i}$};
      }
      \foreach \i in {6,...,10} {
        \node (x\i) at (-1.5*\i+7.5, 2) {$X_{\i}$};
      }
      \node[black!80, dashed] (z1) at (-4.5, -2) {$Z_1$};
      \node[black!80, dashed] (z2) at (-4.5, 4) {$Z_2$};
      \foreach \i in {1,...,5} {
        \pgfmathtruncatemacro{\j}{\i+5}
        \path[-{latex[]}]
        (z1) edge (x\i)
        (z2) edge (x\j);
      }
      \path[-{latex[]}]
      (x1) edge (x2)
      (x1) edge (x8);
      \node[rectangle, draw, dashed, minimum width=7.5cm, minimum height=1.5cm, label=left:$S_1$] (S1) at (-4.5,0) {};
      \node[rectangle, draw, dashed, minimum width=7.5cm, minimum height=1.5cm, label=left:$S_2$] (S2) at (-4.5,2) {};
    \end{tikzpicture}
    \caption{}
  \end{subfigure}%
  \hfill
  \begin{subfigure}{0.33\textwidth}
    \centering
    \begin{tikzpicture}[every node/.style={circle,draw}, scale=0.66, transform shape]
      \foreach \i in {1,...,5} {
        \node (x\i) at (-1.5*\i, 0) {$X_{\i}$};
      }
      \foreach \i in {6,...,10} {
        \node (x\i) at (-1.5*\i+7.5, 2) {$X_{\i}$};
      }
      \node[black!80, dashed] (z1) at (-4.5, -2) {$Z_1$};
      \node[black!80, dashed] (z2) at (-4.5, 4) {$Z_2$};
      \foreach \i in {1,...,5} {
        \pgfmathtruncatemacro{\j}{\i+5}
        \path[-{latex[]}]
        (z1) edge (x\i)
        (z2) edge (x\j);
      }
      \path[-{latex[]}]
      (z2) edge (x4)
      (x1) edge (x8);
      \path[-{latex[]}, red, dashed]
      (z1) edge (x7)
      (x9) edge (x4);
      \node[rectangle, draw, dashed, minimum width=7.5cm, minimum height=1.5cm, label=right:$S_1$] (S1) at (-4.5,0) {};
      \node[rectangle, draw, dashed, minimum width=7.5cm, minimum height=1.5cm, label=right:$S_2$] (S2) at (-4.5,2) {};
    \end{tikzpicture}
    \caption{}
  \end{subfigure}%
  \hfill
  \caption{Example graphs illustrating our structural assumptions. (a) All observed variables are confounded by the same factor $Z_1$, and 6 edges exist between its children $S_1$. (b) Two different confounders affecting five nodes each, and one additional edge incoming to each of the sets. (c) $Z_2$ affects one of the nodes in $S_1$. Furthermore, if we added either of the dashed red edges, too many edges incoming into $S_1$, respectively, $S_2$ would render the model unidentifiable.}
  \label{fig:examples}
\end{figure*}
\begin{assumption}
  There exists a partition of the variables $X$ into $l$ disjoint sets $S_1,\ldots, S_l$ of sizes $ \left|S_j\right|  \geq 4$ such that for each variable $X_i \in S_j$ the direct causal effect $b_{ij}$ of $Z_j \rightarrow X_i$ is non-zero, $b_{ij} \neq 0$.
\end{assumption}
This assumption guarantees that each $Z_j$ has an influence on a subset $S_j$ of the variables $X$ that is sufficiently large to recover its parameters $b_{ij}$. Of course, this would not avail us much if the overlaps between sets are too large, e.g., when two variables $Z_j \neq Z_k$ have exactly the same sets $S_j = S_k$ of downstream effects. To prevent such cases, we introduce our next assumption.
\begin{assumption}
  There are at most $\left| S_j \right|-4$ edges incoming to vertices in $S_j$, aside from the edges $Z_j \rightarrow S_j$.
\end{assumption}
This assumption ensures that the different $Z_j, Z_k$ cannot have too much overlap in their $S_j$, either through direct connections of $Z_j \rightarrow S_k$, or through indirect paths $Z_j \rightarrow S_j \rightarrow S_k$. That is, the sets $S_j$ are only weakly connected to each other to ensure distinguishability between the effects of different $Z_j$.
Note in particular that since models with $Z_j \rightarrow Z_k$ are indistinguishable from models where each node in $X_i \in S_k$ also has an edge $Z_j \rightarrow X_i$, this assumption \emph{requires} $Z$ to be jointly independent.
Likewise, as we saw in the first example, there also cannot be too many connections between variables \emph{within} $S_j$, as this makes it impossible to tell which correlations are due to $Z_j$, and which due to causal effects of variables within $S_j$.

When all assumptions hold, the causal model is identifiable.

\begin{theorem}\label{thm:gauss-id}
  Let our distribution $\Pxz$ be described by the linear Gaussian SEM given in Eq.~\eqref{eq:gauss-sem}
  \begin{align*}
    X = \At X + \Bt Z + \epsilon\,,
  \end{align*}
  for some $Z$ of dimension $l \leq m/4$.
  Further, let assumptions 1-3 hold. Then the number $l$ of confounders and its parameters $B$ are identifiable up to column permutations and rescaling. Furthermore, if all noise variables $\epsilon$ have equal variances, then $A$ is also identifiable.
\end{theorem}

Unlike this sparse linear Gaussian (SLG) case, causal models with latent variables are generally overparametrized and thus unidentifiable.
As we show next, however, it is possible to identify \emph{whether or not} latent confounders are involved. 

\subsection{Algorithmic Model of Causality}
\label{sec:amc}
To determine whether variables $X$ are influenced by latent confounders, we introduce the algorithmic model of causality. We begin by studying $\Px$ under the assumption that no latent variables $Z$ are involved. 
In the algorithmic model of causality, the distribution $\Px$ with graph $G$ corresponds to a set of programs $f_i$ describing how each $X_i$ is generated from $\Pa_i$ and mutually independent noise $\epsilon$
\begin{align*}
  X_i = f_i(\Pa_i, \epsilon_i)\,, \quad \epsilon_i \independent \Pa_i\,.
\end{align*}
We can measure the complexity of such a description of $\Px$ using Kolmogorov complexity~\citep{li2009introduction}. Given a universal Turing machine $U$, it measures the length of the shortest program $p$ approximating a given function $f$,
\begin{align*}
  K(f) \coloneqq \min_p \left\{ \left| p \right| \mid \forall u \forall q: \left| U(u, p, q) - f(u) \right| \leq 1/q \right\}\,.
\end{align*}
Kolmogorov complexity optimally utilizes all available information and therefore measures the length of the best compression of $f$.
If $\Px$ is the distribution generated by a causal process, the \emph{true} causal model should also compress $\Px$ \emph{best}. This notion is captured by the algorithmic Markov condition (AMC)~\citep{janzing2010causal}.
\begin{postulate}[Algorithmic Markov Condition]
  Let $\gt$ be the true causal network for $\Px$. Then
  \begin{align}\label{eq:amc}
    K(\Px) \eqp \sum_{i = 1}^m K(P(X_i | \Pa_{\gt}(X_i)))\,,
  \end{align}
  where $\eqp$ denotes equality up to an additive constant that depends on the Turing machine $U$ but is independent of $P$.
\end{postulate}
In the bivariate case, if $X$ causes $Y$, it says
\begin{align*}
  K(P(X)) + K(P(Y \mid X)) \leqp K(P(Y)) + K(P(X \mid Y)).
\end{align*}

The AMC states that the true network $\gt$ not only provides the best factorization of $P$ but the best compression of $P$ in general.
In particular, $P(X_i \mid \Pa_i)$ and $P(X_j \mid \Pa_j)$ cannot be better compressed jointly so that they do not share any structure. 
In other words, the causal mechanisms generating $X_i$ and $X_j$ are \emph{algorithmically independent} of each other~\citep{janzing2010causal}.

This independence breaks under latent confounding. Consider the case where $(X_1,\ldots,X_4)$ are given by $X = B^tZ + \epsilon$ with unobserved univariate $Z \sim N(0, 1)$.
Then $\Px$ is fully described by its covariances $\sigma_{ij} = b_ib_j$. However, for $4$ variables, there are $6$ covariances described by the four parameters $b_1,\ldots,b_4$, rendering them dependent.

Finding models with independent causal mechanisms, therefore, requires us to consider models with latent factors, i.e., distributions $\Pxz$ with marginals $\Px$ and $P(Z)=\prod_j^{}P(Z_j)$, the class of which we refer to as $\Pc$. The question is whether it is meaningful to compare $\Pxz$ with $\Px$ in terms of Kolmogorov complexity. 
The following theorem provides a positive answer to this question.

\begin{theorem}[Kolmogorov Does Not Incorrectly Detect Confounders]~\label{thm:kc-ineq}
  For any distribution $\Px$, we have
  \begin{align}\label{eq:inf}
    \inf_{\Pxz \in \Pc} K(\Pxz) \leqp K(\Px)\,,
  \end{align}
  where the infimum is over all joint distributions $\Pxz$ with fixed marginal $\Px$ and independent $Z$.
  Conversely, if a joint distribution $\Pxz \in \Pc$ exists such that
  \begin{align}\label{eq:conf-better}
    K(\Pxz) < K(\Px)\,,
  \end{align}
  then the true generating mechanism of $X$ includes latent variables influencing some subset $\Xb_S$.
\end{theorem}
First, this result shows that there is no intrinsic bias towards one type of distribution being more complex than the other.
Second a confounded model can only offer the best description of the data if the true generating mechanism involves unobserved variables.
In this case, we can write 
\begin{align*}
  K(P(X, Z)) = \sum_{i = 1}^m K(P(X_i \mid \Pa(X_i))) + \sum_{j = 1}^l K(P(Z_i))
\end{align*}
where the parents of $X_i$ are among both $X$ and $Z$.
Note that while we cannot incorrectly infer latent confounders where there are none, we may miss latent variables that do exist.

\subsection{Instantiating the AMC}
\label{sec:amc-mdl}

Using Kolmogorov complexity, we have developed a framework for discovering latent confounders affecting $\Px$.
To use it in practice, however, we need to address two challenges: access only to a sample $x$ and incomputability of Kolmogorov complexity~\citep{li2009introduction}.

The first issue is resolved in the large sample limit, where measuring the suitability of a causal model using $K(x)$ instead of $K(\Px)$ leads to the same decisions~\citep{marx2021formally}.
To solve the second issue, we use the Minimum Description Length (MDL) principle~\citep{grunwald2007minimum}. In its simplest form, to encode data $x$, we choose a model $M$ and encode both $M$ and $x$ given $M$ as
\begin{align*}
  L(x, M) =  L(x \mid M) + L(M)\,.
\end{align*}
We then want to find the best model $M^{\ast}$ in some model class $\M$.
For any $\M$, we have $K(x) \leq \inf_{M \in \M} L(x, M)$, so that MDL provides a well-founded upper bound on $K$. 

In this work, our focus is not on identifying a single best model but on determining whether a subset $X_S$ of the observed variables is confounded or not. To distinguish between model classes, we use refined MDL~\cite{grunwald2007minimum} 
\begin{equation}\label{eq:bayes-mdl}
  L(x, \M) \coloneqq -\log \int\limits_{M \in \M}P(\xb | M)Q(M)dM\,,
\end{equation}
where $Q$ is a prior on the models $M \in \M$. 

For this score to be sound, we next establish a result similar to Eq~\eqref{eq:conf-better}. More precisely, we show that confounded models obtain a better score only if causal sufficiency is violated. To do this formally, we require two things.
First, a class $\M_0$ of models without latent factors. Second, a set $\Mf$ of model classes $\M$ describing the observed distribution $\Px$ as marginal of a joint distribution $\Pxz$ in some distinct way.
For data $x$ sampled from $\Px$, Eq.~\eqref{eq:conf-better} then corresponds to asking whether or not
\begin{align}\label{eq:mdl-inf}
  \inf_{\M \in \Mf} L(\xb, \M) < L(\xb, \M_0)\,.
\end{align}
The following theorem tells us that, indeed, Eq.~\eqref{eq:mdl-inf} holds only if latent variables were involved in generating $x$.
\begin{theorem}[MDL Does Not Incorrectly Detect Confounders]\label{thm:onesided-mdl}
  Let $\xb^n$ be the observed part of an i.i.d. sample from $P \in \Mf \cup \left\{ \M_0 \right\}$.
  Further, assume that Eq.~\eqref{eq:mdl-inf} holds $P$-almost surely as $n \rightarrow \infty$. Then $P \in \Mf$.
\end{theorem}

Just as in the case of algorithmic causality, i.e., Thm.~\ref{thm:kc-ineq}, it is generally impossible to guarantee recovery of the exact true model. However, for the SLG, it is possible to define a consistent score, 
requiring no further assumptions than those we already made for identifiability. in Sec.~\ref{sec:ident}.

\begin{theorem}[Consistency of BIC for SLGs]\label{thm:gauss-cons}
  Let $x = x^n$ be a sample from the SLG of Eq.~\eqref{eq:gauss-sem} and let assumptions (A1)-(A3) hold.
  Let $\M$ be the corresponding model class and $\M_0$ the restriction of $\M$ to models with $B = 0$.
  Let
  \begin{equation}\label{eq:penalized-ml}
    L(x^n, M) = -\log P(x^n \mid A, B, \sigma_{\epsilon}^2) + \lambda \norm{A}_0 + \lambda\norm{B}_0
  \end{equation}
  and $\widehat{A}, \widehat{B}$ its minimizers. Then for $\lambda = \log(n)/2$, the score $L$ is consistent for detecting confounders. That is,
  \begin{align*}
    \lim_{n \rightarrow \infty}P\left(\min_{M \in \M} L(x^n, M) < \min_{M \in \M_0} L(x^n, M)\right)= 1\,.
  \end{align*}
  Further, $\widehat{A}$ and $\widehat{B}$ converge to the true $A, B$ in probability,
  \begin{align*}
    \lim_{n \rightarrow \infty} P(\widehat{A} = A, \widehat{B} = B) = 1\,.
  \end{align*}
\end{theorem}
While it is remarkable that in the linear Gaussian case, the full causal model can be recovered from a sample $x$ from $\Px$ only, we have to solve two problems before we can put this into practice.  
First, we do not know $B$, nor even which of the exponentially many subsets $X_S \subseteq X$ are affected by any one $Z_j$.
Second, even knowing $B$ and $Z$, optimizing Eq.~\eqref{eq:penalized-ml} is NP-hard~\citep{peters2012identifiability}.
We, therefore, next develop a good heuristic as to which subsets $X_S$ are likely affected by $Z$ 
and show how standard causal discovery algorithms can be leveraged to find a causal network over both the observed $X$ and the latent $Z$.

\section{The \textsc{CDHC} algorithm}
\label{sec:algo}
With the theory we developed, we can now introduce \ourmethod, our method for Causal Discovery with Hidden Confounders.

\subsection{Finding Confounded Variables}
\label{sec:seed}

To find a causal network over both $X$ and its confounders $Z$, we first need to determine which subsets of $\Xb$ are likely to be confounded. A structure learning algorithm $\Ac$ can help us determine these sets by discovering connected subsets of variables (Prop.~\ref{prop:clique}). If $\Xb_S$ are densely connected in the discovered graph $G$, then the Markov boundaries satisfy $X_i \in \Xb_S$ is $\MB(X_i) \approx \Xb_S$.
Therefore, we consider the Markov boundary of each node as the seed sets $S$ over which we may infer latent confounders.

\subsection{Learning Latent Confounders}
\label{sec:choices}

To evaluate a proposed set of confounded nodes and its associated graph $G$, we introduce a causal model including latent factors as follows.
Based on Theorem~\ref{thm:gauss-id}, given a graph $G$ over $(X, Z)$, we assume that the data is generated from a model in a class $\M = \M(G)$ similar to Probabilistic PCA (PPCA)~\citep{tipping1999probabilistic}
\begin{equation}
  \begin{gathered}
    Z_i \sim \N(0, 1)\,, \;\;\qquad\epsilon \sim N(0, \sigma_{\epsilon}^2) \\
    A_{ij}  \sim \N(0, \sigma_a^2)\,,\;\;\qquad B_{ij} \sim \N(0, \sigma_b^2)\\
    X = \At X + \Bt Z + \epsilon\,,
  \end{gathered}\label{eq:ppca}
\end{equation}
where entries of $A, B$ are nonzero only when their corresponding edges are in $G$. By marginalizing out $Z$, we obtain
\begin{align*}
  X \mid A, B \sim N(0, C(\Bt B + \sigma_{\epsilon}^2)\Ct )\,,
\end{align*}
where $C = \Ai$. Since the $A_{ij}$ are sampled from a continuous distribution, assumption (A3) holds almost surely so that, unlike PPCA, we do not require $A = 0$.

To evaluate the fit of our model class $\M$ to our data $x$, we use the score $L(x, \M)$ defined in Eq.~\eqref{eq:bayes-mdl}
\begin{equation}\label{eq:cdhc-score}
  L(x, \M) = -\log \int\limits_{}^{}P(x \mid A, B) P(A, B)dAdB\,,
\end{equation}
which we can estimate using standard variational methods~\citep{kucukelbir2017automatic}.
This score is suitable in that it is consistent for causal discovery with latent confounders.
\begin{theorem}[Consistency of MDL for SLGs]\label{thm:cdhc-cons}
  Let the assumptions of Thm.~\ref{thm:gauss-cons} hold.
  Then the minimizer $\widehat{G}$,
  \begin{align*}
    \widehat{G} = \argmin_G L(x^n, \M(G))\,,
  \end{align*}
  converges to the ground truth $\gt$ with probability one,
  \begin{equation*}
    \lim_{n \rightarrow \infty}P(\widehat{G} = G^{\ast}) = 1\,.
  \end{equation*}
\end{theorem}

\begin{algorithm}[tb!]
  \newcommand\mycommfont[1]{\footnotesize\ttfamily\textcolor{blue}{#1}}
  \SetCommentSty{mycommfont}
  \caption{\textsc{\ourmethod}}
  \label{alg:method}
  \Input{data $\xb$ sampled from $\Px$, algorithm $\Ac$}
  \Output{graph $G$ and distribution $\Pxz$}
  $G = (V, E) \leftarrow$ Graph inferred over $\xb$ using $\Ac$\;
  \Do { $G$ changes } {
    \ForEach {i $\in \left\{ 1,\ldots,m \right\}$}{
      $X_S \leftarrow$ Markov boundary of $X_i$ in $G$\;
      $G' \leftarrow (V \cup \left\{ Z \right\}, E \cup \left\{ Z \rightarrow X_S \right\})$\;
      \tcp{Forward phase}
      \Do{$L(\xb, G')$ decreases}
      {
        $j \leftarrow \argmin_{j \notin S} L(x, G' \cup \left\{Z \rightarrow X_j \right\})$\;
        $(S, G') \leftarrow (S \cup \left\{ j \right\}, G' \cup \left\{Z \rightarrow X_j \right\})$\;
      }
      \tcp{Backward phase}
      \Do{$L(\xb, G')$ decreases}{
        $j \leftarrow \argmin_{j \in S} L(x, G' \setminus \left\{ Z \rightarrow X_j \right\})$\;
        $(S, G') \leftarrow (S \setminus \left\{ j \right\}, G' \setminus \left\{Z \rightarrow X_j \right\})$\;
      }
      $z \leftarrow$ sample from $P(Z \mid X)$\;
      $G[i] \leftarrow$ Graph inferred over $(\xb, z)$ using $\Ac$\;
    }
    \tcp{Use the model with best confounder}
    $G \leftarrow \argmin_{G[i]} L((x, z), G[i])$\;
  }
  \Return{$G$ and the $P(X, Z)$ associated with $G$}
\end{algorithm}
With this guarantee that our score is sound, we now introduce our method for discovering the entire causal network.
\subsection{Discovering the Causal Network}
\label{sec:putting}

We can now put all of the above together and present \ourmethod. We give the pseudo-code as Algorithm~\ref{alg:method}.
We first (line 1) discover a graph $G$ over the observed data $x$ using a score-based structure discovery algorithm $\Ac$, such as \GES~\citep{chickering2002learning}, \GGSL~\citep{gao2017local} or \NOTEARS~\citep{zheng2018dags}.
We then consider every node $X_i$ and initialize the confounded set $X_S$ with the Markov boundary $\MB(X_i)$ and add a node $Z$ and edges $Z \rightarrow X_S$ to $G$. (l. 4-5).
We refine $S$ by greedily adding nodes (l. 6-9), then removing nodes (l. 10-13). After finding the locally optimal set $S$, we sample $z$ from $P(Z \mid X)$ and fit a network over $(x, z)$ using $\Ac$ (l. 14-15).
Out of all these networks, we update $G$ to be the best of them (l. 16) and iterate until convergence (l. 17).
Finally, we return the discovered network $G$ and distribution $\Pxz$ over $X$ and its inferred confounders $Z$ (l. 18).

Note that since our score strictly decreases at every step, our method necessarily converges. Furthermore, we can show that in the large sample limit we are guaranteed to recover the true set of confounded nodes.

\begin{proposition}[Consistency of \ourmethod for Discovering Confounded Nodes]\label{prop:algo}
  Let $\xb^n$ be the an i.i.d. sample from $P \in \M(\gt)$ defined in Eq.~\eqref{eq:ppca}, let assumptions (A1-3) hold and let $S_i^{\ast}$ be the set of nodes affected by $Z_i$. Assume that $\bigcap_{s \in S_i^{\ast}} \MB_{G^{\ast}}(X_s) \setminus \left\{ Z_i \right\} \subsetneq S_i^{\ast}$.
  Let  $\Ac$ be a consistent for recovering the Markov equivalence class of the graph $\Gx$ for distribution $\Px$.
  Let $\widehat{S}_i$ be the set nodes confounded by $Z_i$ discovered by \ourmethod. Then
  \begin{equation}
    \label{eq:cdhc-cons}
    \lim_{n \rightarrow \infty}P(\widehat{S}_i = S_i^{\ast}) = 1.
  \end{equation}
\end{proposition}
While we can recover the correct sets of confounded nodes, to recover the entire graph $\gt$, we need additional assumptions such as those outlined in Theorem~\ref{thm:gauss-cons}.
% Note that without further assumptions on $\Ac$, we cannot guarantee that the entire ground truth $\gt$ will be recovered.


\begin{figure*}[t!]
  \vspace*{1.1cm}
  \centering
  \quad\quad\quad
  \begin{minipage}[b][3.5cm]{0.18\linewidth}
    \subfloat[Ground truth]{
    \begin{tikzpicture}[scale=0.3]
      \foreach \a [count=\c] in {X1, X2, X3, X4, X5, X6}{\node at (\c*360/6+0:4) (\a) {$X_{\c}$};}
      \node (X0) at ($(X1)!0.5!(X4)$) {$X_0$};

      \path[thick, color=black!50, ->]
      \foreach \a in {X1, X2, X3}{(X0) edge (\a)}
      (X1) edge (X2)
      (X1) edge (X6)
      (X3) edge (X4)
      (X4) edge (X5)
      (X5) edge (X6)
      ;
    \end{tikzpicture}
    }
    \vspace*{0.75cm}
  \end{minipage}
  \hfill
  \begin{minipage}[b][3.5cm]{0.18\linewidth}
    \subfloat[\GES]{
    \begin{tikzpicture}[scale=0.225]
      \foreach \a [count=\c] in {X1, X2, X3, X4, X5, X6}{\node at (\c*360/6+0:4) (\a) {$X_{\c}$};}

      \path[thick, color=black!50, ->]
      (X2) edge[ourred] (X1)
      (X2) edge[ourred] (X3)
      (X3) edge[ourred] (X1)
      (X1) edge (X6)
      (X3) edge (X4)
      (X5) edge[ourred] (X4)
      (X5) edge (X6)
      ;
    \end{tikzpicture}
    }
    \vspace{0em}
    \subfloat[\ourmethod-\GES]{
    \begin{tikzpicture}[scale=0.225]
      \foreach \a [count=\c] in {X1, X2, X3, X4, X5, X6}{\node at (\c*360/6+0:4) (\a) {$X_{\c}$};}
      \node[color=green(ryb)] (X0) at ($(X1)!0.5!(X4)$) {$Z$};

      \path[thick, color=black!50, ->]
      \foreach \a in {X1, X2, X3}{(X0) edge[color=green(ryb)] (\a)}
      (X2) edge[ourred] (X1)
      (X1) edge (X6)
      (X3) edge (X4)
      (X5) edge[ourred] (X4)
      (X5) edge (X6)
      ;
    \end{tikzpicture}
    }
  \end{minipage}%
  \begin{minipage}[b][3.5cm]{0.18\linewidth}
    \subfloat[\GGSL]{
    \begin{tikzpicture}[scale=0.225]
      \foreach \a [count=\c] in {X1, X2, X3, X4, X5, X6}{\node at (\c*360/6+0:4) (\a) {$X_{\c}$};}

      \path[thick, color=black!50, ->]
      (X1) edge (X2)
      (X3) edge[ourred] (X2)
      (X3) edge[ourred] (X1)
      (X1) edge (X6)
      (X3) edge (X4)
      (X4) edge (X5)
      (X4) edge[ourred] (X6)
      (X5) edge (X6)
      ;
    \end{tikzpicture}
    }
    \vspace{0em}
    \subfloat[\ourmethod-\GGSL]{
    \begin{tikzpicture}[scale=0.225]
      \foreach \a [count=\c] in {X1, X2, X3, X4, X5, X6}{\node at (\c*360/6+0:4) (\a) {$X_{\c}$};}
      \node[color=green(ryb)] (X0) at ($(X1)!0.5!(X4)$) {$Z$};

      \path[thick, color=black!50, ->]
      \foreach \a in {X1, X2, X3}{(X0) edge[color=green(ryb)] (\a)}
      (X1) edge (X2)
      (X1) edge (X6)
      (X3) edge (X4)
      (X4) edge (X5)
      (X4) edge[ourred] (X6)
      (X5) edge (X6)
      ;
    \end{tikzpicture}
    }
  \end{minipage}
  \begin{minipage}[b][3.5cm]{0.18\linewidth}
    \subfloat[\NOTEARS]{
    \begin{tikzpicture}[scale=0.225]
      \foreach \a [count=\c] in {X1, X2, X3, X4, X5, X6}{\node at (\c*360/6+0:4) (\a) {$X_{\c}$};}
      \path[thick, color=black!50, ->]
      (X1) edge (X2)
      (X1) edge[ourred] (X3)
      (X2) edge[ourred] (X3)
      (X1) edge (X6)
      (X3) edge (X4)
      (X4) edge (X5)
      (X5) edge (X6)
      ;
    \end{tikzpicture}
    }
    \vspace{0em}
    \subfloat[\ourmethod-\textsc{nt}]{
    \begin{tikzpicture}[scale=0.225]
      \foreach \a [count=\c] in {X1, X2, X3, X4, X5, X6}{\node at (\c*360/6+0:4) (\a) {$X_{\c}$};}
      \node[color=green(ryb)] (X0) at ($(X1)!0.5!(X4)$) {$Z$};

      \path[thick, color=black!50, ->]
      \foreach \a in {X1, X2, X3}{(X0) edge[color=green(ryb)] (\a)}
      (X1) edge (X2)
      (X1) edge (X6)
      (X3) edge (X4)
      (X4) edge (X5)
      (X5) edge (X6)
      ;
    \end{tikzpicture}
    }
  \end{minipage}
  \begin{minipage}[b][3.5cm]{0.18\linewidth}
    \subfloat[\GFCI]{
    \begin{tikzpicture}[scale=0.225]
      \foreach \a [count=\c] in {X1, X2, X3, X4, X5, X6}{\node at (\c*360/6+0:4) (\a) {$X_{\c}$};}
      \path[thick, color=black!50, ->]
      (X1) edge[dotted, -, ourblue] (X2)
      (X1) edge[dotted, -, ourblue] (X3)
      (X2) edge[-, ourblue] (X3)
      (X1) edge[dotted, -, ourblue] (X6)
      (X3) edge[dotted, -, ourblue] (X4)
      (X4) edge (X5)
      (X5) edge (X6)
      ;
    \end{tikzpicture}
    }
    \vspace{0em}
    \subfloat[\DCD]{
    \begin{tikzpicture}[scale=0.225]
      \foreach \a [count=\c] in {X1, X2, X3, X4, X5, X6}{\node at (\c*360/6+0:4) (\a) {$X_{\c}$};}

      \path[thick, color=black!50, ->]
      (X1) edge (X2)
      (X1) edge[-, ourblue] (X3)
      (X2) edge[-, ourblue] (X3)
      (X1) edge (X6)
      (X3) edge (X4)
      (X4) edge (X5)
      (X5) edge (X6)
      ;
    \end{tikzpicture}
    }
  \end{minipage}
  \hfill
  \caption{Application of \ourmethod to synthetic data generated from the network shown in (a). When $X_0$ is withheld, all base algorithms $\Ac$ (b,d,f) find a clique of spurious edges on $X_1, X_2, X_3$ (red). \GFCI(h) and \DCD(i) indicate that some pairs from $X_1, X_2$ and $X_3$ are confounded (blue) but cannot tell that they share the same confounder.
    In contrast, by applying \ourmethod (c, e, g), we discover a confounder capturing the effect of $X_0$ (green) and obtain higher quality networks in all cases. }
  \label{fig:improve}
\end{figure*}

\subsection{Complexity}
\label{sec:complexity}
Last, we analyze the runtime complexity of \ourmethod.
\ourmethod employs a loop (l. 2-17) whose inside has complexity $O(C(m, n) + m^2n) = O(C(m, n))$ -- the former, $C(m, n) = \Omega(m^2n)$, is the runtime for running $\Ac$ and the latter $m^2n$ for finding $S$.
Since we can find at most $O(m)$ non-overlapping confounded sets, our worst case runtime is therefore on the order of $O(mC(m, n))$.
In general, only few variables are confounded so that in practice, our runtime is roughly $O(C(m, n))$ --- the same as that of $\Ac$ itself.

\section{Related Work}\label{sec:related}
Causal inference is one of the most important problems in statistical inference and has attracted a lot of research attention~\citep{pearl2009causality,spirtes2000causation}. Unfortunately, latent confounding makes it impossible to infer causality from observational data without making additional assumptions~\citep{pearl2009causality}.
Traditional constraint-based~\citep{spirtes2000causation,zhang2008completeness} and score-based~\citep{chickering2002learning,gao2017local,zheng2018dags} causal discovery methods can reconstruct the true causal network up to Markov equivalence when causal sufficiency holds.

When causal sufficiency does not hold, a number of algorithms such as the \FCI family~\citep{spirtes2000causation,colombo2012learning,ogarrio2016hybrid}, \tofft~\citep{affeldt20163off2} and \DCD~\citep{bhattacharya2021differentiable} can find complete partially directed acyclic graphs which can capture correlations due to confounders. However, these networks are generally difficult to interpret and cannot determine which sets of variables share the same latent confounder.

To make the resulting causal networks more interpretable, observational and experimental data can be combined to improve results~\citep{kallus2018causal,kocaoglu2019characterization}. However, these methods are generally restricted in their ability to rule out or corroborate the existence of latent variables by the scarcity of available experimental data.

Other research controls causal estimates for latent confounders. To do so, \cite{hoyer2008estimation} solve the overcomplete ICA problem to correct the estimated causal effect of $\Xb$ on $Y$ for confounders, whereas \cite{wang2018blessings} and \cite{ranganath2018multiple} use factor models. 

Until recent years, only little prior research has tackled the topic of determining which variables share the same latent confounders.
\cite{janzing2018detecting} considered the case of determining whether the variables $\Xb$ are confounded by finding deviations of the regression vector from theoretical properties in high-dimensional regression. \cite{kaltenpoth2019we} use the AMC~\citep{janzing2010causal} to infer whether two sets of variables $\Xb$ and $Y$ are causally related or jointly confounded.
\cite{silva2006learning} proposed a model based on low-rank correlation structures between observed variables. \cite{elidan2000discovering} proposed an algorithm for replacing semi-cliques in a discovered causal graph with single nodes based on the idea that such cliques are likely due to latent confounding. 

More recently, work based on trek-separation~\citep{kummerfeld2016causal}, rank-constraints~\citep{huang2022latent}, heterogeneous data sources \citep{zhou2022causal}, and overcomplete ICA-based approaches \citep{xie2020generalized, adams2021identification} have been used to obtain the (hierarchical) latent structure of the observed variables. They do not, however, permit edges between the observed variables.
This is in line with the recent field of causal representation learning~\citep{scholkopf2021toward}, where it is assumed that \emph{all} observed correlations are due to causal relations between the unobserved variables. While such assumptions are realistic for data such as images (pixels don't cause each other), they are not reasonable for data gathered from physical, biological, or social systems.
\begin{figure*}[b]
  \centering
  \captionsetup[subfigure]{oneside,margin={1cm,0cm}}
  \begin{NoHyper}
    \ref{mylegend}
  \end{NoHyper}\\
  \begin{subfigure}[b]{0.24\textwidth}
    \centering
    \begin{tikzpicture}
      \begin{axis}[eda line, ymax = 1.0, ymin = 0, ytick={1.0,0.8,0.6,0.4,0.2,0},
        ylabel={$\Fnet$}, xlabel={Decision Rate},
        ylabel near ticks, xlabel near ticks,
        tick label style={font=\footnotesize},
        label style={font=\footnotesize},
        height=2.95cm, width=\columnwidth,
        legend pos=south west, legend columns=1,
        legend style={nodes={scale=0.8, transform shape}, at={(0.00,0.4)},anchor=north west},
        xmin=0, xmax=1, xtick={0,0.25,0.5,0.75,1}]
        \addplot[ourcolor] table[x index = 0, y   index=1, header = false] {expres/synth/cdhc-ges.dat};
        \addplot[dotted, ourcolor] table[x index = 0, y   index=1, header = false] {expres/synth/cdhc-notears.dat};
        \addplot[dashed, ourcolor] table[x index = 0, y   index=1, header = false] {expres/synth/cdhc-ggsl.dat};
        \addplot[ourgreen] table[x index = 0, y   index=1, header = false] {expres/synth/notears.dat};
        \addplot[ouryellow] table[x index = 0, y   index=1, header = false] {expres/synth/tofft.dat};
        \addplot[ourorange] table[x index = 0, y   index=1, header = false] {expres/synth/gfci.dat};
        \addplot[ourbl] table[x index = 0, y   index=1, header = false] {expres/synth/dcd.dat};
      \end{axis}
    \end{tikzpicture}%
    \caption{}
  \end{subfigure}
  \hfill
  \begin{subfigure}[b]{0.24\textwidth}
    \centering
    \begin{tikzpicture}
      \begin{axis}[eda line, ymax = 1.0, ymin = 0, ytick={1.0,0.8,0.6,0.4,0.2,0},
        ylabel={$\Fconf$}, xlabel={Decision Rate},
        ylabel near ticks, xlabel near ticks,
        tick label style={font=\footnotesize},
        label style={font=\footnotesize},
        height=2.95cm, width=\columnwidth,
        legend pos=south west, legend columns=1,
        legend style={nodes={scale=0.8, transform shape}, at={(0.00,0.4)},anchor=north west},
        xmin=0, xmax=1, xtick={0,0.25,0.5,0.75,1}]
        \addplot[ourcolor] table[x index = 0, y   index=2, header = false] {expres/synth/cdhc-ges.dat};
        \addplot[dotted, ourcolor] table[x index = 0, y   index=2, header = false] {expres/synth/cdhc-notears.dat};
        \addplot[dashed, ourcolor] table[x index = 0, y   index=2, header = false] {expres/synth/cdhc-ggsl.dat};
        \addplot[ourgreen] table[x index = 0, y   index=2, header = false] {expres/synth/notears.dat};
        \addplot[ouryellow] table[x index = 0, y   index=2, header = false] {expres/synth/tofft.dat};
        \addplot[ourorange] table[x index = 0, y   index=2, header = false] {expres/synth/gfci.dat};
        \addplot[ourbl] table[x index = 0, y   index=2, header = false] {expres/synth/dcd.dat};
      \end{axis}
    \end{tikzpicture}%
    \caption{}
  \end{subfigure}
  \hfill
  \begin{subfigure}[b]{0.24\textwidth}
    \centering
    \begin{tikzpicture}
      \begin{axis}[eda line, ymax = 350, ymin = 0, ytick={0,100,...,300},
        ylabel={SHD}, xlabel={Decision Rate},
        ylabel near ticks, xlabel near ticks,
        tick label style={font=\footnotesize},
        label style={font=\footnotesize},
        height=2.95cm, width=\columnwidth,
        legend pos=north west, legend columns=1,
        legend style={nodes={scale=0.6, transform shape}, at={(-0.045,1.05)},anchor=north west },
        xmin=0, xmax=1, xtick={0,0.25,0.5,0.75,1}]
        \addplot[ourcolor] table[x index = 0, y  index=3, header = false] {expres/synth/cdhc-ges.dat};
        \addplot[dotted, ourcolor] table[x index = 0, y  index=3, header = false] {expres/synth/cdhc-notears.dat};
        \addplot[dashed, ourcolor] table[x index = 0, y  index=3, header = false] {expres/synth/cdhc-ggsl.dat};
        \addplot[ourgreen] table[x index = 0, y  index=3, header = false] {expres/synth/notears.dat};
        \addplot[ouryellow] table[x index = 0, y  index=3, header = false] {expres/synth/tofft.dat};
        \addplot[ourorange] table[x index = 0, y  index=3, header = false] {expres/synth/gfci.dat};
        \addplot[ourbl] table[x index = 0, y  index=3, header = false] {expres/synth/dcd.dat};
      \end{axis}
    \end{tikzpicture}%
    \caption{}
  \end{subfigure}
  \hfill
  \begin{subfigure}[b]{0.24\textwidth}
    \centering
    \begin{tikzpicture}
      \begin{axis}[eda line, ymax = 450, ymin = 0, ytick={0,100,...,400},
        ylabel={SID}, xlabel={Decision Rate},
        ylabel near ticks, xlabel near ticks,
        tick label style={font=\footnotesize},
        label style={font=\footnotesize},
        height=2.95cm, width=\columnwidth,
        legend columns=7,
        legend to name={mylegend},
        xmin=0, xmax=1, xtick={0,0.25,0.5,0.75,1}]
        \addplot[ourcolor] table[x index = 0, y index = 4, header = false] {expres/synth/cdhc-ges.dat};
        \addplot[dotted, ourcolor] table[x index = 0, y index = 4, header = false] {expres/synth/cdhc-notears.dat};
        \addplot[dashed, ourcolor] table[x index = 0, y index = 4, header = false] {expres/synth/cdhc-ggsl.dat};
        \addplot[ourgreen] table[x index = 0, y index = 4, header = false] {expres/synth/notears.dat};
        \addplot[ouryellow] table[x index = 0, y index = 4, header = false] {expres/synth/tofft.dat};
        \addplot[ourorange] table[x index = 0, y index = 4, header = false] {expres/synth/gfci.dat};
        \addplot[ourbl] table[x index = 0, y index = 4, header = false] {expres/synth/dcd.dat};
        \addlegendentry{\ourmethod-\GES}
        \addlegendentry{\ourmethod-\textsc{nt}}
        \addlegendentry{\ourmethod-\GGSL}
        \addlegendentry{\NOTEARS}
        \addlegendentry{\tofft}
        \addlegendentry{\GFCI}
        \addlegendentry{\DCD}
      \end{axis}
    \end{tikzpicture}%
    \caption{}
  \end{subfigure}
  \caption{Evaluation on synthetic data. $F_1$ scores for (a) network recovery and (b) confounded set recovery (higher is better), and (c) Structural  Hamming Distance and (d) Structural Intervention Distance (lower is better).
    Each figure shows the average score over increasing fractions of all datasets, sorted in descending order by the confidence of each method.}
  \label{fig:conf-f1}
\end{figure*}

\section{Experiments}
\label{sec:exps}
In this section, we evaluate \ourmethod empirically. We are interested in two things; first, how well it recovers the set of confounded nodes $S^{\ast}$, and second, how well it recovers the entire network. 
We compare \ourmethod against \NOTEARS~\citep{zheng2018dags}, \tofft~\citep{affeldt20163off2}, \DCD~\citep{bhattacharya2021differentiable} and \GFCI~\citep{ogarrio2016hybrid}.\!\footnote{GFCI is part of a group of methods, including \FCI and \RFCI. Preliminary experiments corroborated previous research~\citep{ogarrio2016hybrid} that \GFCI performs better than its relatives.}
We instantiate \ourmethod with different causal discovery algorithms $\Ac$ and refer to \ourmethod using $\Ac$ as \ourmethod-$\Ac$. Specifically, we use \GES~\citep{chickering2002learning}, \GGSL~\citep{gao2017local}, and \NOTEARS~\citep{zheng2018dags}. When clear from the context, we write \ourmethod for \ourmethod-\GES.
We implement \ourmethod in Python. For comparison with all other methods, we use the implementations provided by the respective authors.
All experiments finished within minutes on a commodity laptop. All code and data can be found online, along with additional experiments postponed in the interest of space.\!\footnote{\codeurl}

\subsection{Synthetic Data}
\label{sec:synth}

We evaluate \ourmethod on synthetic data by generating a random acyclic graph $G$ of size $m$ from the Erd\H{o}s-R\'enyi model with parameter $p=0.3$. We model the causal relationships via a linear SEM, $X = AX + \alpha BZ + \epsilon$ where $A_{ij} \neq 0$ if and only if $(i, j) \in E(G)$. Nonzero values of $A, B, Z, \epsilon$ are all $\sim N(0, 3)$. The parameter $\alpha \sim U[1, 8]$ determines the relative strength of confounding.
Using this model, we generate $1000$ data sets over $m = 50$ variables, of which ten nodes are confounded.
Before moving to the general case, we begin by studying how \ourmethod improves over base algorithms $\Ac$ on an illustrative example with $\text{dim}(Z) = 1$.

\noindent\emph{Comparison with Base Algorithms}
We begin by showing that \ourmethod produces better results than standard discovery algorithms when not all variables are observed.
We consider the network shown in Fig.~\ref{fig:improve}a containing nodes $X_0,...,X_6$, of which $X_1, X_2, X_3$ are confounded by $X_0$. 
When withholding $X_0$, none of the base methods find the correct structure over $X_1, X_2, X_3$. 
Furthermore, while \GFCI and \DCD find the variables to be confounded, they cannot tell that all variables share the same confounder.
In contrast, by applying \ourmethod, we consistently find that $X_1, X_2, X_3$ are confounded while maintaining the quality of the remaining edges.

\noindent\emph{Confidence and Performance}
Since we generally do not have access to the ground truth network in practice, we next test how well we can predict the performance of each method from an easily observable quantity. That is, we compare the confidence---the improvement of the discovered network compared to a baseline---of each method to a range of metrics measuring the quality of our results. We provide details on the computation of the confidences in Appendix A.3.

\begin{table}[t]
  \centering
  \begin{tabular}[]{rrrrrr}
    & \multicolumn{4}{c}{Number of confounders} \\
    Method & 1 & 2 & 3 & 4 & 5 \\
    \cmidrule(lr){1-6}
    \ourmethod & 0.43 & 0.38 & 0.35 & 0.23 & 0.15 \\
    \DCD & 0.35 & 0.18 & 0.11 & 0.07 & 0.03 \\
    \tofft & 0.36 & 0.2 & 0.14 & 0.11 & 0.04 \\
    \GFCI & 0.22 & 0.11 & 0.05 & 0.02 & 0.01 \\
  \end{tabular}
  \caption{Comparison of \ourmethod, \DCD, \tofft and \GFCI for graphs with varying numbers of latent confounders. While all methods perform well, only \ourmethod maintains its performance as the number of latent factors increases.}
  \label{tbl:higher}
\end{table}

We evaluate each method based on four criteria: (1) the $F_1$ score for network recovery including the confounder ($\Fnet$), (2) the $F_1$ score for the recovery of the set of confounded nodes ($\Fconf$), (3) the Structural Hamming Distance (SHD) between the discovered and true network, and (4) the Structural Intervention Distance (SID) to measure differences in causal interpretations~\citep{peters2013structural}.

We show the results in the form of decision rate (DR) plots in Fig.~\ref{fig:conf-f1}. First, we sort the result of each method by their confidence. Then we plot the confidence against each metric.
On the left of each plot, we include data sets where each method is most confident and increase this number of sets until all are included on the right.
We see that \ourmethod outperforms its competitors by a large margin. Interestingly, the choice of $\Ac$ has little influence on the performance of \ourmethod, and the gap between each $\Ac$ with \ourmethod-$\Ac$ is comparable. We include further results for \GGSL and \GES in Appendix A.4.

\begin{figure}[t]
  \centering
  \begin{tikzpicture}[]
    \begin{axis}[
      xmin=-0.6, xmax=0.6,
      ymin=-0, ymax=1,
      width=\linewidth,
      tick style        = {thin, black, major tick length=2pt},
      x tick label style  = {font=\scriptsize, yshift = 1pt},
      y tick label style  = {font=\scriptsize, xshift = 1pt},
      axis lines=left,
      height=5cm,
      width=6cm
      ]
      \draw (30, 0) -- (600, 100);
      \draw (0, 0) -- (1300, 0);
      \draw (1180, 0) -- (600, 100);

      \draw (600, 30) -- (890, 50);
      \draw (600, 30) -- (315, 50);
      \draw (600, 0) -- (600, 30);

      \addplot [
      scatter, scatter/use mapped color={draw=mambacolor1, fill=mambacolor1},
      scatter src=explicit,
      only marks,
      opacity=0.5,
      mark=hexagon,mark size=\pgfkeysvalueof{/pgfplots/width}/100/2,
      z buffer=sort] table [meta index=2] {expres/hexbins.dat };
      \node[text width=1cm] at (400, 18) {\small \opt better};
      \node[text width=1cm] (cdhc)at (1125, 50) {\small \ourmethod better};
      \node[text width=1cm] (cdhc-center) at (850, 18) {\phantom{\small \ourmethod better}};
      \node[text width=1cm] (both) at (642, 60) {\small Both equally good};
      \path[thick, color=black!100, ->]
      (cdhc) edge (cdhc-center)
      ;
    \end{axis}
  \end{tikzpicture}
  \caption{Significance assessment of the improvement of \ourmethod against its competitors.}
  \label{fig:bayes-comp}
\end{figure}

\begin{figure*}[t]
  \hfill
  \begin{subfigure}[b]{0.24\textwidth}
    \centering
    \begin{tikzpicture}[scale=0.325]
      \foreach \a [count=\c] in {recA,umuD,uvrD,uvrA,uvrY,ruvA,polB}{\node at (\c*360/7+40:4) (\a) {\a};}
      \node (uvrI) at ($(uvrA)!0.5!(uvrY)$) {};
      \node (lexA) at ($(recA)!0.5!(uvrI)$) {lexA};

      \path[thick, color=black!50, ->]
      \foreach \a in {umuD,uvrD,uvrA,uvrY,ruvA,polB}{(lexA) edge (\a)}
      (recA) edge[bend right=20] (lexA)
      (lexA) edge[bend right=20] (recA)
      ;
    \end{tikzpicture}
    \caption{Ground Truth}
    \label{fig:sosa}
  \end{subfigure}
  \hfill
  \begin{subfigure}[b]{0.24\textwidth}
    \centering
    \begin{tikzpicture}[scale=0.325]
      \foreach \a [count=\c] in {recA,umuD,uvrD,uvrA,uvrY,ruvA,polB}{\node at (\c*360/7+40:4) (\a) {\a};}
      \node (uvrI) at ($(uvrA)!0.5!(uvrY)$) {};
      \node (lexA) at ($(recA)!0.5!(uvrI)$) {Z};

      \path[thick, color=black!50, ->]
      \foreach \a in {umuD,uvrY,ruvA,polB,recA}{(lexA) edge[green(ryb)] (\a)}
      ;
    \end{tikzpicture}
    \caption{\ourmethod}
    \label{fig:sosb}
  \end{subfigure}
  \hfill
  \begin{subfigure}[b]{0.24\textwidth}
    \centering
    \begin{tikzpicture}[scale=0.325]
      \foreach \a [count=\c] in {recA,umuD,uvrD,uvrA,uvrY,ruvA,polB}{\node at (\c*360/7+40:4) (\a) {\a};}

      \path[thick, dotted, ourblue]
      (recA) edge[] (umuD)
      (recA) edge[solid] (uvrA)
      (recA) edge[] (uvrY)
      (recA) edge[] (polB)

      (umuD) edge[] (uvrD)
      (umuD) edge[] (uvrA)
      (umuD) edge[] (uvrY)
      (umuD) edge[] (ruvA)

      (uvrD) edge[] (uvrA)
      (uvrD) edge[] (uvrY)
      (uvrD) edge[] (polB)

      (uvrA) edge[] (uvrY)
      (uvrA) edge[solid] (ruvA)
      (uvrA) edge[] (polB)

      (uvrY) edge[solid] (ruvA)

      (ruvA) edge[] (polB) ;
    \end{tikzpicture}
    \caption{\GFCI}
    \label{fig:sosc}
  \end{subfigure}
  \hfill
  \begin{subfigure}[b]{0.24\textwidth}
    \centering
    \begin{tikzpicture}[scale=0.325]
      \foreach \a [count=\c] in {recA,umuD,uvrD,uvrA,uvrY,ruvA,polB}{\node at (\c*360/7+40:4) (\a) {\a};}

      \path[thick, solid, ourblue]
      (recA) edge[] (uvrA)
      (recA) edge[] (uvrY)
      (recA) edge[] (polB)
      (recA) edge[] (ruvA)

      (umuD) edge[] (uvrD)
      (umuD) edge[] (uvrY)

      (uvrD) edge[] (uvrA)
      (uvrD) edge[] (ruvA)
      (uvrD) edge[] (polB)

      (uvrA) edge[] (uvrY)
      (uvrA) edge[] (ruvA)

      (uvrY) edge[] (ruvA)
      (uvrY) edge[] (polB)

      ;
    \end{tikzpicture}
    \caption{\DCD}
    \label{fig:sosd}
  \end{subfigure}
  \hfill
  \caption{
    Results on SOS DNA repair network in E. coli.
    \ourmethod (b) discovers a confounder $Z$ capturing five out of the seven edges (green) in the ground truth network (a).
    In contrast, \GFCI (c) and \DCD (d) find many pairs of nodes that are confounded (solid blue), and in the case of GFCI, more pairs yet which \emph{might} be confounded (dotted). However, they do not discover that all nodes share the same confounder, making the resulting networks challenging to interpret.
    }
  \label{fig:sos}
\end{figure*}
\noindent\emph{Higher-dimensional $Z$}
We next consider the effect of including multiple confounders $Z_i$ in our causal model, each influencing non-overlapping sets of $5$ variables in a network of $m=50$ variables. We show the $\Fconf$ scores for one to five confounders in Table~\ref{tbl:higher}. We omit \NOTEARS, \GGSL, and \GES since none of them are designed for this task.

We see that for one to three confounders, \ourmethod performs at a consistent level, but for four and five confounders, its performance decreases due to the difficulty of finding additional sets of confounded nodes. 
In contrast, while \DCD, \tofft, and \GFCI perform well for single-dimensional confounders, their performance drops immediately upon addition of a second latent confounder. The reason for this is instructive: since they do not model the confounder but instead indicate whether pairs of variables are confounded, they cannot distinguish between different confounders.

\noindent\emph{Significance}
To verify that \ourmethod significantly outperforms its competitors, we use the Bayesian signed rank test~\citep{benavoli2014bayesian}.
It explicitly models the probability that one model is significantly better than the other \emph{in practice} by introducing a \emph{region of practical equivalence} (rope) specified by parameter $r$~\citep{benavoli2014bayesian}. Two methods are considered to perform equally well if the difference in scores for the methods lies in $[-r, r]$. 
We pick $r = 0.05$~\citep{benavoli2014bayesian} but the conclusion remains the same for values $r \in (0, 0.15]$.
Since the test was designed for \emph{two} competing methods, for each dataset, we compare \ourmethod with the best-performing competitor, which we refer to as \opt. For each dataset $k$, we compute the $\Fnet$ scores for both \ourmethod and \opt and compute their differences. We include more detail in Appendix A.5.
We show the estimated posterior distribution resulting from this comparison in Fig.~\ref{fig:bayes-comp}. Here, the top region contains points where both methods are practically equally good, while the bottom left and right corners contain points where \opt, respectively \ourmethod, perform better. We see that all points lie squarely in the region, indicating \ourmethod to outperform \opt, suggesting that \ourmethod performs significantly better than its competitors.


\subsection{Case Study: Cellular Signaling}
\label{sec:real}
Finally, we consider real-world data to investigate the interpretability of the results returned by \ourmethod. In particular, we consider the SOS DNA repair network in E. coli~\citep{ronen2002assigning}. This data consists of protein levels of eight genes measured every five minutes for five hours, resulting in only 60 samples. Since the governing relationships in gene regulation are highly nonlinear, this tests the applicability of \ourmethod even when our assumptions do not hold.

Since the ground truth network has been established~\citep{perrin2003gene}, we can test \ourmethod by excluding a gene known to have a downstream causal effect on other genes. An excellent candidate is \emph{lexA} as it has a causal influence on \emph{all} of the other genes: it is upstream of six genes and has a bidirectional relationship with the seventh (Fig.~\ref{fig:sosa}). We also applied the other methods on the same data, including here the results of both \GFCI and \DCD, and postpone the networks discovered by the other methods to Appendix A.7.

We show the results in Fig.~\ref{fig:sos}. For clarity, we focus only on discovering which confounded nodes.
In Fig.~\ref{fig:sosb}, we find a striking similarity between the $Z$ discovered by \ourmethod and the true common parent \emph{lexA}.
\ourmethod correctly identifies five out of seven relationships: four out of six downstream effects, as well as one of the two edges between recA and lexA---which is the most a DAG can do, given that the two edges are mutually exclusive.
Next, for \GFCI (Fig.~\ref{fig:sosc}) and \DCD (Fig.~\ref{fig:sosd}), we indicate definite confounding by solid edges and correlations which could be due to either confounding or causation by dotted edges.
\GFCI indicates definite confounding for only three out of 16 pairs, while we cannot be certain for the other pairs. The resulting network of \DCD indicates definite confounding for many pairs of variables.
However, neither method can determine that all variables share the \emph{same} latent confounder. The results of both \GFCI and \DCD are consistent with many different structures and provide no well-founded way of choosing one over the other. That is, since \GFCI and \DCD indicate only pairwise confounding without any way to evaluate whether nodes are jointly confounded, interpretation of the resulting networks is difficult.
Overall, despite low sample size and violations of our model assumptions, \ourmethod finds a readily interpretable network close to the ground truth.

\section{Discussion and Conclusion}
\label{sec:discussion}
We studied the problem of discovering a causal network over both $X$ and its latent confounders $Z$. 
In particular, by exploiting the structure among confounded nodes $X_S$ we proved identifiability in the sparse linear Gaussian model.

We derived a general approach for discovering sets of confounded nodes from the algorithmic model of causality by explicitly modeling latent variables. We showed that including latent variables is only beneficial when the observed $X$ are indeed confounded.
We used MDL to determine which sets of variables are confounded and showed that \ourmethod is consistent for recovering the set of confounded nodes when combined with a consistent causal discovery method.

Evaluation of \ourmethod on synthetic data showed that it outperformed the state of the art on all considered metrics. Furthermore, on real data it generated results close to the ground truth despite small sample size and large deviations from its model assumptions. We further obtained more readily interpretable results than our competitors.

For the future, we are interested in more general identifiability results as well as developing better algorithms which do not require multiple passes of a causal discovery algorithm.

\bibliography{abbrev, bib-paper}

\end{document}
