\documentclass[accepted]{uai2023} 
\usepackage{natbib} 
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} 

\usepackage{booktabs} 
\usepackage{tikz} 

\usepackage{amsmath,amsthm,amssymb}

\usepackage{subcaption}
\usepackage{natbib}
\usepackage{dblfloatfix}
\usepackage[ruled, vlined, nofillcomment, linesnumbered]{algorithm2e}
\usepackage{eda} 
\usepackage{prettyplots}
\usepackage{comment}

% Theorems
\newtheorem{assumption}{Assumption}
\newtheorem*{problem}{Problem Statement}
\newtheorem*{postulate}{Postulate}

\usepackage{graphicx}
\usetikzlibrary{calc}
\usetikzlibrary{positioning}
\usetikzlibrary{external}
% \tikzexternalize


\newcommand{\ourmaintitle}{Causal Discovery with Hidden Confounders\\ using the Algorithmic Markov Condition\\ (Supplementary Material)}
\newcommand{\ourtitle}{\ourmaintitle}
\newcommand{\ourmethod}{\textsc{cdhc}\xspace}
\newcommand{\oururl}{\url{https://eda.rg.cispa.io/prj/pepsi/}}
\newcommand{\codeurl}{\oururl}
\newcommand{\apxurl}{\oururl}


\newif\ifapx
\newif\ifpdf
\pdftrue
\apxfalse 


\definecolor{blue1}{RGB}{6, 131, 178}
\tikzset{ourcolor/.style={blue(munsell), line width = 1.2pt, opacity = .9},
	ourorange/.style={deepcarrotorange, line width = 1.1pt,opacity = .7 }, 
	ourbl/.style={plum(traditional), line width = 1.1pt,opacity = .9}, 
	ourgreen/.style={dollarbill, line width = 1.1pt,opacity = .7},
	ouryellow/.style={mambacolor5, line width = 1.1pt,opacity = .9},
	ourred/.style={red(ryb)!80},  
	ourblue/.style={blue1!90} 
}

\pgfdeclareplotmark{hexagon}
{%
  \pgfpathmoveto{\pgfqpoint{0pt}{1.1547\pgfplotmarksize}}
  \pgfpathlineto{\pgfqpointpolar{150}{1.1547\pgfplotmarksize}}
  \pgfpathlineto{\pgfqpointpolar{210}{1.1547\pgfplotmarksize}}
  \pgfpathlineto{\pgfqpointpolar{270}{1.1547\pgfplotmarksize}}
  \pgfpathlineto{\pgfqpointpolar{330}{1.1547\pgfplotmarksize}}
  \pgfpathlineto{\pgfqpointpolar{30}{1.1547\pgfplotmarksize}}
  \pgfpathclose
  \pgfusepathqfill
}

\SetKwComment{tcpas}{\{}{\}}
\SetCommentSty{textnormal}
\SetArgSty{textnormal}
\SetKwRepeat{Do}{do}{while}
\SetKw{False}{false}
\SetKw{True}{true}
\SetKw{Null}{null}
\SetKwInOut{Output}{output}
\SetKwInOut{Input}{input  }
\SetKw{AND}{and}
\SetKw{OR}{or}
\SetKw{Continue}{continue}

\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}

\newcommand{\Ai}{\ensuremath{(I-A^t)^{-1}}}
\newcommand{\Ait}{\ensuremath{(I-A^t)^{-t}}}
\newcommand{\At}{A^\top}
\newcommand{\Bt}{B^\top}
\newcommand{\Ct}{C^\top}
\newcommand{\Xb}{X}
\newcommand{\xb}{x}
\newcommand{\yb}{y}
\newcommand{\Zb}{Z}
\newcommand{\zb}{z}
\newcommand{\Wb}{W}
\newcommand{\wb}{w}
\newcommand{\Px}{P(\Xb)}
\newcommand{\Pz}{P(\Zb)}
\newcommand{\Pxz}{P(\Xb, \Zb)}
\newcommand{\Qxz}{Q(\Xb, \Zb)}
\newcommand{\M}{\ensuremath{\mathcal{M}}}
\newcommand{\Ac}{\ensuremath{\mathcal{A}}}
\newcommand{\Pc}{\ensuremath{\mathcal{P}}}
\newcommand{\Mf}{\mathfrak{M}}
\newcommand{\N}{N}
\newcommand{\argmin}{\text{arg\,min}}
\newcommand{\gt}{\ensuremath{G^{\ast}}}
\newcommand{\Gxz}{\ensuremath{G_{X,Z}}}
\newcommand{\Gx}{\ensuremath{G_X}}

\newcommand\norm[1]{\lVert#1\rVert}

\newcommand{\MB}{\text{MB}}
\newcommand{\Fnet}{F_{1}^{\text{net}}}
\newcommand{\Fconf}{F_{1}^{\text{conf}}}
\newcommand{\leqp}{\overset{+}{\leq}}
\newcommand{\eqp}{\overset{+}{=}}

\newcommand{\GFCI}{\textsc{gfci}\xspace}
\newcommand{\RFCI}{\textsc{rfci}\xspace}
\newcommand{\FCI}{\textsc{fci}\xspace}
\newcommand{\GES}{\textsc{ges}\xspace}
\newcommand{\GGSL}{\textsc{ggsl}\xspace}
\newcommand{\NOTEARS}{\textsc{notears}\xspace}
\newcommand{\CoCa}{\textsc{coca}\xspace}
\newcommand{\tofft}{\textsc{3off2}\xspace}
\newcommand{\opt}{\textsc{opt}\xspace}
\newcommand{\DCD}{\textsc{dcd}\xspace}


\DeclareMathOperator{\Pa}{Pa}
\DeclareMathOperator{\cov}{cov}
\DeclareMathOperator{\diag}{diag}


\pagestyle{numbered}
\title{\ourtitle}

\author[1]{David Kaltenpoth}
\author[1]{Jilles Vreeken}
\affil[1]{%
  CISPA Helmholtz Center for Information Security\\
  Saarbr\"ucken

}
  \begin{document}
\appendix
\onecolumn
\maketitle
\section{Appendix}
\label{sec:apx}


\subsection{Proofs}
\label{sec:proofs}

\begin{proof}[Proof of Proposition 1]
  For any two $i, j \in S$ we know that, since they are direct descendants of $Z$, $X_i \not \independent X_j \mid U$ for any $U \subset \left\{ X_1,\ldots,X_m \right\} \setminus \left\{ X_i, X_j \right\}$. Hence all edges $\left\{ X_i, X_j \right\}$ are in $G$ so that $S$ is a clique in $G$.
\end{proof}

\begin{proof}[Proof of Theorem 2]
  \note{fix}
  We prove this statement in two steps. First, we show that all $b_{ij}$ are identifiable. Let $i \in \left\{ 1,\ldots,m \right\}$ and $j \in \left\{ 1,\ldots,l \right\}$.
  Then, by assumption (A2) there exists a distinct quadruple $(X_i, X_u, X_v, X_w)$ of nodes that are conditionally independent given $Z_j$. 
  In order to make every quadruple $(X_i, X_u, X_v, X_w)$ be dependent conditional on $Z_j$, it would have to have either an edge between them or a common predecessor, which would require at least $\left| S_j \right|-3$ incoming edges to $S_j$ from sources that are not $Z_j$.

  Therefore, for any two variables $(X_{\lambda}, X_{\mu})$ in our quadruple we know that $\sigma_{\lambda\mu} = \cov(X_{\lambda}, X_{\mu}) = b_{\lambda}b_{\mu}$ and in particular
  \begin{align*}
    \sigma_{iu}\sigma_{vw} = b_ib_ub_vb_w =  \sigma_{iv}\sigma_{uw}
  \end{align*}
  We can therefore write 
  \begin{align*}
    b_i^2 =  \sigma_{iu}\sigma_{iv}/\sigma_{uv}.
  \end{align*}
  Furthermore, no quadruple $(X_i, X_{u'}, X_{v'}, X_{w'})$ this is not conditionally independent given $Z$ can satisfy the constraint $\sigma_{iu}\sigma_{vw} = \sigma_{iv}\sigma_{uw}$ by assumption (A3).
  Hence, all $b_{ij}^2$ are identifiable, and since we know the sign of at least one $b_{ij}$ for the given $j$, we therefore know the sign of each $b_{ij}$ for fixed j. However, since $j$ was arbitrary, $B$ is identifiable in its entirety.

  Now, knowing the values of $B$ we can determine the distribution $P(X \mid Z)$, which depends only on $A$ and $\sigma_{\epsilon}^2$.
  Since $X \mid Z$ is now a linear Gaussian SEM with equal variances, the identifiability of $A$ and $\sigma_{\epsilon}^2$ follows from the work of~\cite{peters2012identifiability} on the identifiability of the equal variance model.
\end{proof}

\begin{proof}[Proof of Theorem 3]
  To prove the first statement, let $\Zb$ be jointly independent and let there be no edges $\Xb \rightarrow \Zb$. Pick $P$ such that $P(\Zb = 0) = 1$. Then $\Zb$ contains no information about $\Xb$ so that $K(P(\Xb, \Zb)) \leq K(\Px) + K(\Pz) = K(\Xb) + O(1)$, with constant $K(P(Z)) = O(1)$ independent of $\Px$.

  For the second statement, consider the case where the true generating mechanism for $\Xb$ does not include any latent variables for any subset $\Xb_S$.
  Then as noted in the AMC and the discussion preceding it, \emph{all} information needed to compress $\Px$ is already present in the graph $G_X^{\ast}$ giving the optimal factorization of $\Px$. Hence $K(\Pxz) \geq K(\Px) + K(P(\Zb | \Xb)) > K(\Px)$.
\end{proof}
\begin{proof}[Proof of Theorem 4]
  Assume that Eq. (4) holds in the limit $n \rightarrow \infty$. Then there is a model class $\M \in \Mf$ such that $L(\xb \mid \M) < L(\xb \mid \M_0)$. Further, as we use a refined MDL score that means there is an optimal $P^{\ast} \in \M$ such that $L(\xb \mid \M) = L(\xb \mid P^{\ast}) + O(n^{-1})$.  Due to the consistency~\citep{grunwald2007minimum} of refined MDL scores this means that almost surely $\Xb$ has generating distribution $P^{\ast}(\Xb)$. As  $P^{\ast}$ is a joint distribution over $(\Xb, \Zb)$ this means that $\xb$ is the observed part of a joint sample $(\xb, \zb)$ from $P^{\ast}(\Xb, \Zb)$.
\end{proof}

\begin{proof}[Proof of Theorem~5]
  As we've seen in the proof of Theorem 2, for each $X_i$ there exists a distinct quadruple $(X_i, X_u, X_v, X_w)$ conditionally independent given $Z_j$ by (A2).
  Hence, all correlations between these four variables can be explained by the parameters in $B$.
  Furthermore, by Proposition 1, no pair of variables $X_\mu, X_\lambda$ can be $d$-separated in any DAG over $X$, so that by setting $b_{ij} = 0$ we would require \emph{at least} four additional entries of $A$ to be non-zero, instead of only one in $B$.

  Hence, since in the limit we have $ \widehat{b}_{ij} \widehat{b}_{vj} - \sigma_{iv} \rightarrow 0$, the matrix $\widehat{B}$ converges towards $B$. Furthermore, given a good approximation of $P(X)$  and of $B$, we obtain a good approximation of $A$ by the results of~\cite{van2013ell}.
\end{proof}

\begin{proof}[Proof of Theorem 6]
  This follows directly from Theorem 5 and the fact that for $n \rightarrow \infty$, our MDL score is equivalent to BIC~\citep{grunwald2007minimum}.
\end{proof}
\begin{proof}[Proof of Proposition 7]
  By Proposition 1, we know that $S_j^{\ast}$ forms a clique in the graph $\gt$ inferred by a consistent $\Ac$.
  This clique is maximal due to no node being in the Markov Blanket of all $s \in S_j^{\ast}$. Further, since $\xb^n$ is a sample from Eq. (6) we know from the MDL principle for selecting nested model classes~\citep{grunwald2007minimum} that in the limit no other set can be compressed better by introducing a confounder than $S_j^{\ast}$ itself. %Hence we will discover the right $S^{\ast}$.
\end{proof}

\subsection{Implementation Details}
\label{sec:implem-detail}

We implemented \ourmethod in Python using PyMC3 %~ \cite{salvatier2016probabilistic}
for posterior inference using ADVI with default parameters. % ~\cite{kucukelbir2017automatic}
All code is available for research purposes. Experiments were run single-threaded on a standard commodity laptop and each experiment finished within minutes.

\subsection{Computing Confidences}
\label{sec:confidence}

For \ourmethod we measure its confidence as the relative gain in compression due to addition of confounders, $C_{\Ac} = (L_{\Ac} - L_{\text{\ourmethod-}\Ac})/L_{\Ac} \geq 0$ and for \NOTEARS we use the normalized difference between the initial (empty network) and final (discovered network) score obtained from optimization.
None of \GFCI, \tofft or \DCD come with readily computable confidence scores so we treat them in \emph{the way most favorable to them} by assuming that their best performances are also their most confident.

\subsection{Additional results for \GGSL and \GES}
\label{sec:additional}

We now provide additional details on the results of \GGSL and \GES in Table~\ref{tbl:gg}. We compute all confidences as described in the previous section. For comparison we also include the results of \ourmethod. Since they are most interpretable, we include only the $\Fnet$ score here. For other metrics too, however, both \GGSL and \GES perform similarly to \NOTEARS so that \ourmethod significantly outperforms them. In particular, neither of them can find any confounders.

\begin{table}[h]
  \centering
  \begin{tabular}[]{rrrr}
    & \multicolumn{3}{c}{Data evaluated} \\
    Method & 1\% & 50\% & 100\% \\
    \cmidrule(lr){1-4}
    \ourmethod & \bf{0.92} & \bf{0.85} & \bf{0.53} \\
    \GGSL & 0.68 & 0.42 & 0.24 \\
    \GES & 0.64 & 0.35 & 0.21 \\
  \end{tabular}
  \caption{Comparison of \ourmethod, \GGSL and \GES in terms of $\Fnet$ scores. The performance of each method is shown for each of 1\%, 50\% and 100\% of the data evaluated (corresponding to leftmost, median and right-most points in a decision-rate plot). We see that \ourmethod clearly outperforms both competitors by a large margin and further that both \GGSL and \GES perform similarly to \NOTEARS.}
  \label{tbl:gg}
\end{table}

\subsection{Additional details on Significance Testing}
\label{sec:significance}
To verify that \ourmethod significantly outperforms its competitors we use the Bayesian signed rank test~\citep{benavoli2014bayesian}.
It explicitly models the probability that one model is significantly better than the other \emph{in practice} by introducing a \emph{region of practical equivalence} (rope) specified by parameter $r$. Two methods are considered to perform equally well if the difference of scores for the methods lies in $[-r, r]$.
We pick $r = 0.05$~\citep{benavoli2014bayesian} but the conclusion remains the same for values $r \in (0, 0.15]$. 
Since the test was designed for two competing methods, for each dataset we compare \ourmethod with the best-performing of its competitors, which we refer to as \opt.

To compare the two methods over all samples, we aggregate the $F_1$ scores for both \ourmethod and \opt and take their differences $z_i = F_{1,i}^{\text{\opt}}-F_{1,i}^{\text{\ourmethod}}, i \in \left\{ 1,\ldots,q \right\}$.
To include the prior assumption that both methods are equally good, we include a pseudo-observation $z_0 = 0$, i.e. that both methods are precisely equally good.
We take weights $w = (w_0,\ldots,w_{q}) \sim \text{Dirichlet}(s,1,\ldots,1)$ where $s$ corresponds to the number of times we obtained  $z_0$. This is commonly set to be $s = 0.5$, but due to our large number of experiments its influence on the posterior is minor. The posterior probabilities are computed as
\begin{align*}
  \theta_{\text{\opt}} &= \sum_{i, j = 0}^qw_iw_jI_{(2r, \infty)}(z_i + z_j)\\
  \theta_{\text{rope}} &= \sum_{i, j = 0}^qw_iw_jI_{[-2r, 2r]}(z_i + z_j)\\
  \theta_{\text{\ourmethod}} &= \sum_{i, j = 0}^qw_iw_jI_{(-\infty, -2r)}(z_i + z_j)
\end{align*}
where $\theta_{\text{\opt}}, \theta_{\text{\ourmethod}}$ are the posterior probabilities that \opt, respectively \ourmethod are better by at least a margin $r$, while $\theta_{\text{rope}}$ is the posterior probability that they perform equally well up to said margin.
The distribution of $\theta$ is not analytically tractable, but we can evaluate it empirically by sampling values for $w$.
Such a sample is precisely what is depicted in Fig. 5 in barycentric coordinates.





\subsection{Realistic Data: REGED}
\label{sec:reged}
  Next, we consider realistic synthetic data from REGED~\citep{guyon2008datasets}, which is based on human lung-cancer microarray gene expression data. Since the available samples are non-i.i.d., the causal relationships are nonlinear, and the ground truth is known from gene intervention studies, it provides a good benchmark for \ourmethod.

  To make \ourmethod applicable to the REGED dataset we consider the following setup. For each node $X_i$ in the ground truth graph $G^{\ast}$ with $k \geq 5$ children, the set of which we denote by $C = C_i$, we select a random subset $R = R_i$ also consisting of $k$ nodes of $G^{\ast}$ which are not contained in the Markov boundary of any $X_i$ in $G^{\ast}$ and do not have a common parent. We then consider the induced subgraph $G_i$ over the nodes $C_i \cup R_i \cup \left\{ i \right\}$. However, the data given to each method is only over the variables $\Xb_{C \cup R}$ from which we compute the results in Fig.~\ref{fig:reged}.

  We show the results for $\Fnet$ for different methods in a DR plot in Fig. \ref{fig:reged}. Even though the data violates our assumptions, \ourmethod outperforms its competitors by a large margin. Moreover, even for those sets of variables where \ourmethod is only moderately confident, it still performs better than its competitors at their \emph{most} confident. This suggests that \ourmethod works reliably even when the true model deviates from our assumptions.
\begin{figure}[t]
  \centering
  \begin{tikzpicture}
    \begin{axis}[eda line,
      ymax = 1.0, ymin = 0.0, ytick={0.0,0.2,...,1.0},
      ylabel={$\Fnet$}, xlabel={Decision Rate},
      ylabel near ticks, xlabel near ticks,
      tick label style={font=\footnotesize},
      label style={font=\footnotesize},
      height=3.05cm, width=0.45\linewidth,
      legend pos=north east, legend columns=2,
      legend style={nodes={scale=0.65, transform shape}, at={(0.65,1.05)},anchor=north west },
      xmin=0, xmax=1, xtick={0,0.25,0.5,0.75,1}]
      \draw[dashed, black!50] (0,20) -- (100,20);
      \draw[dashed, black!50] (0,40) -- (100,40);
      \draw[dashed, black!50] (0,60) -- (100,60);
      \draw[dashed, black!50] (0,80) -- (100,80);
      \addplot[ourcolor] table[x index = 0, y index = 1, header = false] {expres/reged-comp.dat};
      \addplot[ourgreen] table[x index = 0, y index = 2, header = false] {expres/reged-comp.dat};
      \addplot[ouryellow] table[x index = 0, y index = 3, header = false] {expres/reged-comp.dat};
      \addplot[ourorange] table[x index = 0, y index = 4, header = false] {expres/reged-comp.dat};
      \addplot[ourbl] table[x index = 0, y index = 5, header = false] {expres/reged-comp.dat};
      \addlegendentry{\ourmethod}
      \addlegendentry{\NOTEARS}
      \addlegendentry{\tofft}
      \addlegendentry{\GFCI}
      \addlegendentry{\DCD}
    \end{axis}
  \end{tikzpicture}%
  \caption{[Higher is better] Decision rate for \ourmethod and its competitors on the REGED dataset. Overall, \ourmethod outperforms all other methods both for points where they are confident as well as those where they are not.
  }
  \label{fig:reged}
\end{figure}


\subsection{Other methods on the SOS Network}
\label{sec:sos-comp}

In Fig.~\ref{fig:sos2} we show the results of \tofft and \NOTEARS on the SOS dataset. Like \GFCI, \tofft can only give indications about which pairs might be confounded, and for the majority of pairs it is not confident. Meanwhile, by its very design \NOTEARS has no notion that confounders might be involved.

\begin{figure*}[t]
  \hfill
  \begin{subfigure}[b]{0.3\textwidth}
    \centering
    \begin{tikzpicture}[scale=0.5]
      \foreach \a [count=\c] in {recA,umuD,uvrD,uvrA,uvrY,ruvA,polB}{\node at (\c*360/7+40:4) (\a) {\a};}
      \node (uvrI) at ($(uvrA)!0.5!(uvrY)$) {};
      \node (lexA) at ($(recA)!0.5!(uvrI)$) {lexA};

      \path[thick, color=black!50, ->]
      \foreach \a in {umuD,uvrD,uvrA,uvrY,ruvA,polB}{(lexA) edge (\a)}
      (recA) edge[bend right=20] (lexA)
      (lexA) edge[bend right=20] (recA)
      ;
    \end{tikzpicture}
    \caption{Ground Truth}
  \end{subfigure}
  % \hspace*{-4cm}
  \hfill
  \begin{subfigure}[b]{0.3\textwidth}
    \centering
    \begin{tikzpicture}[scale=0.5]
      \foreach \a [count=\c] in {recA,umuD,uvrD,uvrA,uvrY,ruvA,polB}{\node at (\c*360/7+40:4) (\a) {\a};}

      \path[thick, dotted, color=black!50]
      (recA) edge[] (umuD)
      (recA) edge[black, solid] (uvrA)
      (recA) edge[] (uvrY)

      (umuD) edge[] (uvrA)
      (umuD) edge[black, solid] (uvrY)
      (umuD) edge[] (ruvA)

      (uvrD) edge[] (polB)

      (uvrA) edge[black, solid] (ruvA)
      (uvrA) edge[] (polB)

      (uvrY) edge[black, solid] (ruvA)
      ;
    \end{tikzpicture}
    \caption{\tofft}
  \end{subfigure}
  \hfill
  \begin{subfigure}[b]{0.3\textwidth}
    \centering
    \begin{tikzpicture}[scale=0.5]
      \foreach \a [count=\c] in {recA,umuD,uvrD,uvrA,uvrY,ruvA,polB}{\node at (\c*360/7+40:4) (\a) {\a};}
    \end{tikzpicture}
    \caption{\NOTEARS}
  \end{subfigure}
  \hfill
  \caption{\tofft and \NOTEARS on the SOS dataset. As before, only (potentially) confounded pairs are drawn in the figures. We see that \tofft, like \GFCI and \DCD cannot determine all nodes to be jointly confounded. Meanwhile \NOTEARS assumes causal sufficiency and therefore finds no indication of confounding.}
  \label{fig:sos2}
\end{figure*}
\bibliography{abbrev, bib-paper}
\end{document}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End:
