% !TEX root = ../main.tex

\section{Causal Discovery and GES}\label{sec:background}  

We first review the causal discovery problem and the necessary details of the greedy
equivalence search (GES) method.

\subsection{Causal Graphical Models }
Causal discovery aims to identify cause-and-effect relationships between random
variables $\{X_1, ..., X_d\}$. We reason about causal relationships using causal
graphical models (CGM). A CGM has two components:
\begin{enumerate}
\item a directed acyclic graph (DAG), $G^* = (V,E)$, where a node $j \in V$ represents
variable $X_j$ and an edge $(j, k) \in E$ denotes a direct causal link from $X_j$ to
$X_k$,
\item conditional distributions $p(X_j  \mid X_{\Pa_j^{G^*}})$, defining the
distribution of $X_j$ given its causal parents $X_{\Pa^{G^*}_j}$.
\end{enumerate}
The joint distribution of the variables $X_1,..., X_d$ writes:
\begin{equation}\label{eq:factor_joint}
p^*(X) = \prod\limits_{j \in V} p^*(X_j \mid X_{\Pa^{G^*}_j}).
\end{equation}
The goal of causal discovery is to recover the graph $G^*$ from the joint distribution
$p^*$ or from samples drawn from $p^*$. 

However, multiple CGMs with different graphs can generate the same $p^*\!.$ Two
important concepts address this difficulty: faithfulness and Markov equivalence
\citep{spirtes2000causation}.

\parhead{Faithfulness.} In \Cref{eq:factor_joint}, $G^*$ induces a factorization of
$p^*$, which, in turn, induces independencies between variables: each $X_j$ is
independent of its non-descendants given its parents $X_{\Pa^{G^*}_j}$
\citep{pearl1988probabilistic}. Reciprocally, we say that a distribution $p^*$ and a
graph $G^*$ are \textit{faithful} if all the independencies in $p^*$ are exactly those
implied by $G^*$ and no more. \looseness=-1

%(Notice it is the lack of edge in $G^*$ that imposes independencies in $p^*$.)
For example, $H=(\{1,2\}, \{1\rightarrow 2\})$ and $q=q(X_1)q(X_2)$ form a
valid CGM. But the independence $X_1 \indep X_2$ present in $q$ is not suggested by $H$.
Rather, $q$ is faithful to $H'=(\{1,2\}, \varnothing)$, which has no superfluous edges
like $1 \rightarrow 2$.

Limiting the search to faithful graphs reduces the possible CGMs that could have
generated $p^*$. But this is not enough.



\parhead{Markov Equivalence.} 
Two distinct graphs can both be faithful to $p^*$ if they induce the same set of
independencies on $p^*$. For example, $A\rightarrow B \rightarrow C$ and $A \leftarrow B
\leftarrow C$ impose the same set of independencies,  $\{A \indep C \mid B\}$. 

Graphs inducing the same independencies are called \textit{Markov equivalent}, they form
\textit{Markov equivalence classes} (MEC).
Since $G^*$ is identifiable only up to Markov equivalence, the task of causal discovery
becomes finding the MEC of $G^*$. 

\begin{remark}
    With more assumptions (e.g. about the form of $p^*$) or special data (e.g.,
    interventions), other causal discovery methods focus on identifying the possible
    $G^*$ beyond Markov equivalence. See \citet{glymour2019review} for an excellent
    review. We focus on Markov equivalence classes.
\end{remark}

\subsection{Score-based Causal Discovery}
In this work, we assume to have $n$ iid samples, denoted $\data = \{(x_1^{i}, ...,
x_d^{i})\}_{i=1}^n$, from a distribution $p^*$ that is faithful to some $G^*$. We aim to
recover the MEC of $G^*$ from $\data$.

Greedy Equivalence Search (GES) searches for the MEC of $G^*$ among all the possible
MECs. It does so by searching for the MEC whose DAGs maximize a specific score function.
It is a particular case of score-based causal discovery.


{Score-based methods} assign a score $S(G; \data)$ to every possible DAG $G$ given the
data $\data$. The score function $S$ is designed to be maximized by the true graph
$G^*$. This turns causal discovery into an optimization problem. 
\begin{equation}
    G^* = \argmax_{G} S(G; \data)
\end{equation}
Some scores have properties that are important for GES.

\begin{definition}
    A score $S$ is \emph{score equivalent} if it assigns the same score to all the
    graphs in the same MEC.
\end{definition}

A score equivalent $S$ enables defining the score of a MEC as the score of any of its
constituent graphs. 

\begin{definition}[Local Consistency, \citep{chickering2002optimal}]
Let $\data$ contain $n$ iid samples from some $p^*$. Let $G$ be any DAG and $G'$ be a
different DAG obtained by adding the edge $i \rightarrow j$ to $G$. A score $S$ is
\emph{locally consistent} if both hold:
\begin{enumerate}
    \item $X_i \not\hspace*{-0.5mm}\indep_{p^*} X_j \mid X_{\Pa_j^G} \Rightarrow S(G';
    \data)>S(G; \data)$,
\item $X_i \indep_{p^*} X_j \mid X_{\Pa_j^G} \Rightarrow S(G'; \data)<S(G; \data)$.
\end{enumerate}
The independence statements are with respect to $p^*$.

\end{definition}

A locally consistent score increases when we add an edge that captures a dependency not
yet represented in the graph. It decreases when we add an edge that does not capture any
new dependency.

\subsection{Greedy Equivalence Search}
\citet{chickering2002optimal} introduced Greedy Equivalence Search (GES). It is a
score-based method that is guaranteed to return the MEC of $G^*$ whenever the score is
locally consistent. The main characteristic of GES is to navigate the space of MECs.

GES begins with the MEC of the empty DAG and iteratively modifies it to improve the
score. At each step, only a few modifications are allowed. These
modifications are of three types: insertions, deletions, and reversals. 
\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{fig/mecv2.pdf}
    \caption{Illustration of insertions from a MEC A to MECs B or C: (i) choose a DAG in
    A, (ii) insert the edge $2\leftarrow3$ to obtain another DAG (iii) consider its MEC.
    Each MEC has all its DAGs on a white plate, and its canonical PDAG on a gray
    plate (see \Cref{sec:implementation} for a definition).}
    \label{fig:mec}
\end{figure}

\parhead{MEC modifications.}An \textit{insertion} on MEC $M$ selects a DAG $G$ in $M$,
adds an edge $x \rightarrow y$ to $G$ to obtain a different DAG $G'$ and replaces $M$
with the MEC of $G'$ (see \Cref{fig:mec}).

A \textit{deletion} on MEC $M$ selects a DAG $G$ in $M$, removes an edge
$x \rightarrow y$ from $G$ to obtain a different DAG $G'$ and replaces $M$ with the MEC
of $G'$.

A \textit{reversal} on MEC $M$ selects a DAG $G$ in $M$, reverses an edge
$x \rightarrow y$ to $x \leftarrow y$ in $G$ to obtain a different DAG $G'$ and replaces
$M$ with the MEC of $G'$.

GES applies these modifications in three separate phases.

\parhead{Phase 1: Insert.} First, GES finds all possible insertions, 
applies the one leading to the largest score increase, and repeats until no insertion
increases the score.

\parhead{Phase 2: Delete.} Then, GES finds all possible deletions, applies the one
leading to the largest score increase, and repeats until no deletion increases the
score.

The MEC obtained at the end of phase 2 is exactly the MEC of $G^*$ if the score is
locally consistent \citep{chickering2002optimal}

\parhead{Phase 3: Reverse.} In theory, phases 1 and 2 are sufficient to recover the
MEC of $G^*$. Yet, \citet{hauser2012characterization} showed that adding a third
phase with reversals can improve the search in practice with finite data (where the score might not be
locally consistent). In this phase, GES finds all possible reversals for the MEC,
applies the one that increases the score most, and repeats until none do.

The pseudocode of GES is given in \Cref{alg:ges-vanilla}.

\begin{algorithm}[t]
    \caption{Greedy Equivalence Search (GES)}\label{alg:ges-vanilla}  
    \DontPrintSemicolon
    
    \KwIn{Data $\data \in \R^{n\times d}$, score function $S$}  
    \KwDefine{$\delta_{\data,M}(O) = S(\text{Apply}(O,M) ; \data) - S(M; \data)$}  
    \KwOut{MEC of $G^*$}  
    $M \leftarrow \{([d], \varnothing)\}$ \tcp*{Empty graph's MEC}  
    $\mathcal{I} \leftarrow$ get all insertions valid for $M$ \;  
    \While{$|\mathcal{I}| > 0$}{  
        $O^* \leftarrow \argmax_{I \in \mathcal{I}}\{\delta_{\data,M}(I)\}$ \tcp*{Get
        best insertion}  
        \lIf{$\delta_{\data,M}(O^*) \leq 0$}{\textbf{break}}  
        $M \leftarrow$ Apply($O^*, M$) \tcp*{Apply best insertion}  
        $\mathcal{I} \leftarrow$ get all insertions valid for $M$ \;  
    }  
    $\mathcal{D} \leftarrow$ get all deletions valid for $M$ \;  
    \While{$|\mathcal{D}| > 0$}{  
        $O^* \leftarrow \argmax_{D \in \mathcal{D}} \{\delta_{\data,M}(D)\}$ \tcp*{Get
        best deletion}  
        \lIf{$\delta_{\data,M}(O^*) \leq 0$}{\textbf{break}}  
        $M \leftarrow$ Apply($O^*, M$) \tcp*{Apply best deletion}  
        $\mathcal{D} \leftarrow$ get all deletions valid for $M$ \;  
    }  
    \tcc{(Optional) 3rd phase like above but with reversals}  
    \Return $M$ \;  
\end{algorithm}

\parhead{Correctness.} GES's correctness relies on two properties:
    (i) the greedy scheme will reach the global maximum of any locally consistent
    score \citep[Lemma 10]{chickering2002optimal}, and 
    (ii) the true graph $G^*$ and its MEC are the unique global maximizers of any
    locally consistent score \citep[Proposition 8]{chickering2002optimal}.

With these two properties, GES is guaranteed to recover the MEC of $G^*$ when the score
is locally consistent.
\label{sec:greedy_search}

\parhead{The BIC score.} A score commonly used by GES is the Bayesian Information Criterion
(BIC) \citep{schwarz1978estimating}. Given a model class
$\mathcal{M}_G$ for each $G$, and our data $\data$ with its $n$ samples, the BIC defines a score:
\begin{equation}
    S(G; \data) = \log p_{\hat\theta}(\data ; G) - \frac{\alpha}{2} \log n \cdot |\hat\theta|,
\end{equation}
where
$\alpha$ is a hyperparameter, $\hat \theta$ is the likelihood maximizer over
$\mathcal{M}_G$, and the number of parameters $|\hat\theta|$ is usually the number of edges in $G$. The BIC is a model selection criterion trading off log-likelihood and model complexity. Models with higher BIC are preferred. The original BIC has $\alpha=1$. 


\parhead{Gaussian Linear Models. } A common model class used for continuous data with the BIC is the class of Gaussian linear models, where given a graph $G$, each variable is Gaussian with a linear
conditional mean and a specific variance:
\begin{equation}\textstyle
    p_{\theta}(x_j \mid x_{\Pa_j^G}) \sim \mathcal{N}\Bigl(\sum_{k \in \Pa_j^G} 
    \theta_{jk} x_k + \theta_{j0}, \theta_{j(d+1)}^2\Bigr).
\end{equation}
For Gaussian linear models, the BIC is score equivalent. It can also be locally
consistent under some conditions.

\begin{theorem}[Local Consistency of BIC \citep{haughton1988choice,chickering2002optimal}]
    For $\alpha>0$, the BIC for Gaussian linear models is locally consistent once $n$
    is large enough.
    \label{th:local_consistency}
\end{theorem}

% More generally, the local consistency theorem also applies to most model classes of the exponential family (e.g. the multinomial distribution for discrete data).
Hence, for Gaussian linear models, GES is guaranteed to recover the MEC of $G^*$ with
infinite data. 
In practice, however, data is finite and GES can return incorrect MECs. 

\parhead{Example of Failure.} In \Cref{fig:simulation_main}, we report the performance
of GES (orange) on simulated data, along with its variants and the proposed XGES. We
simulate CGMs $(G^*, p^*)$ for $d \in\{25,50\}$ variables, $\rho d \in \{2d,3d,4d\}$
edges ($\rho$ is an edge density parameter) and draw $n=10,000$ samples from $p^*$. The
simulation is detailed in \Cref{sec:experiments:evaluation_setup}. We compare the
methods' results to the true graph $G^*$ using the structural Hamming distance for MECs
(SHD), which counts the number of different edges between graphs of two MECs (see
\Cref{sec:experiments:evaluation_setup}). \Cref{fig:simulation_main} shows that GES can fail (SHD $> 0$),
especially in denser graphs.

In addition, we confirm that including the third phase of reversals in GES improves its
performance (GES-r, in green).


% figure: simulation_main
\begin{figure}[t]
    \centering
    \includegraphics[width=1\linewidth]{fig/simulation_main_shd.pdf}
    \caption{Performance comparison of GES and XGES variants, measured with SHD for
    different edge densities $\rho$. XGES heuristics outperform GES and its variants in
    all scenarios. The dashed lines indicate the number of edges of the true graph. Each boxplot
    is computed over 30 seeds.\looseness=-1}
    \label{fig:simulation_main}
\end{figure}

