% !TEX root = ../main.tex

\section{Extremely Greedy Equivalence Search}\label{sec:xges} 


In this section, we first investigate why GES can fail and then
propose simple solutions to mitigate failure.

\subsection{Scenarios of GES failure}
\label{sec:ges_failure}
As reviewed in \Cref{sec:greedy_search}, GES's correctness relies on two conditions: (i)
the ability of its greedy search to find the score's global maximizer, and (ii) the true
graph's MEC being the score's global maximizer (and the only one).

These two properties might not hold if the data cannot render $S$
locally consistent. We investigate GES failure.
Is the issue that GES cannot reach the global maximizer with greedy search? If so,
changing the search heuristic could help bypass local maxima. Or is it that GES
effectively finds the global maximizer, but this maximizer is not the true graph?
In this case, designing a new score might help. To check these hypotheses, we conduct
empirical experiments.
\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{fig/ges_fail.pdf}
    \caption{
    Empirical study of GES failure, on 90 simulated datasets with varying variables $d$ and graph densities $\rho$. (left) Differences in BIC
    between GES and ground-truth are negative. GES does not find the score's
    global maximum. (right) Ratios of GES-edges to true edges exceed 1. GES returns many more edges than the true graph.
}
    \label{fig:ges_fail}
\end{figure}



\parhead{Greedy Optimization Fails.}
We apply GES on the data simulated for \Cref{fig:simulation_main}, this time including
$d \in \{10,50,100\}$ variables. We obtain a MEC $\hat M$ that we compare to $G^*$ with:
\begin{equation}
    \Delta S = \frac{S(\hat M; \data) - S(G^*; \data)}{d}.
\end{equation}
A negative $\Delta S$ indicates that GES failed to identify a global maximizer and that
$G^*$ still scores higher than $\hat M$ (note that it does not imply $G^*$ is the global maximizer). A positive $\Delta S$ demonstrates
that $G^*$ is no longer the global maximizer and that GES legitimately identified a
MEC with high score.

In \Cref{fig:ges_fail} (left), we consistently find $\Delta < 0$ across different
numbers of variables $d$ and edge densities $\rho$. This is evidence that GES fails to
reach the global maximum and that the true graph $G^*$ still has a better score than
$\hat M$.

We measure next the ratio $\zeta = |\hat M| /|G^*|$ between the number of edges found by
GES and the number of true edges. \Cref{fig:ges_fail} (right) shows that GES
over-inserts edges i.e. $\zeta > 1$, especially with dense graphs. The idea behind GES
is to over-insert edges in the first phase and then delete them in the second phase.
However, we hypothesize that over-inserting may lead GES into local maxima before the
second phase can correct it.

Following these two observations, we focus on ensuring that GES reaches the global
maximizer. To do so, we design novel search heuristics aimed at preventing
over-insertion.



\subsection{The Heuristic XGES-0}\label{sec:xges-0}

GES considers insertions, deletions, and optionally reversals in three separate phases.
Rather, we consider all operations simultaneously, where deletions, insertions, and
reversals can interleave in any order. And when both insertions and deletions can 
increase the score, we prioritize deletions. 

\parhead{Heuristic XGES-0.} At each step, identify all the valid insertions,
deletions, and reversals. If some deletes would increase the score, apply the best one.
Otherwise, if some reversals would increase the score, apply the best one. Otherwise,
apply the best insert. Repeat until no deletions, reversals, or insertions can increase
the score. 



\begin{algorithm}[t]
    \DontPrintSemicolon  
     % align line on the right
    \KwIn{Data $\data \in \mathbb{R}^{n\times d}$, score $S$.}
    \KwDefine{$\delta_{\data,M}(O) = S(\text{Apply}(O,M) ; \data) - S(M; \data)$}  
    \KwOut{MEC of $G^*$}  
    $M \leftarrow \{([d], \varnothing)\}$\;  
    $\mathcal{I},\mathcal{D},\mathcal{R} \leftarrow$ all insertions, deletions, reversals
    valid for $M$\;  
    \While{$|\mathcal{I}| + |\mathcal{D}| + |\mathcal{R}| > 0$}{  
        \uIf{$|\mathcal{D}| > 0$ \textnormal{\textbf{and}} $\max_{D \in \mathcal{D}}
        \{\delta_{\data,M}(D)\} \geq 0 $}{  
            $O^* \leftarrow \argmax_{D \in \mathcal{D}} \{\delta_{\data,M}(D)\}$ \;  
        }  
        \uElseIf{$|\mathcal{R}| > 0$ \textnormal{\textbf{and}} $\max_{R \in \mathcal{R}}
        \{\delta_{\data,M}(R)\} > 0 $}{  
            $O^* \leftarrow \argmax_{R \in \mathcal{R}} \{\delta_{\data,M}(R)\}$\;  
        }  
        \uElseIf{$|\mathcal{I}| > 0$ \textnormal{\textbf{and}} $\max_{I \in \mathcal{I}}
        \{\delta_{\data,M}(I)\} > 0 $}{  
            $O^* \leftarrow \argmax_{I \in \mathcal{I}} \{\delta_{\data,M}(I)\}$\;  
        }  
        \uElse{  
            \textbf{break} \tcp*{No more operations available}  
        }  
        $M \leftarrow$ Apply($O^*, M$) \;
        $\mathcal{I},\!\mathcal{D},\!\mathcal{R} \leftarrow\!$ all insertions, deletions, reversals
        valid for \!$M$\;  
    }  
    \Return $M$\;  
    \caption{XGES-0.   
    }  
    \label{alg:xges-0}  
\end{algorithm}

We call this heuristic XGES-0 for eXtremely Greedy Equivalence Search and we detail it in
\Cref{alg:xges-0}. XGES-0 retains the same theoretical correctness as GES.

\begin{restatable}[]{theorem}{thXgesZero}
    For any locally consistent score $S$, the MEC $\hat M$ returned by XGES-0 contains
    the true graph $G^*$.
    \label{thm:xges0}
\end{restatable}


The proof leverages the same theorems as those used to prove GES's correctness. It is
provided in \Cref{appendix:sec:theoretical_guarantees_xges0}.

In \Cref{fig:simulation_main}, we find empirically that XGES-0 (red) obtains MECs
with better scores and closer to $G^*$ than GES. Early deletions effectively reduce
encounters with local maxima.

\begin{remark}
    Alongside GES, \citet{chickering2002optimal} proposed a variant called OPS that
    also considered insertions and deletions simultaneously. But OPS did not prioritize
    deletions over insertions, resulting in no improvements to GES in practice. 
    We provide more details in \Cref{appendix:sec:empirical:ops}.
    %Mathematically, deleting an edge can only increase the BIC score by at most $\frac{\alpha}{2} \log n$ which is usually smaller than the increase from inserting an edge. Hence even if OPS considered deletions, it would still mostly insert edges first and encounter the same local maxima as GES.
\end{remark}

\subsection{The Heuristic XGES}

Building upon XGES-0, we introduce the heuristic XGES. XGES complements XGES-0. It
repeatedly uses XGES-0, each time deleting an edge that causes a local maximum.

\parhead{Heuristic XGES.} XGES begins by applying XGES-0 until no operations can
increase the score. Then, it enumerates all valid deletions, all of which will
decrease the score. For each deletion, XGES copies the MEC and resumes XGES-0 on the
copy until termination. But without ever reinserting the edge removed by the deletion.
If the final score is worse than the original MEC, the copy is discarded, and the search
continues with the next deletion. If the final score is better, the copy becomes the new
MEC. XGES then restarts with the new MEC and all its new deletions. XGES stops once all
deletions of a MEC have been tried. We provide the pseudocode of XGES in \Cref{alg:xges}
in \Cref{appendix:sec:theoretical_guarantees_xges}.


\parhead{Intuition.} The XGES heuristic aims to remove incorrect edges that were
inserted early in the search and might be causing local maxima preventing their
deletion. By forcefully deleting these edges, XGES can get around the local maximum and
discover better graphs.


\begin{restatable}[]{theorem}{thXges}
    For any score $S$, XGES returns a MEC $\hat M$ with a higher or equal score than
    XGES-0. If $S$ is locally consistent, then $\hat M$ contains the true graph $G^*$. 
    \label{thm:xges}
\end{restatable}

The proof follows from the design of XGES and by \Cref{thm:xges0}. 

\Cref{fig:simulation_main} illustrate the performance of XGES (purple), showing that it
significantly improves over all other GES variants.  XGES enables
non-trivial causal discovery in denser graphs ($\rho\geq2$) with many variables
($d\geq50$). However, XGES is computationally more expensive than
XGES-0. To alleviate this, we develop an efficient implementation for it.
