% !TEX root = ../main.tex

\section{Empirical Studies}\label{sec:experiments} We compare the XGES heuristics to
different variants of GES. We find that
XGES recovers causal graphs with significantly better accuracy and up to 10 times faster.


\subsection{Evaluation Setup}
\label{sec:experiments:evaluation_setup}
\parhead{Data Simulation.}
We simulate CGMs and data for different numbers of variables $d$, edge density $\rho$
(average number of parents) and number of samples $n$. 
We first draw a random DAG $G^*$
from an Erdos-Renyi distribution. We then obtain $p^*$ by choosing each conditional
distribution $p^*(x_i \mid x_{\Pa^i_{G^*}})$ as a Gaussian $x_i \sim
\mathcal{N}(W_i^\top x_{\Pa^i_{G^*}}, \varepsilon_i)$ where $W_i, \varepsilon_i$ are
random selected. To ensure faithfulness, we sample $W_i$ away from $0$. More details are
in \Cref{appendix:sec:simulated_data}.

\parhead{Baseline Algorithms.}
We compare our algorithms against GES without reversals (GES), and with reversals
(GES-r, a.k.a GIES), using the C++ implementation in the R package \texttt{pcalg}
\citep{kalisch2012causal}. We also include fast-GES (fGES) from the Java software Tetrad
\citep{ramsey2017million}.
An additional baseline, OPS, is provided in \Cref{appendix:sec:empirical:ops}.

\parhead{Evaluation Metrics}
We evaluate the algorithms with the structural Hamming distance on MECs (SHD) between
the method's results $\hat M$ and the ground-truth MEC $M^*$ \citep{peters2014causal}. The SHD is the number
of different edges between the CPDAGs of $\hat M$ and $M^*$. 
We also consider causal discovery as a binary classification task, where $\hat M$ predicts the presence of edges in $M^*$. We report the F1 score, precision, and recall for this task. Error bars are computed over multiple random datasets (seeds) and reported as bootstrapped 95\% confidence intervals \citep{waskom2021seaborn}.


\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{fig/simulation_correlation_speed.pdf}
    \caption{
    (left) The BIC scores of the graphs returned by each method are strongly correlated with the SHD to ground truth (shown for $d=50$, $\rho=3$, 30 seeds). XGES finds the highest scores and lowest SHDs. (right) Runtime of GES and XGES for a wide
    range of $d$. XGES-0 is up to 30 times faster than GES, and XGES up to 10 times
    faster. fGES may have overhead due to Java while other methods are in C++.}
    \label{fig:simulation_correlation_speed}
\end{figure}

\subsection{Results}
\parhead{General Performance.} % when we vary d, rho and fix n=10000, alpha=2
In \Cref{fig:simulation_main}, we find that XGES-0 and XGES outperform all baselines.
The improvement is more significant for larger density $\rho$ and larger $d$. The
conclusions are identical with precision and recall in \Cref{fig:simulation_main_precision_recall} of \Cref{appendix:sec:empirical:f1_score},
which are both improved by XGES. We emphasize that even though XGES favors deleting
edges, the proportion of true edges recovered by XGES is higher than GES (the recall).
We also report the F1 metric in \Cref{appendix:sec:empirical:f1_score}. We note that the
performance of fGES is slightly worse than GES. We explain in
\Cref{appendix:subsec:fast_ges} that one of the optimizations of fGES removes
some valid insertions.

\parhead{Choice of Metrics.}
\Cref{fig:simulation_correlation_speed} (left) shows that the BIC scores of the graphs returned by each method are strongly correlated with the SHD to ground truth. This is a comforting observation: maximizing the BIC score on finite data is indeed a good proxy for minimizing SHD to ground-truth.


\parhead{Impact of the Edge Density $\rho$.}
In \Cref{fig:simulation_d_and_rho}, we see that when $\rho=1$ (a very sparse graph where
nodes have $\rho=1$ parent on average), then all methods perform similarly well. The
advantage of XGES over GES is visible as soon as $\rho=2$ and widens as $\rho$ increases, see also
\Cref{fig:simulation_main}.

\parhead{Robustness to the Sample Size $n$.}
In \Cref{fig:simulation_n_and_alpha}~(left), we vary the number of samples $n$ and fix
$d=50$ and $\rho=3$. XGES's performance improves with $n$, coherent with
\Cref{th:local_consistency}. In contrast, GES and its variants are hurt when $n$ increases
beyond $10^4$. But this is not incoherent with GES's correctness in the limit of
infinite data. Instead, this reveals that the finite sample behavior of GES is nontrivial and that GES may require very large $n$ -- beyond what is practical -- to perform well.\looseness=-1

In \Cref{fig:simulation_double_descent}, we study sample sizes up to $n=10^8$ on a small graph with $d=15$ and $\rho=2$. We find again that GES worsens around $10^4$ samples, but this time, it improves again after $10^5$ samples, thereby exhibiting a double descent behavior. We discuss it in more detail in \Cref{appendix:sec:empirical:double_descent}.


% figure: simulation_n and simulation_alpha
\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{fig/simulation_n_and_alpha.pdf}
    \caption{Performance of GES and XGES when varying (left) the number of samples $n$, and
    (right) the regularization strength $\alpha$. Increasing $n$ improves XGES while it hurts GES and its
    variants. Increasing
    $\alpha$ initially improves GES but eventually hurts all methods. The dashed lines indicate the number of edges of the true graph. Error bars over 30 seeds.\looseness=-1}
    \label{fig:simulation_n_and_alpha}
\end{figure}

\parhead{Robustness to the Regularization Strength $\alpha$.}
In \Cref{fig:simulation_n_and_alpha} (right), we vary
$\alpha$ and fix $d=50$, $\rho=3$ and $n=10000$. We find that increasing $\alpha$ helps
GES from $\alpha=1$ to $\alpha=10$ but then hurts it. No value of $\alpha$ enables GES to
catch up to XGES. Echoing \Cref{sec:ges_failure}, we conclude that the solution to GES's
over-inserting is not to change the score function, but to change the search strategy,
as XGES does.


\parhead{Robustness to the Data Simulation.}
We vary the procedure to sample the weights $W_i$ in two ways: changing the scale
and changing the shape of their distribution. We report the results in
\Cref{appendix:sec:empirical:impact_of_simulation} with \Cref{fig:simulation_epsilon,fig:simulation_main_negative}.
We find similar conclusions as in \Cref{fig:simulation_main}.


\parhead{Implementation Speed.}
We measure the runtime of the different methods for a wide range of $d$ and 
$\rho \in \{2,4\}$ in \Cref{fig:simulation_correlation_speed} (right). We find that XGES-0 is up to 30 times
faster than GES, and XGES up to 10 times faster. While fGES's slower runtime may be 
attributed to Java overhead, the other methods are implemented in C++.
Higher densities slow down all methods, with a stronger impact on GES, which is coherent
with GES over-inserting in denser graphs. 

We find the same conclusions by reporting the number of calls to the scoring function as another measure
of efficiency in \Cref{appendix:sec:empirical:number_of_calls}. Interestingly, even though XGES repeatedly applies XGES-0, it only makes around one order of magnitude more BIC score evaluations than XGES-0.

\section*{Conclusion and Future Work}
We introduced XGES, an algorithm that significantly improves on GES. With XGES, we can learn larger and denser graphs from data. XGES offers several avenues for future work. One direction is to study its finite sample guarantees. A second is to study its applicability to more complex model classes beyond linear models. Another is to relax its assumptions: e.g. unfaithful graphs, or scores that are not asymptotically locally consistent \citep{schultheiss2023pitfalls}. Finally, its efficient implementation could be used to analyze large real-world datasets.\looseness=-1
