% !TEX root = ../main.tex

\section{Empirical Studies}

\subsection{Sample Size and Regularization Strength}
\label{appendix:sec:empirical:sample_size_and_regularization_strength}
We reproduce figure \Cref{fig:simulation_n_and_alpha} from the main text in
\Cref{fig:simulation_n_and_alpha_copy}, which shows the impact of the sample size $n$
and the regularization strength $\alpha$ on the performance of GES and XGES variants. We
complete it with \Cref{fig:simulation_n_and_alpha_edges} to show the corresponding
number of predicted edges. It indeed reveals that increasing $n$ causes GES and its
variants to over-insert.

\begin{figure}[h]
    \centering
    \begin{subfigure}{0.45\linewidth}
        \centering
        \includegraphics[width=\linewidth]{fig/simulation_n_and_alpha.pdf}
        \caption{Evolution of the performance of GES and XGES variants with the number of
        samples $n$.}
        \label{fig:simulation_n_and_alpha_copy}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.45\linewidth}
        \centering
        \includegraphics[width=\linewidth]{fig/simulation_n_and_alpha_edges.pdf}
        \caption{Evolution of the number of predicted edges with the number of samples $n$.}
        \label{fig:simulation_n_and_alpha_edges}
    \end{subfigure}
    \caption{Evolution of the performance of GES and XGES variants with (left) the
    number of samples $n$, and (right) the regularization strength $\alpha$. Increasing
    $n$ hurts GES and its variants (as it reduces the BIC regularization and
    over-insertion is exacerbated) while it helps XGES. Increasing the BIC
    regularization with $\alpha$ helps GES but without letting it catch up with XGES.
    Missing points indicate the method did not run.}
\end{figure}

\begin{figure}[h]
    \centering
    \begin{subfigure}{0.49\linewidth}
        \centering
        \includegraphics[width=\linewidth]{fig/simulation_main_recall.pdf}
        \caption{Recall of GES and XGES on simulated data. }
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.49\linewidth}
        \centering
        \includegraphics[width=\linewidth]{fig/simulation_main_precision.pdf}
        \caption{Precision of GES and XGES on simulated data.}
    \end{subfigure}
    \begin{subfigure}{0.49\linewidth}
        \centering
        \includegraphics[width=\linewidth]{fig/simulation_main_f1.pdf}
        \caption{F1 of GES and XGES on simulated data.}
    \end{subfigure}
    \caption{Performance of GES and XGES on simulated data measured with precision, recall and F1. With $n=10000, \alpha=2$ and 30 seeds (same data as \cref{fig:simulation_main}). XGES outperforms other methods in the three metrics.}
    \label{fig:simulation_main_precision_recall}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.7\linewidth]{fig/simulation_main_score_diff.pdf}
    \caption{Difference in score between the true graph/MEC $G^*$ and the graph/MEC returned by GES and XGES on simulated data. XGES returns graphs with higher scores than GES.}
    \label{fig:simulation_main_score_diff}
\end{figure}


\subsection{Complementary metrics: precision, recall, F1 score, BIC score.}
\label{appendix:sec:empirical:f1_score}
We complement \Cref{fig:simulation_main} from the main text with
\Cref{fig:simulation_main_precision_recall} to show the F1 score of GES and XGES for the same simulated data, as well as the precision/recall breakdown. We also report the BIC score of the graphs returned by GES and XGES in \Cref{fig:simulation_main_score_diff}. 
We find similar conclusions as in the main text.

The methods return a MEC $\hat M$ to predict the true MEC $M^*$. The F1 score, precision
and recall are defined for binary classification problems. For each ordered pair of
nodes $(i,j)$ we say $M$ contains $(i,j)$ if $(i,j)$ is directed in $M$ from $i$ to $j$
or if $(i,j)$ is undirected in $M$ $(i,j)$ (but not if $(j,i)$ is directed in $M$ from $j$ to $i$). Then, the binary classification problem is to predict for each $(i,j)$ if
$M^*$ contains $(i,j)$ or not, predicted by whether $\hat M$ contains $(i,j)$ or not.

\clearpage
\newpage
\subsection{More variables and edge densities}
\label{appendix:sec:empirical:more_variables_and_edge_densities}
We show the performances of the methods on more combinations of $d$ and $\rho$ in
\Cref{fig:simulation_d_and_rho}. We fixed $n=10000, \alpha=2$.
We find similar trends as in \Cref{fig:simulation_main} and \Cref{fig:simulation_correlation_speed}.

For edge density $\rho=1$, we find no significant difference between GES and XGES. This
is expected as the true graph is really sparse and the methods are less likely to
encounter local optima.
\begin{figure}[h]
    \centering
    \includegraphics[width=0.98\linewidth]{fig/simulation_shd_wide_d.pdf}
    \caption{Performance of GES and XGES on more combinations of $d$ and $\rho$. We fixed
    $n=10000, \alpha=2$ and we report error bars and averages over 5 seeds. XGES consistently outperforms GES and its variants. The dashed lines indicate the number of true edges.}
    \label{fig:simulation_d_and_rho}
\end{figure}


\subsection{Number of calls to the scoring function}
\label{appendix:sec:empirical:number_of_calls}
We show the number of calls to the scoring function for the different methods in
\Cref{fig:simulation_number_of_calls}. We find that even though XGES repeatedly applies
XGES-0, it only makes around one order of magnitude more BIC score evaluations. fGES was
not included because we could not extract the number of calls from the Tetrad
implementation. 

\begin{figure}[h]
    \centering
    \includegraphics[width=0.4\linewidth]{fig/n_bic_eval.pdf}
    \caption{Number of calls to the scoring function for the different methods. We fixed
    $n=10000, \alpha=2$, and we report error bars and averages over 5 seeds.}
    \label{fig:simulation_number_of_calls}
\end{figure}

\subsection{The OPS variant}
\label{appendix:sec:empirical:ops}
\citet{chickering2002optimal} evaluated GES against a variant called OPS that also
    considered insertions and deletions simultaneously. But OPS did not prioritize
    deletions over insertions, resulting in limited changes to GES in their experiments.
    We show the performance of OPS in \Cref{fig:simulation_ops}. We
    corroborate the results of \citet{chickering2002optimal} that OPS has performances
   similar to GES.

 Mathematically, deleting an edge can only increase the BIC score by at most $\frac{\alpha}{2} \log n$ which is usually smaller than the increase from inserting an edge.
 Hence even if OPS considers deletions and insertions ``together'', we conjecture that most deletions are only considered at the end of the search because insertions have higher scores and are applied first. OPS then encounters the same local maxima as GES. 
 This highlights the importance of prioritizing deletions over insertions with XGES.\looseness=-1

 \begin{figure}
    \centering
    \includegraphics[width=0.6\linewidth]{fig/simulation_main_shd_with_ops.pdf}
    \caption{Same experiment as \Cref{fig:simulation_main} but with the addition of the OPS variant. OPS performs very similarly to GES, suggesting that XGES-0's heuristic favoring deletions over insertions in XGES-0 is important. }
    \label{fig:simulation_ops}
 \end{figure}


\subsection{The double descent of GES with the sample size}
\label{appendix:sec:empirical:double_descent}
We show the performance of GES and XGES on a small graph with $d=15$ and $\rho=2$ for
sample sizes up to $n=10^8$ in \Cref{fig:simulation_double_descent}. XGES monotonically and quickly improves to an SHD of 0. We find that GES
improves from $n=10^2$ to $n=10^4$ but worsens around $n=10^4$. It eventually improves
again after $10^6$. The error bands are bootstrapped 95\% confidence intervals over 30 
seeds. We chose the graph to be small so that we could observe the second descent of
GES with sample sizes up to $10^8$. However, we doubt that such large sample sizes
are practical. We further believe that the issue worsens with larger graphs. (\textit{Note: We reimplemented GES and GES-r to scale to $n=10^8$, we verified that we obtained the same results as the original implementations for $n$ up to $10^6$. See parameter \texttt{-b} in the XGES code.})

When the sample size increases, the strength of the BIC regularization relative to the likelihood decreases in $\frac{\log(n)}{n}$. We hypothesize that GES worsens because
that decrease might lead to over-inserting, and the encounter of local optima. 


\begin{figure}[h]
    \centering
    \includegraphics[width=0.7\linewidth]{fig/simulation_double_descent.pdf}
    \caption{Performance of GES and XGES on a small graph with $d=15$ and $\rho=2$ for
    sample sizes up to $n=10^8$. We find that GES worsens around $n=10^4$ but improves
    after $n=10^6$. It exhibits a double descent behavior with the sample size.}
    \label{fig:simulation_double_descent}
\end{figure}


\subsection{The impact of the simulation}
\label{appendix:sec:empirical:impact_of_simulation}

We describe the simulation procedure and show the impact of changing it. 

\subsubsection{The simulation procedure}
\label{appendix:sec:simulated_data}

In the experiments, we use the following procedure to obtain simulated data.

\begin{enumerate}
    \item Select $d$ and $\rho$.
    \item Draw a DAG $G^*$ as follows:
        \begin{itemize}[leftmargin=*]
            \item $G^* \sim \text{Erdos-Renyi}(\#nodes=d, p=\frac{2\rho}{d-1})$.
            \item Orient each edge $(i,j)$ from $\min(i,j)$ to $\max(i,j)$, that forms a
            DAG.
            \item Draw a permutation $\sigma$ of $[d]$ and relabel each node $i \mapsto
            \sigma(i)$. 
        \end{itemize}
    \item Choose a distribution $p^*$ as follows for each $i
    \in [d]$:
    \begin{itemize}[leftmargin=*]
        \item Sample weights $W_i$ away from $0$ to ensure faithfulness.
        \begin{enumerate}
            \item In most simulations, we draw $W_i \in [1,3]^{\left|\Pa^i_{G^*}\right|}$ from
            $\mathcal{U}([1,3])$.
            \item In \Cref{fig:simulation_main_negative}, we change the simulation to sample weights $W_i \in
            [-3,-1] \cup [1,3]$, by additionaly drawing a Bernoulli variable $b_i \sim
            \mathcal{B}(0.5)$ and setting $W_i \leftarrow -W_i$ if $b_i=1$.
        \end{enumerate}
        
        
        \item $L_1$-normalize $W_i$ as $W_i \leftarrow W_i / \sum_j |W_{ij}|$.
        \item Draw the scale of the Gaussian $\varepsilon_i \sim \mathcal{U}(0, \varepsilon_{\text{max}})$. 
        \begin{enumerate}
            \item By default in most plot $\varepsilon_{\text{max}}=0.5$. 
            \item We vary it in \Cref{fig:simulation_epsilon}.
        \end{enumerate}
        
        
        \item Define $p^*(x_i \mid x_{\Pa^i_{G^*}}) = \mathcal{N}(W_i^\top
        x_{\Pa^i_{G^*}}, \varepsilon_i^2)$.
    \end{itemize}
    \item Draw $n$ i.i.d. samples from $p^*$.
\end{enumerate}


\subsubsection{Varying the scale of the Gaussian noise}

We show the impact of varying the scale of the Gaussian noise $\varepsilon_{\text{max}}$
in \Cref{fig:simulation_epsilon}. As expected, GES and XGES are almost not impacted by
the scale of the noise. This is expected, as the methods explicitly model the noise variable.

\begin{figure}[h]
    \centering
    \includegraphics[width=0.9\linewidth]{fig/simulation_noise.pdf}
    \caption{Performance of GES and XGES on different scales of the Gaussian noise. We
    fixed $n=10000, \alpha=2$.}
    \label{fig:simulation_epsilon}
\end{figure}


\clearpage
\subsubsection{Different sampling of weights}
We find similar conclusions when we change the sampling of weights from $W_i \in [1,3]$
to $W_i \in [-3,-1] \cup [1,3]$ . We show the results in
\Cref{fig:simulation_main_negative}.

\begin{figure}[h]
    \centering
    \includegraphics[width=0.5\linewidth]{fig/simulation_main_negative.pdf}
    \caption{Performance of GES and XGES similar to \Cref{fig:simulation_main} but with
    a different sampling of weights $W$. $n=10000, \alpha=2$. }
    \label{fig:simulation_main_negative}
\end{figure}


% The Tetrad implementation, called fGES, considers a single Insert(X,Y,T S) per (X,Y) as
% the best insert over all T (in term of score, not validity). Yet, the best T might lead
% to an invalid insert, and so all X->Y insert (potentially with another T), will be
% ignored. 



\input{tex/9a.appendix.tex}