% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Reframed GES with a Neural Conditional Dependence Measure}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<xinwei.shen@connect.ust.hk>?Subject=Your UAI 2022 paper}{Xinwei~Shen}{}}
\author[2]{Shengyu~Zhu}
\author[3]{Jiji~Zhang}
\author[2]{Shoubo~Hu}
\author[2]{Zhitang~Chen}
% Add affiliations after the authors
\affil[1]{%
%    Department of Mathematics\\
    Hong Kong University of Science and Technology
%    Hong Kong, China
}
\affil[2]{%
    Huawei Noah's Ark Lab
%    China
}
\affil[3]{%
    Hong Kong Baptist University
%    Hong Kong, China
  }
 
   \usepackage{amsmath, amssymb, amsthm}
\usepackage{mathrsfs, bm}
\usepackage[title]{appendix}
\usepackage{todonotes}
\usepackage{xcolor}
\usepackage{enumitem,booktabs,subfigure}
\usepackage{comment}
\usepackage{multirow}
\usepackage[utf8]{inputenc} 
\usepackage[T1]{fontenc}
\usepackage{algorithm}
\usepackage{algorithmic}

%%% HELPER CODE FOR DEALING WITH EXTERNAL REFERENCES
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{
    \externaldocument{#1}
    \addFileDependency{#1.tex}
    \addFileDependency{#1.aux}
}
%%% END HELPER CODE

% put all the external documents here!
\myexternaldocument{shen_325-supp}

\definecolor{columbiablue}{rgb}{0.61, 0.87, 1.0}
\newcommand{\xs}[1]{\todo[inline,color=columbiablue]{#1}}

\newcommand{\revise}[1]{{\textcolor{black}{#1}}}


\def\ind{\mathbf{1}}
\def\bbE{\mathbb{E}}
\def\bbR{\mathbb{R}}
\def\bbP{\mathbb{P}}
\def\cA{\mathcal{A}}
\def\cB{\mathcal{B}}
\def\cC{\mathcal{C}}
\def\cD{\mathcal{D}}
\def\cE{\mathcal{E}}
\def\cF{\mathcal{F}}
\def\cG{\mathcal{G}}
\def\cP{\mathcal{P}}
\def\cX{\mathcal{X}}
\def\cY{\mathcal{Y}}
\def\cZ{\mathcal{Z}}
\def\cN{\mathcal{N}}
\def\cU{\mathcal{U}}
\def\cT{\mathcal{T}}
\def\cH{\mathcal{H}}
\def\bx{\bm{x}}
\def\bb{\bm{b}}
\def\ba{\bm{a}}
\def\bc{\bm{c}}
\def\by{\bm{y}}
\def\bz{\bm{z}}
\def\bD{\mathbf{D}}
\def\bE{\mathbf{E}}
\def\bV{\mathbf{V}}
\def\bT{\mathbf{T}}
\def\bH{\mathbf{H}}
\def\bZ{\mathbf{Z}}
\def\bA{\bm{A}}
\def\bW{\bm{W}}
\def\bX{\bm{X}}
\def\bY{\bm{Y}}
\def\tf{\tilde{f}}
\def\tg{\tilde{g}}
\def\th{\tilde{h}}
% \def\regf{h}
\newcommand{\indpt}{\perp\!\!\!\perp}
\DeclareMathOperator*{\argmin}{argmin}
\def\pa{\mathbf{Pa}}
\def\nd{\mathbf{Nd}}

\def\pto{\overset{p}{\to}}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}%[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{assumption}{Assumption}
\newtheorem{definition}{Definition}

\theoremstyle{remark}
\newtheorem*{example}{Example}
\newtheorem*{remark}{Remark}



\begin{document}
\maketitle

\begin{abstract}
In a nonparametric setting, the causal structure is often identifiable only up to Markov equivalence, and for the purpose of causal inference, it is useful to learn a graphical representation of the Markov equivalence class (MEC).  In this paper, we revisit the Greedy Equivalence Search (GES) algorithm, which is widely cited as a score-based algorithm for learning the MEC of the underlying causal structure. We observe that in order to make the GES algorithm consistent in a nonparametric setting, it is not necessary to design a scoring metric that evaluates graphs. Instead, it suffices to plug in a consistent estimator of a measure of conditional dependence to guide the search. We therefore present a reframing of the GES algorithm, which is more flexible than the standard score-based version and readily lends itself to the nonparametric setting with a general measure of conditional dependence. In addition, we propose a neural conditional dependence (NCD) measure, which utilizes the expressive power of deep neural networks to characterize conditional independence in a nonparametric manner. We establish the optimality of the reframed GES algorithm under standard assumptions and the consistency of using our NCD estimator to decide conditional independence. Together these results justify the proposed approach. Experimental results demonstrate the effectiveness of our method in causal discovery, as well as the advantages of using our NCD measure over kernel-based measures. % in terms of computational complexity 
\end{abstract}




\section{Introduction}

%background on causal discovery; 
 Causal structure learning is a fundamental problem in various disciplines of science, and flexible solutions to this problem have potentially wide applications
\citep{pearl2009causality, koller2009probabilistic, peters2017elements}, e.g., inferring causal relationships among phenotypes \citep{neto2010causal,zhang2015estimation}, and finding causes in earth system sciences \citep{runge2019inferring} and telecommunication networks \citep{ng2019masked}. In many scenarios, it is expensive or even impossible to perform interventions or randomized experiments in order to discover the causal relationships. This limitation inspires the need to infer or at least systematically produce plausible hypotheses of causal structures from purely observational data, which is often known as causal discovery. General assumptions relating the data distribution to the unknown causal structure have been leveraged to make causal discovery feasible, including the well-known Markov condition and faithfulness assumption~\citep{spirtes2000causation}.


%different categories and their pro/cons --> to motivate our CGES; advantage of our method
Suppose the unknown causal structure can be properly represented by a directed acyclic graph (DAG) over the observed variables. The last one and a half decades have seen a host of results on the identifiability of the causal DAG from the observational data distribution, under various parametric or semi-parametric assumptions~\citep{shimizu2006lingam, Hoyer2009anm, zhang2012pnl,buhlmann2014cam, peters2014equal}. However, in a nonparametric setting, from the observational distribution, the causal structure is known to be identifiable only up to Markov equivalence. Despite this limitation, it remains a worthy task to learn a graphical representation of the Markov equivalence class (MEC), known as a completed partial directed acyclic graph (CPDAG), for a CPDAG usually reveals some valuable causal information and can be used to guide experimental studies. 

Existing methods for causal discovery targeting the CPDAG are roughly categorized into constraint-based and score-based methods. 
The former uses statistical tests to find conditional (in)dependence relationships in the data and use them as constraints to recover the CPDAG that satisfies them. The PC algorithm~\citep{spirtes2000causation} is a well-known exemplar of this approach. 
% Of course, we ourselves know that we started with a score then used GES as a searching scheme, but I think we need a better logic.
%
The latter formulates the task as an optimization problem by assigning a score to each candidate graph and searching for the one with the optimal score. Regarding the search and optimization strategy, many algorithms solve a combinatorial optimization problem by performing a greedy search; on the other hand, starting from \citet{zheng2018dags}, much recent work tackles the problem through a continuous optimization~\citep{yu2019dag,lachapelle2019gradient}. While continuous optimization has advantages in scalability, global convergence is hard to guarantee by using gradient-based algorithms without implausible assumptions such as strong convexity, especially when the model involves neural networks. In contrast, some search algorithms can be shown to achieve global optimality in the  large sample limit even with a relatively sparse search space. One of the best-known procedures of this kind is Greedy Equivalence Search (GES)~\citep{chickering2002optimal}. 

% In terms of scoring, most score functions are born out of restrictive parametric assumptions on the data distribution. Classical examples include the BIC~\citep{schwarz1978estimating} and the BDeu~\citep{geiger1994learning} score. However, when the parametric model is misspecified, which is very common in real data, the optimality of such a score is not guaranteed to reflect the ground truth even in the large sample limit. %Recently Huang et al. (\citeyear{huang2018generalized}) proposed a generalized score function based on kernel regression, which can be used in GES, but despite its other merits, it is not computationally feasible to run such kernel-based methods on big sample sizes.  
\revise{The standard score-based GES algorithm requires a scoring criterion to evaluate each candidate graph. Classical examples include the BIC~\citep{schwarz1978estimating} and the BDeu scores~\citep{geiger1994learning}. However, most score functions are born out of restrictive parametric assumptions on the data distribution which rarely hold for real-world data. When the parametric model is misspecified, which is very common in real data, the optimality of the standard GES with such a score is not guaranteed to reflect the ground truth even in the large sample limit.}

In this paper, we explore a simple strategy to produce a nonparametric GES. We observe that in order to make GES consistent in a nonparametric setting, it is not necessary to design a scoring metric that evaluates graphs as a whole. Instead, it suffices to define a certain criterion to guide the search at each step of the procedure. The approach we consider in this work is to plug in a consistent estimator of a measure of conditional dependence to provide such guidance. The result is a reframed GES algorithm that is more flexible than the standard score-based version and readily lends itself to the nonparametric setting with a general measure of conditional dependence. This avoids potential model misspecification that commonly occurs in score-based methods. On the other hand, although the reframed GES becomes essentially constraint-based, it retains desirable features of the search strategy of GES and performs significantly better in our experiments than paradigmatic constraint-based methods such as PC.

In addition, we propose a measure of conditional dependence based on a characterization of conditional independence from \citet{daudin1980partial} and a novel neural conditional dependence (NCD) estimator which utilizes the expressive power of deep neural networks. 
Many existing nonparametric measures of conditional dependence are based on kernel methods and leverage characterizations in a Reproducing Kernel Hilbert Space (RKHS), e.g., \citet{gretton2005measuring}. However, kernel methods suffer from high computational complexity, preventing them from efficient applications in large scale problems. In contrast, our neural network based approach can benefit from a large sample size without a severe compromise in computational time.

% In contrast to existing kernel-based nonparametric measures of conditional dependence that leverage certain characterizations in Reproducing Kernel Hilbert Space (RKHS), e.g., \citet{gretton2005measuring}, our neural network-based estimator enjoys better  

% Most existing nonparametric measures of conditional dependence are based on kernel methods and leverage certain characterization in Reproducing Kernel Hilbert Space (RKHS), e.g., \citet{gretton2005measuring}. However, kernel 

% To overcome the parametric limitation of score-based methods, we consider a scoring approach motivated by the constraint-based methods. According to the Markov condition, each node in the ground-truth causal structure is conditionally independent from its non-descendants given its parental nodes. Therefore, we consider defining a score to characterize such conditional independence. 
% However, traditional GES requires the score to be decomposable, that is, each piece only depends on one node and its parents. Hence, we modify the standard GES procedure to incorporate such conditional independence scores, and call the new algorithm Constraint GES (CGES). 
% For the scoring, traditional statistical tests adopted in the constraint-based methods cannot be fitted into the CGES framework. To this end, we propose a new statistic to measure the conditional independence. 
%Traditional GES can suffer from two limitations. First, it requires the score to be decomposable where each piece only depends on one node and its parents. Second, such score is usually proposed based on an assumed parametric model for the generative process of the node given its parents. 

We highlight our main contributions as follows:
\begin{itemize}
	\item We present a reframing of the GES algorithm that can flexibly incorporate a consistent estimator of a general conditional dependence measure.
	\item We propose a neural conditional dependence (NCD) measure, which utilizes the expressive power of deep neural networks.
	\item We provide theoretical guarantees on the correctness of the reframed GES algorithm and the consistency of the NCD estimator to measure conditional dependence under mild conditions, and demonstrate the empirical advantages of the resulting method in causal discovery. 
\end{itemize}



%=======================================================
\section{Background and Related Work}
\subsection{Preliminaries and Notations}
% notation: dag, pa
% markov, faithful, equivalence class
Let $\cG=(\bV,\bE)$ be a directed acyclic graph (DAG) consisting of nodes $\bV=(X_1,\dots,X_d)$, each of which is a (possibly multi-dimensional) random variable, and directed edges $\bE$ that connect pairs of nodes. Let $\pa^\cG_i$ be the set of parents of node $X_i$. We denote the joint distribution of $\bV$ by $P_\bV$. A basic problem of causal discovery aims at inferring the unknown causal DAG $\cG$ from an independent and identically distributed (i.i.d.) sample from $P_\bV$. In general, we need assumptions relating the DAG $\cG$ and the distribution $P_\bV$ to make the task possible. A principle adopted by all causal discovery methods is the {\it causal Markov condition}: $P_\bV$ is Markovian with respect to DAG $\cG$, in the sense that every conditional independence statement entailed by $\cG$ according to the standard Markov property of DAGs is true of $P_\bV$. We also assume the commonly adopted {\it faithfulness assumption}: $P_\bV$ is faithful with respect to DAG $\cG$, in the sense that every conditional independence statement true of $P_\bV$ is entailed by $\cG$.
%, under which, we have the local Markov property, i.e., each variable is independent of its non-descendants given its parents. 
%We say $P_\bV$ is Markovian with respect to DAG $\cG$ if $P_\bV$ satisfies the local Markov property with respect to the DAG $\cG$, i.e., each variable is independent of its non-descendants given its parents. This assumption is known as the causal Markov condition. 
If two DAGs $\cG_1$ and $\cG_2$ entail the same set of conditional independence statements, they are said to be {\it Markov equivalent}. The set of all DAGs that are Markov equivalent to a DAG $\cG$ is called the {\it Markov equivalence class} (MEC) of $\cG$, which can be represented by a completed partially directed acyclic graph (CPDAG). 
% 
For random variables $X,Y$ and $Z$, we write $X\indpt Y\mid Z$ to mean that $X$ and $Y$ are conditionally independent given $Z$. 

\subsection{Related Work on Causal Discovery}

% Different scoring functions have been proposed to extend the applicability of the classical GES algorithm. One of the most prominent scores is the kernel generalized variance (KGV)~\citep{bach2002learning}, which allows one to handle nonlinear causal relations and variables with different types and dimensions. Recently,

The point that GES can be recast in the spirit of a constraint-based method has been noted in the literature, most recently by \citet{chickering2020statistically} and most explicitly by \citet{nandy2018}. To our knowledge, however, the idea of running GES without a global scoring metric has not been sufficiently explored. In Nandy et al.'s insightful discussion, for example, they emphatically show how a consistent global score can be constructed from local conditional dependence scores in multivariate Gaussian and nonparanormal settings, and stop short of considering the option of dispensing with global scoring altogether. As we aim to demonstrate in this paper, a reframed GES without global scoring is especially flexible and useful in a nonparametric context.  

%others.
In our experiments, we use a number of state-of-the-art causal discovery algorithms for comparison, in addition to the aforementioned PC and standard GES. %\footnote{Since the main purpose is to make GES more applicable in nonparametric settings, in our comparisons we focus mainly on methods that are designed to handle data for continuous variables generated from fairly complex, nonlinear models, and leave out some important methods designed to learn Bayesian networks for discrete variables, such as \cite{bartlett2017integer}.} 
One of them is the CAM algorithm~\citep{buhlmann2014cam}, which decouples the search for the causal ordering from the selection of parents for each variable, by leveraging an additive modeling assumption. 
\citet{huang2018generalized} propose a generalized score function (GSF) and apply it in the GES algorithm. Specifically, they transform the statistical decision about conditional independence to a model selection problem for a regression task in an RKHS, define a score based on the penalized log-likelihood for the kernel regression, and then use the score to guide local moves in GES. This work is closely related to ours in that both works are motivated by the goal to develop a nonparametric score to guide the local moves of GES. However, our approach differs from GSF in at least two notable aspects. First, by highlighting the sufficiency of designing a local score that enables consistent statistical decisions about conditional independence, we propose a simpler and more flexible way to dispense with parametric assumptions in GES. %when characterizing a conditional independence relation, say, $X\indpt Y\mid Z$, we define a score involving all three random variables rather than the indirect way of regression, which in principle leads to higher efficiency. 
Second, the specific score we propose is based on neural networks rather than kernels and hence enjoys better computational efficiency when scaling to large sample size. 
Another earlier work sharing a similar spirit is the kernel generalized variance (KGV)~\citep{bach2002learning}, which is also compared in our experiments. 

Other methods follow  \citet{zheng2018dags}, who reformulate the original combinatorial problem into a continuous optimization problem, named NOTEARS, which is solved using the augmented Lagrangian algorithm. Several follow-up works extend NOTEARS to nonlinear causal models, including DAG-GNN~\citep{yu2019dag}, GraN-DAG~\citep{lachapelle2019gradient}, and \citet{ng2019masked}, all of which utilize neural networks to model the nonlinear causal relations. 
In addition, \citet{Zhu2020Causal,ijcai2021-491} adopt policy gradient to search for a DAG with the optimal score.

Since the main purpose of this work is to make GES more applicable in nonparametric settings, in our comparisons we focus mainly on methods that are designed to handle data for continuous variables generated from fairly complex, nonlinear models, and leave out some important methods designed to learn Bayesian networks for discrete variables, such as \cite{bartlett2017integer}.

\subsection{Related Work on Conditional Independence}
% measure: HSIC, FOCI
% tests: KCI, GCM
Conditional independence plays an important role in many statistics and machine learning problems, ranging from graphical models~\citep{koller2009probabilistic} to invariance learning~\citep{arjovsky2019irm}. 
A number of studies were devoted to characterizing conditional independence or developing conditional independence tests. 
\citet{gretton2005measuring} introduce the Hilbert-Schmidt independence criterion (HSIC), which is extended by \citet{fukumizu2007kernel} to cover conditional independence and used for a conditional independence test. Recently, \citet{azadkia2019simple} propose a surprisingly simple nonparametric measure of conditional dependence based on ranking statistics, which we refer to as the Rank Conditional Dependence (RCD) measure and will use later to illustrate the flexibility of our approach. 
%that has a very simple form, can be computed from data quickly and has asymptotic justification. We adopt this measure in our causal discovery algorithm to show the flexibility of the proposed CGES framework in incorporating various conditional independence measure. 
Other works focus on constructing tests of conditional independence by proposing various test statistics, including the kernel conditional independence test~\citep{Zhang2011KernelbasedCI} and the test based on a generalized covariance measure~\citep{shah2020}, among others. %, where the latter work pointed out that the problem of nonparametric conditional independence testing for continuous random variables is essentially unsolvable without imposing unverifiable assumptions.



%=======================================================
\section{Reframing the GES Algorithm}
%\xw{This section is based on Prof Zhang's notes.}
% introduce a general score notation; assume a consistent estimator; introduce our procedure.
\subsection{Standard GES}
The GES algorithm~\citep{chickering2002optimal} searches over the space of MECs of DAGs, which are represented by CPDAGs. The connectivity in the search space is given by the independence-map (IMAP) relation:  graph $\cG$ is an IMAP of graph $\cG'$ if every conditional independence entailed by $\cG$ is entailed by $\cG'$. The standard GES uses a scoring function that assigns a score to every DAG given data, and uses the score of a representative DAG in a MEC as that for the MEC. The search strategy consists of two phases, a phase of forward equivalence search (FES) followed by a phase of backward equivalence search (BES). In FES, the procedure starts with the empty CPDAG (the one with no edges), and moves at each step to a best-scoring CPDAG with one more adjacency (that is an IMAP of the previous CPDAG), until the score cannot be improved by adding more adjacencies. In BES, the procedure starts with the output from FES, and moves at each step to a best-scoring CPDAG with one fewer adjacency (of which the previous CPDAG is an IMAP), until the score cannot be improved by deleting more adjacencies.

% introduce the operations in GES
We enter some details of FES to highlight the observation that motivates the subsequent reframing. The case of BES is analogous. In FES, each step considers possible insert-one-edge operations on the current CPDAG. Following \citet{chickering2002optimal}, for non-adjacent nodes $X_i$ and $X_j$ in a CPDAG $\cP$, and for any subset $\bT$ of the neighbors of $X_j$ (i.e., nodes that are connected to $X_j$ by undirected edges) that are not adjacent to $X_i$, the $Insert(X_i,X_j,\bT)$ operator modifies $\cP$ to obtain $\cP'$ by inserting the directed edge $X_i\to X_j$, and for each $T\in\bT$, directing the previously undirected edge between $T$ and $X_j$ as $T\to X_j$. 
If the validity condition in \citet[Theorem~15]{chickering2002optimal} is met, then there is a representative DAG $\cG$ in the MEC represented by $\cP$ and a representative DAG $\cG'$ in the MEC represented by $\cP'$, such that $\cG'$ is the result of inserting $X_i\to X_j$ in $\cG$ (which implies that $\cP'$ is an IMAP of $\cP$).

In Chickering's (\citeyear{chickering2002optimal}) proof of the asymptotic correctness of GES under the causal Markov and faithfulness assumptions, the crucial condition is that the ``local'' decision between $\cG$ and $\cG'$ mentioned above asymptotically tracks whether a certain conditional independence relation holds. We make this notion precise in the following definition, in which $\bD$ denotes an i.i.d. sample with size $n$ from the joint distribution $P_\bV$ of $\bV$.
\begin{definition}[Independence-tracking decision criterion]\label{def:local_cons}
% 	Let $\bD$ be an i.i.d. sample from the joint distribution $p$ of $\bV$. 
	Let $\cG$ and $\cG'$ be two DAGs over $\bV$ that are exactly the same except that $\cG'$ contains an edge $X_i\to X_j$ that does not appear in $\cG$. A decision criterion (based on data $\bD$) to choose between $\cG$ and $\cG'$ (among other options) is independence- tracking if the following two properties hold in the large sample limit:
\begin{enumerate}[label=(\roman*)]%\vspace{-0.1in}
% \setlength{\itemsep}{2pt}
\item If $X_j\indpt X_i\mid \pa^\cG_j$ (according to $P_\bV$), then the decision criterion favors $\cG$ over $\cG'$.
\item Otherwise, the decision criterion favors $\cG'$ over $\cG$.
\end{enumerate}
\end{definition}

In the standard GES algorithm, a scoring function for DAGs is used to make such local decisions. The induced decision criterion is independence-tracking if the scoring function satisfies the so-called local consistency~\citep[Definition~6]{chickering2002optimal}. Indeed, Definition~\ref{def:local_cons} is a straightforward generalization of the notion of local consistency for scoring functions. The generalization serves to highlight a simple but important observation: the crucial condition for the optimality of GES can be implemented by a (locally consistent) score function for DAGs, but does not necessitate such a function.      
%The optimality of the standard version of GES requires the score to satisfy two properties. Computationally, the score function needs to be decomposable, that is, it can be written as a sum of scores, each of which depends only on one variable and its parents in $\cG$, or in other words, it can be expressed as $S(\cG,\bD)=\sum_{i=1}^d s(X_i,\pa^\cG_i)$ for some function $s$. With this property, scores can be calculated and updated in a ``local'' fashion.
%Statistically, the score needs to be locally consistent as defined below~\citep[Definition~6]{chickering2002optimal}, in which the convention that a smaller score indicates a favored DAG is assumed: 
%\begin{definition}\label{def:local_cons}
%	Let $\cG$ be any DAG, and $\cG'$ be the DAG that results from adding the edge $X_i\to X_j$ (that does not appear in $\cG$). A scoring criterion $S(\cG, \bD)$ is locally consistent if the following two conditions hold in the large sample limit:
%\begin{enumerate}[leftmargin=*,label=(\roman*)]%\vspace{-0.1in}
% \setlength{\itemsep}{2pt}
%\item If $X_j\indpt X_i\mid \pa^\cG_j$ (according to $p$), then $S(\cG,\bD)<S(\cG',\bD)$.
%\item Otherwise, $S(\cG,\bD)>S(\cG',\bD)$.
%\end{enumerate}
%\end{definition}
%The local consistency property basically says that for every pair of DAGs $\cG,\cG'$ as given in Definition~\ref{def:local_cons}, the score function should favor $\cG$ over $\cG'$ in the large sample limit if $X_j\indpt X_i\mid \pa^\cG_j$, and favor $\cG'$ over $\cG$ otherwise. 


\subsection{Reframed GES}
We now describe a simple alternative way to implement an independence-tracking decision criterion for GES, by using any consistent measure of conditional dependence, in the following sense:
%For three random variables $(X,Y,Z)$ following a joint distribution $p_{XYZ}$, let $T(X,Y|Z)$ be a statistic given an i.i.d. sample from $p_{XYZ}$ intended to be a measure of the conditional dependence between $X$ and $Y$ given $Z$. 
%In the edge-addition stage of GES, what we apply at each step is the (valid) operator $Insert(X_i, X_j, \bT)$ whose corresponding local score  $T(X_i,X_j|\pa^{\cG(\bT)}_j)$ is the highest, and we do this until all remaining valid insert operators yield a local score lower than some threshold $\tau$. If this local score satisfies the following consistency, then 

% \begin{definition}\label{def:score_cons}
% % 	Let $\cG$ be any DAG and $X_i, X_j$ are two nodes such that the directed edge $X_i\to X_j$ does not appear in $\cG$. 
% % Let $p$ be a joint distribution of $\bV$. 
% Consider a set of statistics indexed by distinct variables $X, Y\in \bV$ and $\mathbf{Z} \subseteq \bV\backslash \{X, Y\}$, $T_{X, Y|\mathbf{Z}}(\bD)$ (intended to measure the conditional dependence between $X$ and $Y$ given $\mathbf{Z}$). $T_{X, Y|\mathbf{Z}}(\bD)$ is said to be $\tau$-consistent with parameter $\tau>0$ if for every $X, Y\in \bV$ and $\mathbf{Z}\subseteq \bV\backslash \{X, Y\}$, 
% 	the following two conditions hold in the large sample limit:
% \begin{enumerate}[leftmargin=*,label=(\roman*)]%\vspace{-0.1in}
% % \setlength{\itemsep}{2pt}
% \item If $X\indpt Y\mid \mathbf{Z}$ (according to $p$), then $T_{X, Y|\mathbf{Z}}(\bD)<\tau$.
% \item Otherwise, $T_{X, Y|\mathbf{Z}}(\bD)>\tau$.
% \end{enumerate}
% \end{definition}

\begin{definition}[$\tau$-consistency]\label{def:score_cons}
Consider a set of statistics $\cT = \{T_n(X, Y|\mathbf{Z})\mid X, Y\in \bV,\mathbf{Z} \subseteq \bV\backslash \{X, Y\}\}$ (intended to measure conditional dependence) depending on the sample $\bD$ with size $n$. $\cT$ is said to be $\tau$-consistent with parameter $\tau>0$ if for every $X, Y\in \bV$ and $\mathbf{Z}\subseteq \bV\backslash \{X, Y\}$, 
the following two conditions hold in the large sample limit:
\begin{enumerate}[label=(\roman*)]%\vspace{-0.1in}
\item If $X\indpt Y\mid \mathbf{Z}$ (according to $P_\bV$), then $T_n(X, Y|\mathbf{Z})<\tau$.
\item Otherwise, $T_n(X, Y|\mathbf{Z})>\tau$.
\end{enumerate}
\end{definition}

%we can give a necessary condition when a score is consistent as in Def 2: population equivalence + sample consistency. based on this, we can easily show some previous CI measure satisfies Def 2.
For our purpose, the following sufficient condition for the $\tau$-consistency in Definition~\ref{def:score_cons} is useful. All proofs are deferred to Appendix~\ref{app:pf}. 

\begin{proposition}\label{prop:suff_cond}
	%For any random variables $(X,Y,Z)$ following a joint distribution $p_{XYZ}$, 
	Suppose for every $X, Y\in \bV$ and $\mathbf{Z}\subseteq \bV\backslash \{X, Y\}$, $T_*(X,Y|\mathbf{Z})\geq0$ is a quantity depending on $P_\bV$ such that
\begin{equation*}
	T_*(X,Y|\mathbf{Z})=0\text{ if and only if }X\indpt Y\mid \mathbf{Z}.
\end{equation*}
Let $\hat{T}_n(X,Y|\mathbf{Z})$ form a set of statistics indexed by $X, Y\in \bV$ and $\mathbf{Z} \subseteq \bV\backslash \{X, Y\}$. 
% a statistic given an i.i.d. sample with size $n$ from $p$.
If $\hat{T}_n(X,Y|\mathbf{Z})\to T_*(X,Y|Z)$ in probability as $n\to\infty$ for every $X, Y\in \bV$ and $\mathbf{Z}\subseteq \bV\backslash \{X, Y\}$, then there exists $\tau>0$ such that $\{\hat{T}_n(X,Y|\mathbf{Z})\}$ is $\tau$-consistent.
\end{proposition}


This proposition provides a way to construct a $\tau$-consistent set of statistics. One first defines a population quantity that takes the boundary value if and only if the conditional independence in question holds. Then one constructs a consistent estimator for this quantity given an i.i.d. sample. The aforementioned measures including HSIC~\citep{fukumizu2007kernel} and RCD~\citep{azadkia2019simple} were both developed along this line. %Since RCD has a very simple form, can be computed from data quickly and is well justified asymptotically, we adopt this measure in our causal discovery algorithm to demonstrate the flexibility of our reframed GES algorithm in incorporating various conditional independence measure. A brief introduction to RCD can be found in the Appendix. 
%that has a very simple form, can be computed from data quickly and has asymptotic justification. We adopt this measure in our causal discovery algorithm to show the flexibility of the proposed CGES framework in incorporating various conditional independence measure. 
%Because in this paper, we avoid involving kernel methods for the sake of computational complexity and 
In the next section, we will propose a new measure of conditional dependence based on a neural network implementation.

It is worth noting the essential difference between our defined $\tau$-consistent statistics and conditional independence tests. We note that the condition in Proposition~\ref{prop:suff_cond} indicates that when $X\indpt Y\mid Z$, the statistic $\hat{T}_n(X,Y|\mathbf{Z})$ converges to 0 in probability, i.e., $\hat{T}_n=o_p(1)$. By contrast, in a typical conditional independence test, one usually uses a test statistic that, under the null hypothesis of conditional independence, follows an asymptotic null distribution, which is then used to develop a decision rule. This means that when $X\indpt Y\mid Z$, the test statistic is stochastically bounded, i.e., $O_p(1)$, but not necessarily $o_p(1)$. Therefore, it is in general non-trivial to define a $\tau$-consistent statistic from a conditional independence test. %and we leave it for future exploration. 

With a $\tau$-consistent $\hat{T}(X,Y|\mathbf{Z})$, it is straightforward to implement an independence-tracking decision criterion to be used in GES. Specifically, at each step in FES, to each (valid) operator $Insert(X_i, X_j, \bT)$ we assign $\hat{T}(X_i,X_j|\pa^\cG_j)$ as its ``local score'' (where $\cG$ is the DAG representing the current CPDAG induced by the operator), and apply the operator with the highest local score (indicating conditional dependence), unless all remaining valid insert operators yield a local score lower than the threshold $\tau$. Similarly, at each step in BES, to each (valid) operator $Delete(X_i, X_j, \mathbf{H})$, we assign $\hat{T}(X_i,X_j|\pa^\cG_j)$ as its local score (where $\cG$ is the DAG representing the CPDAG the operator would produce), and apply the operator with the lowest local score (indicating conditional independence), unless all remaining valid delete operators yield a score greater than $\tau$. The update step of FES is summarized in Algorithm \ref{FESupdate}, and the dual update step of BES is given in Appendix~\ref{app:bes}. 

\begin{algorithm}[t]
\small
\caption{The update step in the reframed FES}
\label{FESupdate}
\textbf{Input}: the current CPDAG $\cP$, sample $\bD$, a list of valid insert operators $\mathbf{INS}$, statistics $\hat{T}(X,Y|\mathbf{Z})$, threshold $\tau$ \\
\textbf{Output}: the next CPDAG $\cP'$
\begin{algorithmic}%[1] %[1] enables line numbers
%\STATE Do some action.
\STATE Set $s=0$ and $I=\texttt{NULL}$.
\FOR{$Insert(X_i, X_j, \bT)\in \mathbf{INS}$}
\STATE Let $\cG$ be the representative DAG of $\cP$ corresponding to $Insert(X_i, X_j, \bT)$.
\STATE Evaluate $Score(X_i, X_j, \bT)=\hat{T}(X_i,X_j|\pa^\cG_j)$. 
\IF {$Score(X_i, X_j, \bT)>s$}
\STATE Let $s=Score(X_i, X_j, \bT)$ and $I=Insert(X_i, X_j, \bT)$.
\ENDIF
\ENDFOR
\IF {$s>\tau$}
\STATE Apply operator $I$ to obtain $\cP'$.
\ELSE
\STATE Keep $\cP'=\cP$ (and terminate FES).
\ENDIF
\STATE \textbf{return} $\cP'$
\end{algorithmic}
\end{algorithm}

We call the GES algorithm with these update steps the \emph{reframed GES}. Unlike the standard GES, this reframed GES does not optimize a global score for MECs. However, by using a suitable local score for choosing operators to apply (or to stop), the local decision criterion remains independence-tracking, and as a result the asymptotic optimality of the reframed GES algorithm is still guaranteed, as stated in the following theorem. 
\begin{theorem}\label{thm:opt_cges}
Under the causal Markov and faithfulness assumptions, the reframed GES procedure using a $\tau$-consistent $\hat{T}(X,Y|\mathbf{Z})$ recovers the MEC of the true graph in the large sample limit. 
\end{theorem}



%=======================================================
\section{Neural Conditional Dependence Measure}
In this section, we propose a novel measure of conditional dependence. 
Let $X$, $Y$, and $Z$ be three random variables taking values in $\bbR^{d_X}$, $\bbR^{d_Y}$, and $\bbR^{d_Z}$, respectively, where $d_X$, $d_Y$, and $d_Z$ are the corresponding dimensions. We assume that their joint distribution is absolutely continuous with respect to Lebesgue measure with density $p_*$ defined on $\bbR^{d_X+d_Y+d_Z}$. The conditional independence between $X$ and $Y$ given $Z$ is defined by $p_*(x,y,z)=p_*(x|z)p_*(y|z)p_*(z)$ for all $x,y,z$ with $p_*(z)>0$~\citep{dawid1979conditional}. 

The following lemma from \citet{daudin1980partial} characterizes the conditional independence, which has given rise to several hypothesis testing methods. %, including the neural conditional dependence measure proposed in this paper. 
Let $L^2_Z$, $L^2_{XZ}$, and $L^2_{YZ}$ be the spaces of square integrable functions of $Z$, $(X,Z)$, and $(Y,Z)$, respectively, e.g., $L^2_{XZ}=\{f:\bbR^{d_X+d_Z}\to\bbR\mid\bbE[f(X,Z)^2]<\infty\}$. 
\begin{lemma}[\citet{daudin1980partial}]\label{lem:daudin}
	The random variables $X$ and $Y$ are conditionally independent given $Z$ if and only if
	\begin{equation*}
		\bbE[f(X,Z)g(Y,Z)]=0,
	\end{equation*}
	for all $f\in L^2_{XZ}$ and $g\in L^2_{YZ}$ such that $\bbE[f(X,Z)|Z]=0$ and $\bbE[g(X,Z)|Z]=0$.
\end{lemma}

At the population level, given a ground truth density $p_*$, we propose the following measure of conditional dependence between $X$ and $Y$ given $Z$:
\begin{equation}\label{eq:score_pop}\small
	S(X,Y|Z)=\sup_{f,g}\rho^2(f(X,Z)-h^*(Z),g(Y,Z)-l^*(Z))
\end{equation}
where $f\in L^2_{XZ}$ and $g\in L^2_{YZ}$ are test functions, $h^*(Z)=\bbE[f(X,Z)|Z]$, $l^*(Z)=\bbE[g(Y,Z)|Z]$, and $\rho(X_1,X_2)=\mathsf{cov}(X_1,X_2)/\sqrt{\mathsf{var}(X_1)\mathsf{var}(X_2)}$ denotes the Pearson correlation coefficient of two random variables $X_1$ and $X_2$. %We note that the two random variables within the correlation coefficient both have zero means. 
The reason for using the correlation coefficient rather than the covariance is that after normalization by the variances, the characteristic is bounded between $[-1,1]$. This makes the measure well-defined in a bounded range and the computation of its subsequent sample version numerically stable. 

Based on Lemma~\ref{lem:daudin}, we have the following simple theorem which characterizes the property of the measure \eqref{eq:score_pop} and 
establishes an equivalence condition between the measure and conditional independence. 
\begin{theorem}\label{thm:pop_equiv}
	For all $p_*$, we have $S(X,Y|Z)\in[0,1]$ and $S(X,Y|Z)=0$ if and only if $X\indpt Y\mid Z$.
\end{theorem}

Having defined the measure $S(X,Y|Z)$, we now make the computation tractable. 
We use deep neural network classes to parametrize the test functions $f,g$ and the conditional expectations $h,l$ in \eqref{eq:score_pop}. Formally, we write $f_\theta$, $g_\phi$, $h_\omega$, $l_\psi$, where the subscripts denote the parameters of the corresponding neural networks. We then exploit the approximation %of \eqref{eq:score_pop}
\begin{equation}\label{eq:score_pop_nn}
    \sup_{\theta,\phi}\rho^2(f_\theta(X,Z)-h_{\omega^*}(Z),g_\phi(Y,Z)-l_{\psi^*}(Z)),
\end{equation}
where $h_{\omega^*}(z)=h^*(z)$ and $l_{\psi^*}(z)=l^*(z)$. 
According to the universal approximation theorem of neural networks~\citep{hornik1989multilayer}, equation~\eqref{eq:score_pop_nn} can approximate the true measure \eqref{eq:score_pop} with arbitrary accuracy by choosing the appropriate network architecture. 
Since here we mainly focus on the statistical property of the estimator proposed below, we ignore the small approximation error (i.e., the gap between \eqref{eq:score_pop_nn} and \eqref{eq:score_pop}) in the analysis for simplicity.

% Having defined the population measure $S(X,Y|Z)$, 
Next, we present a consistent estimator of $S(X,Y|Z)$.  
Let $\bD=\{(x_i,y_i,z_i),i=1,\dots,n\}$ be the collection of i.i.d. copies of $(X,Y,Z)\sim p_*$. Our estimator of $S(X,Y|Z)$ is given by
% \begin{equation}\label{eq:score_sam}\small
% \begin{split}
% \hat{S}_n(X,Y|Z)&=\sup_{\theta,\phi}\hat{\rho}^2\big(f_\theta(X,Z)-h_{\hat\omega}(Z),g_\phi(Y,Z)- l_{\hat\psi}(Z)\big)\\
% &:=\sup_{\theta,\phi}\frac{\hat{\mathbb{E}}^2[(f_\theta(X,Z)-h_{\hat\omega}(Z))\cdot(g_\phi(Y,Z)-l_{\hat\psi}(Z))]}{\hat{\mathbb{E}}[f_\theta(X,Z)-h_{\hat\omega}(Z)]^2\cdot\hat{\mathbb{E}}[g_\phi(Y,Z)-l_{\hat\psi}(Z)]^2}
% \end{split}
% \end{equation}
% 
\begin{equation}\label{eq:score_sam}\small
	\hat{S}_n(X,Y|Z)=\sup_{\theta,\phi}\hat{\rho}^2\big(f_\theta(X,Z)-h_{\hat\omega}(Z),g_\phi(Y,Z)- l_{\hat\psi}(Z)\big),
\end{equation}
where $\hat{\rho}$ is the sample correlation coefficient based on data $\bD$, and
\begin{equation}\label{eq:reg_est}\small
\begin{split}
	\hat \omega=\argmin_{\omega}\frac{1}{n}\sum_{i=1}^n (f_\theta(x_i,z_i)-h_\omega(z_i))^2,\\
	\hat \psi=\argmin_{\psi}\frac{1}{n}\sum_{i=1}^n (g_\phi(y_i,z_i)-l_\psi(z_i))^2,
\end{split}
\end{equation}
are the estimators of $\omega^*$ and $\psi^*$.% defined below \eqref{eq:score_pop}. 

% \begin{remark}
% 	We use the correlation coefficient rather than the covariance, that is, the statistic is normalized using the variances. Therefore, our estimator $\hat{S}_n(X,Y|Z)$ always lies in $[0,1]$, which makes the computation numerically stable.
% \end{remark}
\begin{remark}\label{rmk:reg_est}
	The estimators in \eqref{eq:reg_est} based on regression come from the fact that $\bbE[f(X,Z)|Z]=\argmin_{h}\bbE[f(X,Z)-h(Z)]^2$, which is proved in Appendix~\ref{app:pf}.
\end{remark}

% We use deep neural network classes to parametrize the test functions $f,g$ and the regressors $h,l$. Formally, we write $f_\theta$, $g_\phi$, $h_\omega$, $l_\psi$, where the subscripts denote the parameters of the corresponding neural networks. We exploit the approximation %of \eqref{eq:score_pop}
% \begin{equation}\label{eq:score_pop_nn}
%     \sup_{\theta,\phi}\rho^2(f_\theta(X,Z)-h_{\omega^*}(Z),g_\phi(Y,Z)-l_{\psi^*}(Z)),
% \end{equation}
% where $h_{\omega^*}(z)=h^*(z)$ and $l_{\psi^*}(z)=l^*(z)$. 
% According to the the universal approximation theorem of neural networks~\citep{hornik1989multilayer}, equation~\eqref{eq:score_pop_nn} can approximate the true measure \eqref{eq:score_pop} with arbitrary accuracy. 
% Since we mainly focus on the statistical property of the proposed estimator, we ignore the small approximation error, i.e., the gap between \eqref{eq:score_pop_nn} and \eqref{eq:score_pop}, in the analysis for simplicity. %and regard \eqref{eq:score_pop_nn} as the measure of interest. % sample version也要parametrize

% , which can be justified by the universal approximation theorem of neural networks up to some small approximation errors which are ignored in analysis for simplicity.
We call the proposed estimator \eqref{eq:score_sam} the \emph{neural conditional dependence (NCD)} estimator and its population version \eqref{eq:score_pop_nn} the NCD measure. Since \eqref{eq:score_sam} solves a bilevel optimization problem involving regression problems \eqref{eq:reg_est}, we adopt an alternating gradient descent scheme to obtain the NCD estimator. The procedure is summarized in Algorithm~\ref{alg}, where we alternately update the test functions $f_\theta,g_\phi$ and nonlinear regressors $h_\omega,l_\psi$ for $T_t$ and $T_r$ steps, respectively.

\begin{algorithm}[t]
\small
\caption{Computing the NCD score}
\label{alg}
\textbf{Input}: sample $\bD$, horizon $T_t,T_r$, initial $\theta,\phi,\omega,\psi$\\
%\textbf{Parameter}: Optional list of parameters\\
\textbf{Output}: NCD score
\begin{algorithmic}[1] %[1] enables line numbers
\FOR{$t_t=1,2,\dots, T_t$}
%\STATE Do some action.
\FOR{$t_r=1,2,\dots, T_r$}
\STATE Update $\omega$ by descending $\sum_i\nabla_\omega(f_\theta(x_i,z_i)-h_\omega(z_i))^2$\\
\STATE Update $\psi$ by descending $\sum_i\nabla_\psi(g_\phi(y_i,z_i)-l_\psi(z_i))^2$
\ENDFOR
\STATE Update $\theta,\phi$ by ascending the gradient of \\\ \ \ \ $\hat{\rho}^2(f_\theta(X,Z)-h_\omega(Z),g_\phi(Y,Z)-l_\psi(Z))$
%\STATE Update $g$ by descending the gradient of $\hat{\rho}^2(f-h,g-l)$
\ENDFOR
\STATE Compute $\hat{s}=\hat{\rho}^2(f(X,Z)-h(Z),g(Y,Z)-l(Z))$ 
\STATE \textbf{return} $\hat{s}$
\end{algorithmic}
% Remark: $\nabla_h$ denotes the gradient with respect to the parameter of $h$. The same holds for $l$.
\end{algorithm}


To study the asymptotic behavior of the proposed estimator, we assume the following regularity conditions, all of which are mild assumptions commonly adopted in the literature. %to guarantee the uniform convergence.
\begin{enumerate}[leftmargin=*,label=\textit{C\arabic*}.]%\vspace{-0.1in}
% \setlength{\itemsep}{2pt}
%\setlength{\parskip}{2pt}
\item The parameter spaces $\theta\in\Theta$, $\phi\in\Phi$, $\omega\in\Omega$, and $\psi\in\Psi$ are compact.
\item $f_\theta(x,z)$, $g_\phi(y,z)$, $h_\omega(z)$, and $l_\psi(z)$ are continuous with respect to the corresponding parameters and data $x,y,z$.
\item $f_\theta(x,z)$, $g_\phi(y,z)$, $h_\omega(z)$, and $l_\psi(z)$ are dominated square integrable, i.e., there exists a dominating function $F(x,z)$ such that $|f_\theta(x,z)|\leq F(x,z)$ for all $\theta$ and $\bbE[F(X,Z)]^2<\infty$. 
\item For all $\theta,\phi$, there exist unique $\omega^*(\theta)\in\Omega$ and $\psi^*(\phi)\in\Psi$ such that $h_{\omega^*}(z)=h^*(z)$ and $l_{\psi^*}(z)=l^*(z)$ almost surely, respectively. 
\end{enumerate}
The following theorem establishes the consistency of $\hat{S}_n(X,Y|Z)$ as an estimator of the population measure $S(X,Y|Z)$.
\begin{theorem}\label{thm:nci_cons}
	Under the regularity conditions \textit{C1-C4}, as $n\to\infty$, we have $\hat{S}_n(X,Y|Z)\to S(X,Y|Z)$ in probability.
\end{theorem}

% \begin{remark}
% In implementations, we use neural network classes to represent the test functions $f,g$ and the regressors $h,l$, which can be justified by the universal approximation theorem of neural networks up to some small approximation errors which are ignored in analysis for simplicity. We then call the proposed estimator \eqref{eq:score_sam} neural conditional dependence (NCD) score. Since \eqref{eq:score_sam} is a bilevel optimization problem involving regression \eqref{eq:reg_est}, we adopt an alternating gradient descent scheme to obtain the NCD score. The procedure is summarized in Algorithm~\ref{alg}.
% \end{remark}


% how we use S hat 
% Finally, we describe the causal discovery method based on the NCD estimator.
Finally, we apply the NCD measure to causal discovery through the reframing of GES in the previous section. 
We plug the estimator $\hat{S}_n$ into the reframed GES procedure as the ``local score''. 
Based on Theorems~\ref{thm:pop_equiv} and~\ref{thm:nci_cons}, by applying the sufficient condition in Proposition~\ref{prop:suff_cond}, we know that $\hat{S}_n(X,Y|\mathbf{Z})$ satisfies the $\tau$-consistency in Definition~\ref{def:score_cons} with some $\tau>0$. Then Theorem~\ref{thm:opt_cges} implies the asymptotic correctness of our method to recover the true MEC. 
% The following corollary shows that our score satisfies the score consistency in Definition~\ref{def:score_cons}.

% \begin{corollary}
% 	$\hat{S}_n$
% \end{corollary}

In addition, to demonstrate the flexibility of our reframed GES algorithm in incorporating various conditional dependence measures, we will also test a version of our procedure using the RCD measure recently proposed by \citet{azadkia2019simple}, because it is very easy to compute. The RCD estimator can be shown to satisfy the $\tau$-consistency and hence suits our framework well. For completeness, we provide a description of RCD in Appendix~\ref{app:rcd}. 
%Since RCD has a very simple form, can be computed from data quickly and is well justified asymptotically, we adopt this measure in our causal discovery algorithm to demonstrate the flexibility of our reframed GES algorithm in incorporating various conditional independence measure. 



%=======================================================
\section{Experiments}

\begin{table*}[h]
    % \small
    \centering
    \begin{tabular}{lllllllll}
    \toprule
        Setting & \multicolumn{2}{c}{GP (1k)} & \multicolumn{2}{c}{GP (5k)} & \multicolumn{2}{c}{MULT (1k)} & \multicolumn{2}{c}{MULT (5k)}  \\
        \cmidrule(r){2-3} \cmidrule(r){4-5} \cmidrule(r){6-7} \cmidrule(r){8-9}
        Methods & SHD & F1 score & SHD & F1 score & SHD & F1 score & SHD & F1 score\\ 
        \midrule
        NCD   & \textbf{5.6$\pm$2.5} & \textbf{0.63$\pm$0.14} & \textbf{4.2$\pm$2.3} & \textbf{0.71$\pm$0.14} & 6.2$\pm$2.9 & 0.59$\pm$0.08 & 5.6$\pm$2.4 & 0.60$\pm$0.08  \\
        RCD   & 9.0$\pm$0.7 & 0.41$\pm$0.07 & 8.4$\pm$1.1 & 0.53$\pm$0.08 & 7.4$\pm$2.1 & 0.51$\pm$0.09 & \textbf{3.2$\pm$1.3} & \textbf{0.67$\pm$0.07} \\
        PC & 8.8$\pm$1.6 & 0.36$\pm$0.15 & 7.2$\pm$2.4 & 0.50$\pm$0.16 & 7.6$\pm$1.7 & 0.44$\pm$0.15 & 4.6$\pm$1.8 &0.57$\pm$0.13 \\
        BIC & 7.0$\pm$2.8 & 0.49$\pm$0.20 & 6.0$\pm$2.5 & 0.59$\pm$0.17 & 4.2$\pm$2.9 & 0.65$\pm$0.09 & 4.4$\pm$3.4 & {0.62$\pm$0.11} \\
        KGV & 8.5$\pm$1.1 & 0.37$\pm$0.08 & 7.5$\pm$0.5 & 0.51$\pm$0.06 & 9.0$\pm$1.9 & 0.35$\pm$0.14 & 7.2$\pm$0.7 & 0.47$\pm$0.07\\
        CAM & 6.0$\pm$3.5 & 0.50$\pm$0.26 & 7.2$\pm$3.7 & 0.52$\pm$0.22 & 10.8$\pm$1.8 & 0.09$\pm$0.07 & 11.2$\pm$2.3 & 0.13$\pm$0.15 \\
        NOTEARS & 11.4$\pm$0.9 & 0.06$\pm$0.08 & 11.6$\pm$0.9 &0.06$\pm$0.08 & 24.8$\pm$3.8 &0.36$\pm$0.07 & 23.6$\pm$4.7 &0.37$\pm$0.07 \\
        DAG-GNN & 11.0$\pm$1.7 &0.00$\pm$0.00 & 11.4$\pm$1.8 &0.03$\pm$0.07 & 16.4$\pm$2.6 &0.37$\pm$0.13 & 13.6$\pm$3.4 &0.40$\pm$0.10\\
        GraN-DAG & 10.6$\pm$1.1 &0.05$\pm$0.06 & 12.2$\pm$1.8 & 0.12$\pm$0.04 & 8.6$\pm$2.6 &0.54$\pm$0.12 & 10.2$\pm$1.9 &0.51$\pm$0.08 \\
        GSF & 6.4$\pm$3.5 & 0.55$\pm$0.19 & \multicolumn{2}{c}{>12h}  & \textbf{3.0$\pm$1.1} & \textbf{0.67$\pm$0.06} & \multicolumn{2}{c}{>12h}  \\
        \bottomrule
    \end{tabular}
    \caption{SHD and F1 score on PNL data sets with 10 nodes, 2 expected degrees, and 1000 and 5000 samples.}\label{tab:pnl2_shd_f1}
% \vskip -0.02in
\end{table*}

\begin{table*}[h]
    % \small
    \centering
    \begin{tabular}{lllllllll}
    \toprule
        Setting & \multicolumn{2}{c}{GP (1k)} & \multicolumn{2}{c}{GP (5k)} & \multicolumn{2}{c}{MULT (1k)} & \multicolumn{2}{c}{MULT (5k)}  \\
        \cmidrule(r){2-3} \cmidrule(r){4-5} \cmidrule(r){6-7} \cmidrule(r){8-9}
        Methods & SHD & F1 score & SHD & F1 score & SHD & F1 score & SHD & F1 score\\ 
        \midrule
        NCD   & \textbf{28.4$\pm$3.6} & \textbf{0.55$\pm$0.05} & \textbf{24.6$\pm$3.8} & \textbf{0.58$\pm$0.08} & \textbf{29.2$\pm$4.6} & \textbf{0.52$\pm$0.07} & 29.8$\pm$5.1 & \textbf{0.57$\pm$0.09}  \\
        RCD   & 32.8$\pm$2.2 & 0.39$\pm$0.11 & 32.6$\pm$4.7 & 0.44$\pm$0.10 & 31.4$\pm$4.5 & 0.43$\pm$0.14 & \textbf{27.2$\pm$3.3} & 0.54$\pm$0.06 \\
        PC & 37.6$\pm$1.3 & 0.18$\pm$0.09 & 36.0$\pm$2.7 & 0.26$\pm$0.05 & 36.2$\pm$1.8 &0.23$\pm$0.06 & 34.4$\pm$1.5 & 0.32$\pm$0.08 \\
        BIC & 33.0$\pm$2.1 & 0.45$\pm$0.06 & 30.8$\pm$3.3 & 0.50$\pm$0.09 & 30.8$\pm$5.6 & 0.43$\pm$0.09 & 35.0$\pm$3.9 & 0.39$\pm$0.07 \\
        KGV & 37.8$\pm$0.7 & 0.20$\pm$0.08 & 34.2$\pm$3.9 & 0.33$\pm$0.07 & 37.2$\pm$1.6 & 0.27$\pm$0.02 & 37.4$\pm$2.2 & 0.31$\pm$0.06 \\
        CAM & 33.0$\pm$5.6 & 0.42$\pm$0.13 & 30.6$\pm$3.4 & 0.50$\pm$0.11 & 35.2$\pm$2.8 &0.25$\pm$0.07 & 34.4$\pm$6.5 & 0.31$\pm$0.15 \\
        NOTEARS & 38.8$\pm$1.9 & 0.13$\pm$0.05 & 38.4$\pm$1.8 &0.13$\pm$0.05 & 39.0$\pm$1.6 & 0.33$\pm$0.04 & 39.0$\pm$1.9 & 0.34$\pm$0.07 \\
        DAG-GNN & 39.2$\pm$1.3 &0.03$\pm$0.02 & 39.2$\pm$2.3 &0.05$\pm$0.09 & 37.8$\pm$2.4 &0.26$\pm$0.10 & 39.6$\pm$1.1 &0.25$\pm$0.12 \\
        GraN-DAG & 34.0$\pm$7.9 &0.18$\pm$0.09 & 35.4$\pm$6.9 &0.30$\pm$0.13 & 37.4$\pm$3.2 &0.20$\pm$0.08 & 37.0$\pm$3.5 &0.27$\pm$0.09 \\
        GSF & 34.0$\pm$3.0 & 0.39$\pm$0.05 & \multicolumn{2}{c}{>12h}  & 31.6$\pm$3.2 & 0.38$\pm$0.09 & \multicolumn{2}{c}{>12h} \\
        \bottomrule
    \end{tabular}
    \caption{SHD and F1 score on PNL data sets with 10 nodes, 8 expected degrees, and 1000 and 5000 samples.}\label{tab:pnl8_shd_f1}
% \vskip -0.08in
\end{table*}


% emphasis: CGES performs similarly well for all settings while other methods may perform well for a single setting (where the model could be less mis-specified)

In this section, we compare our proposed method with various existing state-of-the-art causal discovery approaches on both synthetic and pseudo-real data sets.
% 
Baseline methods include score-based methods using BIC~\citep{chickering2002optimal}, KGV~\citep{bach2002learning}, and GSF~\citep{huang2018generalized}; a constraint-based method, PC algorithm~\citep{spirtes2000causation}; a method based on structural causal model, CAM~\citep{buhlmann2014cam}; as well as the emerging methods in the continuous optimization paradigm including NOTEARS~\citep{zheng2018dags}, DAG-GNN~\citep{yu2019dag}, and GraN-DAG~\citep{lachapelle2019gradient}. %All baseline methods were run with the publicly available code from the authors' websites. 
The details of the experimental settings and hyperparameters (including the choice of $\tau$) of baseline methods and ours are given in Appendix~\ref{app:detail}.\footnote{Our code is available at \url{https://github.com/xwshen51/GES-NCD}.}%, including a metric that we propose based on NCD to guide hyperparameter tuning in an unsupervised manner.

The causal discovery performance is evaluated using three metrics: the structural hamming distance (SHD), the structural interventional distance (SID)~\citep{peters2015structural} and the F1 score. %Like other score- and constraint-based baseline methods, 
Since our method and many baseline approaches return a CPDAG representing an MEC, both SHD and SID are evaluated between the learned and ground-truth CPDAGs. %, to which the learned CPDAGs/DAGs and ground-truth DAGs belong. 
Then the SHD is the smallest number of edge additions, deletions, and reversals to convert the estimated CPDAG into the true CPDAG. The SID counts the number of pairs $(X_i,X_j)$ such that the interventional distribution $p(x_j|do(X_i=x))$ would be miscalculated if we chose the parent adjustment set from the estimated graph. %A lower SHD and SID indicate a better estimate.
We report the SIDs corresponding to the best and worst DAG in the learned MEC. 
% consists in computing the path between all the pairs of variables, and checks if the causal relationship between the variables is respected.
% Both SHD and SID are computed using the Causal Discovery Toolbox~\citep{kalainathan2020causal}.
The SHD and SID are computed using functions corresponding to CPDAGs in the Causal Discovery Toolbox \citep{kalainathan2020causal}.
The F1 score is defined as the harmonic mean of the precision and the recall. Computing the F1 score involves summarizing the number of correctly estimated edges. Directed edges in the ground-truth CPDAG are deemed correctly estimated if the learned CPDAG contains exactly the same directed edge and are deemed incorrectly otherwise. Undirected edges in the ground-truth CPDAG are converted to two directed edges in the adjacency matrix. When the learned CPDAG contains exactly the same undirected edge, both converted directed edges are correctly estimated. One directed edge and no edge in the learned CPDAG are deemed as correctly estimating 1 and 0 edge, respectively. %, achieves 1 when both precision and recall are perfect and has the lowest possible value of 0.
In general, a lower SHD or SID and a higher F1 score indicate a better estimate.

% In the evaluation, both SHD and SID are computed using functions corresponding to CPDAGs in the Causal Discovery Toolbox \citep{kalainathan2020causal}. F1 score, which depends on the precision and recall, involves summarizing the number of correctly estimated edges. Directed edges in the ground-truth CPDAG are deemed correctly estimated if the learned CPDAG contains exactly the same directed edge and are deemed incorrectly otherwise. Undirected edges in the ground-truth CPDAG are converted to two directed edges in the adjacency matrix. When the learned CPDAG contains exactly the same undirected edge, both converted directed edges are correctly estimated. One directed edge and no edge in the learned CPDAG are deemed as correctly estimating 1 and 0 edge, respectively.



\subsection{Synthetic Data}



As mentioned in previous sections, when a data set does not satisfy the additive Gaussian noise assumption, many existing methods such as BIC, CAM, NOTEARS, and GraN-DAG suffer from model misspecification and thus may lead to misleading results. In contrast, nonparametric methods like GSF and ours in principle will not be affected. Here we consider the well-known post nonlinear (PNL) causal models~\citep{zhang2012pnl}. A general PNL model expresses each variable $X_i$ as $$X_i=g_{i,2}(g_{i,1}(\pa_i)+N_i),\ i=1,\dots,d,$$ where $\pa_i$ contains the direct causes of $X_i$, $N_i$ is the exogenous noise variable, and $g_{i,1}$ and $g_{i,2}$ are nonlinear transformations. 

To synthesize a data set, we first randomly generate a ground-truth DAG $\cG$ following the Erd\H{o}s-R\'enyi (ER) graph model and then generate data following $\cG$ and two types of PNL models that were also considered in \citet{lachapelle2019gradient}. The first one, called \emph{PNL-GP}, samples $g_{i,1}$ independently from a Gaussian process with bandwidth one, takes $g_{i,2}$ as the sigmoid function, and $N_i\sim Laplace(0,b_i)$ with $b_i\sim \cU[0,1]$. All root variables in PNL-GP are sampled from $\cU[-1,1]$. The second one, named \emph{PNL-MULT}, takes $g_{i,1}(x)=\log(sum(x))$ where $sum(x)$ takes the sum of all components of a vector $x$, $g_{i,2}(\cdot)=\exp(\cdot)$, and $N_i\sim|\cN(0,\sigma^2_i)|$ with $\sigma^2_i\sim\cU[0,1]$. All root variables in PNL-MULT are sampled from $\cU[0,2]$. This model is adapted from \citet{zhang2015estimation}.



Tables \ref{tab:pnl2_shd_f1} and \ref{tab:pnl8_shd_f1} present the results of SHD and F1 score on sparse and dense graphs with 10 nodes respectively, where the error bars represent the standard deviations across 5 data sets per setting. The results of SID are basically consistent, which are deferred to Appendix~\ref{app:add_exp} due to the space limit. Additional results on graphs with 20 nodes are also presented in Appendix~\ref{app:add_exp}. We see that in general, the reframed GES algorithm with our own NCD or the adopted RCD (shown in the first two lines of all tables) performs the best across all settings, except in the sparse PNL-MULT data where GSF is the best. The advantages of our methods on the more challenging dense graphs are more significant than those on sparse ones. 
In most cases, NCD outperforms RCD, though RCD produces excellent results on PNL-MULT with a larger sample. From the perspective of implementation, RCD may be favored over NCD in terms of fewer hyperparameters and less computational cost. 
% We also observe that the convergence rate of NCD and RCD vary across different data distributions. Specifically, NCD converges faster on PNL-GP than on PNL-MULT, while RCD converges particularly faster on PNL-MULT.
In addition, our methods exhibit similarly good performances across different ground-truth models, while most other methods tend to perform well on at most one setting, which indicates the robustness of our nonparametric approach against different distributions. 

GSF, as another kernel-based nonparametric score, performs very well on PNL-MULT with a sparse structure, but is less competitive in other settings. 
Note that we only report the results of GSF using 1000 samples, because even for the sparse graph, it takes around 17 hours for a single run with 5000 samples compared to around 19 minutes with 1000 samples. In contrast, our NCD-based method can benefit from a larger sample size while taking similar computational time as with a smaller sample (both within 4 minutes in the sparse case). \revise{In Appendix~\ref{app:add_exp}, we discuss more details regarding the computational time of different methods.} 
KGV leads to inferior performance in all settings. 
The standard GES with the linear-Gaussian BIC score sometimes performs well on PNL-MULT; a possible reason is that when the noise variance $\sigma_i^2\sim\cU[0,1]$ happens to be small, the PNL-MULT model behaves similarly to a linear-Gaussian model, leading to a case with minor misspecification for BIC. 
This may also partly account for the fact that PC performs better on PNL-MULT than on PNL-GP; that is, a PNL-MULT data set can be similar to linear-Gaussian data which would satisfy the model assumption made in the hypothesis testing. 
CAM performs better on PNL-GP than on PNL-MULT and achieves the best SID in one case, as shown in Appendix~\ref{app:add_exp}. 
The continuous optimization methods are inferior on these PNL data sets, which could be explained by their misspecification of the model. %NOTEARS relies on a linear causal model which is far beyond the PNL expression, DAG-GNN 


%\begin{table*}[h]
%\centering
%\subtable[degree=2]{
%\begin{tabular}{lllll}
%\toprule
%\multicolumn{1}{c}{\textbf{Method}} & \multicolumn{1}{c}{\textbf{SHD}} & \multicolumn{1}{c}{\textbf{SID}} & \multicolumn{1}{c}{\textbf{F1 score}} \\\midrule
%\multirow{2}{*}{NCD} &  \\
% &  \\
%\multirow{2}{*}{RCD} &  \\
%&  \\
%\multirow{2}{*}{PC} &  \\
%&  \\
%\multirow{2}{*}{GES(cdt)} &  \\
%&  \\
%%\multirow{2}{*}{CAM} &  \\
%%&  \\
%%\multirow{2}{*}{NOTEARS} &  \\
%% \\
%%\multirow{2}{*}{DAG-GNN} &  \\
%% \\
%%\multirow{2}{*}{GraN-DAG} &  \\
%% \\
%\multirow{2}{*}{GSF} &  \\
% \\
%\bottomrule
%\end{tabular}}
%\subtable[degree=8]{
%\begin{tabular}{lll}
%\toprule
%\multicolumn{1}{c}{\textbf{SHD}} & \multicolumn{1}{c}{\textbf{SID}} & \multicolumn{1}{c}{\textbf{F1 score}} \\\midrule
%\\
% \\
% \\
% \\
% \\
% \\
% \\
% \\
%% \\
%% \\
%% \\
%% \\
%% \\
%% \\
%% \\
%% \\
% \\
% \\
%\bottomrule
%\end{tabular}}
%\caption{Gauss-ANM data with dimension 10. For each method, the values on top and bottom correspond to sample sizes 1000 and 3000 respectively.}
%\end{table*}



%\begin{table*}[h]
%\centering
%\subtable[degree=2]{
%\begin{tabular}{lllll}
%\toprule
%\multicolumn{1}{c}{\textbf{Method}} & \multicolumn{1}{c}{\textbf{SHD}} & \multicolumn{1}{c}{\textbf{SID}} & \multicolumn{1}{c}{\textbf{F1 score}} \\\midrule
%\multirow{2}{*}{NCD} &  \\
% &  \\
%\multirow{2}{*}{RCD} &  \\
%&  \\
%\multirow{2}{*}{PC} &  \\
%&  \\
%\multirow{2}{*}{GES(cdt)} &  \\
%&  \\
%%\multirow{2}{*}{CAM} &  \\
%%&  \\
%%\multirow{2}{*}{NOTEARS} &  \\
%% \\
%%\multirow{2}{*}{DAG-GNN} &  \\
%% \\
%%\multirow{2}{*}{GraN-DAG} &  \\
%% \\
%\multirow{2}{*}{GSF} &  \\
% \\
%\bottomrule
%\end{tabular}}
%\subtable[degree=8]{
%\begin{tabular}{lll}
%\toprule
%\multicolumn{1}{c}{\textbf{SHD}} & \multicolumn{1}{c}{\textbf{SID}} & \multicolumn{1}{c}{\textbf{F1 score}} \\\midrule
%\\
% \\
% \\
% \\
% \\
% \\
% \\
% \\
%% \\
%% \\
%% \\
%% \\
%% \\
%% \\
%% \\
%% \\
% \\
% \\
%\bottomrule
%\end{tabular}}
%\caption{Gauss-ANM data with dimension 20. For each method, the values on top and bottom correspond to sample sizes 1000 and 3000 respectively.}
%\end{table*}



%\begin{table*}[h]
%\centering
%\subtable[GP]{
%\begin{tabular}{lcccc}
%\toprule
%Method & SHD & SID\_l & SID\_u & F1 score \\\midrule
%NCD & \textbf{1.4$\pm$1.9} & \textbf{4.0$\pm$6.5} & \textbf{16.0$\pm$6.1} & \textbf{0.78$\pm$0.13}  \\
%RCD & \textbf{5.4$\pm$6.2} & \textbf{9.0$\pm$14.0} & \textbf{20.4$\pm$12.6} & \textbf{0.60$\pm$0.32} \\
%NOTEARS & 8.6$\pm$4.3 & 22.8$\pm$14.3 & 36.8$\pm$14.6 & 0.37$\pm$0.24 \\
%DAG-GNN & 8.8$\pm$5.5 & 22.0$\pm$15.7 & 40.6$\pm$20.4 & 0.36$\pm$0.18 \\
%GraN-DAG & 8.0$\pm$3.2 & 8.4$\pm$14.5 & 32.2$\pm$21.5 &  0.56$\pm$0.08\\
%CAM & \textbf{2.8$\pm$2.9} & \textbf{4.7$\pm$6.5} & \textbf{19.8$\pm$10.0} & \textbf{0.90$\pm$0.10} \\
%GSF & 4.2$\pm$4.3 &	9.6$\pm$12.1 &	22.0$\pm$14.1&	0.68$\pm$0.15 \\
%GES & 8.6$\pm$5.0 & 13.0$\pm$7.9 & 24.6$\pm$12.3 & 0.53$\pm$0.18 \\
%PC & 6.6$\pm$4.7 & 11.2$\pm$9.7 & 25.8$\pm$12.1 & 0.58$\pm$0.15 \\
%\bottomrule
%\end{tabular}}
%\subtable[MULT]{
%\begin{tabular}{cccc}
%\toprule
%SHD & SID\_l & SID\_u & F1 score \\\midrule
%\textbf{3.8$\pm$1.9} & \textbf{7.0$\pm$5.6} & \textbf{22.4$\pm$8.5} & \textbf{0.69$\pm$0.04} \\
%{6.0$\pm$4.7} & {14.2$\pm$16.8} & {25.6$\pm$13.6} & {0.60$\pm$0.20} \\
%12.6$\pm$2.9 & 20.0$\pm$9.1 & 53.8$\pm$4.8 & 0.19$\pm$0.06
%\\%$\pm$1.8 & 48.2$\pm$3.9 & 48.2$\pm$3.9 & 0.26$\pm$0.10\\ 
%14.2$\pm$6.2 & 31.0$\pm$16.9 & 43.8$\pm$13.5 & 0.28$\pm$0.20 \\
%12.6$\pm$3.0 & 27.6$\pm$3.2& 40.8$\pm$4.5 & 0.21$\pm$0.09 \\
%9.2$\pm$7.3 & 7.0$\pm$10.6 & 37.0$\pm$15.5 & 0.42$\pm$0.18 \\
%6.8$\pm$3.8 &8.6$\pm$11.6 &	23.6$\pm$4.8 &	0.58$\pm$0.10\\
%\textbf{3.4$\pm$2.6} & \textbf{4.8$\pm$7.7} & \textbf{16.6$\pm$3.4} & \textbf{0.72$\pm$0.11} \\
%\textbf{3.8$\pm$0.7} & \textbf{8.4$\pm$2.7} & \textbf{32$\pm$8.7} & \textbf{0.66$\pm$0.05} \\
%\bottomrule
%\end{tabular}}
%\caption{PNL datasets with dimension 10, degree 2. (Grandag data)}
%\end{table*}


% GP	CAM	10$\pm$  23.25$\pm$7.1 &	25.3$\pm$14.5&	62.3$\pm5.5&	0.77$\pm0.14
% DAG-GNN	10	37.6$\pm$0.55& 	72.0$\pm$9.7	&81.60$\pm$6.2&	0.23$\pm0.14
% NOTEARS		38.2$\pm$2.2&	77.4$\pm$8.6&	86.0$\pm$5.3&	0.16$\pm0.0610
% MULT	CAM	10	37.4$\pm$3.0&	76.8$\pm$4.9&	83.4$\pm2.3&	0.21$\pm$0.05


%\begin{table*}[h]
%\centering
%\subtable[GP]{
%\begin{tabular}{lcccc}
%\toprule
%Method & SHD & SID\_l & SID\_u & F1 score \\\midrule
%NCD & \textbf{30.8$\pm$6.3} & \textbf{62.6$\pm$4.5} & \textbf{68.0$\pm$4.0} & \textbf{0.53$\pm$0.09}  \\
%RCD & 35.2$\pm$5.6 & 68.4$\pm$8.8 & 75.2$\pm$4.3 & 0.44$\pm$0.14 \\
%NOTEARS & 38.2$\pm$2.2&77.4$\pm$8.6&86.0$\pm$5.3&0.16$\pm$0.06\\
%DAG-GNN & 37.6$\pm$0.6&72.0$\pm$9.7&81.6$\pm$6.2&0.23$\pm$0.14 \\
%GraN-DAG & 27.0$\pm$5.6&35.2$\pm$13.8&66.8$\pm$5.0&0.69$\pm$0.12 \\
%CAM & 23.3$\pm$7.1 &	25.3$\pm$14.5&	62.3$\pm$5.5&	0.77$\pm$0.14\\
%GSF & 30.6$\pm$7.0 &	52.8$\pm$16.5 &	71.6$\pm$	8.0 &	0.55$\pm$0.16\\
%GES & 35.2$\pm$1.9 & 69.6$\pm$4.5 & 72.2$\pm$2.1 & 0.47$\pm$0.05 \\
%PC & 36.2$\pm$2.6 & 81.4$\pm$2.5 & 83.6$\pm$3.0 & 0.27$\pm$0.08 \\
%\bottomrule
%\end{tabular}}
%\subtable[MULT]{
%\begin{tabular}{cccc}
%\toprule
%SHD & SID\_l & SID\_u & F1 score \\\midrule
%\textbf{29.4$\pm$3.3} & \textbf{54.4$\pm$7.9} & \textbf{69.0$\pm$1.2} & \textbf{0.53$\pm$0.03}  \\
%\textbf{31.4$\pm$5.9} & \textbf{65.2$\pm$10.8} & \textbf{74.0$\pm$11.3} & \textbf{0.40$\pm$0.14} \\
%{36.2$\pm$2.7} & {63.6$\pm$5.4} & {89.8$\pm$0.5} & {0.12$\pm$0.04} \\
%37.4$\pm$1.7 &	72.0$\pm$7.6&86.0$\pm$2.5 &	0.19$\pm$0.08 \\
%37.4$\pm$1.7 & 72.0$\pm$7.6 & 86.0$\pm$2.5 & 0.19$\pm$0.08 \\
%37.4$\pm$3.0&	76.8$\pm$4.9&	83.4$\pm$2.3&	0.21$\pm$0.05 \\
%27.2$\pm$4.8 &	59.6$\pm$11.3 &	74.2$\pm$8.5 & 0.48$\pm$0.10 \\
%33.2$\pm$6.1 & 60.4$\pm$15.1 & 75.0$\pm$10.1 & 0.32$\pm$0.05 \\
%33.2$\pm$1.6 & 63.6$\pm$10.2 & 80.8$\pm$4.8 & 0.30$\pm$0.10 \\
%\bottomrule
%\end{tabular}}
%\caption{PNL datasets with dimension 10, degree 8. (grandag data)}
%\end{table*}

In addition, we evaluate our methods in a multi-dimensional scenario where each node may have more than one dimension. Note that our NCD estimator can be readily applied to the multi-dimensional setup by adjusting the input dimension of the test functions, while the rank-based RCD measure unfortunately cannot be directly applied here. Some of the baseline methods, including CAM, NOTEARS, and GraN-DAG do not apply to the multi-dimensional case, so we do not compare with them in this setting. 
We use 10 synthetic data sets from \citet{huang2018generalized}, each with 5 nodes and a sample size of 1000. As shown in Table \ref{tab:multdim}, our approach outperforms the baseline methods in all three metrics. PC, KGV, and GSF are the second-best performing methods in terms of SHD, SID, and F1 score, respectively, though they all give an inferior performance in other metrics.


\begin{table}%[h]
\centering
% \small
\begin{tabular}{@{}llll@{}}
\toprule
{Method} & \multicolumn{1}{c}{SHD} & \multicolumn{1}{c}{SID} & \multicolumn{1}{c}{F1 score}\\\midrule
NCD & \textbf{2.6$\pm$2.9} & \textbf{[2.0$\pm$3.7, 14.2$\pm$2.8]} & \textbf{0.73$\pm$0.15} \\
PC & 3.0$\pm$1.3 & [7.4$\pm$4.3, 14.9$\pm$3.7] & 0.57$\pm$0.11 \\
KGV & 4.1$\pm$1.6 & [3.7$\pm$3.3, 17.8$\pm$2.8] & 0.58$\pm$0.11 \\
DAG-GNN\hspace{-0.1in} &4.3$\pm$2.3 &	[4.1$\pm$3.9,	15.6$\pm$4.1]&	0.59$\pm$0.27 \\
% GSF\_marg & 2.1$\pm$2.7 & [2.2$\pm$3.4, 15.1$\pm$3.8] & 0.75$\pm$0.14 \\
GSF & 4.7$\pm$3.0 & [4.3$\pm$4.2, 15.6$\pm$3.0] & 0.61$\pm$0.17 \\
BIC & 4.7$\pm$0.9 & [4.8$\pm$3.8, 16.2$\pm$2.7] & 0.57$\pm$0.00 \\
\bottomrule
\end{tabular}
\caption{Results on 10 multi-dimensional data sets.}\label{tab:multdim}
\end{table}


\subsection{Pseudo-real Data}

Although the synthetic data sets from PNL models can expose the model misspecification problem in many existing methods, they differ from the additive noise setup only by the nonlinearity $g_{i,2}$, and hence amount to relatively mild cases of misspecification. In this section, we consider a pseudo-real data set sampled from the SynTReN generator~\citep{van2006syntren} where there is no guarantee at all for model specification. We evaluate on the 10 data sets sampled by \citet{lachapelle2019gradient}, each with 20 nodes and a small sample size of 500. %Note that although our proposed conditional dependence measure does not make parametric model assumptions and thus alleviate the misspecification issue, the small sample size makes it a still challenging task.
In addition, we consider a real Bayesian network, CHILD network (with 20 nodes), and randomly generate 3000 samples following the PNL-GP model introduced in the previous section. 

As shown in Table \ref{tab:syntren}, on SynTReN, most baseline methods perform poorly, indicating a potentially severe violation of their model assumptions. Our reframed GES with NCD and with RCD obtain the best SHDs. %but not the best SID, while another nonparametric score-based method GSF achieves the best SID. 
The results on SynTReN suggest the potential advantage of nonparametric causal discovery methods in real applications where model misspecification is common and possibly grave. %Moreover, as suggested by experiments on synthetic data, our NCD can benefit from a large sample, whereas kernel methods such as GSF face serious computational challenges when the sample size is big. Therefore, we expect our method to exhibit even more advantages than the kernel-based GSF in large-scale scenarios. 
Moreover, our methods also obtain the best performance on the real graph structure CHILD. Note that on this large data set with 3000 samples,  the kernel-based methods GSF and KGV face serious computational challenges in that they take more than 12 hours for a single run. Therefore, we expect our method to exhibit even more advantages than the kernel-based methods in large-scale scenarios. 
% The results of other metrics are given in Appendix~\ref{app:add_exp}. 


% \begin{table}%[h]
% \centering
% \small
% \begin{tabular}{ccc}
% \toprule
% Method & SHD & SID \\\midrule
% NCD & \textbf{30.0$\pm$5.8} & [147.0$\pm$53.4, 224.3$\pm$56.5] \\
% RCD & \textbf{30.9$\pm$4.8} & [156.6$\pm$52.8, 228.1$\pm$55.3] \\
% GSF & 52.1$\pm$8.9 & \textbf{[67.3$\pm$51.2, 109.4$\pm$43.3]} \\
% % PC(cdt) & 41.5$\pm$3.4 & [137.4$\pm$53.5, 207.5$\pm$55.8] \\
% % GES(cdt) & 66.6$\pm$8.5 & [116.3$\pm$42.2, 132.3$\pm$39.2] \\
% PC & 37.4$\pm$4.1 & [197.2$\pm$63.4, 221.1$\pm$65.5]\\
% % GES(tetrad) & 48.3$\pm$5.8 & [199.9$\pm$54.6, 219.4$\pm$59.5]\\
% BIC & 65.8$\pm$10.8 & [117.9$\pm$46.0, 138.0$\pm$47.5]\\
% KGV & 39.9$\pm$8.0 & [127.2$\pm$47.3, 149.9$\pm$52.1] \\
% CAM & 38.2$\pm$5.3 & [141.5$\pm$49.2,	211.9$\pm$50.3] \\
% NOTEARS & 99.8$\pm$14.4 & [124.2$\pm$82.9,	158.3$\pm$72.9] \\
% DAG-GNN &38.5$\pm$5.1 &	[193.8$\pm$51.9,	203.8$\pm$56.3]\\
% GraN-DAG & 58.7$\pm$10.0	&[188.1$\pm$57.3,	219.7$\pm$62.6] \\
% % GSF(new data) &25.5$\pm$6.7 &[220.7$\pm$78.2, 242.5$\pm$72.2] \\
% \bottomrule
% \end{tabular}
% \caption{SHD and SID on pseudo-real data.}\label{tab:syntren}
% \end{table}

% PNL_10nodes_deg8
%GP	gsf	10	30.6	6.985700	52.8	16.528763	71.6	8.049845
%MULT	gsf	10	27.2	4.816638	59.8	11.189281	70.6	11.781341

\begin{table}%[h]
\centering
% \small
\begin{tabular}{lll}
\toprule
Method & \multicolumn{1}{c}{SynTReN} & \multicolumn{1}{c}{CHILD} \\\midrule
NCD & \textbf{30.0$\pm$5.8} & \textbf{16.8$\pm$3.3} \\
RCD & \textbf{30.9$\pm$4.8} & \textbf{14.0$\pm$5.8} \\
% PC(cdt) & 41.5$\pm$3.4 & [137.4$\pm$53.5, 207.5$\pm$55.8] \\
% GES(cdt) & 66.6$\pm$8.5 & [116.3$\pm$42.2, 132.3$\pm$39.2] \\
PC & 37.4$\pm$4.1 & 22.6$\pm$9.4 \\
% GES(tetrad) & 48.3$\pm$5.8 & [199.9$\pm$54.6, 219.4$\pm$59.5]\\
BIC & 65.8$\pm$10.8 & 32.8$\pm$16.8 \\
% CAM & 38.2$\pm$5.3 & \textbf{7.0$\pm$3.3} \\
NOTEARS & 99.8$\pm$14.4 & 23.6$\pm$1.9 \\
DAG-GNN &38.5$\pm$5.1 &	28.8$\pm$3.2 \\
GraN-DAG & 58.7$\pm$10.0	& {17.2$\pm$2.1} \\
KGV & 39.9$\pm$8.0 & >12h \\
GSF & 52.1$\pm$8.9 & >12h\\%18.2$\pm$5.7 \\
% GSF(new data) &25.5$\pm$6.7 &[220.7$\pm$78.2, 242.5$\pm$72.2] \\
\bottomrule
\end{tabular}
\caption{SHD on pseudo-real data.}\label{tab:syntren}
\end{table}





%=======================================================
\section{Conclusion}
In this work, we presented a reframed GES algorithm that works with a measure of conditional dependence rather than a scoring metric for graphs. This way the algorithm is easily applicable in a nonparametric setting with a theoretical guarantee. We also proposed a neural conditional dependence (NCD) measure based on a deep neural network implementation, and established its theoretical properties that make it suitable for the reframed GES. The resulting causal discovery algorithm was shown in our experiments to be superior or competitive in comparison to a number of state-of-the-art methods. It also enjoys a significant advantage over kernel-based nonparametric methods in large-scale settings, since the latter are usually infeasible when the sample size is relatively large. For future work, we plan to explore a continuous optimization formulation of causal discovery based on such nonparametric conditional dependence measures. 

\begin{acknowledgements} % will be removed in pdf for initial submission,
%                         % so you can already fill it to test with the
%                         % ‘accepted’ class option
%    Briefly acknowledge people and organizations here.
%
%    \emph{All} acknowledgements go in this section.
JZ’s research was supported in part by the RGC of Hong Kong under GRF13602720 and a start-up fund from HKBU.~
\end{acknowledgements}
The authors thank the anonymous reviewers for their valuable comments and suggestions.


\bibliography{uai2022-template}

% \appendix
% NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% \section{Math font exposition}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

\end{document}
