%\documentclass[dvipsnames]{uai2024} % for initial submission
\documentclass[accepted, dvipsnames]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

% % If you use natbib package, activate the following three lines:
% \usepackage[round]{natbib}
% \renewcommand{\bibname}{References}
% \renewcommand{\bibsection}{\subsubsection*{\bibname}}

% % If you use BibTeX in apalike style, activate the following line:
% \bibliographystyle{apalike}
% \bibliographystyle{icml2023}
% \setcitestyle{authoryear,round,citesep={;},aysep={,},yysep={;}} %ICML

% Recommended, but optional, packages for figures and better typesetting:
%\usepackage{microtype}
\usepackage{graphicx}
%\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2023} with \usepackage[nohyperref]{icml2023} above.
\usepackage{hyperref}
\hypersetup{
    colorlinks=true, % AISTATS keeps links in black.
    linkcolor=Blue,
    filecolor=Blue,      
    urlcolor=Blue,
    citecolor=Blue % For biblatex.
    }

% Highlighted text.
\newcommand{\kyra}[1]{{\color{purple} [Kyra: {#1}]}}
\newcommand{\weishen}[1]{{\color{red} [Weishen: {#1}]}}
\newcommand{\rebuttal}[1]{{\color{black}{#1}}} %Maroon
  
% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}


% If accepted, instead use the following line for the camera-ready submission:
% \usepackage[accepted]{icml2023}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\newenvironment{proofsketch} {\begin{proof}[Proof sketch]} {\end{proof}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% BEGIN JM
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{tikz}
\usetikzlibrary {arrows.meta,arrows,patterns,decorations.pathreplacing,chains,positioning,shapes.geometric,calc} 
\usepackage{pifont}% http://ctan.org/pkg/pifont % for checkmarks, etc

\renewcommand\fbox{\fcolorbox{gray!50}{white}}

%\pagestyle{plain} % page numbers

\usepackage{minitoc} % TOC for appendix only
\renewcommand \thepart{}
\renewcommand \partname{}

\usepackage[T1]{fontenc} %  enables bold small caps

\newcommand{\cmark}{{\color{gray!30}\ding{51}}} % checkmark
\newcommand{\xmark}{{\color{Maroon}\ding{55}}} % x-mark

\RequirePackage{algorithm}
\RequirePackage[noend]{algorithmic}

% \RequirePackage{algcompatible}
% JM

% Customize itemize and enumerate.
\usepackage{enumitem}
\setlist[itemize]{noitemsep, topsep=0pt}

% Custom math commands.
\newcommand{\ind}{\perp\!\!\!\perp} 
\newcommand{\nind}{\not\!\perp\!\!\!\perp} 
\newcommand{\lambdav}[0]{ \ensuremath{\boldsymbol{\Lambda}} }
\newcommand{\g}{\mathcal{G}_{XY\z}}
\newcommand{\z}{\mathbf{Z}}

%%%% CUSTOMIZING TABLES
\usepackage{colortbl} % for colored table columns
\usepackage{array} % for multirows
\usepackage{adjustbox}  % for adjusting width of tables to text width
\usepackage{multirow} % vertical text in table
\usepackage{booktabs} %cmid rule under multicolumn

\usepackage{arydshln} % dashed and dotted hlines
\usepackage{hhline}   % for double cmidrules
\setlength\dashlinedash{0.2pt}
\setlength\dashlinegap{1.5pt}
\setlength\arrayrulewidth{0.3pt}
% \setbox0\hbox{\tabular{@{}l}\textsc{Relative to exposure $Y$}\endtabular} % for multirows

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% END JM
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}

\title{Local Discovery by Partitioning: \\ Polynomial-Time Causal Discovery Around Exposure-Outcome Pairs}

% Add authors
\author[1]{Jacqueline Maasch}
\author[2]{Weishen Pan}
\author[3]{Shantanu Gupta}
\author[1]{Volodymyr Kuleshov}
\author[4]{Kyra Gan}
\author[2]{Fei Wang}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science, Cornell Tech, New York, NY
}
\affil[2]{%
    Department of Population Health Sciences, Weill Cornell Medicine, New York, NY
}
\affil[3]{%
    Machine Learning Department, Carnegie Mellon University, Pittsburgh, PA
  }
\affil[4]{%
    Department of Operations Research and Information Engineering, Cornell Tech, New York, NY
  }


  
  
\begin{document}

\maketitle

\begin{abstract}
    Causal discovery is crucial for causal inference in observational studies, as it can enable the identification of \emph{valid adjustment sets} (VAS) for unbiased effect estimation. However,  global causal discovery is notoriously hard in the nonparametric setting, with exponential time and sample complexity in the worst case. To address this, we propose \emph{local discovery by partitioning} (LDP): a local causal discovery method that is tailored for downstream inference tasks without requiring parametric and pretreatment assumptions. LDP is a constraint-based procedure that returns a VAS for an exposure-outcome pair under latent confounding, given sufficient conditions. The total number of independence tests performed is worst-case quadratic with respect to the cardinality of the variable set. Asymptotic theoretical guarantees are numerically validated on synthetic graphs. Adjustment sets from LDP yield less biased and more precise average treatment effect estimates than baseline discovery algorithms, with LDP outperforming on confounder recall, runtime, and test count for VAS discovery. Notably, LDP ran at least $1300\times$ faster than baselines on a benchmark.
\end{abstract}

%We first establish that this partition is unique, exhaustive, and mutually exclusive, and then show that LDP returns a VAS under suitable conditions and is robust under causal insufficiency.  Moreover, LDP requires a quadratic number of independence tests in the worst case. We numerically illustrate that LDP significantly reduces the runtime and sample complexity, producing an unbiased average treatment effect estimate with lower variance.



%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% INTRO
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Introduction}

Uncertainty surrounding the true causal structure of observational data is a central challenge in causal inference.
Unbiased causal effect estimation requires that certain variable types are omitted from covariate adjustment (e.g., colliders), while others are retained (e.g., confounders)
\citep{schisterman_overadjustment_2009,lu_revisiting_2021,holmberg_collider_2022}. However, the identification of such variables can be challenging when the structural causal model is unknown, as is often true in practice. When domain knowledge is limited, causal discovery offers a powerful solution by automating the identification of critical variables and providing valid adjustment sets (VAS) for downstream inference.

\input{figure_tex/figure_motivation}

\rebuttal{We consider the set of causal discovery methods that do not require parametric assumptions on the data generating process for the identifiability of causal relations,} increasing their reliability in real-world settings (e.g., healthcare). We therefore restrict our attention to constraint-based discovery, avoiding the standard parametric assumptions of functional causal models \citep{shimizu_linear_2006, hoyer_nonlinear_2008,zhang_identiability_2009,rolland_score_2022,montagna_shortcuts_2023} and restrictive assumptions on variable variances~\citep{gao2020polynomial}. Despite asymptotic guarantees on correctness, the practicality of global constraint-based discovery is limited by the high sample and time complexity of running many conditional independence (CI) tests \citep{aliferis2010local,zhang_kernel-based_2011,schluter2014survey,zarebavani_cupc_2020,hagedorn2022gpu,braun_gpucsl_2022}. As shown in Figures \ref{fig:motive} and \ref{fig:motive_times_tests}–\ref{fig:fci_finite}, classic constraint-based methods like PC and FCI \citep{spirtes_causation_2000} can also display finite sample failure modes for VAS discovery, even with reasonably large sample sizes. %Additionally, these methods cannot infer beyond the Markov equivalence class (MEC) and can fail to unambiguously label causal paths that are relevant for downstream inference, \rebuttal{even with infinite data. We provide such an example in Figure \ref{fig:pc_mec_ambiguous}}.

Increasingly, attention is shifting toward local discovery methods that only learn relationships that are \textit{causally relevant} to target variables of interest \citep{gupta_local_2023, cai2023learning, dai2024local}.
However, existing local discovery methods that are tailored for VAS identification generally require that all variables are \textit{pretreatment} \rebuttal{(i.e., non-descendants of the exposure) \citep{entner_data-driven_2013,cheng_local_2022}. This strong graphical assumption heavily simplifies the automated covariate selection problem, but is difficult to reliably verify in real-world data.}

To address the challenges posed by existing global and local methods, 
we propose \emph{local discovery by partitioning} (LDP): a local causal discovery algorithm designed for downstream inference tasks that does not assume pretreatment nor require parametric assumptions on the underlying data generating process. We approach this problem through the lens of \textit{causal partitioning}, where variables are systematically subsetted according to their causal relation to an exposure-outcome pair. LDP returns a VAS under the backdoor criterion in worst-case polynomial time. Once a VAS is identified, conditional exchangeability holds, allowing the user to choose their preferred inference method to obtain unbiased effect estimates. In addition to VAS discovery, LDP identifies other variable types that can facilitate inference (e.g., instrumental variables \citep{imbens_instrumental_2014}) or statistical efficiency (e.g., causes of outcome \citep{brookhart_variable_2006}).

\paragraph{Contributions} We introduce a taxonomy of eight exhaustive and mutually exclusive \textit{causal partitions} that are universal properties of any arbitrary dataset with respect to an exposure-outcome pair. We then propose a polynomial-time procedure for leveraging these partitions to \rebuttal{obtain a VAS under the backdoor criterion}. LDP improves on the practicality of causal discovery in the context of downstream inference, owing to the following properties.
\begin{itemize}[noitemsep,topsep=0pt]
    \item \textit{Time efficiency:} LDP only conducts tests that are needed for learning a VAS. The total number of independence tests performed is worst-case quadratic with respect to total variables, versus exponential for common baselines. On a community benchmark, LDP ran $1400\times$ to $2500\times$ faster than PC.
    \item \textit{Sample efficiency:} The majority of CI tests defined in Algorithm \ref{alg:method} use conditioning sets of size one or two, contributing to more favorable sample efficiency relative to experimental baselines.
    \item \textit{Flexibility:} LDP  does not require parametric
    assumptions over the data generating process and does not assume the magnitude of the exposure-outcome effect (which may be null). We replace the pretreatment assumption with a milder, verifiable condition. 
\end{itemize}

\paragraph{Organization}  The remainder of this paper is organized as follows. Section~\ref{sec:prelim} describes preliminaries for causal graphical modeling, and Section~\ref{sec:partitions} describes the universal partitioning taxonomy.
Section~\ref{sec:alg} introduces LDP and establishes that LDP returns a VAS for the true DAG under causal insufficiency, if a specific CI criterion is passed by at least one variable in the observed data. 
Section~\ref{sec:related_work} compares LDP to existing works that do and do not assume pretreatment. In Section~\ref{sec:experiments}, we numerically evaluate LDP and establish that LDP achieves low runtimes and high sample efficiency when compared with existing causal discovery methods. Results demonstrate that VAS from LDP yield less biased and more precise average treatment effect estimates than baselines. Source code is available on GitHub.\footnote{\href{https://github.com/jmaasch/ldp}{https://github.com/jmaasch/ldp}}  


%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% PRELIMINARIES
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Preliminaries}
\label{sec:prelim}

%%%%%%%%%%%%%%%%
% NOTATION
%%%%%%%%%%%%%%%%

Univariate random variables are denoted by uppercase letters (e.g., $X$). Sets or multivariate random variables are denoted by bold uppercase (e.g., $\z$), and graphs by calligraphic letters (e.g., $\mathcal{G}$). Let $X,Y, \z$ denote continuous or discrete random variables representing an exposure, an outcome, and a variable set of unknown causal structure, respectively. Let $\g$ be the graph induced by $\{X,Y,\z\}$. Sample sizes are denoted by $n$ and large values are abbreviated (e.g., $n = 1000 \to n = 1\mathsf{k}$.)

%%%%%%%%%%%%%%%%
% GRAPHS
%%%%%%%%%%%%%%%%

We restrict our attention to the set of causal graphs that are directed and acyclic.\footnote{Refer to \cite{pearl_causal_2009} for an introduction to graphical models.} We assume the common causal Markov condition and faithfulness \citep{spirtes_causation_2000}.\footnote{This ensures that the CI relations entailed by the joint distribution $p(X,Y,\z)$ precisely match those implied by the Markov condition applied to $\g$ (i.e., $p(X,Y,\z)$ and $\g$ are \textit{faithful} to each other).} We define active and inactive paths in $\g$ based on the concept of $d$-separation.\footnote{\rebuttal{Note that these definitions consider both the directed path and its corresponding undirected path, ignoring directionality.}}

\begin{definition} [$D$-separation, \citealt{spirtes_causation_2000}]
    Nodes $V$ and $V'$ in arbitrary causal DAG $\mathcal{G}$ are $d$-separated given node set $\mathbf{D}$ (where $\{V,V'\} \notin \mathbf{D}$) when there is no undirected path between $V$ and $V'$ that is \textit{active} relative to $\mathbf{D}$. \label{def:d-sep}
\end{definition}

\begin{definition}
    [Active paths, \citealt{spirtes_causation_2000}]  \label{def:active}
    An undirected path is considered \textit{active} relative to a node set $\mathbf{D}$ if every node on this path is active relative to $\mathbf{D}$. A node $V \in \mathcal{G}$ is active on a path relative to $\mathbf{D}$ if
    \begin{enumerate}[noitemsep,topsep=0pt]
        \item $V \notin \mathbf{D}$ is not a collider,
        \item $V\in \mathbf{D}$ is a collider, or 
        \item $V\notin \mathbf{D}$ is a collider and at least one of its descendants is in $\mathbf{D}$.
    \end{enumerate}
\end{definition}

\input{figure_tex/figure_triples_one_row}

We take an \textit{inactive} path to be one that does not meet Definition \ref{def:active} (e.g., due to existence of a collider $\notin \mathbf{D}$ on that path). As the definitions of active and inactive are with respect to $\mathbf{D}$, we assume $\mathbf{D} = \emptyset$ unless
otherwise stated. We classify active paths between two nodes $\{Z,Z'\}$ following from Table \ref{tab:path_types}: 1) $Z \rightarrow \cdots \rightarrow Z'$, 2) $Z \leftarrow \cdots \leftarrow Z'$, or 3) $Z \leftarrow \cdots Z'' \cdots \rightarrow Z'$, where $Z''$ denotes a third node. 

We say that causal association flows from exposure $X$ to outcome $Y$ through directed paths $X \rightarrow \cdots \rightarrow Y$. Non-causal association between $X$ and $Y$ due to a common cause also presents as statistical dependency, per Reichenbach’s common cause principle \citep{jonas_peters_elements_2017}. Such common causes lie along \textit{backdoor paths} for $\{X,Y\}$ ($X \leftarrow \cdots Z \cdots \rightarrow Y$).

\begin{definition} [Backdoor path, \citealt{pearl_causal_2009}]
    Any non-causal path between exposure $X$ and outcome $Y$ with an edge pointing into $X$ ($\cdots \to X$).
    % \citep{neal_introduction_2020}.
    %when it is 1) not blocked by a non-collider that \textit{is} conditioned on or 2)  not blocked by a collider that is \textit{not} conditioned on \citep{neal_introduction_2020}.
    \label{def:backdoor_path}
\end{definition}

We define valid adjustment with respect to the \textit{backdoor criterion}, which enables \emph{conditional ignorability} or \emph{conditional exchangeability} for causal effect estimation in observational data: i.e., \textit{confounding bias} is eliminated by achieving conditional independence between exposure $X$ and the potential outcomes of $Y$ \citep{vanderweele_definition_2013}.

\begin{definition} [Valid adjustment under the backdoor criterion, \citealt{pearl_causal_2009}]
Let $\mathbf{A}_{XY}$ be an adjustment set for $\{X,Y\}$ that does not contain $\{X,Y\}$. $\mathbf{A}_{XY}$ is \textit{valid} if
\begin{enumerate}[noitemsep,topsep=0pt]
    \item $\mathbf{A}_{XY}$ contains no descendants of $X$ and
    \item $\mathbf{A}_{XY}$ blocks all backdoor paths for $X$ and $Y$.
\end{enumerate}
    \label{def:backdoor_criterion}
\end{definition}


\begin{definition}[Confounder, \citealt{vanderweele_definition_2013}] \label{def:confounder}
    A confounder for a variable pair $\{X,Y\}$ is a variable $Z$ for which there exists a variable set $\mathbf{S}$ (which may be empty) such that the effect of  $X$ on $Y$ is unconfounded given $\{Z, \mathbf{S}\}$ but not given any proper subset of $\{Z, \mathbf{S}\}$. 
\end{definition}

\input{figure_tex/figure_ten_node}


\section{Causal Partitions of $\z$}
\label{sec:partitions}

We approach the problem of local discovery for downstream inference through the lens of \textit{causal partitioning}, where variables are systematically subsetted according to their causal relation to the exposure and outcome. We establish an exhaustive taxonomy of eight disjoint partitions in Table~\ref{tab:partitions}. 
These partitions are not assumptions on the true DAG. \rebuttal{Rather, they are \textit{universal properties} of any ground truth DAG with respect to a chosen exposure-outcome pair.} Thus, a true unique partitioning exists for any \rebuttal{directed acyclic} data generating process (though some partitions may be empty). In this work, we argue that these fundamental properties can be conveniently leveraged for efficient algorithm design. 

In the next theorem, we establish that each variable $Z \in \z$  belongs to a single ground truth partition.

%%%%%%%%%%%%%%%%
% PARTITIONS
%%%%%%%%%%%%%%%%

\begin{theorem}
The eight partitions defined in Table~\ref{tab:partitions} are exhaustive and mutually exclusive, such that any variable $Z$ falls uniquely under one partition category.
\label{theorem:partitions}
\end{theorem}

\textit{Intuition.}\; The intuition behind this taxonomy is reflected in the eight triple graphs in Figure~\ref{fig:triples}, where $X$ is assumed to cause $Y$ and all paths are restricted to length one. These triples are exhaustive and mutually exclusive, and arise from simple enumeration of the three possible relations that one variable can take with respect to another: cause, effect, or neither. We generalize this intuition to the setting of arbitrary cardinality and indirect active paths, where the primitive relations of cause, effect, and neither map to the more complex relational combinations enumerated in Tables \ref{tab:path_types} and \ref{tab:path_grid} (e.g., ancestor, non-ancestor, descendant, and non-descendant).

\vspace{-2mm}

\rebuttal{
\begin{proof}
To prove Theorem \ref{theorem:partitions}, we define every type of active path from a candidate $Z \in \z$ to $X$ or $Y$ that can possibly arise in the ground truth graph (Table \ref{tab:path_types}). These can be direct adjacencies or indirect active paths of arbitrary length. Table \ref{tab:path_grid} expresses every possible combination of path types that can coincide for a single $Z$. The mutual exclusivity of partitions follows from the fact that each cell of Table \ref{tab:path_grid} contains a single partition, such that the pattern of allowable active path types from $Z$ to $X$ and $Y$ is unique for each partition. Exhaustivity follows from the fact that every cell in Table \ref{tab:path_grid} that does not violate acyclicity contains a partition, such that all possible combinations are represented.
\end{proof}
}

\begin{table*}[!t]
    \centering
    \begin{adjustbox}{max width=\linewidth}
    \begin{tabular}
    {
    p{0.3cm} p{16.5cm}
    % p{0.3cm} p{8.5cm}
    }
    \toprule
    \multicolumn{2}{c}{\fontfamily{cmr}\textsc{Exhaustive and Mutually Exclusive Causal Partitions}} \\
    \midrule
        $\z_1$ &  \textit{Confounders and their proxies}: Non-descendants of $X$ that lie on an active backdoor path between $X$ and $Y$ (Definition \ref{def:confounder}), and their proxies (Definition \ref{def:proxy}). \\
        $\z_2$ &  \textit{Colliders and their proxies}: Non-ancestors of $\{X,Y\}$ with at least one active path to $X$ not mediated by $Y$ and at least one active path to $Y$ not mediated by $X$.\\
        $\z_3$ &  \textit{Mediators and their proxies}: Descendants of $X$ that are ancestors of $Y$, and their proxies (Definition \ref{def:proxy}). \\
        $\z_4$ &  Non-descendants of $Y$ that are marginally dependent on $Y$ but marginally independent of $X$  (Definition \ref{def:z4}). \\
        $\z_5$ &  \textit{Instruments and their proxies}: Non-descendants of $X$ whose causal effect on $Y$ is fully mediated by $X$, and that share no confounders with $Y$ (Definitions \ref{def:z5} and \ref{def:proxy}). \\
        $\z_6$ &  Descendants of $Y$ where all active paths shared with $X$ are mediated by $Y$. \\
        $\z_7$ &  Descendants of $X$ where all active paths shared with $Y$ are mediated by $X$. \\
        $\z_8$ &  All nodes that share no active paths with $X$ nor $Y$. \\
    \midrule
    \end{tabular}
    \end{adjustbox}
    \caption{\rebuttal{Partitions are formally defined by the path combinations enumerated in Table \ref{tab:path_grid}.}}
    \label{tab:partitions}
\end{table*}

\input{tables_tex/table_path_types}

\input{tables_tex/table_path_grid}

%\vspace{3mm}

Some partitions coincide with existing terminology while others do not. $\z_1$ approximately maps to \textit{confounder} \citep{vanderweele_definition_2013}, $\z_2$ to  \textit{collider}, $\z_3$ to \textit{mediator}, $\z_4$ to \textit{pure prognostic variable} \citep{hahn_feature_2022}, and $\z_5$ to \textit{instrumental variable} \citep{lousdal_introduction_2018}. To our knowledge, $\{\z_6, \z_7, \z_8\}$ do not coincide with existing terms in the causal inference literature. Further attention is given to defining  $\z_4$ and $\z_5$ in Appendix \ref{sec:prelim_appendix}, given their role in the identifiability conditions of LDP (Section \ref{sec:identifiability}). Proxy variables are also further defined in Appendix \ref{sec:prelim_appendix}. When referring to multiple partitions collectively, e.g., $\z_5$ and $\z_7$, we use notation of the form $\z_{5,7}$. When referring to a subpartition that is descended from or adjacent to a specific variable, we use notation of the form $\z_{2 \in de(X)}$ and $\z_{2 \in adj(X)}$, respectively.\footnote{\rebuttal{An example of $\z_{2 \notin de(X)}$ and $\z_{2 \notin adj(X)}$ is $\mathbf{M}_3$ in Fig. \ref{fig:m_butterfly}.}} %Note that in $\g$, partitions can be cardinality greater than or equal to zero.


Within a single partition, there can be arbitrarily many active paths among its members (e.g., $Z_1 \to \cdots \to Z'_1$). 
Across partitions, active paths can exist in arbitrary DAGs as long as they comply with acyclicity and the patterns in Table~\ref{tab:path_grid}.

\begin{definition} [Inter-partition active path]
    Any active path that is shared by at least two partitions, is \textit{not} fully mediated by $X$ and/or $Y$, and complies with acyclicity and the combination of path types allowable in Table \ref{tab:path_grid}. \label{def:inter_paths}
\end{definition}

An example of an inter-partition active path that cannot exist is $Z_4 \to \cdots \to Z_5$, as such a path violates the definitions of these partitions (Table \ref{tab:path_grid}, Appendix \ref{sec:prelim_appendix}).
When we assemble all partitions into a single DAG, reduce active paths with $\{X,Y\}$ to length-1 arrows, and abstract away inter-partition active paths, we obtain the projection in Figure \ref{fig:ten_node_dag}. 


\input{figure_tex/schematic_one_row}
\input{algorithms/alg}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% METHODS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \vspace{-10pt}
\section{Local Discovery by Partitioning} \label{sec:alg}

The pseudocode for LDP is expressed in Algorithm \ref{alg:method}. Proofs of correctness are given in Appendix \ref{sec:partition_correctness}. \rebuttal{Given an exposure-outcome pair $\{X,Y\}$ and variable set $\z$, LDP causally partitions $\z$ in service of identifying a VAS under the backdoor criterion. LDP raises a warning if a VAS is not identifiable, as assessed using the $\z_5$ criterion (Definition \ref{def:z5_criterion}; Line 29 of Algorithm \ref{alg:method}). A visual schematic for the learning process of Algorithm \ref{alg:method} is provided in Figure \ref{fig:schematic_one_row}. Note that the correctness of  certain intermediate results identified by LDP requires more stringent identifiability conditions than VAS discovery, as discussed in Remark \ref{remark:vas_foremost}.}

\rebuttal{
\begin{remark}[LDP is foremost a VAS discovery method, not a partition labeling method]
\label{remark:vas_foremost}
    LDP outputs partition labels as intermediate results en route to identifying a VAS. As intermediate results, LDP labels 1) five unique causal partitions ($\z_1$, $\z_4$, $\z_5$, $\z_7$, and $\z_8$), and 2) a superset $\z_{\textsc{Post}}$, which aggregates the remaining three post-treatment partitions ($\z_2$, $\z_3$, and $\z_6$). While the sufficient conditions for guaranteeing correct partition labels for $\z_4$, $\z_7$, and $\z_8$ are very lax (Theorem \ref{theorem:z4_z8_latent}), sufficient conditions for correctly labeling the remaining partitions are significantly stronger than for VAS discovery (Section \ref{sec:identifiability}). For use of predicted partition labels \textit{outside VAS selection}, we urge caution in interpreting the results. Thus, guaranteed partition label correctness for $\z_1$, $\z_2$, $\z_3$, and $\z_5$ in arbitrary DAGs is a key limitation and an area for future work.
\end{remark}
}

\rebuttal{\textbf{High-Level Overview} Here, we describe the basic logic of Algorithm \ref{alg:method} in plain English. Note that the partition labels assigned by Algorithm \ref{alg:method} assume that the sufficient conditions described in Section \ref{sec:identifiability} are satisfied.
\begin{enumerate}[noitemsep,topsep=0pt,label={\itshape Step \arabic*},leftmargin=\widthof{[Step 7]}+\labelsep]%align= left]
    \item $\z_8$ discovered with knowledge of $\{X,Y,Z\}$ only. 
    \item $\z_4$ discovered with knowledge of $\{X,Y,Z\}$ only. 
    \item  $\z_{7}$ discovered with knowledge of $\{X,Y,Z\}$ only. $\z_5$ will also be discovered if $|\z_1| = 0$. 
    \item A fraction of $\z_{\textsc{Post}}$ is discovered, providing complete knowledge of $\z_6$ and partial knowledge of $\z_2$ and $\z_3$. This step leverages prior knowledge of $\z_4$ that was obtained programmatically at Step 2. 
    \item $\z_{\textsc{Mix}}$ is temporarily aggregated, providing partial knowledge of $\z_1$, $\z_2$, $\z_3$, and $\z_5$. $\z_{\textsc{Mix}}$ is a transient superset that is used to  differentiate $\z_1$ and $\z_5$ from $\z_{\textsc{Post}}$ in Step 6.
    \item Knowledge of $\z_{\textsc{Post}}$ is complete. $\z_{\textsc{Mix}}$ is fully disaggregated, providing final partition labels for some members and moving others to superset $\z_{1,5}$. 
    At this stage, we also finalize our knowledge of $\z_7$. By Line 19, all members of $\z_5$ have been placed in $\z_{1,5}$. By Line 22, members of $\z_1$ that are adjacent to $Y$ have been uniquely identified.
    \item $\z_1$ and $\z_5$ are fully disentangled. This step tests whether a member of superset $\z_{1,5}$ is marginally dependent on known members of $\z_1$. All previously known members of $\z_1$ are adjacent to $Y$. $\z_1$ that are left to be discovered are those with indirect active paths to $Y$. In any arbitrary DAG, no $Z_5 \in \z_5$ will ever be dependent on a $Z_1 \in \z_1$ that is adjacent to $Y$. However, all $\z_1$ are marginally dependent on at least one $Z_1 \in \z_1$ adjacent to $Y$. 
    \item The algorithm concludes by testing the $\z_5$ criterion, which indicates whether a VAS was discovered and raises a warning when failed.
\end{enumerate}
}

\paragraph{Time Complexity} 
We report Big $O$ complexity in terms of total independence tests performed, as is conventional for constraint-based causal discovery \citep{spirtes_causation_2000, tsamardinos_max-min_2006}. The first for-loop (Steps 1–3) requires a linear number of tests in $O(|\z|)$, where Step 1 caches all marginal test results for every candidate relative to $\{X,Y\}$. 
Step 4 requires $O(|\z|^2)$ tests. Step 5 requires $O(|\z|)$ and Step 6 requires $O(|\z|^2)$. Step 7 requires no tests, as it uses cached test results. Step 8 requires $O(|\z|)$ tests. Thus, total tests performed is in $O(|\z|^2)$. Empirical results with an oracle corroborate asymptotic analyses (Figure \ref{fig:test_curve}).

\paragraph{Sample Complexity}  The sample complexity of LDP will be dictated by the user-selected independence test. In lieu of a formal complexity analysis, we provide some statistical intuition for the efficiency of LDP. It is generally assumed that lower order CI relations (i.e., those with smaller conditioning sets) are inferred more reliably than higher order relations under finite samples \citep{spirtes_causation_2000}. For example, conditional mutual information (CMI) has been shown to require an exponential number of samples in the cardinality of the conditioning set \citep{kubkowski2021gain}. For a conditioning set of size 10 and a power of at least 0.5, CMI can require a sample size of approximately $n = 30\mathsf{k}$ \citep{kubkowski2021gain}. Even if all variables in this conditioning set have just three discrete states, it is possible that only a subset of the $3^{10}$ possible states will be instantiated in sample sizes representative of real-world data \citep{spirtes_causation_2000}. Therefore, LDP was designed under the intuition that lower order CI tests provide more favorable sample complexity. The maximum conditioning set size for Line 12 is  $O(|\z_{1,2,3,5}|)$ and for the $\z_5$ criterion is $O(|\z_1|)$. All other conditioning sets are cardinality one or two. Limited empirical comparisons by sample size (Figures \ref{fig:baselines}, \ref{fig:fci_latent_z5a}, \ref{fig:motive_times_tests}, \ref{fig:pc_fci}) and conditioning set size (Appendix \ref{sec:cond_set_size}) are provided.

%%%%%%%%%%%%%%%%
% ADJUSTMENT SETS
%%%%%%%%%%%%%%%%


\label{sec:cov_sel}

\paragraph{LDP for VAS Selection Under Multiple Criteria} LDP flexibly facilitates VAS selection under multiple popular theoretical criteria (Appendix \ref{sec:covariate_criteria}). As LDP returns $\z_1$, $\z_4$, and $\z_5$, LDP can be used as an automated preprocessing step for VAS selection under the  \textit{common cause criterion} (which retains only $\z_1$), the \textit{disjunctive cause criterion} (which retains $\{\z_1,\z_4,\z_5\}$) \citep{vanderweele_new_2011}, and the \textit{outcome criterion} (which retains $\{\z_1, \z_4\}$) \citep{brookhart_variable_2006}, all of which yield VAS under the backdoor criterion and the \textit{generalized adjustment criterion} \citep{perkovic_complete_2015}. However, we recommend caution if adjusting for $\z_5$, as adjusting for instruments can amplify bias or introduce new bias \citep{pearl_class_2012}. We discuss notions of optimal and minimal adjustment sets in Appendix \ref{sec:covariate_criteria}.


%%%%%%%%%%%%%%%%
% ASSUMPTIONS
%%%%%%%%%%%%%%%%


% \vspace{-5pt}
\subsection{Identifiability Conditions}
\label{sec:identifiability}
% 

Here, we describe two separate sets of sufficient conditions for the identifiability of 1) a VAS for the true DAG and 2) correct partition labels, which are provided as intermediate results by LDP. Notably, we show that a VAS is identifiable under causal insufficiency if a specific CI criterion is met by at least one variable (Line 29 of Algorithm \ref{alg:method}). \rebuttal{All theoretical results are for the asymptotic regime and assume an independence oracle.}

\paragraph{Assumptions}
We assume the causal Markov condition, faithfulness, and acyclicity. 
Variables are not assumed to be exclusively pretreatment  and we do not place sparsity constraints on the true graph. We do not make assumptions about the distributional forms of variables nor the functional forms of their causal relations. %No specific functional causal model is imposed, freeing LDP from the identifiability assumptions of the post-nonlinear additive noise model \citep{zhang_identiability_2009}. 
\rebuttal{While user-specified independence tests might impose their own parametric assumptions, nonparametric tests are recommended when the data generating process is unknown (e.g., \citealt{gretton2005measuring, gretton2007kernel, zhang_kernel-based_2011, runge_conditional_2018}).}

\begin{figure}[!t]
    \centering
    \includegraphics[width=0.9\linewidth]{figures/time_tests.jpg}
    \caption{Total tests performed under an independence oracle (top) and mean runtime over 100 replicates (bottom) as the cardinality of $\z$ increases, with 95\% confidence intervals in shaded regions. Each DAG resembles Figure \ref{fig:ten_node_dag} with equal cardinality per partition ($[1,10]$). Results are reported for LDP and PC. LDECC and MB-by-MB curves overlapped with PC, with PC outperforming. Exponential, quadratic, $x \log_2(x)$, and linear curves (in tests and milliseconds) serve as comparison. Table \ref{tab:time_tests} reports raw data.}
    \label{fig:test_curve}
\end{figure}

\paragraph{The Exposure-Outcome Pair} 
The only prior knowledge of $\g$ that is required by LDP concerns the exposure-outcome relationship. While the causal effect of $X$ on $Y$ can be of arbitrary strength or null, we assume that 1) $X$ and $Y$ are marginally dependent and 2) $Y$ cannot
be a direct nor indirect cause of $X$ due to the acyclicity assumption. All proofs and experiments assume univariate $X$ and $Y$. %The marginal dependence of $X$ and $Y$ is necessary for Step 2 of Algorithm \ref{alg:method}, which relies on the assumption that there exists a path from $X$ to $Y$.

\paragraph{Sufficient Conditions for VAS Identification} A VAS 1) contains no descendants of $X$ and 2) blocks all backdoor paths for $\{X,Y\}$ (Definiton \ref{def:backdoor_criterion}). Theorem \ref{theorem:valid_adjustment} shows that the VAS returned by LDP meets both criteria given the following sufficient (but not necessary) graphical conditions.

\rebuttal{
\begin{enumerate}[noitemsep,topsep=0pt, label=C\arabic*]
\item\label{cond:sufficient_2} The existence of at least one observed member of $\z_4$. 
\item\label{cond:sufficient_3} The existence of at least one observed member of $\z_5$, such that all $\z_1$ are marginally independent of at least one observed $Z_5 \in \z_5$. 
\end{enumerate}
}

\ref{cond:sufficient_2} is testable at Line 7 of Algorithm \ref{alg:method} and \ref{cond:sufficient_3} is testable in Steps 7 and 8. \ref{cond:sufficient_2} guarantees that all backdoor paths will be blocked by the conditioning set in Step 5 of Algorithm \ref{alg:method} \rebuttal{($X \cup \z' \setminus Z$)}, which is used to discover $\z_5$.  \ref{cond:sufficient_3} guarantees that LDP identifies the $\z_1$ needed to ensure a VAS, and enables the $\z_5$ criterion to be tested (Definition \ref{def:z5_criterion}). We note that \ref{cond:sufficient_2} and \ref{cond:sufficient_3} can be intuitively checked by reasoning whether \emph{multiple causes of $X$ and $Y$} exist in the dataset. While the existence of multiple causes does not guarantee that these conditions will hold, the absence of multiple causes indicates that LDP might not be suitable. Verifying \ref{cond:sufficient_2} does not require \ref{cond:sufficient_3} nor causal sufficiency (Theorem \ref{theorem:z4_z8_latent}) and replaces the strong pretreatment assumption in the covariate selection literature. \ref{cond:sufficient_2} is discussed further in Remark \ref{remark:C2}.

With respect to preventing descendants of $X$ from entering the adjustment set, we present Lemma \ref{lemma:desc_x}.

\begin{lemma} LDP does not place descendants of $X$ in $\z_1$ under Conditions \ref{cond:sufficient_2} and \ref{cond:sufficient_3}. \label{lemma:desc_x}
\end{lemma}

Additionally, LDP provides an internal test that indicates whether an adjustment set blocks all backdoor paths.

\begin{definition}[$\z_5$ criterion] \label{def:z5_criterion}
    \rebuttal{If there exists a $Z_5 \in \z_5$ that is $d$-separable from $Y$ given $X$ and $\z_1$ ($Z_5 \ind Y | X \cup \z_1$), we say that the $\z_5$ criterion is passed.}
\end{definition}

\begin{lemma}[Passing the $\z_5$ criterion is a valid indicator that $\z_1$ blocks all backdoor paths] \label{lemma:z5_indicates_vas}
    If the $\z_5$ criterion is passed, then the $\z_1$ recovered by LDP is asymptotically guaranteed to block all backdoor paths for $X$ and $Y$.
\end{lemma}

\begin{theorem}[LDP returns a VAS for $\{X,Y\}$ under the backdoor criterion] 
    Following from Lemmas \ref{lemma:desc_x} and \ref{lemma:z5_indicates_vas}, 
    if the $\z_5$ criterion is passed, then the $\z_1$ returned by LDP is a VAS for $\{X,Y\}$.
    \label{theorem:valid_adjustment}
\end{theorem}

All proofs are provided in Appendix \ref{sec:adjustment_correctness}. We numerically validate Theorem \ref{theorem:valid_adjustment} in Section \ref{sec:experiments}.


\paragraph{Sufficient Conditions for Correct Partition Labels} \rebuttal{As we show in Theorem \ref{theorem:z4_z8_latent}, Conditions \ref{cond:sufficient_2} and \ref{cond:sufficient_3} are \textit{not} required to correctly label $\z_4$, $\z_7$, and $\z_8$.} 

\rebuttal{
\begin{theorem}
\label{theorem:z4_z8_latent}
Partitions $\z_4$, $\z_7$, and $\z_8$ are guaranteed to be correctly labeled by LDP in random structures without \ref{cond:sufficient_2} and \ref{cond:sufficient_3}, even in the presence of latent confounding.
\end{theorem}

\textit{Intuition.} \; Theorem \ref{theorem:z4_z8_latent} follows from the fact that tests for $\z_4$, $\z_7$, and $\z_8$ rely only on knowledge of $\{X,Y\}$ and candidate $Z$. This is in contrast to Step 5, for example, which relies on access to additional variables in the true graph for correctness. Full proof is provided in Appendix \ref{append:z4_z8_latent}.
}


\rebuttal{We note that LDP correctly labels partitions $\z_1$, $\z_2$, $\z_3$, and $\z_5$ under additional conditions \ref{cond:sufficient_1} and \ref{cond:sufficient_4}. Given sufficient (but not necessary) conditions \ref{cond:sufficient_2}–\ref{cond:sufficient_4}, Theorem~\ref{theorem:correctness} states that LDP correctly labels the causal partitions of $\z$ as intermediate results en route to identifying a VAS (proof in Appendix~\ref{sec:partition_correctness}). Recall that \ref{cond:sufficient_1} and \ref{cond:sufficient_4} are not needed for identifying a VAS, nor partitions $\z_4$, $\z_7$, and $\z_8$. Further, Section \ref{sec:experiments} provides empirical examples where \ref{cond:sufficient_1} and \ref{cond:sufficient_4} are violated with no impact on partition label accuracy for $\z_1$, $\z_2$, $\z_3$, and $\z_5$.}
% Next, we show that LDP correctly labels partitions $\z_1$, $\z_2$, $\z_3$, and $\z_5$ under the following sufficient (but not necessary) conditions: \ref{cond:sufficient_2}, \ref{cond:sufficient_3}, \ref{cond:sufficient_1}, and \ref{cond:sufficient_4}.}

% \rebuttal{
% \begin{enumerate}[noitemsep,topsep=0pt, label=C\arabic*]
% \setcounter{enumi}{2}
% \item\label{cond:sufficient_1} The absence of inter-partition active paths (Def. \ref{def:inter_paths}). 
% \item\label{cond:sufficient_4} Causal sufficiency in $\g$. 
% \end{enumerate}
% }

% \rebuttal{Note that \ref{cond:sufficient_1} and \ref{cond:sufficient_4} are not needed for VAS discovery.} Given \ref{cond:sufficient_1}, $\z_2$ (if any exist in $\g$) will be marginally dependent on $\z_4$ and will be identifiable by LDP. The second statement of \ref{cond:sufficient_3} is trivially satisfied when \ref{cond:sufficient_1} is satisfied (as $\z_5$ shares no active paths with $\z_1$ is this setting) but is significant when \ref{cond:sufficient_1} is violated. We demonstrate robustness of partition label correctness to specific violations of \ref{cond:sufficient_1} in Tables \ref{tab:results_m_butterfly}, \ref{tab:latents}. Correctness under violations of \ref{cond:sufficient_4} is described in Appendix \ref{sec:adjustment_correctness}, Appendix \ref{append:z4_z8_latent}, and Section \ref{sec:results:latent}.

% Given these sufficient (but not necessary) conditions, we obtain Theorem \ref{theorem:correctness}. Proof is provided in Appendix \ref{sec:partition_correctness}.

% \begin{theorem} [Partition correctness of Algorithm \ref{alg:method}] Given \ref{cond:sufficient_2}–\ref{cond:sufficient_4}, Algorithm \ref{alg:method} is guaranteed to output a correct partition of $\z$ as defined in Table~\ref{tab:partitions}.
% \label{theorem:correctness}
% \end{theorem} 

\begin{remark}[Weakening the pretreatment assumption with Condition \ref{cond:sufficient_2}]\label{remark:C2}
We argue that the common pretreatment requirement assumes away the problem of confounder ($\z_1$) and instrument ($\z_5$) identification via \textit{a priori} exclusion of $\{\z_{2 \in de(X)},\z_3,\z_6,\z_7\}$. 
We introduce Condition \ref{cond:sufficient_2} based on the intuition that assuming the presence of at least one verifiable representative from a single partition ($\z_4$) is more moderate than assuming the complete absence of multiple partitions, which may not be verifiable. We argue that \ref{cond:sufficient_2} is a verifiable assumption, as we show that the correct identification of $\z_4$ is robust to latent confounding in arbitrary DAGs (Theorem \ref{theorem:z4_z8_latent}).
Note that \ref{cond:sufficient_2} is \textit{not necessary} when $\z$ contains no colliders (or, more strongly, when $\z$ is pretreatment), nor when the $\z_5$ criterion is passed (indicating that backdoor paths for $\{X,Y\}$ were closed despite failure to test for colliders; Lemma \ref{lemma:z5_indicates_vas}).
\end{remark}

%\rebuttal{\begin{remark}[\textbf{Weakening Causal Sufficiency for Partition Labeling}] \label{remark:weakening_c4_labeling}
%    We empirically demonstrate in Table \ref{tab:latents} that certain forms of latent confounding in $\g$ do not impact LDP's ability to accurately partition, while others do. As shown in Theorem \ref{theorem:z4_z8_latent}, certain tests are robust to causal insufficiency. Thus, it is known that \ref{cond:sufficient_4} is a sufficient but \textit{not necessary} condition for correct partitioning. However, we leave the nontrivial task of fully characterizing partitioning behavior under arbitrary latent confounding to future work.
%\end{remark}}


% \vspace{-10pt}
\section{Comparison to Prior Methods}\label{sec:related_work}

\textbf{Global Causal Discovery$\;$}
While global methods can theoretically identify the partitions of $\z$, LDP was explicitly designed to circumvent common drawbacks. The local approach of LDP avoids costly combinatorial optimization, guaranteeing worst-case polynomial test totals without sparsity constraints (at the expense of graphical assumptions \ref{cond:sufficient_2} and \ref{cond:sufficient_3}). %Existing nonparametric global methods cannot infer beyond the MEC, which can yield ambiguous partition labels even when an oracle is available (Figure \ref{fig:pc_mec_ambiguous}). 
Further, the asymptotic guarantees of nonparametric global discovery can fail even on simple structures under small to moderately large samples, which are common in practice (Figures \ref{fig:motive_times_tests}–\ref{fig:fci_finite}). LDP addresses sample complexity by favoring lower order CI tests relative to global constraint-based methods \citep{spirtes_causation_2000}.


\paragraph{Local Discovery Around Target Variables}
The challenges of global discovery can be mitigated by local methods that infer relevant substructures around a target (or targets) of interest. Most local methods for causal ancestor discovery, confounder discovery, or related tasks impose strong graphical assumptions that require prior knowledge. The most common of these assumptions requires that input variables are non-descendants of the target (e.g., the \emph{pretreatment assumption} in the exposure-outcome context) \citep{de_luna_covariate_2011, entner_data-driven_2013,  haggstrom_covsel_2015, shortreed_outcomeadaptive_2017, tian_evaluating_2018, gultchin_dierentiable_2020, soleymani_causal_2022, shah_finding_2022, cai2023learning}. 
We argue that excluding the existence of
colliders, mediators, and other descendants of the exposure overly simplifies the problem of identifying instruments, confounders, and other variables that are useful for downstream inference. 

% \begin{figure}[!t]
%     \centering
%     \includegraphics[width=0.9\linewidth]{figures/ten_node_results.png}
%     \caption{Partition label accuracy of LDP on a 10-node DAG with one node per partition (Figure \ref{fig:ten_node_dag}). Accuracy is averaged over 100 DAGs (i.e., 800 variables total, excluding exposure-outcome pairs), with 95\% confidence intervals in shaded regions. Independence was determined by chi-square tests for discrete data and Fisher-z for continuous data, both with $\alpha = 0.001$. Tables \ref{tab:results_ten_node_dag} and \ref{tab:results_continuous} report raw data.}
%     \label{fig:ten_node_results}
% \end{figure}

\paragraph{Automated Covariate Selection for Pretreatment $\z$} \citet{entner_data-driven_2013}, \citet{gultchin_dierentiable_2020}, \citet{shah_finding_2022}, \citet{cheng_local_2022}, and \citet{cheng_toward_2023} assume the existence of \textit{anchor} or \textit{auxiliary variables}, which can resemble $\z_1$ or $\z_5$. Thus, this assumption plays a similar role to Condition \ref{cond:sufficient_3} and Lemma \ref{lemma:z5_indicates_vas}. With the exception of \citet{cheng_toward_2023}, these methods require pretreatment $\z$, a strong assumption that we significantly weaken by introducing Condition \ref{cond:sufficient_2}.
While \citet{cheng_local_2022} require that auxiliary variables were identified prior to confounder discovery, LDP discovers the variables needed to satisfy Conditions  \ref{cond:sufficient_2} and \ref{cond:sufficient_3} end-to-end. The continuous optimization 
approach taken by \citet{gultchin_dierentiable_2020} requires parametric assumptions, while LDP does not. 

We should emphasize that these methods are designed for VAS discovery alone, sometimes combined with causal effect estimation end-to-end. LDP, on the other hand, is a \textit{local discovery procedure} for partitioning $\z$ while guaranteeing a VAS. Thus, unlike prior methods, LDP can be used to satisfy multiple covariate selection criteria (Section \ref{sec:cov_sel}) or to assist with tasks beyond valid adjustment (e.g., discovering instruments and their proxies, causes of outcome, etc.).

\paragraph{Automated Covariate Selection for Arbitrary $\z$} Like LDP, concurrent work by \citet{cheng_toward_2023} avoids the pretreatment assumption. However, it is not a local method as it calls an existing variant of FCI \citep{Colombo_2012}, a global discovery algorithm. %with worst-case exponential time complexity. 
Lemma \ref{lemma:z5_indicates_vas} bears similarity to Theorem 1 in \citet{cheng_toward_2023}, in which they prove that an analogous CI relation indicates the existence of VAS in partial ancestral graphs with hidden variables. 


%Overall, LDP requires less prior knowledge over the graph than \citet{entner_data-driven_2013}, \citet{gultchin_dierentiable_2020}, \citet{shah_finding_2022}, and \citet{cheng_local_2022}, which also assume pretreatment.  
 
%{\color{red}Address sparsity assumptions.}
%\paragraph{Comparison to Global Causal Discovery}
%While global causal discovery can theoretically identify the partitions of $\z$, existing methods often present with drawbacks. Aside from the computational costs of combinatorial optimization, many global methods cannot infer beyond the Markov equivalence class (MEC), which can result in ambiguous partition labeling. Further, the asymptotic guarantees of nonparametric global discovery can fail on simple structures under small samples, which are common in practice (Figure \ref{fig:pc_fci}). Failure on real-world data often arises from sample inefficiencies that increase with node count \citep{gultchin_dierentiable_2020}. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% EXPERIMENTS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%\vspace{-2mm}
\section{Experimental Results} \label{sec:experiments}

% \paragraph{Experimental Data}
% In this section, 
We numerically validate LDP 
% Methods were numerically validated
on custom synthetic DAGs and the \textsc{Mildew} benchmark from the \texttt{bnlearn} Bayesian Network Repository (Figure \ref{fig:mildew_full}) \citep{scutari_learning_2010}.\footnote{\href{https://www.bnlearn.com/bnrepository/}{https://www.bnlearn.com/bnrepository/}} For all simulated DAGs, structural equations are reported in Tables \ref{tab:sem_discrete} and \ref{tab:sem_continuous} and graphs are visualized in Appendix \ref{sec:custom_dags}. Experimental data generation is described in Appendix \ref{sec:experimental_design_appendix}. We experimentally validate LDP for VAS discovery in causally sufficient $\z$ and in the presence of latent confounding. Additionally, we probe 1) partition label correctness and 2) the quality of adjustment sets ($\mathbf{A}_{XY}$) returned by each method with respect to average treatment effect (ATE) estimation.

%%%%%%%%%%%%%%%%
% BASELINES
%%%%%%%%%%%%%%%%

\begin{figure*}[!t]
    \centering
    %%
    \textsc{Mildew (Figure \ref{fig:mildew_full})} 
    \includegraphics[width=\textwidth]{figures/mildew_baselines.jpg} \\
    \vspace{3mm}
    %%
    \textsc{Linear-Gaussian 10-node DAG (Figure \ref{fig:ten_node_dag})} 
    \includegraphics[width=\textwidth]{figures/ate_results.jpg}
    %%
    \caption{Baselines on \textsc{Mildew} ($|\z| = 31$) and a linear-Gaussian DAG ($|\z| = 8$) (Tables \ref{tab:mildew}, \ref{tab:ate}). Independence was determined with chi-square tests for \textsc{Mildew} ($\alpha = 0.001$) and Fisher-z tests for the linear-Gaussian DAG ($\alpha = 0.01$). Results were averaged over 10 and 100 replicates per sample size for \textsc{Mildew} and the linear-Gaussian DAG, respectively (95\% confidence intervals in shaded regions). Precision and recall for $\z_1$ identification were computed per adjustment set.}
    \label{fig:baselines}
\end{figure*}

\paragraph{Baseline Discovery Methods} All baselines are constraint-based and do not assume pretreatment. PC and FCI are global discovery algorithms with asymptotic theoretical guarantees and worst-case exponential time complexity with respect to node count \citep{spirtes_causation_2000}. Two local methods were also selected for comparison. MB-by-MB \citep{wang_discovering_2014} and Local Discovery Using Eager Collider Checks (LDECC) \citep{gupta_local_2023} take distinct approaches to inferring the local structure around a target node. While MB-by-MB is exponential-time, LDECC is provably polynomial-time for certain categories of graphs and exponential for others. Further description of these algorithms and how they were evaluated is in Appendix \ref{sec:experimental_design_appendix}. To illustrate the strengths and weaknesses of all baselines, VAS were evaluated under the \emph{common cause criterion} (CCC) \citep{guo_confounder_2022} and \emph{disjunctive cause criterion} (DCC) \citep{vanderweele_new_2011} (Appendix \ref{sec:covariate_criteria}).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% RESULTS AND DISCUSSION
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% \vspace{-2mm}

\label{subsec:empirical_results}

%\label{sec:results_partition_labels}

\paragraph{Partition Accuracy} We measure partition accuracy as the percentage of partition labels that are consistent with ground truth. Results on the 10-node DAG with one variable per partition (Figure \ref{fig:ten_node_dag}) indicate that LDP correctly partitions $\z$ under continuous, discrete, linear, and nonlinear data generating processes (Figure \ref{fig:ten_node_results}, Tables \ref{tab:results_ten_node_dag}, \ref{tab:results_continuous}). Figure \ref{fig:ten_node_results} supports the claim that LDP is agnostic to the strength of the direct effect of $X$ on $Y$, as results are unharmed when $X$ is not adjacent to $Y$. 
Though LDP is not guaranteed to correctly partition when inter-partition active paths exist, 
we demonstrate that LDP is robust to certain violations of this condition (Tables \ref{tab:results_m_butterfly}, \ref{tab:results_17_nodes}, \ref{tab:latents}): LDP provides high partition accuracies on the \textsc{Mildew} benchmark ($\geq 90\%$ accuracy; Figure \ref{fig:ldp_mildew_acc}) and synthetic DAGs when $\z_2$ shares active paths with $\z_4$, $\z_5$, and $\z_6$ (Figures \ref{fig:m_butterfly}, \ref{fig:dag_17}, \ref{fig:latents}).
% % high partition accuracies on the 13-node and 17-node DAGs containing M-structures with paths between $\z_4$, $\z_5$, and $\z_2$ (Figures \ref{fig:m_butterfly}, \ref{fig:dag_17}) 
% empirically demonstrate that LDP is robust to certain kinds of inter-partition active paths (Tables \ref{tab:results_m_butterfly}, \ref{tab:results_17_nodes}). High partition accuracies on \textsc{Mildew} ($\geq 90\%$; Figure \ref{fig:ldp_mildew_acc}) further 
% validate
% % corroborate 
% the ability of LDP to handle certain forms of inter-partition active paths.

% \vspace{-2mm}

\subsection{VAS Under Causal Sufficiency}

We compare adjustment set quality across baselines for two graphs with small $\z_1$ (Figure \ref{fig:baselines}): the \textsc{Mildew} benchmark ($|\z| = 31; |\z_1| = 2$) and a linear-Gaussian DAG ($|\z| = 8; |\z_1| = 1$). Quality was measured in terms of $\z_1$ precision and recall in $\mathbf{A}_{XY}$, percent of $\mathbf{A}_{XY}$ that were valid, total tests performed, runtime, and ATE MSE (when ground truth was available). Results indicate that LDP provides superior sample, statistical, and time efficiency relative to baselines.

LDP outperformed on $\z_1$ recall (Figure \ref{fig:baselines}.A, \ref{fig:baselines}.F top), test count (\ref{fig:baselines}.D), and runtime (\ref{fig:baselines}.E). %Though PC sometimes obtained a greater percentage of valid $\mathbf{A}_{XY}$ for \textsc{Mildew}, LDP achieved higher $\z_1$ precision and recall.
Notably, LDP ran $1400\times$ to $2500\times$ faster than PC across sample sizes for \textsc{Mildew}, with comparable gains relative to local baselines (Table \ref{tab:mildew}).
High $\z_1$ recall for LDP is reflective of its ability to detect $\z_1$ that are not adjacent to either $X$ nor $Y$, unlike local baselines. As expected, LDP displayed superior $\z_1$ precision under the CCC but was comparable to other methods when $\z_4$ and $\z_5$ were intentionally retained under the DCC (\ref{fig:baselines}.B, \ref{fig:baselines}.G). Only LDP consistently returned a VAS for the linear-Gaussian DAG under the CCC and DCC (\ref{fig:baselines}.H). Furthermore, $\mathbf{A}_{XY}$ from LDP provided less biased and more precise ATE estimates (\ref{fig:baselines}.I, \ref{fig:baselines}.J). Highly biased ATE estimates using $\mathbf{A}_{XY}$ from PC is linked to a propensity to include extraneous variables (Figure \ref{fig:vas_card}). Low ATE variance for LDP implies favorable statistical efficiency relative to baselines. Further, LDP achieves consistently high VAS quality at smaller sample sizes than baselines, implying greater sample efficiency.

%Note that conditioning on instruments ($\z_5$) can induce or amplify bias \citep{pearl_class_2012}, while conventional wisdom says that adjustment should minimally constrain the exposure and maximally constrain the outcome \citep{runge_necessary_2021}. Adjusting for $\z_5$ constrains $X$, while adjusting for $\z_4$ constrains $Y$ and poses no theoretical problems. Thus, LDP may be at lower risk of overadjustment or unnecessary adjustment than local baselines.

Additionally, we illustrate a known failure mode of LDP partition labeling that still results in VAS (Figure \ref{fig:complex_backdoor}; $|\z| = 14$; $|\z_1| = 7$). In a complex backdoor path, a $Z_1$ adjacent to $Y$ is marginally dependent on a $Z_4$ and will be mislabeled as $\z_{\textsc{Post}}$. Further, a $Z_2$ that is 1) a non-descendant of $X$ and 2) conditionally independent of $\{X,Y\}$ given $\z_1$ is guaranteed to be placed in $\z_1$. Despite these mislabelings, LDP returned a VAS for 99\% of 100 replicates (sample size $n = 5\mathsf{k}$). Figure \ref{fig:complex_backdoor} describes further details.

%\paragraph{Statistical Efficiency} ATE estimate variance served as a measure of statistical efficiency across baselines (Figure \ref{fig:baselines}, bottom). The ATE was estimated using linear regression
% \footnote{\href{https://scikit-learn.org/}{https://scikit-learn.org/}}
% I don't think we need to cite scikit learn here.
%for linear-Gaussian DAGs with a ground truth total effect of $3.75$. LDP returned the highest quality adjustment sets in terms of ATE mean squared error (MSE), confounder recall, and percent valid, with baselines lagging even as sample size increased. LDP generally produced the least biased ATE estimates and lowest ATE variance, and was the only method to achieve unbiased estimates under the DCC. Rising ATE MSE for PC may be explained by the cardinality of $\mathbf{A}_{XY}$ increasing with sample size. %Ground truth $|\mathbf{A}_{XY}| = 1$ under the CCC and 3 under the DCC, which LDP adhered to more closely than baselines. %Low confounder recall LDECC generally failed to retain any covariates in $\mathbf{A}_{XY}$ under the CCC.



\subsection{VAS Discovery With Latent Variables} \label{sec:results:latent}

\input{tables_tex/table_latent_experiments_small}

\begin{figure}[!t]
    \centering
    \fbox{\begin{tabular}{@{}c@{}}
    \includegraphics[height=0.11\textheight]{figures/latents_true.png} \\
    \footnotesize \textit{True ($Z_{5a}$ latent})
    \end{tabular}}
    %%
    \fbox{\begin{tabular}{@{}c@{}}
    \includegraphics[height=0.11\textheight]{figures/fci_latent_z5a.png} \\
    \footnotesize \textit{FCI ($n = 50\mathsf{k}$)}
    \end{tabular}}
    %\includegraphics[width=0.3\textwidth]{figures/fci_latent_z5a.png}
    \caption{FCI failed to identify a VAS on DAGs with linear causal functions and Bernoulli noise (true $\z_1$ in red; 5/5 replicates consistent with the predicted DAG at right). Additional results for PC and FCI are reported in Figure \ref{fig:fci_finite}.}
    \label{fig:fci_latent_z5a}
\end{figure}

With the $\z_5$ criterion (Line 29 of Algorithm \ref{alg:method}; Definition \ref{def:z5_criterion}), LDP helps the user to manage uncertainty about the quality of the returned adjustment set. To numerically validate the asymptotic guarantees of this criterion and Theorem \ref{theorem:valid_adjustment}, we ran LDP with an independence oracle on causally insufficient structures. The ground truth DAG contained a butterfly structure, M-structure, and inter-partition active paths (Figures \ref{fig:fci_latent_z5a}, \ref{fig:latents}; $|\z| = 18$; $|\z_1| = 4$). All combinations of common causes (up to size three) were iteratively dropped from the observed data. Hidden confounders were in $\z_1$, $\z_4$, and $\z_5$ and induced latent confounding between $\{X,Y\}$, $\{X, \z_1\}$, $\{Y, \z_1\}$, $\{X,\z_2\}$, $\{Y,\z_2\}$, $\{Y,\z_4\}$, and $\{X,\z_5\}$. Partition $\z_2$ shared active paths with $\z_4$, $\z_5$, and $\z_6$. 

For all eight structures with one latent variable, we observed 100\% concordance between LDP passing the $\z_{5}$ criterion, whether a VAS for the true DAG existed in $\z$, and whether LDP returned a VAS. For the 84 structures with two to three latent variables, we saw 100\% concordance between whether the $\z_{5}$ criterion was passed and whether LDP returned a VAS. In all instances (6\%) where a VAS existed in $\z$ and LDP failed to return a VAS, both parents of the M-collider were latent. As expected, such a M-collider was treated as $\z_1$. In all such cases, LDP raised a warning that the $\z_{5}$ criterion was failed and a VAS was not identified. These results suggest that the $\z_{5}$ criterion is a valid indicator for whether the $\z_1$ returned by LDP is a VAS and, further, whether it induces \textit{M-bias} \citep{ding_adjust_2014}.

To probe finite sample performance, we ran LDP on linear categorical instantiations of Figure \ref{fig:latents} ($n = 50\mathsf{k}$; chi-square tests; $\alpha = 0.001$). We tested 100 replicates per latently confounded structure. Among 600 instances for which a VAS of the ground truth DAG existed in $\z$, LDP returned a VAS for 99.5\%  (95\% CI $[99.1,99.9]$; Table \ref{tab:latents_small}). If at least one parent in the M-structure was observed in $\z$, the collider was not placed in $\z_1$ and LDP returned a VAS without M-bias (200/200 replicates).

\rebuttal{In contrast, PC and FCI demonstrate finite sample failure modes on the same causal structure (Figures \ref{fig:fci_latent_z5a}, \ref{fig:fci_finite}). Though PC and FCI return a VAS when provided with an oracle, both algorithms fail to provide VAS for discrete and continuous data samples (Figure \ref{fig:fci_finite}). In particular, these methods display a high false negative rate on true confounder $Z_1$.}
%We ran 5 replicates because the interpretation of FCI for VAS discovery is nontrivial, due to bidirected edges and uncertain directionality introducing ambiguity. Ambiguous interpretation on MECs is a central motivation for this work.

%%%%%%%%%%%%%%%%
% LIMITATIONS
%%%%%%%%%%%%%%%%

%\subsection{Limitations and Future Directions}

\section{Limitations And Future Work} 
The performance of LDP will be constrained by the accuracy, runtime, and sample complexity of the chosen independence test. \rebuttal{While LDP does not make innate parametric assumptions, the user should be cautious if opting for parametric independence tests. We provide asymptotic theoretical guarantees, which future work could extend to probabilistic guarantees under finite samples. %We leave the derivation of necessary conditions for identifiability to future work.

As causal discovery is notoriously impractical in many settings, this work attempts to highlight the benefits of approaches that are tailored for specific use cases. Under a tailored approach, prior knowledge of the problem space can improve performance relative to generalized global discovery (e.g., in time and sample efficiency). 

In this work, we propose a performant local discovery algorithm for VAS discovery, at the expense of Conditions \ref{cond:sufficient_2} and \ref{cond:sufficient_3}. These graphical conditions restrict the space over which LDP provides informative results. We hope to see performant local discovery solutions to the covariate selection problem that do not assume pretreatment yet do not rely on the presence of $\z_4$ and $\z_5$. In particular, the efficient differentiation of confounders, mediators, and colliders in random graphs is a challenging problem that we hope future research will address.}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% CONCLUSION
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%\section{CONCLUSION}





%%%%%%%%%%%%%%%%%%%%%%%%%%%%



\section*{Acknowledgements} 

The authors would like to acknowledge support from the NSF Graduate Research Fellowship (author J. Maasch); NSF 1750326, 2212175; and NIH R01AG080991, R01AG076234.


%If using bibtex, please protect capital letters of names and abbreviations in titles, for example, use \{B\}ayesian or \{L\}ipschitz in your .bib file.

\bibliography{references}



%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%

\newpage

%\title{Local Discovery by Partitioning: \\ Polynomial-Time Causal Discovery Around Exposure-Outcome Pairs\\(Supplementary Material)}
%\maketitle

\appendix
\onecolumn

\counterwithin{table}{section}
\counterwithin{figure}{section}

\section*{Appendix}

\input{appendix}




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



\end{document}




% This document was modified from the file originally made available by
% Pat Langley and Andrea Danyluk for ICML-2K. This version was created
% by Iain Murray in 2018, and modified by Alexandre Bouchard in
% 2019 and 2021 and by Csaba Szepesvari, Gang Niu and Sivan Sabato in 2022.
% Modified again in 2023 by Sivan Sabato and Jonathan Scarlett.
% Previous contributors include Dan Roy, Lise Getoor and Tobias
% Scheffer, which was slightly modified from the 2010 version by
% Thorsten Joachims & Johannes Fuernkranz, slightly modified from the
% 2009 version by Kiri Wagstaff and Sam Roweis's 2008 version, which is
% slightly modified from Prasad Tadepalli's 2007 version which is a
% lightly changed version of the previous year's version by Andrew
% Moore, which was in turn edited from those of Kristian Kersting and
% Codrina Lauth. Alex Smola contributed to the algorithmic style files.
