% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    

%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} 
% \usepackage{siunitx} 
\usepackage{booktabs} 
\usepackage{tikz}
\usepackage{nameref}
\usepackage{zref-xr}
\zxrsetup{toltxlabel}
\zexternaldocument*{su_509-supp}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand\ci{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}

\DeclareMathOperator{\Forbb}{forb}
\DeclareMathOperator{\pa}{pa}
\DeclareMathOperator{\possPa}{posspa}
\DeclareMathOperator{\posspa}{posspa}
\DeclareMathOperator{\de}{de}
\DeclareMathOperator{\possDe}{possde}
\DeclareMathOperator{\possde}{possde}
\DeclareMathOperator{\an}{an}
\DeclareMathOperator{\possAn}{possan}
\DeclareMathOperator{\possan}{possan}
\DeclareMathOperator{\CN}{cn}
\DeclareMathOperator{\possCN}{posscn}
\DeclareMathOperator{\Opt}{O}
\DeclareMathOperator{\Var}{\mathrm{Var}}
\DeclareMathOperator{\Cov}{\mathrm{Cov}}
\DeclareMathOperator{\E}{\mathrm{E}}
\DeclareMathOperator{\vect}{\mathrm{vec}}
\DeclareMathOperator{\vecth}{\mathrm{vech}}
\DeclareMathOperator{\diag}{\mathrm{diag}}

\newcommand{\mpdag}{maximal PDAG}
\newcommand{\Mpdag}{Maximal PDAG}
\newcommand{\MPDAG}{MAXIMAL PDAG}
\newcommand{\pstar}[1][p]{{#1}^{*}}
\newcommand{\g}[1][G]{\mathcal{#1}}
\newcommand{\f}[2][X,Y]{\Forbb(#1,#2)}
\newcommand{\fb}[2][X,Y]{\Forbb(\mathbf{#1},#2)}
\newcommand{\cn}[2][X,Y]{\CN(#1,#2)}
\newcommand{\cnb}[2][X,Y]{\CN(\mathbf{#1},#2)}
\newcommand{\cns}[1]{\CN(\mathbf{X},Y,#1)}
\newcommand{\opts}[1]{\mathbf{O}(\mathbf{X},Y,#1)}
\newcommand{\posscn}[2][X,Y]{\possCN(#1,#2)}
\newcommand{\posscnb}[2][X,Y]{\possCN(\mathbf{#1},#2)}
\newcommand{\opt}[2][X,Y]{\mathbf{O}(#1,#2)}
\newcommand{\optb}[2][X,Y]{\mathbf{O}(\mathbf{#1},#2)}
\newcommand{\vsp}{\vspace{.2cm}}

\newcommand{\balpha}{\boldsymbol{\alpha}}
\newcommand{\bbeta}{\boldsymbol{\beta}}
\newcommand{\bepsilon}{\boldsymbol{\epsilon}}
\newcommand{\btau}{\boldsymbol{\tau}}
\newcommand{\bdelta}{\boldsymbol{\delta}}
\newcommand{\btheta}{\boldsymbol{\theta}}
\newcommand{\bxi}{\boldsymbol{\xi}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bX}{\mathbf{X}}
\newcommand{\bbX}{\mathbf{X}}
\newcommand{\bY}{\mathbf{Y}}
\newcommand{\bc}{\mathbf{c}}
\newcommand{\bC}{\mathbf{C}}
\newcommand{\bz}{\mathbf{z}}
\newcommand{\bZ}{\mathbf{Z}}
\newcommand{\bK}{\mathbf{K}}
\newcommand{\bW}{\mathbf{W}}
\newcommand{\bSigma}{\boldsymbol{\Sigma}}
\newcommand{\bv}{\mathbf{v}}
\newcommand{\bV}{\mathbf{V}}
\newcommand{\be}{\mathbf{e}}
\newcommand{\bE}{\mathbf{E}}
\newcommand{\bA}{\mathbf{A}}
\newcommand{\ba}{\mathbf{a}}
\newcommand{\br}{\mathbf{r}}
\newcommand{\bu}{\mathbf{u}}
\newcommand{\bU}{\mathbf{U}}
\newcommand{\bO}{\mathbf{O}}
\newcommand{\bp}{\mathbf{p}}
\newcommand{\bP}{\mathbf{P}}
\newcommand{\bLambda}{\boldsymbol{\Lambda}}
\newcommand{\bOmega}{\boldsymbol{\Omega}}
\newcommand{\bGamma}{\boldsymbol{\Gamma}}
\newcommand{\bDelta}{\boldsymbol{\Delta}}
\newcommand{\bB}{\mathbf{B}}
\newcommand{\bD}{\mathbf{D}}
\newcommand{\bd}{\mathbf{d}}
\newcommand{\bI}{\mathbf{I}}
\newcommand{\bZsf}{\boldsymbol{\mathsf{Z}}}
\newcommand{\Xsf}{\mathsf{X}}
\newcommand{\bPi}{\boldsymbol{\Pi}}

\usepackage{tikz}
\usetikzlibrary{arrows.meta,arrows,patterns}
\usetikzlibrary{shapes,decorations,decorations.pathreplacing,arrows,calc,arrows.meta,fit,positioning}
\usetikzlibrary{arrows.meta,automata,positioning,quotes}
\tikzset{
    directed/.style={-Latex,semithick},
    %state/.style ={circle,draw,minimum width=0.7cm},
    state/.style ={minimum width=0.5cm},
    point/.style = {circle, draw, inner sep=0.04cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=2pt, align=left, sloped}
}

\usepackage{amsmath,amssymb}
\usepackage{amsthm}
\newtheoremstyle{break}
  {\topsep}{\topsep}%
  {\itshape}{}%
  {\bfseries}{}%
  {\newline}{}%
  
\newtheorem{Satz}{Satz}
\theoremstyle{plain}
\newtheorem{Lemma}[Satz]{Lemma}
\newtheorem{Corollary}[Satz]{Corollary}
\newtheorem{Theorem}[Satz]{Theorem}
\newtheorem{Proposition}[Satz]{Proposition}
\newtheorem{Fact}[Satz]{Fact}

\newtheoremstyle{breakdfn}
  {\topsep}{\topsep}%
  {\upshape}{}%
  {\bfseries}{}%
  {\newline}{}%

\theoremstyle{definition}
\newtheorem{Example}[Satz]{Example}
\newtheorem{Assumption}[Satz]{Assumption}
\newtheorem{Definition}[Satz]{Definition}

\theoremstyle{remark}
\newtheorem{Conjecture}[Satz]{Conjecture}
\newtheorem*{Remark}{Remark}

\usepackage{algorithm}
\usepackage[noend]{algpseudocode}

\definecolor{mygreen}{RGB}{0,110,51}

\usepackage{rotating}
\usepackage{afterpage}

\usepackage{multirow}
\usepackage{booktabs}

\title{A Robustness Test for Estimating Total Effects with Covariate Adjustment}

% The standard author block has changed for UAI 2021 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:zehao.su@sund.ku.dk?subject=Your UAI 2022 Paper}{Zehao Su}}
\author[2]{Leonard Henckel}
% Add affiliations after the authors
\affil[1]{%
    Section of Biostatistics\\
    Department of Public Health\\
    University of Copenhagen
}
\affil[2]{%
    Department of Mathematical Sciences\\
    University of Copenhagen
}
  
  \begin{document}
\maketitle

\begin{abstract}
Suppose we want to estimate a total effect with covariate adjustment in a linear structural equation model. We have a causal graph to decide what covariates to adjust for, but are uncertain about the graph. Here, we propose a testing procedure, that exploits the fact that there are multiple valid adjustment sets for the target total effect in the causal graph, to perform a robustness check on the graph. If the test rejects, it is a strong indication that we should not rely on the graph. We discuss what mistakes in the graph our testing procedure can detect and which ones it cannot and develop two strategies on how to select a list of valid adjustment sets for the procedure. 
We also connect our result to the related econometrics literature on coefficient stability tests.
\end{abstract}

\section{Introduction}\label{sec:intro}

Suppose we are interested in estimating the total (causal) effect of a treatment $X$ on an outcome $Y$ from observational data. One popular approach to estimate such an effect is covariate adjustment, also known as adjusting for confounding. Deciding which covariates to adjust for is a difficult problem, but it can be answered precisely if we have knowledge of the underlying causal structure in the form of a graph \citep{pearl2009causality}. In particular, the class of covariate sets we may adjust for has been fully graphically characterised \citep{shpitser2010validity,perkovic16}. We refer to sets in this class as valid adjustment sets.

In some cases there is more than one valid adjustment set, which raises the question how we can exploit this. One approach is to try and select from the available valid adjustment sets the one that provides the most statistically efficient estimator \citep{kuroki2003covariate,rotnitzky2020efficient, witte2020efficient,henckel2019graphical}. 

Another natural approach is to use multiple valid adjustment sets and test whether they all in fact lead to estimators of the same quantity. Such a test would be a simple and targeted robustness check on the causal graph we are relying on. Here we say targeted in the sense that the test would only detect mistakes in the graph that are relevant to our goal of estimating the total effect of interest, which is easier than checking whether the entire graph is correct. 

In the econometrics literature, it is already common practice to estimate the total effect with multiple estimators and then check whether the estimates differ by a large margin \citep[e.g.][]{dikova2019investment,yigezu2021socio,schlegel2021innovation}. This approach is often called testing for coefficient stability \citep{walter2009variable} or simply called a robustness check \citep{lu2014robustness}.

Practitioners often verify coefficient stability in a heuristic manner, but there is also a theoretical literature on the topic, especially for instrumental variable estimators \citep{frank2013would,oster2019unobservable}. 

For covariate adjustment estimators, \cite{lu2014robustness} have proposed a formal test for coefficient stability. Their framework is not based on graphical models and therefore it is harder to decide which adjustment sets to use for their test.
They propose to fix what they call a core of covariates and then create additional adjustment sets by adding what they call non-core covariates to the core. Here, the status of being core or non-core depends on certain conditional independences. In the graphical framework it becomes clear that this approach is too restrictive as, for example, two valid adjustment sets may be disjoint. As a result their approach may consider too few sets which leads to a loss of power.


There exists a more general literature on validation tests for structural equation models. This literature, however, has focused on tests that either validate the entire model \citep{SEM1989,bollen1993confirmatory,spirtes2000causation} or rely on instrumental variables \citep{kirby2009}. Another related literature focuses on identifying valid adjustment sets by relying on an auxiliary variable, typically called an anchor, whose causal relationship to the treatment has to be known from domain knowledge \citep{entner2013data,gultchin2020differentiable,shah2021finding,cheng2022toward}.

In this paper, we adopt the framework of a linear structural equation model compatible with an unknown directed acyclic graph (DAG). We propose a targeted robustness test that given a pair $(X,Y)$ and a candidate DAG $\g$ tests whether the valid adjustment sets with respect to $(X,Y)$ in $\g$ lead to estimates of the same quantity. 

We first discuss, which mistakes in the candidate graph our robustness test has power for and how this depends on the valid adjustment sets we use for the test. We then propose a simple $\chi^2$-test, similar to the one proposed by \citet{lu2014robustness}, although it differs in that we do not require a fixed core of covariates. We show that in general the joint asymptotic covariance matrix of the estimators we wish to compare is degenerate and that we need to know its rank for the test. This problem was acknowledged but not addressed by \citet{lu2014robustness}. 

In response to the problem of the degenerate asymptotic covariance matrix, we propose two strategies. The first is to estimate the rank. This is a difficult statistical problem and may be unstable, especially in small samples. The upside of this approach, however, is that it allows us to use all valid adjustment sets for our test, which maximises its power. 
% We refer to this strategy as $\mathrm{All}$.

The second strategy carefully selects a subset of the valid adjustment sets in a way that ensures the following two properties are likely to hold. First, the asymptotic covariance matrix is not degenerate. Second, we do not lose power completely against mistakes in the graph we had power for when using all valid adjustment sets. These two strategies represent different trade-offs between the stability of our testing procedure and its power to detect mistakes in the candidate graph.


Finally, we investigate with a simulation study how well our testing procedure controls the type-I error rate and how much power it has in finite samples. We do so for both of the strategies we propose, in order to compare and contrast their respective advantages and disadvantages. We also illustrate our testing procedure on a real data problem. All proofs are given in the supplementary materials. An implementation of our testing procedure and the code for our simulation study are made available at \url{https://github.com/zehaosu/RoCA}.


\section{Preliminaries}

We consider a linear structural equation model compatible with a DAG, where nodes represent random variables and edges represent direct effects. We now provide the most important definitions. The remaining definitions are provided in Section~\ref{app:graph} of the supplementary materials.

\textit{Linear structural equation models.}
  Let $\mathcal{G}=(\bV,\bE)$ be a DAG. Then $\bV=(V_1, \dots, V_p)$ follows a \emph{linear structural equation model} compatible with $\mathcal{G}$ if for all $i=1,\dots,p$
  \[V_{i}\gets\sum_{V_{j}\in\pa(V_{i},\mathcal{G})}\alpha_{ij}V_{j}+\epsilon_{i},
  \]
  with edge coefficients $\alpha_{ij}$ and jointly independent errors $\epsilon_{i}$ with zero mean and finite variance. We do not assume that the errors are normally distributed.

\textit{Total effects.}
Consider a pair $(X,Y)$ of random variables. The \emph{total effect} of $X$ on $Y$ is the partial derivative of the expectation $\E(Y\mid do(X=x))$ with respect to $x$. This is the instantaneous change of the average of $Y$ in the world where $X$ is set to $x$ \citep{pearl2009causality}.
In a linear structural equation model, the partial derivative is a constant slope that does not depend on $x$. As a result, the total effect is simply a number $\tau_{yx}$.

\textit{Causal and forbidden nodes.}
Consider two nodes $X$ and $Y$ in a DAG \(\g=(\bV,\bE)\). The \emph{causal nodes} relative to \((X,Y)\) in $\g$, denoted \(\mathrm{cn}(X,Y,\g)\), are all nodes on directed paths from \(X\) to \(Y\), excluding \(X\). The descendants of $X$ in $\g$, denoted $\de(X,\g)$, are all nodes $V$ such that there exists a directed path from $X$ to $V$ in $\g$. The \emph{forbidden nodes} relative to $(X,Y)$ in $\g$, denoted $\mathrm{forb}(X,Y,\g)$, are all nodes that are descendants of causal nodes, including $X$. The \emph{non-forbidden nodes} relative to $(X,Y)$ in $\g$, denoted $\mathrm{nonforb}(X,Y,\g)$, are the nodes in $\mathbf{V}\setminus \mathrm{forb}(X,Y,\g)$.

\textit{Notation for regression coefficients.}
Consider random variables $X$ and $Y$, random vectors $\bZ_1, \dots, \bZ_k$ and the collection of adjustment sets \(\mathcal{Z}=\{\bZ_{1},\dots,\bZ_{k}\}\).
Let \(\beta_{yx.\bz_i}\) indicate the population level regression coefficient of \(X\) in the ordinary least squares regression of \(Y\) on \(X\) and \(\bZ_i\). Let \(\hat{\beta}_{yx.\bz_i}\) denote the corresponding estimator.
Let \(\bbeta_{yx.\mathcal{Z}}\) denote the stacked population regression coefficients \((\beta_{yx.\bz_{1}},\dots,\beta_{yx.\bz_{k}})^{\top}\) and \(\hat{\bbeta}_{yx.\mathcal{Z}}\) the corresponding estimator. Finally, let $\delta_{yz_i}=Y - \beta_{yz_i} Z$ be the population level residuals for the ordinary least squares regression of $Y$ on $Z_i$ and $r_{yz_i}$ be the corresponding vector of sample residuals.

\textit{Valid adjustment sets.}
Consider nodes $X$ and $Y$ in a DAG $\g=(\bV,\bE)$. A node set $\bZ$ is a \emph{valid adjustment set} relative to $(X,Y)$ in $\g$ if for all linear structural equation models compatible with $\g$, $\beta_{yx.\bz}=\tau_{yx}$. We say a valid adjustment set $\mathbf{Z}=\{Z_1,\dots,Z_k\}$ is \emph{minimal} if for all $i \in \{1,\dots,k\}$, $\mathbf{Z}\setminus Z_i$ is not a valid adjustment set. The class of valid adjustment sets has been fully characterised as follows. 

\textit{Adjustment criterion.} \citep{shpitser2010validity,perkovic16}
  A (possibly empty) set $\bZ$ is a valid adjustment set relative to $(X, Y)$ in $\g$ if and only if
  \begin{enumerate}
    \item $\bZ$ contains no node in \(\mathrm{forb}(X,Y,\g)\), and
    \item $\bZ$ blocks all paths between $X$ and $Y$ in $\mathcal{G}$ that are not directed from $X$ to $Y$.
  \end{enumerate}

\textit{d-separation.}
Consider three disjoint node sets $\bX,\bY$ and $\bZ$ in a DAG $\g=(\bV,\bE)$, such that $\mathbf{V}$ follows a linear structural equation model compatible with $\g$. We can read off from $\g$ whether $\bX$ is independent of $\bY$ given $\bZ$ with a graphical criterion called \emph{d-separation} \citep{pearl2009causality} which we define formally in the supplementary materials. We use the notation $\bX \perp_{\g} \bY\mid\bZ$ to denote that $\bX$ is d-separated from $\bY$ given $\bZ$ in $\g$. 

\section{A Targeted Robustness Test for Covariate Adjustment}

\subsection{The Null Hypothesis and its Properties}
\label{sec:setup}
Suppose we wish to estimate the total effect of a treatment $X$ on a response variable $Y$. Let $\g_0$ denote the unknown true underlying casual graph and suppose we have a candidate causal graph $\g$ that describes our understanding of the underlying causal structure but that we are not certain about. We would like to check whether the candidate graph is plausible, so we can rely on it to estimate $\tau_{yx}$ with some confidence. In order to do so, we use $\g$ to identify a collection of valid adjustment sets $\mathcal{Z}=\{\bZ_{1},\dots,\bZ_{k}\}$ with respect to $(X,Y)$ in $\g$. If $\g$ is correct each of these sets corresponds to a consistent estimator of $\tau_{yx}$, i.e.,
\begin{equation}
    \tau_{yx}=\beta_{yx.\bz_{1}}=\beta_{yx.\bz_{2}}=\cdots=\beta_{yx.\bz_{k}}.
    \label{eqn:overid}
\end{equation}
If $\bZ$ consists of more than one set, then equation \eqref{eqn:overid} imposes an over-identifying constraint on the total effect $\tau_{yx}$. We use this constraint to test the plausibility of the candidate graph $\g$. The more valid adjustment sets $\mathbf{Z}_i$ we use, the more mistakes in $\g$ the test can detect.


It is generally not possible to directly test the constraint from equation $\eqref{eqn:overid}$ with observational data because we do not know the true total effect $\tau_{yx}$. It is, however, possible to test the relaxed null hypothesis 
\[
H_{0}: \beta_{yx.\bz_{1}}=\beta_{yx.\bz_{2}}=\cdots=\beta_{yx.\bz_{k}}.
\]
Let $H_{0}^{*}$ denote the null hypothesis associated with equation (\ref{eqn:overid}). As $H_{0}$ holds whenever $H_{0}^{*}$ does, it follows that any test with type-I error rate control for testing $H_{0}$ also has type-I error rate control for testing $H_{0}^{*}$. In addition, any rejection of $H_{0}$ implies a rejection of $H^{*}_{0}$ and as a result of the candidate graph $\g$. It is therefore reasonable to test $H_{0}$ as a proxy for $H_{0}^{*}$.

There is an even more restrictive null hypothesis $H_{0}^{**}: \g = \g_0$. However, as we are interested in estimating one specific total effect, it is not necessary to validate the entire candidate graph \(\g\), and the distinction between $H_0^*$ and $H_{0}^{**}$ is irrelevant for the purposes of this paper.
There are, however, cases where $H_{0}$ holds but $H_{0}^{*}$ does not and in these cases any test for $H_{0}$ will have no power to reject $H_{0}^{*}$. This occurs whenever $H_{0}$ holds but for all $\bZ_i \in \mathcal{Z}$, $\beta_{yx.\mathbf{z}_i} \neq \tau_{yx}$. Whether this is the case depends on the choice of candidate sets $\mathcal{Z}$ and is more likely if $\mathcal{Z}$ contains few sets. In particular, this is impossible if $\mathcal{Z}$ contains even a single valid adjustment set from the true graph $\g_0$. In response, it is natural to use all available valid adjustment sets relative to $(X,Y)$ in $\g$ to maximise the number of sets in $\mathcal{Z}$. However, even for a moderately sized $\g$ the number of valid adjustment sets relative to the pair $(X,Y)$ can be very large, e.g., there are $72$ in the graph $\g_0$ and $96$ in the graph $\g_1$ from Figure \ref{fig:graph}.
This raises the question whether it is possible to select the collection $\mathcal{Z}$ in a way that minimises the risk of having no power for $H_{0}^{*}$, while simultaneously limiting its size.

\begin{figure}[t!]
\centering
    \begin{tikzpicture}[scale=1]
      \node[state] (X) at (0,0) {$X$};
      \node[state] (Y) at (2,0) {$Y$};
      \node[state] (A1) at (0.5,1) {$A_{1}$};
      \node[state] (A2) at (1.5,1) {$A_{2}$};
      \node[state] (B1) at (0.5,-1) {$B_{1}$};
      \node[state] (B2) at (1.5,-1) {$B_{2}$};
      %\node[state] (M) at (1,0) {$M$};
      \node[state] (V) at (-0.5,1) {$V$};
      \node[state] (D) at (-0.5,-1) {$D$};
      \node[state] (R) at (2.5,1) {$R$};
      \node[state] (F) at (2.5,-1) {$F$};
      \path (X) edge[directed] (Y)
            (X) edge[directed] (D)
            (Y) edge[directed] (F)
            (A1) edge[directed] (X)
            (A1) edge[directed] (A2)
            (A2) edge[directed] (Y)
            (B1) edge[directed] (X)
            (B2) edge[directed] (Y)
            (B2) edge[directed] (B1)
            (V) edge[directed] (X)
            (R) edge[directed] (Y);
    \node at (1, -1.75) {$\g_0$};
    \end{tikzpicture}
    \quad
    \begin{tikzpicture}[scale=1]
      \node[state] (X) at (0,0) {$X$};
      \node[state] (Y) at (2,0) {$Y$};
      \node[state] (A1) at (0.5,1) {$A_{1}$};
      \node[state] (A2) at (1.5,1) {$A_{2}$};
      \node[state] (B1) at (0.5,-1) {$B_{1}$};
      \node[state] (B2) at (1.5,-1) {$B_{2}$};
      %\node[state] (M) at (1,0) {$M$};
      \node[state] (V) at (-0.5,1) {$V$};
      \node[state] (D) at (-0.5,-1) {$D$};
      \node[state] (R) at (2.5,1) {$R$};
      \node[state] (F) at (2.5,-1) {$F$};
      \path (X) edge[directed] (Y)
            (X) edge[directed] (D)
            (Y) edge[directed] (F)
            (A1) edge[directed] (X)
            (A1) edge[directed] (A2)
            (A2) edge[directed] (Y)
            (B1) edge[directed] (X)
            (B2) edge[directed] (Y)
           % (B2) edge[directed] (B1)
            (V) edge[directed] (X)
            (R) edge[directed] (Y);
    \node at (1, -1.75) {$\g_{1}$};
    \end{tikzpicture}
    \newline
    \begin{tikzpicture}[scale=1]
      \node[state] (X) at (0,0) {$X$};
      \node[state] (Y) at (2,0) {$Y$};
      \node[state] (A1) at (0.5,1) {$A_{1}$};
      \node[state] (A2) at (1.5,1) {$A_{2}$};
      \node[state] (B1) at (0.5,-1) {$B_{1}$};
      \node[state] (B2) at (1.5,-1) {$B_{2}$};
      %\node[state] (M) at (1,0) {$M$};
      \node[state] (V) at (-0.5,1) {$V$};
      \node[state] (D) at (-0.5,-1) {$D$};
      \node[state] (R) at (2.5,1) {$R$};
      \node[state] (F) at (2.5,-1) {$F$};
      \path (X) edge[directed] (Y)
            (X) edge[directed] (D)
            (Y) edge[directed] (F)
            (A1) edge[directed] (X)
            (A1) edge[directed] (A2)
            (A2) edge[directed] (Y)
            (B1) edge[directed] (X)
            (B2) edge[directed] (Y)
            (B1) edge[directed] (B2)
            (V) edge[directed] (X)
            (R) edge[directed] (Y);
    \node at (1, -1.75) {$\g_{2}$};
    \end{tikzpicture}
    \quad
    \begin{tikzpicture}[scale=1]
      \node[state] (X) at (0,0) {$X$};
      \node[state] (Y) at (2,0) {$Y$};
      \node[state] (A1) at (0.5,1) {$A_{1}$};
      \node[state] (A2) at (1.5,1) {$A_{2}$};
      \node[state] (B1) at (0.5,-1) {$B_{1}$};
      \node[state] (B2) at (1.5,-1) {$B_{2}$};
      \node[state] (V) at (-0.5,1) {$V$};
      \node[state] (D) at (-0.5,-1) {$D$};
      \node[state] (R) at (2.5,1) {$R$};
      \node[state] (F) at (2.5,-1) {$F$};
      \path (X) edge[directed] (Y)
            (X) edge[directed] (D)
            (X) edge[directed] (A1)
            (X) edge[directed] (B1)
            (Y) edge[directed] (F)
            (A1) edge[directed] (A2)
            (A2) edge[directed] (Y)
            (B2) edge[directed] (Y)
            (B1) edge[directed] (B2)
            (V) edge[directed] (X)
            (V) edge[directed,bend left=70] (Y)
            (R) edge[directed] (Y);
    \node at (1, -1.75) {$\g_{3}$};
    \end{tikzpicture}

\caption{Graphs used in Examples \ref{exp:graph}, \ref{exp:degen} and \ref{exp:ppplot}.}
    \label{fig:graph}
\end{figure}
 
We now provide a necessary condition on $\mathcal{Z}$ under which the problematic case that $H_0$ holds but $H^*_0$ does not is rare, and as a result testing $H_0$ is a good proxy for testing $H^*_0$. 


\begin{Theorem}
	Consider nodes $X$ and $Y$ in a DAG $\g_0=(\mathbf{V},\mathbf{E})$ such that $Y \in \de(X,\g_0)$. Let $\mathcal{Z}=\{\mathbf{Z}_1,\dots,\mathbf{Z}_k\}$ be a collection of node sets in $\g_0$. Suppose there exists a $\mathbf{Z}_i$, such that $\f{\g_0} \cap \mathbf{Z}_i = \emptyset$ and $\mathbf{Z}_i$ is not a valid adjustment set relative to $(X,Y)$ in $\g_0$. Further, suppose that $(\mathbf{V} \setminus \f{\g_0}) \subseteq \bigcup_{j=1}^k \mathbf{Z}_j$. 
    If we sample the edge coefficients and error variances for a linear structural equation model compatible with $\g_0$ from a distribution $P$ such that $P$ is absolutely continuous with respect to the Lebesgue measure, then $P$-almost surely there exists a $\mathbf{Z}_j$ such that $\beta_{yx.\mathbf{z}_i} \neq \beta_{yx.\mathbf{z}_j}.$
\label{thm:existence}
\end{Theorem}

Verifying that Theorem \ref{thm:existence} holds requires knowledge of the true DAG $\g_0$, which we do not have. Nonetheless, it gives two important but also intuitive insights on how to select $\mathcal{Z}$. First, the sets in $\mathcal{Z}$ should cover as many nodes as possible, i.e., ideally all non-forbidden nodes in the candidate graph $\g$. This maximises the chances that $(\mathbf{V} \setminus \f{\g_0}) \subseteq \bigcup_{j=1}^k \mathbf{Z}_j$. Second, we should minimise the number of nodes that appear in all sets $\mathbf{Z}_i \in \mathcal{Z}$, and some of the candidate sets $\mathbf{Z}_i$ should be as small as possible. This maximises the chance that $\f{\g_0} \cap \mathbf{Z}_i = \emptyset$ for at least one $\mathbf{Z}_i \in \mathcal{Z}$. Note that this is very different from the strategy proposed by \citet{lu2014robustness}.

\begin{Example}
    Consider the graphs from Figure \ref{fig:graph} and the linear structural equation model compatible with $\g_0$, where all edge coefficients and error variances equal 1. We are interested in estimating the total effect $\tau_{yx}$, which here is simply the edge coefficient of the edge $X \rightarrow Y$ and therefore $\tau_{yx}=1$ by the path tracing rules for total effects from \citet{wright1934method}.
    
    We now illustrate for three candidate graphs that differ from $\g_0$, whether we can use tests for the null hypothesis $H_{0}$ to detect the mistakes in the candidate graphs and how this depends on the choice of sets $\mathcal{Z}$.
    
    Consider the candidate graph $\g_1$ and the collection \[\mathcal{Z}=\{\{A_{1}\},\{A_{1},A_{2}\}, \{A_{1},A_{2},R\}\}\] of three valid adjustment sets relative to $(X,Y)$ in $\g_{1}$.
    A direct calculation shows that $\bbeta_{yx.\mathcal{Z}}=(1.25, 1.25,1.25)^{\top}$, none of which are equal to the total effect $\tau_{yx}=1$.
    In this case, the null hypothesis $H_{0}$ is true even though the null hypothesis $H_{0}^{*}$ is false, i.e., testing $H_{0}$ will not detect that there is a mistake in the candidate graph.
    However, if we add the set of non-forbidden nodes $\mathbf{Z}_4=\{A_{1},A_{2},B_{1},B_{2},V,D,R\}$ to $\mathcal{Z}$, then $H_{0}$ no longer holds as $\mathbf{Z}_4$ is a valid adjustment set in $\g_0$ and therefore $\beta_{yx.\mathbf{z}_4}=1$. In this case testing $H_{0}$ will detect that there is a mistake in the candidate graph.
    
    Consider now the candidate graph $\g_{2}$. It has exactly the same valid adjustment sets relative to $(X,Y)$ as the true graph $\g_0$. Testing $H_{0}$ will therefore not detect that there is a mistake in the candidate graph, irrespective of the collection of valid adjustment sets. Since the two graphs are equivalent with respect to estimating the total effect $\tau_{yx}$ with covariate adjustment this is not a concern.
    
    Consider now the candidate graph $\g_{3}$.
    All valid adjustment sets relative to $(X,Y)$ in $\g_{3}$ result in estimates of $1.5$. Therefore testing $H_{0}$ will not detect that there is a mistake in the candidate graph, irrespective of the collection of valid adjustment sets.
    Interestingly, in the graph $\g'_3$ equal to $\g_3$ but with the edge $V \rightarrow Y$ removed, we can detect the mistakes by testing $H_{0}$ with, for example, the collection $\mathcal{Z}$ of all valid adjustment sets in $\g'_3$. This is an example of using an instrument to detect omitted variables bias \citep[cf.][]{chen2015exogeneity}, which our test implicitly exploits.
    \label{exp:graph}
  \end{Example}

\subsection{The Test Statistic}
As a preparatory result and for completeness, we first derive the asymptotic distribution of the estimator $\hat{\bbeta}_{yx.\mathcal{Z}}$.

\begin{Lemma}
  Consider a $p$-dimensional random vector $\bV=(V_{1},V_{2},\dots,V_{p})$ that follows a distribution where $\E(V_{\ell}^{4})<\infty$ for all $1\leq \ell\leq p$. 
  Given two random variables $X,Y\in\bV$, let $\mathcal{Z}=\{\bZ_{1},\bZ_{2},\dots,\bZ_{k}\}$, $k \geq 2$, be a collection of random subvectors of $\bV$ that do not contain $X$ or $Y$, and let $\bZ_{i}'=(X,\bZ^{\top}_{i})^{\top}$ for $i=1,2,\dots,k$. 
  Then the random vector $\sqrt{n}(\hat{\bbeta}_{yx.\mathcal{Z}}-\bbeta_{yx.\mathcal{Z}})$ converges in distribution to a multivariate normal random variable with mean zero and covariance matrix
  \begin{equation}
    (\bSigma_{\mathcal{Z}})_{ij}=\dfrac{\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})}{\E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2})} \quad 1\leq i, j \leq k,
    \label{eqn:covmat}
  \end{equation}
  where $\delta_{y\bz_{i}'}=Y-\bbeta_{y\bz_{i}'}\bZ_{i}'$ and $\delta_{x\bz_{i}}=X-\bbeta_{x\bz_{i}}\bZ_{i}$.
  \label{lem:normal}
\end{Lemma}


In general the covariance matrix $\bSigma_{\mathcal{Z}}$ of the limiting normal distribution will not be of full rank, that is, the distribution will be degenerate. To illustrate this, we now give an example.



\begin{Example}
    Consider again the DAG $\g_0$ and the linear structural equation model from Example~\ref{exp:graph}. Let  
    \begin{align*}
        \mathcal{Z}=\{\{A_{1},B_{1}\},\{A_{1},A_{2},B_{1}\},&\{A_{1},B_{1},B_{2}\},\\
        &\{A_{1},A_{2},B_{1},B_{2}\}\}.
    \end{align*}
    The asymptotic covariance matrix $\bSigma_{\mathcal{Z}}$ of $\hat{\bbeta}_{yx.\mathcal{Z}}$ is the rank-3 matrix
    \[
    \begin{pmatrix}
    1.75 & 1.25 & 1.5 & 1 \\
    1.25 & 1.25 & 1 & 1 \\
    1.5 & 1 & 1.5 & 1 \\
    1 & 1 & 1 & 1
    \end{pmatrix}.
    \]
    \label{exp:degen}
  \end{Example}

We now reformulate and slightly generalise the null hypothesis $H_{0}$ from Section~\ref{sec:setup} as follows. Consider a pair of random variables $(X,Y)$ and a collection of random vectors $\mathcal{Z}=\{
\mathbf{Z}_1,\dots,\mathbf{Z}_k\}$. Define a contrast matrix $\bGamma \in \mathbb{R}^{(k-1) \times k}$ such that \(\bGamma\mathbf{1}=\mathbf{0}\) and \(\mathrm{rank}(\bGamma)=k-1\) and
consider the null hypothesis: $H_{0}:\bGamma\bbeta_{yx.\mathcal{Z}}=\mathbf{0}$. Based on the joint asymptotic normality of $\hat{\bbeta}_{yx.\mathcal{Z}}$ we now construct an asymptotically $\chi^{2}$-distributed test statistic for this null hypothesis.

\begin{Definition}[Rank-$r$ Moore-Penrose inverse]
    Consider the spectral decomposition of an $l\times l$ positive semidefinite matrix $\bDelta=\mathbf{P}\mathbf{\Lambda}\mathbf{P}^{\top}$, where $\mathbf{P}$ is the orthonormal matrix of eigenvectors and $\bLambda=\mathrm{diag}(\lambda_{1},\lambda_{2},\dots,\lambda_{l})$ with $\lambda_{1}\geq \lambda_{2}\geq \cdots\geq \lambda_{l}$ the ordered eigenvalues of $\bDelta$.
    The rank-$r$ Moore-Penrose inverse of $\bDelta$ is the matrix $\bDelta^{\dagger}_{r}=\mathbf{P}\bLambda^{\dagger}_{r}\mathbf{P}^{\top}$, where $r\leq \mathrm{rank}(\bDelta)$ and $\bLambda^{\dagger}_{r}=\mathrm{diag}(1/\lambda_{1},\dots,1/\lambda_{r},0,\dots,0)$.
  \end{Definition}

\begin{Theorem}
Assume the same conditions as in Lemma~\ref{lem:normal}.
 Let $\bSigma_{\mathcal{Z}}$ be the covariance matrix of the limiting distribution of $\sqrt{n}(\boldsymbol{\hat{\beta}}_{yx.\mathcal{Z}}-\boldsymbol{\beta}_{yx.\mathcal{Z}})$ and given a $(k-1)\times k$ contrast matrix $\mathbf{\Gamma}$, define $\bDelta_{\mathcal{Z}}=\bGamma\bSigma_{\mathcal{Z}}\bGamma^{\top}$. Suppose that $\hat{\bSigma}_{\mathcal{Z}}$ is a consistent estimator of $\bSigma_{\mathcal{Z}}$ and that $\hat{r}$ is a consistent estimator of $\mathrm{rank}(\bDelta_{\mathcal{Z}})=r_{0}, 1\leq r_0\leq k-1$.
Let $\hat{\bDelta}^{\dagger}_{\mathcal{Z},\hat{r}}$ denote the rank-$\hat{r}$ Moore-Penrose inverse of $\hat{\bDelta}_{\mathcal{Z}}=\mathbf{\Gamma}\hat{\bSigma}_{\mathcal{Z}}\mathbf{\Gamma}^{\top}$.
Then under the null hypothesis $H_{0}: 
\mathbf{\Gamma} \bbeta_{yx.\mathcal{Z}} = 0$, the test statistic
\begin{equation}
    T^{2}_{\hat{r}}=n(\mathbf{\Gamma}\hat{\bbeta}_{yx.\mathcal{Z}})^{\top}\hat{\bDelta}^{\dagger}_{\mathcal{Z},\hat{r}}(\mathbf{\Gamma}\hat{\bbeta}_{yx.\mathcal{Z}})
    \label{eq:test-statistic}
\end{equation}
converges in distribution to a $\chi^{2}_{r_{0}}$-distributed random variable.
\label{thm:chisquare}
\end{Theorem}

We can estimate $\bSigma_{\mathcal{Z}}$ consistently by plugging in sample residuals for the population level residuals in equation \eqref{eqn:covmat}. We refer to this estimator as the plug-in estimator and denote it $\hat{\bSigma}_{\mathcal{Z}}$.
For a detailed argument, see Lemma~\ref{lem:consistent} in the supplementary materials.
Note also that for simplicity, we only consider the contrast matrix $\bGamma$ with $1$ at the entries $(j,j)$ and $-1$ at the entries $(j,j+1)$ for $j=1,2,\dots,k-1$, and with zeroes at the remaining entries in this paper. 

\subsection{The Degrees of Freedom}
To compute the Moore-Penrose inverse and the degrees of freedom for the test statistic in equation \eqref{eq:test-statistic}, it is necessary to know the rank of $\bDelta_{\mathcal{Z}}$. There are two possible approaches to this problem. The first is to estimate the rank $r_0$ with some estimate $\hat{r}$. 
The second approach is to select the candidate sets $\mathcal{Z}$ in a way that ensures the matrix $\bDelta_{\mathcal{Z}}$ is invertible. We now develop tools for both approaches.

\subsubsection{Estimating the Degrees of Freedom}
\label{sec:rank}
A standard approach to estimating the rank of a matrix from a noisy observation is information criterion based model selection. This is equivalent to conducting sequential hypothesis tests \citep{camba2009statistical} for the possible ranks. 
In order to apply such model selection to the rank estimation of $\bDelta_Z$, we first derive that the half-vectorised plug-in estimator $\vecth(\hat{\bDelta}_{\mathcal{Z}})$ based on the plug-in estimator $\hat{\bSigma}_{\mathcal{Z}}$ is asymptotically normal. 
\begin{Proposition}
  Under the same conditions as in Lemma~\ref{lem:normal}, $\sqrt{n}\vecth(\hat{\bDelta}_{\mathcal{Z}}-\bDelta_{\mathcal{Z}}) \overset{d}{\to} \mathrm{N}(\mathbf{0},\bC)$, where $\bC=\bPi\mathbf{F}\bPi^{\top}$ for some positive semidefinite matrix $\mathbf{F}$, with scaling matrix $\bPi=\bE_{l}(\bGamma\otimes\bGamma)\bD_{k}$. Here, $\bE_{l}$ is the $l(l+1)/2\times l^{2}$ elimination matrix, $l=k-1$ and $\bD_{k}$ is the $k^{2}\times k(k+1)/2$ duplication matrix.
  \label{ppn:vech}
  \end{Proposition}
Based on Proposition \ref{ppn:vech} and a consistent estimator $\hat{\bC}$ of the matrix $\bC$, we may construct a rank estimation procedure from the minimum discrepancy function (MDF) test statistic \citep{cragg1997inferring,donald2007rank}, which has the form \begin{equation}
n\min_{\mathrm{rank}(\tilde{\bDelta}_{\mathcal{Z}})\leq r}\vecth(\hat{\bDelta}_{\mathcal{Z}}-\tilde{\bDelta}_{\mathcal{Z}})^{\top}\hat{\bC}^{-1}\vecth(\hat{\bDelta}_{\mathcal{Z}}-\tilde{\bDelta}_{\mathcal{Z}}). \label{eqn:mdf}
\end{equation} 
This procedure, however, has only been shown to be consistent if either $\bC$  is of full rank \citep{cragg1997inferring} or, in slightly adapted form, if the true rank of $\bC$ is known \citep{ratsimalahelo2003strongly}. 

As we cannot estimate the rank of $\bC$ to estimate the rank of $\bDelta_{\mathcal{Z}}$, we instead propose using a simplified estimator based on the MDF statistic from equation \eqref{eqn:mdf} which we call the pseudo-MDF estimator:
\begin{align}
\begin{split}
    \hat{r}=\underset{r\in\{1,\dots,k-1\}}{\mathrm{argmin}}\left\{n\|\vecth(\hat{\bDelta}_{\mathcal{Z}}-\tilde{\bDelta}_{\mathcal{Z},r})\|_{2}^{2} + \right.\\ \left. \log (n)r(k-1-(r-1)/2)\vphantom{n\|\vecth(\hat{\bDelta}-\tilde{\bDelta}_{r})\|_{2}^{2}}\right\},
    \end{split}
    \label{eqn:pseudobic}
  \end{align}
where $\tilde{\bDelta}_{\mathcal{Z},r}$ is the best rank-$r$ reconstruction of $\hat{\bDelta}_{\mathcal{Z}}$ based on spectral decomposition such that $\tilde{\bDelta}_{\mathcal{Z},r}\tilde{\bDelta}_{\mathcal{Z},r}^{\dagger}=\mathbf{I}$.
Note that we effectively assume that the matrix $\hat{\bC}^{-1}$ is the identity matrix.
In doing so, we ignore the covariance structure between the elements of $\hat{\bDelta}_{\mathcal{Z}}$.
Although the elements of $\hat{\bDelta}_{\mathcal{Z}}$ are likely correlated, the pseudo-MDF rank estimate nonetheless works well empirically (see Section \ref{sec:sim}).

\subsubsection{Selecting $\mathcal{Z}$ to Ensure Full Rank}
\label{sec:select Z}
  \begin{algorithm}[t!]
    \begin{algorithmic}[1]
      \State \textbf{Input}: Candidate graph $\g$, vertices $(X,Y)$, data $\mathcal{D}_{n}$, testing strategy $S\in\{\mathrm{All},\mathrm{Min+}\}$
      \State \textbf{Output}: $p$-value
      \If{$S=\mathrm{All}$}
        \State Set $\mathcal{Z}$ as the collection of all valid adjustment sets relative to $(X,Y)$ in $\g$
      \EndIf
      \If{$S=\mathrm{Min+}$}
        \State Set $\mathcal{Z}$ as a pruned collection of all minimal valid adjustment sets relative to $(X,Y)$ in $\g$ plus the set of non-forbidden nodes
      \EndIf
      \For{each adjustment set $\bZ$ in $\mathcal{Z}$}
        \State Get sample regression residuals $\mathbf{r}_{x\bz}$ and $\mathbf{r}_{y\bz'}$ from data $\mathcal{D}_{n}$, where $\mathbf{Z}'=(X,\mathbf{Z})$
      \EndFor
      \State Compute $\hat{\bSigma}_{\mathcal{Z}}$ and $\hat{\bDelta}_{\mathcal{Z}}$ with regression residuals
      \If{$S=\mathrm{All}$}
        \State Estimate optimal rank $\hat{r}$ from $\hat{\bDelta}_{\mathcal{Z}}$ based on (\ref{eqn:pseudobic})
      \EndIf
      \If{$S=\mathrm{Min}+$}
        \State Set $\hat{r}$ as the cardinality of $\mathcal{Z}$ minus one
      \EndIf
      \State Compute test statistic
      \[
      T^{2}_{\mathrm{obs}} = n(\mathbf{\Gamma}\hat{\bbeta}_{yx})^{\top}\hat{\bDelta}^{\dagger}_{\mathcal{Z},\hat{r}}(\mathbf{\Gamma}\hat{\bbeta}_{yx})
      \]
      \State Calculate $p\text{-value}=1-F(T^{2}_{\mathrm{obs}})$ where $F(\cdot)$ is the cumulative distribution function of $\chi^{2}_{\hat{r}}$
    \end{algorithmic}
    \caption{Testing procedure}
    \label{alg:test}
  \end{algorithm}

Depending on the choice of candidate sets $\mathcal{Z}$, the asymptotic covariance matrix $\bSigma_{\mathcal{Z}}$ may be of full rank. This is, for example, trivially true if there is only one set in $\mathcal{Z}$. Whenever $\bSigma_{\mathcal{Z}}$ is invertible, the matrix $\bDelta_{\mathcal{Z}}$ is also invertible.
We now propose a strategy to select $\mathcal{Z}$, such that $\bSigma_{\mathcal{Z}}$ is likely to be of full rank and that also follows the guidelines derived from Theorem \ref{thm:existence} in Section \ref{sec:setup}.
  \begin{Lemma}
    Consider nodes $X$ and $Y$ in a DAG $\g=(\bV,\bE)$ such that $Y \in \de(X,\g)$.
    Consider a collection of \[\mathcal{Z}=\{\bZ_{1},\dots,\bZ_{k}\}\cup\{\mathrm{nonforb}(X,Y,\g)\}\] where $\bZ_{i}$, $i=1,\dots,k$, are minimal valid adjustment sets relative to $(X,Y)$ in $\g$. If $\bZ_{i}\setminus (\cup_{j\neq i}\bZ_{j}) \neq \emptyset$ for all $i=1,\dots,k$, $\mathrm{nonforb}(X,Y,\g) \setminus \left( \cup_{i} \bZ_{i} \right) \not\perp_{\g} X$ and we sample the edge coefficients and error variances for a linear structural equation model compatible with $\g$ from a distribution $P$, such that $P$ is absolutely continuous with respect to the Lebesgue measure, then the asymptotic covariance matrix $\bSigma_{\mathcal{Z}}$ for the random vector $\boldsymbol{\hat{\beta}}_{yx.\mathcal{Z}}$ is $P$-almost surely of full rank.
    \label{lem:minimal}
  \end{Lemma}


    In general, the collection of all minimal valid adjustment sets will not fulfil the distinct node condition of Lemma \ref{lem:minimal}. It is, however, easy to prune the set of all minimal valid adjustment sets to obtain a subset that fulfils the conditions of Lemma \ref{lem:minimal} and still covers the same set of nodes as the collection of all minimal valid adjustment sets.

\subsection{The testing procedure}

We propose a testing procedure that, given a pair of nodes $(X,Y)$ in a candidate graph $\g$ and a data set, tests whether adjusting for the valid adjustment sets relative to $(X,Y)$ in $\g$ leads to estimators that converge to the same quantity. The procedure uses the test statistic from equation \eqref{eq:test-statistic} and we propose two strategies to select the collection of valid adjustment sets $\mathcal{Z}$. 

The first strategy, which we call $\mathrm{All}$, considers all available valid adjustment sets. This strategy is likely to lead to the best power for the test, but it requires estimating the degrees of freedom for the test statistic's asymptotic distribution. This is a difficult problem (see Section \ref{sec:rank}) and as a result the solution we propose does not have a formal consistency guarantee, although it performs well in practice (see Section \ref{sec:sim}). In addition, computing all valid adjustment sets is very computationally expensive, especially for moderate to large graphs, and as a result this strategy may often not be computationally feasible.

The second strategy, which we call $\mathrm{Min}+$, is to prune the collection of all minimal valid adjustment sets as explained in Section \ref{sec:select Z} and then add the set of all non-forbidden nodes, which under the assumption $Y \in \de(X,\g)$ is always a valid adjustment set (see Lemma \ref{lemma:nonforb} in the supplementary materials). This approach avoids estimating the degrees of freedom but may lead to a loss of power. Note that if $Y \notin \de(X,\g)$ we would need to replace the non-forbidden nodes with another large set such as $\mathrm{Adjust}(X,Y,\g)$ \citep{perkovic16}. However, if $Y \notin \de(X,\g)$ every set that d-separates $X$ and $Y$ is a valid adjustment set and therefore our problem reduces to checking d-separation statements, for which there exists a wide literature on conditional independence tests \citep[e.g.][]{spirtes2000causation}. Therefore, we disregard this case. 

Another major advantage of the $\mathrm{Min}+$ strategy is that it avoids the computationally heavy task of computing all valid adjustment sets. The number of minimal valid adjustment sets is typically much smaller than the number of valid adjustment sets and as a result the polynomial-delay algorithm by \citet{van2014constructing}, which we use to estimate the set of all minimal valid adjustment sets, is generally quite fast. We verify this in a small simulation study, where the \(\mathrm{Min}+\) strategy ran on sparse graphs of size up to \(5000\) (see Section~\ref{app:sim} in the Supplementary Materials).

We summarise the testing procedure in Algorithm \ref{alg:test}. As discussed in Section \ref{sec:setup}, the test cannot detect all types of mistakes in $\g$, but it nonetheless serves as a simple and targeted robustness check. 

\begin{Example}
To illustrate our testing procedure, we revisit the linear structural equation model from Example~\ref{exp:graph} as well as the true graph and candidate graphs shown in Figure~\ref{fig:graph}. In addition, we also consider the candidate graph $\g^{\prime}_{3}$ which is the graph $\g_{3}$ with the edge $V\to Y$ deleted.
To each candidate graph we apply the testing procedure with both testing strategies (see Algorithm \ref{alg:test}).
Recall that for the candidate graphs $\g_0$ and $\g_{3}$ the null hypothesis is true, while it is false for the candidate graphs $\g_{1}$ and $\g^{\prime}_{3}$. We sample $100$ data sets with $n=25$ observations, $100$ sets with $n=100$ observations, and another $100$ sets with $n=400$ observations from the underlying linear structural equation model and apply our testing procedure to each of these data sets. The resulting $p$-values are shown as probability-probability plots against the standard uniform distribution in Figure~\ref{fig:running-example}. We explain the construction of these plots more thoroughly in Section~\ref{sec:ppplot} of the Supplementary Materials. When the null hypothesis is true, we see that both strategies lead to close to uniform $p$-values, especially when $n>25$. We do not consider $\g_2$ as it is equivalent to $\g_0$ in terms of valid adjustment sets. We also observe reasonable power for $n>25$ and $\g_1$, especially with strategy $\mathrm{All}$. For $\g_3'$ on the other hand the power is mediocre except for $\mathrm{All}$ and $n=400$. 
    \label{exp:ppplot}
\end{Example}

\begin{figure}[t!]
	\centering
	\scalebox{0.425}{
	\includegraphics{fig/running-example-plot.pdf}
	}
	\caption{Probability-probability plots of the $p$-values in Example~\ref{exp:ppplot}.
	Theoretical probabilities are from the cumulative distribution function of a standard uniform distribution.
	Rows: test strategies.
	Columns: sample sizes for the test.}
	\label{fig:running-example}
\end{figure}

\section{Simulations}\label{sec:sim}


We investigate the finite sample performance of the testing procedure from Algorithm~\ref{alg:test} in a simulation study.
The study is structured as follows. We randomly generate \(50\) DAGs for each graph size from \(\{10,15\}\). The expected neighbourhood size is sampled uniformly from \(\{2,3,4,5\}\) for each graph. Then, for each DAG $\g_0$ we randomly generate a compatible linear structural equation model by (i) sampling the edge coefficients uniformly from the interval \([-2,-0.1]\cup[0,1,2]\), (ii) sampling the error distribution uniformly to either be normal, \(t\), uniform or logistic for all errors and (iii) uniformly sampling scale parameters which depend on the error distribution such that the error variances are in the interval \([0.4,1.6]\). We randomly choose a pair of nodes \((X,Y)\) such that \(Y\in\mathrm{de}(X,\mathcal{G}_{0})\) and that there exist at least two valid adjustment sets relative to \((X,Y)\) in \(\g_0\).

For each true DAG $\g_0$ we then sample $40$ data sets from the corresponding linear structural equation model. The sample size $m$ is \(100\) for half of these data sets, and \(400\) for the other half. With each of these data sets we estimate a causal graph $\g$ using either the Greedy Equivalence Search (GES) algorithm \citep{Chickering02}, if the errors of the linear structural equation model are normal or the Linear Non-Gaussian Acyclic Models (LiNGAM) algorithm \citep{shimizu2006linear}, otherwise. We do this to generate a large number of plausible candidate causal graphs for our testing procedure. We use the sample sizes $100$ and $400$ to ensure that some of the candidate graphs contain more errors and some fewer. We refer to the candidate graphs that were generated using the sample size $100$ as low accuracy candidate graphs and to those that were generated with the sample size $400$ as high accuracy ones.

\begin{figure}
    \centering
    \scalebox{0.6}{
    \begin{tikzpicture}[scale=1.1]
      \node at (-4,0) {Underlying model};
      \node at (-4,-1.5) {Candidate graphs};
      \node at (-4,-3) {$p$-values};
      \node (g0) at (0,0) {$\mathcal{G}_{0}$};
      \node (g1) at (-2,-1.5) {$\mathcal{G}_{1}$};
      \node (g2) at (-0.5,-1.5) {$\mathcal{G}_{2}$};
      \node at (0.5,-1.5) {$\cdots$};
      \node (g3) at (2,-1.5) {$\mathcal{G}_{20}$};
      \node (p1) at (-2,-3) {$p^{(1)}$};
      \node (p2) at (-1,-3) {$p^{(2)}$};
      \node at (0,-3) {$\cdots$};
      \node (p3) at (1,-3) {$p^{(100)}$};
      \path (g0) edge[directed] node[pos=0.25,left] {\small$D_{m,1}$} (g1);
      \path (g0) edge[directed] node[pos=0.5,right] {\small$D_{m,2}$} (g2);
      \path (g0) edge[directed] node[pos=0.25,right] {\small$D_{m,20}$} (g3);
      \path (g2) edge[directed] node[pos=0.25,left] {\small$D_{n,1}$} (p1);
      \path (g2) edge[directed] node[pos=0.5,right] {\small$D_{n,2}$} (p2);
      \path (g2) edge[directed] node[pos=0.25,right] {\small$D_{n,100}$} (p3);
      \draw [-latex] (3,-3) -- (3,0.2) node[left] {};
      \draw [-latex] (3,-3) -- (6.2,-3) node[below] {};
      \fill[color=gray!20] (3,-3) -- (3.05,-2.7) -- (3.1,-2.4) -- 
            (3.15,-2.1) -- (3.25,-1.8) -- (3.35,-1.5) -- 
            (3.6,-1.2) -- (3.7,-0.9) -- (4.1,-0.6) -- 
            (4.6,-0.3) -- (5.5,0) -- (6,0) -- (6,-3) -- cycle;
     \foreach \Point in {(3,-3), (3.05,-2.7), (3.1,-2.4), 
            (3.15,-2.1), (3.25,-1.8), (3.35,-1.5), 
            (3.6,-1.2), (3.7,-0.9), (4.1,-0.6), 
            (4.6,-0.3), (5.5,0), (6,0)}{
    \fill \Point circle[radius=1pt];
        }
      \draw (3,-3) -- (3.05,-3) -- (3.05,-2.7) -- (3.1,-2.7) -- (3.1,-2.4) -- (3.15,-2.4) --
            (3.15,-2.1) -- (3.25,-2.1) -- (3.25,-1.8) -- (3.35,-1.8) -- (3.35,-1.5) -- (3.6,-1.5) --
            (3.6,-1.2) -- (3.7,-1.2) -- (3.7,-0.9) -- (4.1,-0.9) -- (4.1,-0.6) -- (4.6,-0.6) --
            (4.6,-0.3) -- (5.5,-0.3) -- (5.5,0) -- (6,0);
        \draw[dotted,semithick] (3,-3) -- (6,0);
        \draw[dotted,semithick] (3,0) -- (6,0);
      \node at (4.8,-1.8) {AUC};
      \node at (4.5,-3.5) {prob.-prob. plot};
      \draw [decorate,decoration={brace,amplitude=3pt,mirror},semithick] (-2,-3.3) -- (1,-3.3);
      \draw [-latex,semithick] (-0.5,-3.4) -- (-0.5,-3.5) -- (3.5,-3.5);
    \end{tikzpicture}}
    \caption{An illustration of the double simulation scheme for the simulation study and an illustration of the AUC metric.}
    \label{fig:double_sim}
\end{figure}

For each candidate graph $\g$ and each sample size $n\in\{50,100,200,400\}$ we do the following procedure.
We sample an additional $100$ data sets with sample size $n$ from the corresponding true linear structural equation model. Given these data sets, the pair $(X,Y)$ and the candidate graph $\g$, we apply Algorithm~\ref{alg:test} using both strategies for graphs with \(10\) or \(15\) nodes. To measure the performance of our testing procedure we then compute the area under the curve (AUC) of the probability-probability plot of the \(100\) $p$-values against the uniform distribution. This means that in total we obtain $8$ AUCs for each candidate graph, i.e., $4$ for each testing strategy and $2$ for each sample size. Figure~\ref{fig:double_sim} gives an illustration of the layered simulation scheme. We give further details for the design of the simulation study in Section \ref{app:setup} of the Supplementary Materials.

Figure~\ref{fig:double-sim} is an ensemble of violin plots of the AUCs from the simulation study. As we have access to the true graph, we can decide for each candidate graph and testing strategy, whether the null hypothesis for the test, i.e. $H_0$, is true or false and plot these cases separately. Figure~\ref{fig:double-sim} shows that as the sample size of the data set used for our testing procedure increases, the AUCs in the cases where the null hypothesis is true become centred around $0.5$. This indicates that both strategies control the type-I error rate asymptotically.
There are, however, very large and small AUCs when we use the strategy $S=\mathrm{All}$ with small sample sizes and for the more accurate candidate graphs. This is likely due to the rank estimation step required for this strategy and indicates that as expected the strategy $\mathrm{Min}+$ is more stable than $\mathrm{All}$.

\begin{figure}[t!]
	\centering
	\scalebox{0.45}{
	\includegraphics{fig/double-sim-boxplot-final.pdf}
	}
	\caption{Violin plots (layered with boxplots) of the areas under the curve (AUC) from the simulation study.
	The AUCs are grouped by sample size for the testing procedure (first row) and the expected accuracy of the candidate graph (second row).}
	\label{fig:double-sim}
\end{figure}

In the cases where the null hypothesis is false, i.e., the candidate graph contains a mistake that the test can detect, the AUCs have a cluster close to $1$ which is especially pronounced for the larger sample sizes. This indicates that our testing procedure has good power in many cases. Unsurprisingly, the AUCs are smaller for the candidate graphs with fewer errors, since it is more difficult to detect that there is a mistake in an almost correct graph than in a glaringly incorrect one. Nonetheless, the AUCs remain respectable and there continue to be AUCs close to $1$.
In general, the AUCs for strategy $\mathrm{All}$ are larger than those for $\mathrm{Min}+$, although this gain is obtained at the price of a loss in stability.

\begin{table}[ht!]
    \centering
    {\scriptsize
    \begin{tabular}{cccccc}
    \toprule
        Cand. graph & & \multicolumn{2}{c}{Null (\(H_{0}\))} & \multicolumn{2}{c}{Alternative ($\neg H_0$)} \\
        \cmidrule(lr){3-4} \cmidrule(lr){5-6}
        accuracy & $n$ & $S=\mathrm{Min+}$ & $S=\mathrm{All}$ & $S=\mathrm{Min+}$ & $S=\mathrm{All}$ \\
        \midrule
        \multirow{4}{*}{Low} & 50 & 0.0753 & 0.0903 & 0.5570 & 0.7396\\
        & 100 & 0.0634 & 0.0626 & 0.6352 & 0.7880\\
        & 200 & 0.0540 & 0.0510 & 0.7132 & 0.8341\\
        & 400 & 0.0494 & 0.0468 & 0.7887 & 0.8812\\
        \midrule
        \multirow{4}{*}{High} & 50 & 0.0781 & 0.0887 & 0.1543 & 0.1697\\
        & 100 & 0.0631 & 0.0585 & 0.2026 & 0.2094\\
        & 200 & 0.0559 & 0.0499 & 0.2838 & 0.3010\\
        & 400 & 0.0540 & 0.0471 & 0.3838 & 0.4152\\
        \bottomrule
    \end{tabular}
    }
    \caption{Proportion of hypotheses rejected at level $0.05$ in the simulation study.}
    \label{tab:my_label}
\end{table}

The AUCs we consider do not capture the behaviour of our testing procedure fully, so we also calculate the proportion of tests rejected at level $0.05$ among all tests performed in the simulation study as an additional metric. The results are given in Table \ref{tab:my_label}. They indicate that for both strategies our testing procedure controls the type-I error rate asymptotically and at the same time has good power for the alternative. 

Note that for conciseness we have only analysed the performance of our procedure for testing the test null $H_0$ and not the stricter \(H_{0}^{*}\). However, since $H_0$ is in fact the null-hypothesis our procedure is formally testing, the performance does not differ meaningfully between the two cases that (i) \(H_{0}^{*}\) holds and that (ii) \(H_{0}^{*}\) does not hold but \(H_{0}\) does. We verify this in Section \ref{sec:sim} of the Supplementary Materials. 
We also investigate how often the problematic case that \(H_{0}\) holds but \(H_{0}^{*}\) does not occur in our simulation study, i.e., the testing procedure has no power to detect a meaningful mistake in the candidate graph: it never occurs in more than $15 \%$ of the cases where \(H_{0}\) holds although the actual percentage is much lower for some settings of our simulation study (see Table~\ref{tab:double-sim} in the Supplementary Materials).

We also investigate the performance of our testing procedure in an additional simulation study with graphs of size \(20\), \(40\) and \(80\). Here, we only consider the \(\mathrm{Min}+\) strategy as the \(\mathrm{All}\) is too computationally expensive.
Due to space constraints we provide the results in Section~\ref{app:sim} of the Supplementary Materials, but they do not differ meaningfully from the results for the smaller graphs.

\section{Real data example}\label{sec:RD}
We apply our testing procedure to the single cell data collected for the investigation of human primary naïve CD$4^{+}$ T cell signalling networks by \citet{sachs2005causal}.
This data set consists of measurements from a total of $9$ experimental conditions.
We will only use the data from the observational regime, which corresponds to the experimental setting with reagent anti-CD3/CD28.
This subset of the data consists of $853$ measurements of $11$ phosphorylated proteins and phospholipids.
The observational data is thought to be consistent with the conventionally accepted molecular interaction network (also called the consensus graph, $\g_{\mathrm{Consensus}}$, Figure~\ref{fig:sc_graph} left). We use alternative graph proposed in \citet{sachs2005causal} ($\g_{\mathrm{Sachs}}$, Figure~\ref{fig:sc_graph} right) to evaluate the results of the analysis.


We consider $\g_{\mathrm{Consensus}}$ as the candidate graph $\g$. We extract all pairs of nodes $(X,Y)$ that satisfy $Y\in\mathrm{de}(X,\g)$. There are $36$ such node pairs in the graph.
For every pair, we apply our testing procedure with strategy $\mathrm{All}$ to the log-transformed and centred observational data.
After a Bonferroni correction only the $p$-values for the pairs (PKA, Erk) and (PKA, Akt) are significant at $0.05$ level ($1.19\times 10^{-14}$ and $4.91\times 10^{-14}$).

\begin{figure}[t!]
\centering
	\scalebox{0.5}{
	\input{fig/sc_graph.tex}
	}
	\caption{Causal DAGs representing intracellular signalling network among human primary naïve CD$4^{+}$ T cells.}
	\label{fig:sc_graph}
\end{figure} 

We now take a closer look at these two node pairs.
The collection $\mathcal{Z}$ of all valid adjustment sets relative to (PKA, Erk) in the consensus graph consists of $419$ sets.
The test rejects the null hypothesis that these adjustment sets lead to estimates of the same quantity.
To illustrate a potential error in $\g_{\mathrm{Concensus}}$, we consider the valid adjustment sets $\emptyset$ and \{PLCg, PIP2, PIP3, Akt, PKC, p38, JNK\}.
If we consider the alternative graph $\g_{\mathrm{Sachs}}$ as a more appropriate representation of the true data generating mechanism, the rejection is justifiable. The covariate Akt is a forbidden node in $\g_{\mathrm{Sachs}}$ because it opens a collider path PKA $\to$ Akt $\gets$ Erk. On the other hand, the empty set is a valid adjustment set also in $\g_{\mathrm{Sachs}}$. A similar argument applies to the the pair (PKA, Akt).
The collection $\mathcal{Z}$ relative to this node pair also has a size of $419$, among which we can look at adjustment sets $\emptyset$ and \{Raf, Mek, PLCg, PIP2, PIP3, Erk, PKC, p38, JNK\}. Using Erk is problematic as it is a forbidden node in $\g_{\mathrm{Sachs}}$ because it blocks the causal path PKA $\to$ Erk $\to$ Akt. This indicates that in both cases our testing procedure is detecting a mistake in the consensus graph.

Our testing procedure produces rank estimates of $\bDelta_{\mathcal{Z}}$ mostly at $1$ ($30$ out of $36$ cases), even though the size of $\mathcal{Z}$ goes up to $419$. This illustrates how a large number of adjustment sets does not necessarily mean a large number of effective over-identifying constraints on the total effect for the test. It is unsurprising that our testing procedure with the $\mathrm{Min}+$ strategy detects the same two pairs of nodes as problematic (p-values $7.04\times 10^{-15}$ and $7.35\times 10^{-15})$.

\section{Conclusion and discussion}\label{sec:diss}
In this paper, we propose a robustness test that checks whether it is reasonable to use a candidate causal graph to estimate a total effect of interest with covariate adjustment. This is a useful model validation tool for practitioners who wish to estimate a total effect with covariate adjustment and rely on causal graphs obtained from domain knowledge.

We develop our testing procedure assuming that the candidate graph is a DAG. In applications with unmeasured confounding between the covariates, it is more natural to assume that the candidate graph is an acyclic directed mixed graph (ADMG) with bi-directed edges that represent error correlations induced by the presence of unmeasured confounding. If the candidate ADMG contains at least two valid adjustment sets, it is also possible to apply our testing procedure in this setting with one limitation. The set $\mathrm{nonforb}(X,Y,\g)$ may not be a valid adjustment set and as a result the strategy $\mathrm{Min}+$ fails. We believe it is possible to adapt $\mathrm{Min}+$ to an ADMG by replacing $\mathrm{nonforb}(X,Y,\g)$ with a suitable alternative large valid adjustment set but we leave this for future research.

Another interesting idea for future research is that in general, given a valid adjustment set and a forbidden node, adding the node to the set should change the limit of the resulting estimator. It may be possible to exploit this in order to devise a testing procedure similar to the one proposed in this paper but that also exploits the information contained in the forbidden nodes of the candidate causal graph.

\begin{acknowledgements} 
    We thank Milan Kuzmanovic for proposing the idea for Lemma \ref{lem:minimal}.
    We also thank Vi Thanh Pham, Nicola Gnecco and Jonas Peters for feedback and insightful discussions. LH was supported by a research grant (18968) from VILLUM FONDEN.
\end{acknowledgements}

\bibliography{su_509.bib}

\end{document}