% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    

%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} 
% \usepackage{siunitx} 
\usepackage{booktabs} 
\usepackage{tikz}
\usepackage{nameref}
\usepackage{zref-xr}
\zxrsetup{toltxlabel}
\zexternaldocument*{su_509}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand\ci{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}

\DeclareMathOperator{\Forbb}{forb}
\DeclareMathOperator{\pa}{pa}
\DeclareMathOperator{\possPa}{posspa}
\DeclareMathOperator{\posspa}{posspa}
\DeclareMathOperator{\de}{de}
\DeclareMathOperator{\possDe}{possde}
\DeclareMathOperator{\possde}{possde}
\DeclareMathOperator{\an}{an}
\DeclareMathOperator{\possAn}{possan}
\DeclareMathOperator{\possan}{possan}
\DeclareMathOperator{\CN}{cn}
\DeclareMathOperator{\possCN}{posscn}
\DeclareMathOperator{\Opt}{O}
\DeclareMathOperator{\Var}{\mathrm{Var}}
\DeclareMathOperator{\Cov}{\mathrm{Cov}}
\DeclareMathOperator{\E}{\mathrm{E}}
\DeclareMathOperator{\vect}{\mathrm{vec}}
\DeclareMathOperator{\vecth}{\mathrm{vech}}
\DeclareMathOperator{\diag}{\mathrm{diag}}

\newcommand{\mpdag}{maximal PDAG}
\newcommand{\Mpdag}{Maximal PDAG}
\newcommand{\MPDAG}{MAXIMAL PDAG}
\newcommand{\pstar}[1][p]{{#1}^{*}}
\newcommand{\g}[1][G]{\mathcal{#1}}
\newcommand{\f}[2][X,Y]{\Forbb(#1,#2)}
\newcommand{\fb}[2][X,Y]{\Forbb(\mathbf{#1},#2)}
\newcommand{\cn}[2][X,Y]{\CN(#1,#2)}
\newcommand{\cnb}[2][X,Y]{\CN(\mathbf{#1},#2)}
\newcommand{\cns}[1]{\CN(\mathbf{X},Y,#1)}
\newcommand{\opts}[1]{\mathbf{O}(\mathbf{X},Y,#1)}
\newcommand{\posscn}[2][X,Y]{\possCN(#1,#2)}
\newcommand{\posscnb}[2][X,Y]{\possCN(\mathbf{#1},#2)}
\newcommand{\opt}[2][X,Y]{\mathbf{O}(#1,#2)}
\newcommand{\optb}[2][X,Y]{\mathbf{O}(\mathbf{#1},#2)}
\newcommand{\vsp}{\vspace{.2cm}}

\newcommand{\balpha}{\boldsymbol{\alpha}}
\newcommand{\bbeta}{\boldsymbol{\beta}}
\newcommand{\bepsilon}{\boldsymbol{\epsilon}}
\newcommand{\btau}{\boldsymbol{\tau}}
\newcommand{\bdelta}{\boldsymbol{\delta}}
\newcommand{\btheta}{\boldsymbol{\theta}}
\newcommand{\bxi}{\boldsymbol{\xi}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bX}{\mathbf{X}}
\newcommand{\bbX}{\mathbf{X}}
\newcommand{\bY}{\mathbf{Y}}
\newcommand{\bc}{\mathbf{c}}
\newcommand{\bC}{\mathbf{C}}
\newcommand{\bz}{\mathbf{z}}
\newcommand{\bZ}{\mathbf{Z}}
\newcommand{\bK}{\mathbf{K}}
\newcommand{\bW}{\mathbf{W}}
\newcommand{\bSigma}{\boldsymbol{\Sigma}}
\newcommand{\bv}{\mathbf{v}}
\newcommand{\bV}{\mathbf{V}}
\newcommand{\be}{\mathbf{e}}
\newcommand{\bE}{\mathbf{E}}
\newcommand{\bA}{\mathbf{A}}
\newcommand{\ba}{\mathbf{a}}
\newcommand{\br}{\mathbf{r}}
\newcommand{\bu}{\mathbf{u}}
\newcommand{\bU}{\mathbf{U}}
\newcommand{\bO}{\mathbf{O}}
\newcommand{\bp}{\mathbf{p}}
\newcommand{\bP}{\mathbf{P}}
\newcommand{\bLambda}{\boldsymbol{\Lambda}}
\newcommand{\bOmega}{\boldsymbol{\Omega}}
\newcommand{\bGamma}{\boldsymbol{\Gamma}}
\newcommand{\bDelta}{\boldsymbol{\Delta}}
\newcommand{\bB}{\mathbf{B}}
\newcommand{\bD}{\mathbf{D}}
\newcommand{\bd}{\mathbf{d}}
\newcommand{\bI}{\mathbf{I}}
\newcommand{\bZsf}{\boldsymbol{\mathsf{Z}}}
\newcommand{\Xsf}{\mathsf{X}}
\newcommand{\bPi}{\boldsymbol{\Pi}}

\usepackage{tikz}
\usetikzlibrary{arrows.meta,arrows,patterns}
\usetikzlibrary{shapes,decorations,decorations.pathreplacing,arrows,calc,arrows.meta,fit,positioning}
\usetikzlibrary{arrows.meta,automata,positioning,quotes}
\tikzset{
    directed/.style={-Latex,semithick},
    %state/.style ={circle,draw,minimum width=0.7cm},
    state/.style ={minimum width=0.5cm},
    point/.style = {circle, draw, inner sep=0.04cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=2pt, align=left, sloped}
}

\usepackage{amsmath,amssymb}
\usepackage{amsthm}
\newtheoremstyle{break}
  {\topsep}{\topsep}%
  {\itshape}{}%
  {\bfseries}{}%
  {\newline}{}%
  
\newtheorem{Satz}{Satz}
\theoremstyle{plain}
\newtheorem{Lemma}[Satz]{Lemma}
\newtheorem{Corollary}[Satz]{Corollary}
\newtheorem{Theorem}[Satz]{Theorem}
\newtheorem{Proposition}[Satz]{Proposition}
\newtheorem{Fact}[Satz]{Fact}

\newtheoremstyle{breakdfn}
  {\topsep}{\topsep}%
  {\upshape}{}%
  {\bfseries}{}%
  {\newline}{}%

\theoremstyle{definition}
\newtheorem{Example}[Satz]{Example}
\newtheorem{Assumption}[Satz]{Assumption}
\newtheorem{Definition}[Satz]{Definition}

\theoremstyle{remark}
\newtheorem{Conjecture}[Satz]{Conjecture}
\newtheorem*{Remark}{Remark}

\usepackage{algorithm}
\usepackage[noend]{algpseudocode}

\definecolor{mygreen}{RGB}{0,110,51}

\usepackage{rotating}
\usepackage{afterpage}

\usepackage{multirow}
\usepackage{booktabs}

\title{A Robustness Test for Estimating Total Effects with Covariate Adjustment (Supplementary Materials)}

% The standard author block has changed for UAI 2021 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:zehao.su@sund.ku.dk?subject=Your UAI 2022 Paper}{Zehao Su}}
\author[2]{Leonard Henckel}
% Add affiliations after the authors
\affil[1]{%
    Section of Biostatistics\\
    Department of Public Health\\
    University of Copenhagen
}
\affil[2]{%
    Department of Mathematical Sciences\\
    University of Copenhagen
}
  
\begin{document}
\onecolumn
\maketitle
\appendix
\section{Graphical preliminaries}
\label{app:graph}

\paragraph{Graphs}
A graph $\mathcal{G}=(\bV,\bE)$ is a tuple of a node set $\bV$ and an edge set $\bE$. We consider simple directed graphs where there is at most one edge between any pair of vertices and the edges are of the form $\rightarrow$.
\paragraph{Walks, paths and cycles}
  Two vertices are adjacent if there is an edge between them.
  A \emph{walk} between $X$ and $Y$ is a sequence of vertices $(X,\dots,Y)$ such that successive vertices are adjacent.
  A \emph{path} between $X$ and $Y$ is a walk between $X$ and $Y$ where all vertices are distinct.
  A \emph{directed path} from $X$ to $Y$ is a path between $X$ and $Y$ where all the edges point towards $Y$.
  A \emph{cycle} is a path $(X,Z,\dots,Y)$ plus an edge between $Y$ and $X$.
  A \emph{directed cycle} is a directed path $(X,Z,\dots,Y)$ from $X$ to $Y$ plus an edge $Y\to X$. Given a path $p=(V_1,\dots,V_k)$, let $p(V_i,V_j)$, $i<j$ denote the path segment from $V_i$ to $V_j$ and let $-p=(V_k,\dots,V_1)$. Given two paths $p=(V_1,\dots,V_k)$ and $q=(V_k,\dots,V_q)$, let $p \oplus q = (V_1,\dots,V_k,\dots,V_q)$. We call any node $V_i$ on a path $p=(V_1,\dots,V_k)$ such that $V_{i-1}\rightarrow V_i \leftarrow V_{i+1}$ a collider on $p$ and any node that is not a collider on $p$, a non-collider on $p$.

\paragraph{DAG}
  A \emph{directed acyclic graph} (DAG) is a directed graph without directed cycles.

\paragraph{Parents, children, ancestors and descendants}
  If $X\to Y$, then $X$ is a parent of $Y$ and $Y$ is a child of $X$.
  If there is a directed path from $X$ to $Y$, then $X$ is an ancestor of $Y$ and $Y$ is a descendant of $X$.
  Any node is an ancestor and a descendant of itself.
  For any node $X\in\bV$, the sets of parents, children, ancestors and descendants of $X$ in $\mathcal{G}$ are denoted by $\mathrm{pa}(X,\mathcal{G})$, $\mathrm{ch}(X, \mathrm{G})$, $\mathrm{an}(X, \mathrm{G})$ and $\mathrm{de}(X, \mathrm{G})$, respectively.
This definition applies disjunctively to sets of nodes. For example, the parents of the set of vertices $\bX$ are defined as $\mathrm{pa}(\bX,\mathcal{G})=\cup_{X\in\bX}\mathrm{pa}(X,\mathcal{G})$. The non-descendants of $\bX$ are $\mathrm{nonde}(\bX,\mathcal{G})=\bV\setminus\mathrm{de}(\bX,\mathcal{G})$.

\paragraph{$d$-separation}
  A path $p$ between $X$ and $Y$ is blocked by a set $\bZ$ if at least one of the following conditions holds:
  \begin{enumerate}[label=(\roman*)]
    \item There is a non-collider on $p$ that is in $\bZ$;
    \item There is a collider on $p$ such that neither itself nor any other of its descendants are in $\bZ$.
  \end{enumerate}
  A path that is not blocked is said to be open. If all paths between $X\in\bX$ and $Y\in\bY$ are blocked by $\bZ$, then $\bX$ and $\bY$ are \emph{$d$-separated} by $\bZ$, denoted by $\bX\perp_{\mathcal{G}}\bY\mid\bZ$. Otherwise, they are said to be \emph{$d$-connected} by $\bZ$.
  
  \paragraph{Faithfulness}
Consider a DAG $\g=(\bV,\bE)$ such that $\bV$ follows a linear structural equation model compatible with $\g$. If for all disjoint subsets $\bX,\bY$ and $\bZ$ of $\bV$ such that $\bX$ is independent of $\bY$ given $\bZ$, $\bX\perp_{\mathcal{G}}\bY\mid\bZ$ then we say that the distribution of $\bV$ is faithful to $\g$. 

\section{Proofs}
\subsection{Proof of Theorem~\ref{thm:existence}}
\begin{proof}
  Consider the set $\bZ_{i}$. By assumption, $\Forbb(X,Y,\mathcal{G}_0)\cap\bZ_{i}=\emptyset$ and, nonetheless, $\bZ_{i}$ is not a valid adjustment set. Thus, there must exist a non-causal path $p$ from $X$ to $Y$ in $\mathcal{G}_0$ that is open given $\bZ_{i}$. Suppose that $p$ is of the form $X \rightarrow C \leftarrow Y$. By assumption $Y \in \de(X,\g_0)$ and therefore $Y \in \f{\g_0}$ which in turn implies that $\de(Y,\g_0) \subseteq \f{\g_0}$. As a result, $C \in \f{\g_0}$. But as $\mathbf{Z}_i \cap \f{\g_0} = \emptyset$ it follows that $\de(C,\g_0) \cap \mathbf{Z}_i =\emptyset$, which contradicts our assumption that $p$ is open given $\mathbf{Z}_i$. We can therefore assume that $p$ is not of the form $X \rightarrow C \leftarrow Y$ which implies that $p$ must contain at least one non-collider. If every non-collider on $p$ is in $\Forbb(X,Y,\mathcal{G}_0)$, it follows that every node on $p$ is in $\Forbb(X,Y,\mathcal{G}_0)$. But this contradicts our assumption that $p$ is non-causal and open given $\bZ_{i}$. We can therefore conclude that $p$ contains at least one non-collider that is not in $\Forbb(X,Y,\mathcal{G}_0)$. But as $(\bV\setminus\Forbb(X,Y,\mathcal{G}_0))\subseteq\cup_{j=1}^{k}\bZ_{j}$ by assumption, $p$ must be blocked by some set $\bZ_{j}$.

  Consider the potential colliders $C_{1},\dots,C_{m}$ on $p$. As $p$ is open given $\bZ_{i}$ for each collider $C_{k}$, there must exist a causal path $q_{k}$ to some node in $\bZ_{i}$, where we choose $q_{k}$ to be the shortest possible such path. If any of the $q_{k}$ intersects, drop the longer of the two paths. If any $q_{k}$ contains $X$, replace $p$ with $-q_{k}(X,C_{k})\oplus p(C_{k},Y)$ and repeat our argument. Consider now the following linear structural equation: set all edge coefficients not on $p$ or our list of paths $q_{1},\dots,q_{m'}$ to $0$. The resulting model is clearly compatible with $\mathcal{G}$ but also to a pruned graph $\mathcal{G}'$ where we drop all edges with edge coefficient 0. Clearly, in $\mathcal{G}'$ the path $p$ is still open given $\bZ_{i}$ and closed given $\bZ_{j}$. Furthermore, $p$ is the only path from $X$ to $Y$ in $\mathcal{G}'$, and as a result, we can conclude that $\beta_{yx.\bz_{i}}\neq 0$ and $\beta_{yx.\bz_{j}}= 0$. We have therefore shown that there exists a linear structural equation model compatible with $\g$, such that $\beta_{yx.\mathbf{z}_i} - \beta_{yx.\mathbf{z}_j} \neq 0$. 

 Consider now the term $\beta_{yx.\mathbf{z}_i} - \beta_{yx.\mathbf{z}_j}$ as a function in the edge coefficients and error variances from the underlying linear structural equation model. By the same arguments as given in Section 13.3 of \citet{spirtes2000causation} the function $\beta_{yx.\mathbf{z}_i} - \beta_{yx.\mathbf{z}_j}$ is equivalent to a polynomial in the edge coefficients and error variances of the linear structural equation model. As we have shown that there exists one linear structural equation model such that $\beta_{yx.\mathbf{z}_i} - \beta_{yx.\mathbf{z}_j}\neq 0$, this polynomial is non-trivial. Our claim then follows from the fact that the zero set of non-trivial polynomials has Lebesgue measure $0$.
\end{proof}

\subsection{Proof of Lemma~\ref{lem:normal}}
\begin{Lemma}[Orthogonality between covariates and regression residual, {[\citealp{buja2019models}]}]
  In a least squares regression of $X$ on $\bZ$, the minimiser of the optimisation problem $\min_{\bbeta}\E(X-\bZ^{\top}\bbeta)^{2}$ is the population regression coefficient $\bbeta_{x\bz}=\bSigma_{\bz\bz}^{-1}\bSigma_{\bz x}$. The residual $\delta_{x\bz}=X-\bZ^{\top}\bbeta_{x\bz}$ is orthogonal to $\bZ$, i.e., $\E(\bZ\delta_{x\bz})=\mathbf{0}$.
  \label{lem:orthogonal}
\end{Lemma}

Unless specified otherwise, serif letters denote random samples for scalar random variables. For example, $\Xsf=(X_{1},X_{2},\dots,X_{n})^{\top}$ is an $n$-dimensional vector containing $n$ i.i.d. copies of $X$.
Bold serif letters denote random samples for vector random variables.
For example, $\bZsf=(\bZ_{1},\dots,\bZ_{n})^{\top}$ is an $n\times p$ matrix where each row is i.i.d. as $\bZ\in\mathbb{R}^{p}$.

\begin{Lemma}[Regression error representation of OLS coefficients, {[\citealp{buja2019models}]}]
  The difference between sample and population regression coefficient of $X$ from regressing $Y$ on $\bZ'=(X,\bZ^{\top})^{\top}$ is
  \[
    \hat{\beta}_{yx.\bz}-\beta_{yx.\bz}=\frac{\langle\br_{x\bz},\bdelta_{y\bz'}\rangle}{\|\br_{x\bz}\|^{2}},
  \]
  where $\br_{x\bz}=\Xsf-\bZsf\hat{\bbeta}_{x\bz}$ is the vector of sample residuals from regressing $\Xsf$ on $\bZsf$.
  \label{fct:partial}
\end{Lemma}

\begin{proof}[Proof of Lemma~\ref{lem:normal}]
  The proof is inspired by the results in Appendix E.5 of \citet{buja2019models}. We first observe from Lemma \ref{fct:partial} that for every set $\bZ_{i}$, $i=1,2,\dots,k$,
  \begin{equation}
    n^{1/2}(\hat{\beta}_{yx.\bz_{i}}-\beta_{yx.\bz_{i}}) = \dfrac{n^{-1/2}\langle\br_{x\bz_{i}},\bdelta_{y\bz_{i}'}\rangle}{n^{-1}\|\br_{x\bz_{i}}\|^{2}},
    \label{eqn:residual}
  \end{equation}
  where $\br_{x\bz_{i}}=\Xsf-\bZsf_{i}\hat{\bbeta}_{x\bz_{i}}$ is the sample residuals from regressing $\Xsf$ on $\bZsf_{i}$.
  
  \noindent\emph{Numerator of \eqref{eqn:residual}.}
  \begin{align*}
  n^{-1/2}\langle\br_{x\bz_{i}},\bdelta_{y\bz_{i}'}\rangle
    &= n^{-1/2}\langle\Xsf-\bZsf_{i}\hat{\bbeta}_{x\bz_{i}},\bdelta_{y\bz_{i}'}\rangle \\
    &= n^{-1/2}\langle\bdelta_{x\bz_{i}}-\bZsf_{i}(\hat{\bbeta}_{x\bz_{i}}-\bbeta_{x\bz_{i}}),\bdelta_{y\bz_{i}'}\rangle \\
    &= n^{-1/2}\langle\bdelta_{x\bz_{i}},\bdelta_{y\bz_{i}'}\rangle - n^{-1/2}\langle\bZsf_{i}(\hat{\bbeta}_{x\bz_{i}}-\bbeta_{x\bz_{i}}),\bdelta_{y\bz_{i}'}\rangle.
  \end{align*}
  For the second term on the last line it holds that 
  \begin{align*}
    n^{-1/2}\langle\bZsf_{i}(\hat{\bbeta}_{x\bz_{i}}-\bbeta_{x\bz_{i}}),\bdelta_{y\bz_{i}'}\rangle &= \left(n^{-1}\bdelta_{y\bz_{i}'}^{\top}\bZsf_{i}\right)\cdot n^{1/2}(\hat{\bbeta}_{x\bz_{i}}-\bbeta_{x\bz_{i}}) \\
    &= o_{p}(1)\cdot O_{p}(1)=o_{p}(1),
  \end{align*}
  since $\E(\delta_{y\bz_{i}'}\bZ_{i})=\mathbf{0}$ by Lemma~\ref{lem:orthogonal} and $n^{1/2}(\hat{\bbeta}_{x\bz_{i}}-\bbeta_{x\bz_{i}})$ converges in distribution to a multivariate normal random variable by the central limit theorem, which is appropriate since by assumption, the fourth moments of $\bV$ are finite.
  
  \noindent\emph{Denominator of \eqref{eqn:residual}.}

  Using the convention that the hat matrix $\mathbf{H}_{n}=\bZsf_{i}(\bZsf_{i}^{\top}\bZsf_{i})^{-1}\bZsf_{i}^{\top}$, the average squared sample residuals
  \begin{align*}
    n^{-1}\|\br_{x\bz_{i}}\|^{2} &= n^{-1}\Xsf^{\top}(\mathbf{I}-\mathbf{H}_{n})\Xsf \\
    &= n^{-1}\|\Xsf\|^{2}-\left(n^{-1}\Xsf^{\top}\bZsf_{i}\right)\left(n^{-1}\bZsf_{i}^{\top}\bZsf_{i}\right)^{-1}\left(n^{-1}\bZsf_{i}^{\top}\Xsf\right) \\
    &\overset{p}{\to} \E(X^{2})-\E(X\bZ_{i}^{\top})[\E(\bZ_{i}\bZ_{i}^{\top})]^{-1}\E(\bZ_{i} X) \\
    &= \E(X^{2})-\E(X\bZ_{i}^{\top}\bbeta_{x\bz_{i}}) \\
    &= \E(X-\bZ_{i}^{\top}\bbeta_{x\bz_{i}})^{2} = \E(\delta_{x\bz_{i}}^{2}).
  \end{align*}
  The second to last step follows because $\E[\bZ_{i}(X-\bZ_{i}^{\top}\bbeta_{x\bz_{i}})]=\E(\bZ_{i}\delta_{x\bz_{i}})=\mathbf{0}$ by Lemma~\ref{lem:orthogonal}.

  We are now ready to present the asymptotic joint normality of $\hat{\bbeta}_{yx.\mathcal{Z}}$.
  Since $\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}})=\E[(X-\bZ_{i}^{\top}{\bbeta_{x\bz_{i}}})\delta_{y\bz_{i}'}]=0$, together with the fact that the fourth moments of $\bV$ are finite, we can apply the multivariate central limit theorem to conclude that
  \[
    \left(n^{-1/2}\langle\bdelta_{x\bz_{1}},\bdelta_{y\bz_{1}'}\rangle,\dots,n^{-1/2}\langle\bdelta_{x\bz_{k}},\bdelta_{y\bz_{k}'}\rangle\right)\overset{d}{\to}\mathrm{N}(\mathbf{0},\mathbf{\Psi})
  \]
  where the entries have the form $\mathbf{\Psi}_{ij}=\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})$ for all $1\leq i, j\leq k$.
  Therefore, the random vector
  \begin{align*}
    \begin{pmatrix}
      n^{-1/2}\langle\br_{x\bz_{1}},\bdelta_{y\bz_{1}'}\rangle \\
      n^{-1/2}\langle\br_{x\bz_{2}},\bdelta_{y\bz_{2}'}\rangle \\
      \vdots \\
      n^{-1/2}\langle\br_{x\bz_{k}},\bdelta_{y\bz_{k}'}\rangle
    \end{pmatrix} &=
    \begin{pmatrix}
      n^{-1/2}\langle\bdelta_{x\bz_{1}},\bdelta_{y\bz_{1}'}\rangle \\
      n^{-1/2}\langle\bdelta_{x\bz_{2}},\bdelta_{y\bz_{2}'}\rangle \\
      \vdots \\
      n^{-1/2}\langle\bdelta_{x\bz_{k}},\bdelta_{y\bz_{k}'}\rangle
    \end{pmatrix} -
    \begin{pmatrix}
      n^{-1/2}\langle\bZsf_{1}(\hat{\bbeta}_{x\bz_{1}}-\bbeta_{x\bz_{1}}),\bdelta_{y\bz_{1}'}\rangle \\
      n^{-1/2}\langle\bZsf_{2}(\hat{\bbeta}_{x\bz_{2}}-\bbeta_{x\bz_{2}}),\bdelta_{y\bz_{2}'}\rangle \\
      \vdots \\
      n^{-1/2}\langle\bZsf_{k}(\hat{\bbeta}_{x\bz_{k}}-\bbeta_{x\bz_{k}}),\bdelta_{y\bz_{k}'}\rangle
    \end{pmatrix}
    \\
    &\overset{d}{\to}\mathrm{N}(\mathbf{0},\mathbf{\Psi}),
  \end{align*}
  due to the fact that the second vector converges in distribution to a vector of zeroes.
    Based on the discussion of the denominator term, we can conclude that $$n^{-1}\mathrm{diag}(\|\br_{x\bz_{1}}\|^{2},\|\br_{x\bz_{2}}\|^{2},\dots,\|\br_{x\bz_{k}}\|^{2})\overset{p}{\to}\mathrm{diag}(\E(\delta_{x\bz_{1}}^{2}),\E(\delta_{x\bz_{2}}^{2}),\dots,\E(\delta_{x\bz_{k}}^{2}))=\mathbf{\Upsilon}.$$
  The target quantity can then be written as
  \begin{align*}
  n^{1/2}(\hat{\bbeta}_{yx.\mathcal{Z}}-\bbeta_{yx.\mathcal{Z}}) &=
  \begin{pmatrix}
    n^{-1}\|\br_{x\bz_{1}}\|^{2} & 0 & \cdots & 0 \\
    0 & n^{-1}\|\br_{x\bz_{2}}\|^{2} & \cdots & 0 \\
    \vdots & \vdots & \ddots & \vdots \\
    0 & 0 & 0 & n^{-1}\|\br_{x\bz_{k}}\|^{2}
  \end{pmatrix}^{-1} \begin{pmatrix}
    n^{-1/2}\langle\br_{x\bz_{1}},\bdelta_{y\bz_{1}'}\rangle \\
    n^{-1/2}\langle\br_{x\bz_{2}},\bdelta_{y\bz_{2}'}\rangle \\
    \vdots \\
    n^{-1/2}\langle\br_{x\bz_{k}},\bdelta_{y\bz_{k}'}\rangle
  \end{pmatrix} \\
  &\overset{d}{\to}\mathrm{N}(\mathbf{0},\bSigma_{\mathcal{Z}}),
\end{align*}
  where the convergence follows from Slutsky's Theorem, and the asymptotic covariance matrix $\bSigma_{\mathcal{Z}}=\mathbf{\Upsilon}^{-1}\mathbf{\Psi}\mathbf{\Upsilon}^{-1}$ is as specified in the theorem statement.
\end{proof}

\begin{Remark}
  If $\bZ_{1},\dots,\bZ_{k}$ are all valid adjustment sets relative to $(X,Y)$ in $\g$ for a linear structural equation model compatible with a DAG $\g$, we can simplify the diagonal terms $\bDelta_{\mathcal{Z},ii}=\E(\delta_{x\bz_{i}}^{2}\delta_{y\bz_{i}'}^{2})=\E(\delta_{x\bz_{i}}^{2})\E(\delta_{y\bz_{i}'}^{2})$ due to the independence between $\delta_{x\bz_{i}}$ and $\delta_{y\bz_{i}'}$ (see proof of Proposition 3.1 in Supplement from \citeauthor{henckel2019graphical} [\citeyear{henckel2019graphical}]).
  Therefore, the corresponding terms are $\bSigma_{\mathcal{Z},ii}=\E(\delta_{y\bz_{i}'}^{2})/\E(\delta_{x\bz_{i}}^{2})$.
  It can also be shown that $\hat{\beta}_{yx.\bz}$ is root-$n$ consistent for the total effect $\tau_{yx}$ for any valid adjustment set $\bZ$ \citep{nandy2017estimating}.
    In this case, in order to apply the central limit theorem separately on every entry of $(n^{-1/2}\langle\bdelta_{x\bz_{1}},\bdelta_{y\bz_{1}'}\rangle,\dots,n^{-1/2}\langle\bdelta_{x\bz_{k}},\bdelta_{y\bz_{k}'}\rangle)^{\top}$, we only need the  finite variance assumption for the error terms $\bepsilon$ of the linear structural equation model.
    In such a model, both $\delta_{x\bz_{i}}$ and $\delta_{y\bz_{i}'}$ can be expressed as linear functions of the error terms, say $\btheta_{i}^{\top}\bepsilon$ and $\bxi_{i}^{\top}\bepsilon$. Furthermore,
  \begin{align}
    \Var(\delta_{x\bz_{i}}\delta_{y\bz_{i}'})&=\E(\delta_{x\bz_{i}}^{2}\delta_{y\bz_{i}'}^{2})-[\underbrace{\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'})}_{=0}]^{2} \nonumber \\
    &= \E(\delta_{x\bz_{i}}^{2})\E(\delta_{y\bz_{i}'}^{2}) = \E(\btheta_{i}^{\top}\bepsilon)^{2}\E(\bxi_{i}^{\top}\bepsilon)^{2}, \label{eqn:2nd-moment}
  \end{align}
  due to the independence between $\delta_{x\bz_{i}}$ and $\delta_{y\bz_{i}'}$. The order of each $\epsilon_{v_i}$ term cannot be larger than $2$ in \eqref{eqn:2nd-moment} for all $V_{i}\in\bV$. Therefore, $\Var(\delta_{x\bz_{i}}\delta_{y\bz_{i}}')$ is finite for all $\bZ_{i}$ whenever $\E(\epsilon_{v_i}^{2})<\infty$ for all $V_i \in \bV$.
\end{Remark}

We also show that a consistent estimator of the covariance matrix can be obtained by plugging in the sample residuals.

\begin{Lemma}[Consistency of plug-in estimator of $\bSigma_{\mathcal{Z}}$]
    Consider the setting in Lemma~\ref{lem:normal}. The plug-in estimator $\hat{\bSigma}_{\mathcal{Z}}$ of $\bSigma_{\mathcal{Z}}$ with entries
    \[
      \hat{\bSigma}_{\mathcal{Z},ij} = \frac{n\sum_{s=1}^{n}r_{x\bz_{i},s}\cdot r_{y\bz_{i}',s}\cdot r_{x\bz_{j},s}\cdot r_{y\bz_{j}',s}}{\|\br_{x\bz_{i}}\|^{2}\|\br_{x\bz_{j}}\|^{2}},
    \]
    for all $1\leq i, j \leq k$, is consistent.
    \label{lem:consistent}
  \end{Lemma}
\begin{proof}[Proof of Lemma~\ref{lem:consistent}]
  Consider
  \[
     \hat{\bSigma}_{\mathcal{Z},ij}=\frac{n^{-1}\sum_{s=1}^{n}r_{x\bz_{i},s}\cdot r_{y\bz_{i}',s}\cdot r_{x\bz_{j},s}\cdot r_{y\bz_{j}',s}}{n^{-1}\|\br_{x\bz_{i}}\|^{2}n^{-1}\|\br_{x\bz_{j}}\|^{2}}.
  \]
  The denominator converges in probability to $\E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2})$ by the proof of Lemma~\ref{lem:normal}.
  The numerator can be written as
  \begin{align*}
    n^{-1}\sum_{s=1}^{n}r_{x\bz_{i},s}r_{y\bz_{i}',s}r_{x\bz_{j},s}r_{y\bz_{j}',s} &= n^{-1}\sum_{s=1}^{n}\left[(\delta_{x\bz_{i},s}-\bZ_{i,s}^{\top}(\hat{\bbeta}_{x\bz_{i}}-\bbeta_{x\bz_{i}}))(\delta_{y\bz_{i}',s}-\bZ_{i,s}^{'\top}(\hat{\bbeta}_{y\bz_{i}'}-\bbeta_{y\bz_{i}'}))\right. \\
    &\qquad\qquad\qquad\left.(\delta_{x\bz_{j},s}-\bZ_{j,s}^{\top}(\hat{\bbeta}_{x\bz_{j}}-\bbeta_{x\bz_{j}}))(\delta_{y\bz_{j}',s}-\bZ_{j,s}^{'\top}(\hat{\bbeta}_{y\bz_{j}'}-\bbeta_{y\bz_{j}'}))\right] \\
    &= n^{-1}\sum_{s=1}^{n}\delta_{x\bz_{i},s}\delta_{y\bz_{i}',s}\delta_{x\bz_{j},s}\delta_{y\bz_{j}',s} + R,
  \end{align*}
  where the remainder term $R$ contains the rest of the products from the expansion: $1$ product with no $\delta$-term, $4$ products with $1$ $\delta$-term, $6$ products with $2$ $\delta$-terms and $4$ products with $3$ $\delta$-terms. Below we will show that the remainder term $R\overset{p}{\to}0$, and it follows that the numerator converges in probability to $\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})$. By the continuous mapping theorem, $\hat{\bSigma}_{\mathcal{Z},ij}\overset{p}{\to}\bSigma_{\mathcal{Z},ij}$ follows.

  We will discuss one case from each category, as the results can be shown similarly for other products in the same category.
  The use of parentheses in the subscript denotes a particular entry of a vector.
  For example, $Z_{i(t),s}$ is the $t$-th entry of the $s$-th observation $\bZ_{i,s}$ and $\hat{\beta}_{x\bz_{i}(t)}$ is the $t$-th entry of the vector $\hat{\bbeta}_{x\bz_{i}}$.
  With the finite fourth moment assumption on $\bV$, $\hat{\beta}_{x\bz_{i}(t)}\overset{p}{\to}\beta_{x\bz_{i}(t)}$ and $\hat{\beta}_{y\bz_{i}'(t)}\overset{p}{\to}\beta_{y\bz_{i}'(t)}$ for any $\bZ_{i}$ and $1\leq t\leq |\bZ_{i}|$.

  \noindent\emph{No $\delta$-term.}
  \begin{align*}
    &\phantom{{}=\;}n^{-1}\sum_{s=1}^{n}\bZ_{i,s}^{\top}(\hat{\bbeta}_{x\bz_{i}}-\bbeta_{x\bz_{i}})\bZ_{i,s}^{'\top}(\hat{\bbeta}_{y\bz_{i}'}-\bbeta_{y\bz_{i}'})\bZ_{j,s}^{\top}(\hat{\bbeta}_{x\bz_{j}}-\bbeta_{x\bz_{j}})\bZ_{j,s}^{'\top}(\hat{\bbeta}_{y\bz_{j}'}-\bbeta_{y\bz_{j}'}) \\
    &= \sum_{t,u,v,w}\left(n^{-1}\sum_{s=1}^{n}Z_{i(t),s}Z'_{i(u),s}Z_{j(v),s}Z'_{j(w),s}\right)(\hat{\beta}_{x\bz_{i}(t)}-\beta_{x\bz_{i}(t)})(\hat{\beta}_{y\bz_{i}'(u)}-\beta_{y\bz_{i}'(u)}) \\
    &\qquad\qquad\qquad\qquad(\hat{\beta}_{x\bz_{j}(v)}-\beta_{x\bz_{j}(v)})(\hat{\beta}_{y\bz_{j}'(w)}-\beta_{y\bz_{j}'(w)}) \\
    &\overset{p}{\to} \sum_{t,u,v,w} \mathrm{const}\cdot 0\cdot 0\cdot 0\cdot 0 \\
    &= 0,
  \end{align*}
  where $1\leq t\leq |\bZ_{i}|,1\leq u\leq |\bZ'_{i}|,1\leq v\leq |\bZ_{j}|,1\leq w\leq |\bZ'_{j}|$, the constant term $\E(Z_{i(t)}Z'_{i(u)}Z_{j(v)}Z'_{j(w)})$ exists due to the finite fourth moment assumption on $\bV$.

  \noindent\emph{One $\delta$-term.}
  \begin{align*}
    &\phantom{{}=\;} -n^{-1}\sum_{s=1}^{n}\delta_{x\bz_{i},s}\bZ_{i,s}^{'\top}(\hat{\bbeta}_{y\bz_{i}'}-\bbeta_{y\bz_{i}'}))\bZ_{j,s}^{\top}(\hat{\bbeta}_{x\bz_{j}}-\bbeta_{x\bz_{j}})\bZ_{j,s}^{'\top}(\hat{\bbeta}_{y\bz_{j}'}-\bbeta_{y\bz_{j}'}) \\
    &= \sum_{u,v,w}\left(n^{-1}\sum_{s=1}^{n}\delta_{x\bz_{i},s}Z'_{i(u),s}Z_{j(v),s}Z'_{j(w),s}\right)(\hat{\beta}_{y\bz_{i}'(u)}-\beta_{y\bz_{i}'(u)})(\hat{\beta}_{x\bz_{j}(v)}-\beta_{x\bz_{j}(v)})(\hat{\beta}_{y\bz_{j}'(w)}-\beta_{y\bz_{j}'(w)}) \\
    &\overset{p}{\to} \sum_{u,v,w} \mathrm{const}\cdot 0\cdot 0\cdot 0 \\
    &= 0,
  \end{align*}
  where $1\leq u\leq |\bZ'_{i}|,1\leq v\leq |\bZ_{j}|,1\leq w\leq |\bZ'_{j}|$, the constant term $\E(\delta_{x\bz_{i}}Z'_{i(u)}Z_{j(v)}Z'_{j(w)})$ exists due to the finite fourth moment assumption on $\bV$.

  \noindent\emph{Two $\delta$-terms.}
  \begin{align*}
    &\phantom{{}=\;}n^{-1}\sum_{s=1}^{n}\delta_{x\bz_{i},s}\delta_{y\bz_{i}',s}\bZ_{j,s}^{\top}(\hat{\bbeta}_{x\bz_{j}}-\bbeta_{x\bz_{j}})\bZ_{j,s}^{'\top}(\hat{\bbeta}_{y\bz_{j}'}-\bbeta_{y\bz_{j}'}) \\
    &= \sum_{v,w}\left(n^{-1}\sum_{s=1}^{n}\delta_{x\bz_{i},s}\delta_{y\bz_{i}',s}Z_{j(v),s}Z'_{j(w),s}\right)(\hat{\beta}_{x\bz_{j}(v)}-\beta_{x\bz_{j}(v)})(\hat{\beta}_{y\bz_{j}'(w)}-\beta_{y\bz_{j}'(w)}) \\
    &\overset{p}{\to} \sum_{v,w} \mathrm{const}\cdot 0\cdot 0 \\
    &= 0,
  \end{align*}
  where $1\leq v\leq |\bZ_{j}|,1\leq w\leq |\bZ'_{j}|$, the constant term $\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}Z_{j(v)}Z'_{j(w)})$ exists due to the finite fourth moment assumption on $\bV$.

  \noindent\emph{Three $\delta$-terms.}
  \begin{align*}
    &\phantom{{}=\;}-n^{-1}\sum_{s=1}^{n}\delta_{x\bz_{i},s}\delta_{y\bz_{i}',s}\delta_{x\bz_{j},s}\bZ_{j,s}^{'\top}(\hat{\bbeta}_{y\bz_{j}'}-\bbeta_{y\bz_{j}'}) \\
    &= \sum_{w}\left(n^{-1}\sum_{s=1}^{n}\delta_{x\bz_{i},s}\delta_{y\bz_{i}',s}\delta_{x\bz_{j},s}Z'_{j(w),s}\right)(\hat{\beta}_{y\bz_{j}'(w)}-\beta_{y\bz_{j}'(w)}) \\
    &\overset{p}{\to} \sum_{w} \mathrm{const}\cdot 0 \\
    &= 0,
  \end{align*}
  where $1\leq w\leq |\bZ'_{j}|$, the constant term $\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}Z'_{j(w)})$ exists due to the finite fourth moment assumption on $\bV$.
\end{proof}

\begin{Remark}
  If the $\bZ_{i}$'s are valid adjustment sets, the diagonal terms simplify to $(\hat{\bSigma}_{\mathcal{Z}})_{ii}=\|\br_{y\bz_{i}'}\|^{2}_{2}/\|\br_{x\bz_{i}}\|^{2}_{2}$, and their convergence follows by the proof of Lemma~\ref{lem:normal} on the denominator.
\end{Remark}


\subsection{Proof of Proposition~\ref{ppn:vech}}
\begin{proof}
The proof aims to show that the half-vectorised asymptotic covariance matrix estimator $\hat{\bSigma}_{\mathcal{Z}}$, after subtracting their true values in $\bSigma_{\mathcal{Z}}$, will converge to a zero-mean normal distribution.

For the $(i,j)$-th entry, we write
\begin{align*}
  n^{1/2}(\hat{\bSigma}_{\mathcal{Z},ij}-\bSigma_{\mathcal{Z},ij}) &= \frac{n^{-1/2}\sum_{s=1}^{n}r_{x\bz_{i},s}r_{y\bz_{i}',s}r_{x\bz_{j},s}r_{y\bz_{i}',s}}{n^{-1}\sum_{s=1}^{n}r_{x\bz_{i},s}^{2}n^{-1}\sum_{s=1}^{n}r_{x\bz_{j},s}^{2}}- \frac{n^{1/2}\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{i}'})}{\E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2})} \\
  &= \frac{N}{\E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2})n^{-1}\sum_{s=1}^{n}r_{x\bz_{i},s}^{2}n^{-1}\sum_{s=1}^{n}r_{x\bz_{j},s}^{2}}.
\end{align*}
The numerator $N$ of the expression above is expanded as
\begin{align*}
  &\quad \E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2})n^{-1/2}\sum_{s=1}^{n}r_{x\bz_{i},s}r_{y\bz_{i}',s}r_{x\bz_{j},s}r_{y\bz_{i}',s}-\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})n^{1/2}n^{-1}\sum_{s=1}^{n}r_{x\bz_{i},s}^{2}n^{-1}\sum_{s=1}^{n}r_{x\bz_{j},s}^{2} \\
  &= \E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2})n^{-1/2}\sum_{s=1}^{n}\delta_{x\bz_{i},s}\delta_{y\bz_{i}',s}\delta_{x\bz_{j},s}\delta_{y\bz_{i}',s}-\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})n^{1/2}n^{-1}\sum_{s=1}^{n}\delta_{x\bz_{i},s}^{2}n^{-1}\sum_{s=1}^{n}\delta_{x\bz_{j},s}^{2} + R,
\end{align*}
where $R$ collects the remainder term resulting from replacing the sample residuals with population residuals.

We now subtract and add back the expected squared population residuals from the average squared population residuals.
That is,
\begin{align*}
  n^{1/2}(\hat{\bSigma}_{\mathcal{Z},ij}-\bSigma_{\mathcal{Z},ij})&= \E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2})n^{-1/2}\sum_{s=1}^{n}\delta_{x\bz_{i},s}\delta_{y\bz_{i}',s}\delta_{x\bz_{j},s}\delta_{y\bz_{i}',s}\\
  &\qquad-\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})n^{1/2}\left(n^{-1}\sum_{s=1}^{n}\delta_{x\bz_{i},s}^{2}-\E(\delta_{x\bz_{i}}^{2})\right)\left(n^{-1}\sum_{s=1}^{n}\delta_{x\bz_{j},s}^{2}-\E(\delta_{x\bz_{j}}^{2})\right) \\
  &\qquad - \E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})n^{-1/2}\sum_{s=1}^{n}\left(\E(\delta_{x\bz_{i}}^{2})\delta_{x\bz_{j},s}^{2}+\E(\delta_{x\bz_{j}}^{2})\delta_{x\bz_{i},s}^{2}\right)\\
  &\qquad+ \E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})n^{1/2}\E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2}) + R \\
  &= n^{-1/2}\sum_{s=1}^{n}\left[\E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2})\delta_{x\bz_{i},s}\delta_{y\bz_{i}',s}\delta_{x\bz_{j},s}\delta_{y\bz_{i}',s} \right.\\
  &\qquad- \E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})\left(\E(\delta_{x\bz_{i}}^{2})\delta_{x\bz_{j},s}^{2}+\E(\delta_{x\bz_{j}}^{2})\delta_{x\bz_{i},s}^{2}\right)\\
  &\qquad \left.+ \E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})\E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2})\right] + R'.
\end{align*}
The first term converges to a zero-mean normal distribution by the central limit theorem and the finite fourth moment assumption on $\bV$.
The remainder term $R=o_{p}(1)$ by analogous arguments to the ones used in the proof of Lemma~\ref{lem:consistent}.
The second term on the second to last line disappears asymptotically, which entails that $R'=o_{p}(1)$.

The asymptotic covariance between two entries in $\vecth(\hat{\bSigma}_{\mathcal{Z}})$
\[
  a.\Cov(n^{1/2}(\hat{\bSigma}_{\mathcal{Z},ij}-\bSigma_{\mathcal{Z},ij}),n^{1/2}(\hat{\bSigma}_{\mathcal{Z},kl}-\bSigma_{\mathcal{Z},kl})) = \frac{\gamma_{ij,kl}}{\omega_{ij,kl}},
\]
where
\begin{align*}
  \gamma_{ij,kl} &:= \E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2})\E(\delta_{x\bz_{k}}^{2})\E(\delta_{x\bz_{l}}^{2})\Cov(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'},\delta_{x\bz_{k}}\delta_{y\bz_{k}'}\delta_{x\bz_{l}}\delta_{y\bz_{l}'}) \\
  &\quad - \E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2})\E(\delta_{x\bz_{k}}^{2})\E(\delta_{x\bz_{k}}\delta_{y\bz_{k}'}\delta_{x\bz_{l}}\delta_{y\bz_{l}'}) \Cov(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'},\delta_{x\bz_{l}}^{2}) \\
  &\quad - \E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{j}}^{2})\E(\delta_{x\bz_{l}}^{2})\E(\delta_{x\bz_{k}}\delta_{y\bz_{k}'}\delta_{x\bz_{l}}\delta_{y\bz_{l}'}) \Cov(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'},\delta_{x\bz_{k}}^{2}) \\
  &\quad - \E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{k}}^{2})\E(\delta_{x\bz_{l}}^{2})\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'}) \Cov(\delta_{x\bz_{k}}\delta_{y\bz_{k}'}\delta_{x\bz_{l}}\delta_{y\bz_{l}'},\delta_{x\bz_{j}}^{2}) \\
  &\quad - \E(\delta_{x\bz_{j}}^{2})\E(\delta_{x\bz_{k}}^{2})\E(\delta_{x\bz_{l}}^{2})\E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'}) \Cov(\delta_{x\bz_{k}}\delta_{y\bz_{k}'}\delta_{x\bz_{l}}\delta_{y\bz_{l}'},\delta_{x\bz_{i}}^{2}) \\
  &\quad + \E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})\E(\delta_{x\bz_{k}}\delta_{y\bz_{k}'}\delta_{x\bz_{l}}\delta_{y\bz_{l}'}) \E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{k}}^{2})\Cov(\delta_{x\bz_{j}}^{2},\delta_{x\bz_{l}}^{2}) \\
  &\quad + \E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})\E(\delta_{x\bz_{k}}\delta_{y\bz_{k}'}\delta_{x\bz_{l}}\delta_{y\bz_{l}'}) \E(\delta_{x\bz_{i}}^{2})\E(\delta_{x\bz_{l}}^{2})\Cov(\delta_{x\bz_{j}}^{2},\delta_{x\bz_{k}}^{2}) \\
  &\quad + \E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})\E(\delta_{x\bz_{k}}\delta_{y\bz_{k}'}\delta_{x\bz_{l}}\delta_{y\bz_{l}'}) \E(\delta_{x\bz_{j}}^{2})\E(\delta_{x\bz_{k}}^{2})\Cov(\delta_{x\bz_{i}}^{2},\delta_{x\bz_{l}}^{2}) \\
  &\quad + \E(\delta_{x\bz_{i}}\delta_{y\bz_{i}'}\delta_{x\bz_{j}}\delta_{y\bz_{j}'})\E(\delta_{x\bz_{k}}\delta_{y\bz_{k}'}\delta_{x\bz_{l}}\delta_{y\bz_{l}'}) \E(\delta_{x\bz_{j}}^{2})\E(\delta_{x\bz_{l}}^{2})\Cov(\delta_{x\bz_{i}}^{2},\delta_{x\bz_{k}}^{2}) \textrm{ and} \\
  \omega_{ij,kl} &:= [\E(\delta_{x\bz_{i}}^{2})]^{2}[\E(\delta_{x\bz_{j}}^{2})]^{2}[\E(\delta_{x\bz_{k}}^{2})]^{2}[\E(\delta_{x\bz_{l}}^{2})]^{2}.
\end{align*}

Analogous to the proof of Lemma~\ref{lem:normal}, the joint normality follows by the multivariate Central Limit Theorem, which we can apply due to Slutsky's Theorem and the assumption that the fourth moments of the errors are finite.

Define a deterministic mapping for subscript $\mathbf{g}(a)=(ij)$, $a=1,2,\dots,k(k+1)/2$ such that it maps the $a$-th element of $\mathrm{vech}(\bSigma_{\mathcal{Z}})$ to the $(i,j)$-th entry of $\bSigma_{\mathcal{Z}}$.
The asymptotic covariance matrix $\mathbf{F}$ of $\mathrm{vech}(\hat{\bSigma}_{\mathcal{Z}})$ is a $k(k+1)/2\times k(k+1)/2$ matrix whose entries are related to the expression of $\omega_{\cdot,\cdot}$ and $\gamma_{\cdot,\cdot}$ by the mapping $\mathbf{g}(\cdot)$ such that
\[
    \mathbf{F}_{ab}=\frac{\gamma_{\mathbf{g}(a),\mathbf{g}(b)}}{\omega_{\mathbf{g}(a),\mathbf{g}(b)}},
\]
for $1\leq a,b\leq k(k+1)/2$.
The asymptotic covariance matrix $\mathbf{C}$ of $\mathrm{vech}(\hat{\bDelta}_{\mathcal{Z}})$ follows from the linear relationship $\mathrm{vech}({\bDelta}_{\mathcal{Z}})=\mathbf{\Pi}\mathrm{vech}({\bSigma}_{\mathcal{Z}})$.
\end{proof}
\begin{Remark}
Again we discuss the special situation where the $\bZ_{i}$'s are valid adjustments sets.
In this case, the diagonal terms
  \begin{align*}
  n^{1/2}(\hat{\bSigma}_{\mathcal{Z},ii}-\bSigma_{\mathcal{Z},ii})&= \frac{n^{1/2}n^{-1}\sum_{s=1}^{n}r_{y\bz_{i}',s}^{2}}{n^{-1}\sum_{s=1}^{n}r_{x\bz_{i},s}^{2}} - \frac{\E(\delta_{y\bz_{i}'}^{2})}{\E(\delta_{x\bz_{i}}^{2})} \\
  &= \frac{n^{-1/2}\sum_{s=1}^{n}[\E(\delta_{x\bz_{i}}^{2})r_{y\bz_{i}',s}^{2}-\E(\delta_{y\bz_{i}'}^{2})r_{x\bz_{i},s}^{2}]}{\E(\delta_{x\bz_{i}}^{2})n^{-1}\sum_{s=1}^{n}r_{x\bz_{i},s}^{2}}.
\end{align*}
The numerator
\begin{align*}
  n^{-1/2}\sum_{s=1}^{n}[\E(\delta_{x\bz_{i}}^{2})r_{y\bz_{i}',s}^{2}-\E(\delta_{y\bz_{i}'}^{2})r_{x\bz_{i},s}^{2}] &=n^{-1/2}\sum_{s=1}^{n}\left[\E(\delta_{x\bz_{i}}^{2})(\delta_{x\bz_{i},s}-\bZ_{i,s}^{'\top}(\hat{\bbeta}_{y\bz_{i}'}-\bbeta_{y\bz_{i}'}))^{2}\right.\\
  &\qquad\qquad\qquad-\left.\E(\delta_{y\bz_{i}'}^{2})(\delta_{x\bz_{i},s}-\bZ_{i,s}^{\top}(\hat{\bbeta}_{x\bz_{i}}-\bbeta_{x\bz_{i}}))^{2}\right] \\
  &= n^{-1/2}\sum_{s=1}^{n}[\E(\delta_{x\bz_{i}}^{2})\delta_{x\bz_{i},s}^{2}-\E(\delta_{y\bz_{i}'}^{2})\delta_{x\bz_{i},s}^{2}] + R \\
  &\overset{d}{\to}\mathrm{N}(0,[\E(\delta_{x\bz_{i}}^{2})]^{2}\Var(\delta_{y\bz_{i}'}^{2})+[\E(\delta_{y\bz_{i}'}^{2})]^{2}\Var(\delta_{x\bz_{i}}^{2})),
\end{align*}
where we can apply the central limit theorem because to the first term because $\E(\E(\delta_{x\bz_{i}}^{2})\delta_{y\bz_{i}',s}^{2}-\E(\delta_{y\bz_{i}'}^{2})\delta_{x\bz_{i},s}^{2})=0$ and $\Var(\E(\delta_{x\bz_{i}}^{2})\delta_{y\bz_{i}',s}^{2}-\E(\delta_{y\bz_{i}'}^{2})\delta_{x\bz_{i},s}^{2}) = [\E(\delta_{x\bz_{i}}^{2})]^{2}\Var(\delta_{y\bz_{i}'}^{2})+[\E(\delta_{y\bz_{i}'}^{2})]^{2}\Var(\delta_{x\bz_{i}}^{2})$.
The remainder term $R=o_{p}(1)$ by analogous arguments used in the proof of Lemma~\ref{lem:consistent}.
Similarly, the denominator $\E(\delta_{x\bz_{i}}^{2})n^{-1}\sum_{s=1}^{n}r_{x\bz_{i},s}^{2}$ converges in probability to $[\E(\delta_{x\bz_{i}}^{2})]^{2}$.
Then by Slutsky's Theorem,
\[
  n^{1/2}(\hat{\bSigma}_{\mathcal{Z},ii}-\bSigma_{\mathcal{Z},ii})\overset{d}{\to}\mathrm{N}\left(0,\frac{[\E(\delta_{x\bz_{i}}^{2})]^{2}\Var(\delta_{y\bz_{i}'}^{2})+[\E(\delta_{y\bz_{i}'}^{2})]^{2}\Var(\delta_{x\bz_{i}}^{2})}{[\E(\delta_{x\bz_{i}}^{2})]^{4}}\right).
\]
\end{Remark}

\subsection{Proof of Theorem \ref{thm:chisquare}}

\begin{proof}[Proof of Theorem \ref{thm:chisquare}]
  Lemma~\ref{lem:normal} states that $n^{1/2}(\hat{\bbeta}_{yx.\mathcal{Z}}-\bbeta_{yx.\mathcal{Z}})$ is asymptotically normal. We first show that to quantify the degrees of freedom of a Wald-type statistic, one only needs to look at the rank of covariance matrix $\bDelta_{\mathcal{Z}}=\bGamma\bSigma_{\mathcal{Z}}\bGamma^{\top}$.

  Suppose $\mathrm{rank}(\bDelta_{\mathcal{Z}})=r_{0}\leq l$ where $l=k-1$. Consider the eigendecomposition of $\bDelta_{\mathcal{Z}}=\mathbf{Q}\mathbf{\Phi}\mathbf{Q}^{\top}$, where $\mathbf{Q}=(\mathbf{q}_{1}\ \cdots\ \mathbf{q}_{l})$ is the orthonormal matrix containing the eigenvectors of $\bDelta_{\mathcal{Z}}$, and $\mathbf{\Phi}=\mathrm{diag}(\phi_{1},\dots,\phi_{l})$ with eigenvalues $\phi_{1}\geq\cdots\geq\phi_{r_{0}}>\phi_{r_{0}+1}=\cdots=\phi_{l}=0$. It can be verified that the (unique) Moore-Penrose inverse of $\bDelta_{\mathcal{Z}}$ is defined as
  \[
    \bDelta_{\mathcal{Z}}^{\dagger}=\sum_{s=1}^{r_{0}}\phi_{j}^{-1}\mathbf{q}_{s}\mathbf{q}_{s}^{\top},
  \]
  because of the semi-positive definiteness.
  Under $H_{0}:\bGamma\bbeta_{yx.\mathcal{Z}}=\mathbf{0}$, denote $n^{1/2}\bGamma\hat{\bbeta}_{yx}\overset{d}{\to}\mathbf{G}\sim\mathrm{N}(\mathbf{0},\bDelta_{\mathcal{Z}})$.
  For all $1\leq s\neq t\leq r_{0}$, $\Cov(\mathbf{q}_{s}^{\top}\mathbf{G},\mathbf{q}_{t}^{\top}\mathbf{G})=\mathbf{q}_{s}^{\top}\bDelta_{\mathcal{Z}}\mathbf{q}_{t}=0$. By joint normality of $\mathbf{G}$, $\mathbf{q}_{s}^{\top}\mathbf{G}$ and $\mathbf{q}_{t}^{\top}\mathbf{G}$ are independent. Moreover, since $\mathbf{q}_{s}^{\top}\mathbf{G}\sim\mathrm{N}(0,\phi_{s})$,
  \begin{align}
    n(\mathbf{\Gamma}\hat{\bbeta}_{yx.\mathcal{Z}})^{\top}\bDelta_{\mathcal{Z}}^{\dagger}(\mathbf{\Gamma}\hat{\bbeta}_{yx.\mathcal{Z}}) &=\sum_{s=1}^{r_{0}}\phi_{s}^{-1}(\mathbf{q}_{s}^{\top}n^{1/2}\bGamma\hat{\bbeta}_{yx.\mathcal{Z}})^{2} \nonumber\\
    &\overset{d}{\to}\sum_{s=1}^{r_{0}}\phi_{s}^{-1}(\mathbf{q}_{s}^{\top}\mathbf{G})^{2}\sim\chi^{2}_{r_{0}}.\label{eqn:conv-chisq}
  \end{align}
  
  The consistency of $\hat{r}$, i.e., $\lim_{n\to\infty}\mathbb{P}(|\hat{r}-r_{0}|<\epsilon)=1$, $\forall\epsilon>0$, implies that $\lim_{n\to\infty}\mathbb{P}(\hat{r}=r_{0})=1$ when taking $\epsilon<1$, since both $\hat{r}$ and $r_{0}$ are integer-valued.

  Since $\hat{\bDelta}_{\mathcal{Z}}$ is positive semidefinite, its spectral decomposition is $\hat{\mathbf{P}}\hat{\bLambda}\hat{\mathbf{P}}^{\top}$, where $\hat{\bLambda}=\diag(\hat{\lambda}_{1},\dots,\hat{\lambda}_{k})$ with $\hat{\lambda}_{1}\geq\dots\geq\hat{\lambda}_{k}\geq 0$. The rank-$\hat{r}$ spectral approximation of $\hat{\bDelta}_{\mathcal{Z}}$ is then $\hat{\mathbf{P}}\hat{\bLambda}_{\hat{r}}\hat{\mathbf{P}}^{\top}$, where $\hat{\bLambda}_{\hat{r}}=\mathrm{diag}(\hat{\lambda}_{\hat{r},1},\dots,\hat{\lambda}_{\hat{r},\hat{r}},0,\dots,0)$.
  Following Weyl's inequality \citep{stewart1998perturbation} and Proposition~\ref{ppn:vech}, we have $\hat{\bLambda}\overset{p}{\to}\bLambda$ since the asymptotic covariance matrix of $\mathrm{vech}(\hat{\bDelta}_{\mathcal{Z}})$ is finite. We now show that $\hat{\bLambda}_{\hat{r}}\overset{p}{\to}\bLambda$.
  For any $\ell\in\{1,\dots,k\}$,
  \begin{align*}
    &\ \ \phantom{=}\lim_{n\to\infty}\mathbb{P}(|\hat{\lambda}_{\hat{r},\ell}-\hat{\lambda}_{\ell}|<\epsilon) \\
    &=
    \lim_{n\to\infty}\mathbb{P}(|\hat{\lambda}_{\hat{r},\ell}-\hat{\lambda}_{\ell}|<\epsilon\mid \hat{r}= r_{0})\mathbb{P}(\hat{r}= r_{0}) \\
    &\quad +\lim_{n\to\infty}\mathbb{P}(|\hat{\lambda}_{\hat{r},\ell}-\hat{\lambda}_{\ell}|<\epsilon\mid \hat{r}\neq r_{0})\mathbb{P}(\hat{r}\neq r_{0}) \\
    &= \lim_{n\to\infty}\mathbb{P}(|\hat{\lambda}_{r_{0},\ell}-\hat{\lambda}_{\ell}|<\epsilon\mid \hat{r}= r_{0}).
  \end{align*}
  If $\ell \leq r_{0}$, $\hat{\lambda}_{r_{0},\ell}=\hat{\lambda}_{\ell}$ and $\mathbb{P}(|\hat{\lambda}_{r_{0},\ell}-\hat{\lambda}_{\ell}|<\epsilon\mid \hat{r}= r_{0})=1$.
  Otherwise if $\ell > r_{0}$, $\lim_{n\to\infty}\mathbb{P}(|\hat{\lambda}_{r_{0},\ell}-\hat{\lambda}_{\ell}|<\epsilon\mid \hat{r}= r_{0})=\lim_{n\to\infty}\mathbb{P}(|\hat{\lambda}_{\ell}|<\epsilon\mid \hat{r}= r_{0})=1$ because $\hat{\lambda}_{\ell}\overset{p}{\to}\lambda_{\ell}=0$.
  Hence, $\hat{\lambda}_{\hat{r},\ell}\overset{p}{\to}\hat{\lambda}_{\ell}$ for all $\ell$.
  Since all entries of $\hat{\bP}$ are bounded by $1$, $\hat{\bDelta}_{\mathcal{Z},\hat{r}}-\hat{\bDelta}_{\mathcal{Z}}=\hat{\bP}(\hat{\bLambda}_{\hat{r}}-\bLambda)\hat{\bP}^{\top}\overset{p}{\to} 0$. Then $\hat{\bDelta}_{\mathcal{Z},\hat{r}}\overset{p}{\to}\bDelta_{\mathcal{Z}}$ by consistency of $\hat{\bDelta}_{\mathcal{Z}}$.

  The rank of $\hat{\bDelta}_{\mathcal{Z},\hat{r}}$ is equal to $\hat{r}$ by construction. With the condition that $\mathbb{P}(\mathrm{rank}(\hat{\bDelta}_{\mathcal{Z},\hat{r}})=\mathrm{rank}(\bDelta_{\mathcal{Z}}))\to 1$, it follows from Theorem 2 in \citet{andrews1987asymptotic} that $\hat{\bDelta}_{\mathcal{Z},\hat{r}}^{\dagger}\overset{p}{\to}\bDelta_{\mathcal{Z}}^{\dagger}$.
  By Slutsky's theorem, the convergence in distribution in \eqref{eqn:conv-chisq} still holds if we use a consistent estimator $\hat{\bDelta}_{\mathcal{Z},\hat{r}}^{\dagger}$ of $\bDelta_{\mathcal{Z}}^{\dagger}$ instead.
  Therefore, $n(\mathbf{\Gamma}\hat{\bbeta}_{yx.\mathcal{Z}})^{\top}\hat{\bDelta}_{\mathcal{Z},\hat{r}}^{\dagger}(\mathbf{\Gamma}\hat{\bbeta}_{yx.\mathcal{Z}})\overset{d}{\to}\chi^{2}_{r_{0}}$.
\end{proof}

\subsection{Proof of Lemma~\ref{lem:minimal}}

\begin{Lemma}[Modified Lemma D.1 in \citeauthor{henckel2019graphical} {[\citeyear{henckel2019graphical}]}]
  Consider a causal DAG $\g=(\bV,\bE)$ such that $X,Y\in \bV$ and that $\bZ\subset\bV\setminus\{X,Y\}$ is a valid adjustment set relative to $(X,Y)$ in $\g$. Given a partition $\bZ=\bZ_{1}\cup\bZ_{2}$, if $X\perp_{\mathcal{G}}\bZ_{1}\mid \bZ_{2}$, then $\bZ_{2}$ is a valid adjustment set relative to $(X,Y)$ in $\g$.
  \label{lem:partition}
\end{Lemma}

\begin{Theorem}[\citealp{spirtes1995directed}]
  Consider DAG $\mathcal{G}$ containing $X$, $Y$ and $\bZ$, where $X\neq Y$ and $\bZ$ does not contain $X$ or $Y$, $X$ is $d$-separated from $Y$ given $\bZ$ if and only if the partial correlation coefficient $\rho_{xy.\bz}=0$ for all linear structural equation models compatible with $\mathcal{G}$.
  \label{thm:partial}
\end{Theorem}

\begin{Corollary}
    Consider nodes $X$ and $Y$, and a set $\bZ$ in a DAG $\g$.
  Then $X$ is $d$-separated from $Y$ given $\bZ$ if and only if $\beta_{yx.\bz}=0$ for some linear structural equation model compatible with and faithful to $\g$.
  \label{cor:coefficient}
\end{Corollary}

\begin{Lemma}
  Consider a causal DAG $\g=(\bV,\bE)$ and let $\bV$ follow a linear structural equation model compatible with $\g$.
  Let $\boldsymbol{\epsilon}=\{\epsilon_{v_{1}},\epsilon_{v_{2}},\dots,\epsilon_{v_{p}}\}$ be the set of independent errors from the linear structural equation model, where $p$ is the number of nodes in $\g$.
  Given two nodes $X,Y\in\bV$ such that $Y\in\mathrm{de}(X,\g)$ and any valid adjustment set $\bZ$ relative to $(X,Y)$ in $\g$, the population regression residual $\delta_{y\bz'}$ is a linear combination of the error terms $\boldsymbol{\epsilon}$, in which the coefficient of $\epsilon_{y}$ is $1$.
  \label{lem:epsy}
\end{Lemma}
\begin{proof}
  We refer to the proof of Lemma B.4 in \citet{henckel2019graphical}.
  The residual $\delta_{y\bz'}$ can be written as a linear combination of errors.
  In particular, the coefficient of $\epsilon_{Y}$ is
  \[
    \tau_{yy}-\sum_{N\in\mathrm{de}(Y,\mathcal{G})\cap\bZ'}\beta_{yn.\bz'_{-n}}\tau_{ny}.
  \]
  Since $\bZ$ is a valid adjustment set relative to $(X,Y)$ in $\g$, it cannot contain descendants of $Y$, which are forbidden nodes.
  Then the set $\mathrm{de}(Y,\mathcal{G})\cap\bZ'$ is empty, because $X\notin\mathrm{de}(Y,\mathcal{G})$.
  The result is immediate using the convention that $\tau_{yy}=1$.
\end{proof}

We are now ready to present the proof of Lemma~\ref{lem:minimal}.

\begin{proof}[Proof of Lemma~\ref{lem:minimal}]
Consider a linear structural equation model that is faithful to $\g$.
  We will first only consider the minimal valid adjustment sets $\bZ_{1},\dots,\bZ_{k}$ in the collection $\mathcal{Z}$.
  The first step of the proof is to show that the regression residuals $(\delta_{x\bz_{1}},\dots,\delta_{x\bz_{k}})$ cannot be linearly dependent.
  Suppose on the contrary that there is a linear combination $\ell=\sum_{i}\alpha_{i}\delta_{x\bz_{i}}$ such that $\ell=0$ for some $\alpha_{1},\dots,\alpha_{k}$ not all equal to $0$.
  Without loss of generality, suppose that $\alpha_{1}\neq 0$.
  Consider the first minimal valid adjustment set $\bZ_{1}$. 
  It contains at least one unique node $N\notin\cup_{2\leq j\leq k}\bZ_{j}$.
  We can thus write $\delta_{x\bz_{1}}=X-\bbeta_{x\bz_{1}}^{\top}\bZ_{1}$, where $\bbeta_{x\bz_{1}}$ is the population OLS regression coefficient of $X$ on $\bZ_{1}$.
  Since $\bZ_{1}$ is a minimal adjustment set, node $N$ is $d$-connected with $X$ in $\g$ given $\bZ_{1}\setminus\{N\}$ by Lemma~\ref{lem:partition}.
  It follows from Corollary~\ref{cor:coefficient} that the regression coefficient $\beta_{xn.\bz_{1,-n}}$ of $N$ in $\bbeta_{x\bz_{1}}$ cannot be zero.
  In this case, expanding $\delta_{x\bz_{i}}$ into $X-\bbeta_{x\bz_{i}}^{\top}\bZ_{i}$ and rearranging the terms, the equation $\ell=0$ can be expressed equivalently as
  \begin{equation}
    N=\frac{1}{\alpha_{1}\beta_{xn.\bz_{1,-n}}}\left[\alpha_{1}\left(X-\sum_{V\in \bZ_{1}\setminus\{N\}}\beta_{xv.\bz_{1,-v}}V\right) + \sum_{i\neq 1}\alpha_{i}(X-\bbeta_{x\bz_{i}}^{\top}\bZ_{i})\right]=\sum_{V\neq N}\gamma_{v}V,\label{eqn:poly}
  \end{equation}
  where $\gamma_{v}=-(\alpha_{1}\beta_{xn.\bz_{1,-n}})^{-1}\left(\sum_{i}I(V\in\bZ_{i})\beta_{xv.\bz_{i,-v}}\right)$ for $V\neq X$ and $\gamma_{x}=(\alpha_{1}\beta_{xn.\bz_{1,-n}})^{-1}\sum_{i}\alpha_{i}$.
  Equation (\ref{eqn:poly}) cannot hold due to the fact that the covariance matrix of $\bV$ is non-singular.
  Therefore, we conclude that $\ell\neq 0$ when $\alpha_{1}\neq 0$.
  On the contrary, when $\alpha_{1}=0$, the argument above can be repeated for minimal adjustment sets $\bZ_{2}$ with $\alpha_{2}\neq 0$, so on and so forth until $\alpha_{k}\neq 0$.
  Since the linear combination $\ell$ cannot evaluate to zero whenever $\alpha_{i}\neq 0$ for any $i\in\{1,\dots,k\}$, the inequality $\ell\neq 0$ holds generally for all $\alpha_{i}$'s not all equal to zero.

  The second step is to show that the regression residual products $(\delta_{x\bz_{1}}\delta_{y\bz_{1}'},\dots,\delta_{x\bz_{k}}\delta_{y\bz_{k}'})$ cannot be linearly dependent either.
  Lemma~\ref{lem:epsy} states that each $\delta_{y\bz'_{i}}$ contains the error term $\epsilon_{y}$.
  For any valid adjustment set $\bZ_{i}$, $\delta_{y\bz_{i}'}\ci \delta_{x\bz_{i}}$ (see proof of Proposition 3.1 in Supplement from \citeauthor{henckel2019graphical} [\citeyear{henckel2019graphical}]).
  Therefore, $\delta_{x\bz_{i}}$, when written in the form of error terms only, cannot contain $\epsilon_{y}$.
  Consider now another linear combination $\ell^{*}=\sum_{i}\xi_{i}\delta_{y\bz_{i}'}\delta_{x\bz_{i}}$.
  Suppose that $\ell^{*}=0$ for some $\xi_{i}$'s not all equal to $0$.
  We can expand $\delta_{y\bz_{i}'}$ into $\epsilon_{y}$ plus some linear combination of the other errors.
  Singling out the terms involving $\epsilon_{y}$ in $\ell^{*}$, we have that
  \begin{equation}
    \epsilon_{Y}\sum_{i}\xi_{i}\delta_{x\bz_{i}}=0,
  \end{equation}
  since $\ell^{*}=0$ and $\epsilon_{y}$ is independent from the other errors.
  Due to the non-degeneracy of $\epsilon_{y}$, the linear combination $\sum_{i}\epsilon_{i}\delta_{x\bz_{i}}$ must evaluate to $0$ for some $\xi_{i}$'s not all equal to $0$.
  However, this is impossible by independence between $\delta_{x\bz_{i}}$'s shown in the first step, and we have reached a contradiction.
  
  Following the proof of Lemma~\ref{lem:normal}, the asymptotic covariance matrix $\mathbf{\Psi}$ is precisely the covariance matrix of $(\delta_{x\bz_{1}}\delta_{y\bz_{1}'},\dots,\delta_{x\bz_{k}}\delta_{y\bz_{k}'})^{\top}$, which is non-singular due to linear independence among $\delta_{x\bz_{i}}\delta_{y\bz_{i}'}$'s.
  Hence, the corresponding asymptotic covariance matrix $\bSigma_{\mathcal{Z}\setminus\mathrm{nonforb}(X,Y,\g)}$ also has full rank.
  
  Now we consider the set of non-forbidden nodes. Let $\mathbf{N}=\mathrm{nonforb}(X,Y,\g)$. The $d$-connection condition of a unique node $N\in\mathbf{N}$ and faithfulness ensures a non-zero coefficient in front of $N$ in $\delta_{x\mathbf{n}}$.
  Since $\mathrm{nonforb}(X,Y,\g)$ is a valid adjustment set relative to $(X,Y)$ in $\g$, we can repeat the argument above and conclude that the enlarged asymptotic covariance matrix $\bSigma_{\mathcal{Z}}$ is also non-singular.
  
  When the edge coefficients and the error variances in the linear structural equation model are sampled from an absolutely continuous distribution $P$ with respect to the Lebesgue measure, the model is faithful with probability $1$ \citep{spirtes2000causation}.
  Therefore, since we showed that for all faithful models $\bSigma_{\mathcal{Z}}$ is invertible our claim follows.
\end{proof}

\subsection{Lemma~\ref{lemma:nonforb} and its proof}

\begin{Lemma}
Consider nodes $X$ and $Y$ in a DAG $\g$ such that $Y \in \de(X,\g)$. Then $\mathrm{nonforb}(X,Y,\g)$ is a valid adjustment set relative to $(X,Y)$ in $\g$.
\label{lemma:nonforb}
\end{Lemma}

\begin{proof}
Obviously, $\mathrm{nonforb}(X,Y,\g)$ does not contain any forbidden nodes so it only remains to show that it blocks all paths from $X$ to $Y$ that are not directed. Note first the only possible path from $X$ to $Y$ that does not contain a non-collider is $X \rightarrow C \leftarrow Y$. By assumption $\de(Y,\g)\subseteq \f{\g}$ and therefore this path is blocked by $\mathrm{nonforb}(X,Y,\g)$. Let $p$ be any other path from $X$ to $Y$ that is not directed. It must therefore contain at least one non-collider. If any non-collider on $p$ is in $\mathrm{nonforb}(X,Y,\g)$, $p$ is blocked so suppose this is not the case, i.e., all non-collider on $p$ are in $\f{\g}$. Any collider on $p$ must be a descendant of a non-collider on $p$ and is therefore also in $\f{\g}$. In this case $p$ is again blocked given $\mathrm{nonforb}(X,Y,\g)$ and therefore we can assume that $p$ does not contain any colliders and is therefore of the form $X \leftarrow \dots \leftarrow F \rightarrow \dots \rightarrow Y$. But any node in $\f{\g}$ that is not $X$ is a descendant of $X$ and therefore $F=X$ or we would have a violation of the acyclicity assumption. But then $p$ is a directed path which contradicts out starting assumption for $p$.
\end{proof}

\section{Simulation setup}

\label{sec: app_sim}

\subsection{Simulation in Example~\ref{exp:ppplot}}
\label{sec:ppplot}
The definition of the probability-probability plot that we employ in Example 6 is described as follows. Given a sample of p-values $p_{1},p_{2},\dots,p_{R}$, we sort them in the increasing order: $p_{(1)},\dots,p_{(R)}$. Then we apply the empirical distribution function to get the empirical probabilities $\hat{P}_{(j)}$ for $j=1,\dots,R$, i.e., $\hat{P}_{(j)}=\sum_{i=1}^{R}I(p_{(i)}\leq p_{(j)})/R$. These are simply $j/R$ assuming no ties. Since we wish to compare the sample to the standard uniform distribution, whose cumulative distribution function is $F(t)=t$ for $t\in[0,1]$, we compute the theoretical probabilities $P_{(j)}=F(p_{(j)})=p_{(j)}$. The plot is finally obtained by plotting $\hat{P}_{(j)}$ against $P_{(j)}$.

\subsection{Simulation in Section~\ref{sec:sim}}
\label{app:setup}
\paragraph{True graph}
We generate causal DAGs as Erdős–Rényi random graphs. There are in total \(50\) DAGs with \(10\) nodes and \(50\) DAGs with \(15\) nodes. The expected neighbourhood size for each DAG is drawn uniformly from $\{2,3,4,5\}$, with the function \textsf{randDAG} in \textsf{R} package \textsf{pcalg} \citep{kalisch2012causal}.


\paragraph{Linear structural equation model}
For our compatible linear structural equation we sample edge coefficients uniformly from $[-2,-0.1]\cup[0.1,2]$. 
We then draw an error distribution uniformly from one of four distributions: normal, uniform, $t$, or logistic. Note that we use the same error distribution for all errors in the model. We than sample variances for each error in our model as follows. The variance parameter of the normal errors is sampled uniformly from $0.5$ to $1.5$. The location parameter of the uniform errors symmatric around zero is sampled uniformly from $1.2$ to $2.1$. The $t$-errors are sampled from a $t$-distribution with $5$ degrees of freedom and then scaled by $\sqrt{3/5}$ times the square root of a uniformly sampled number from $0.5$ to $1.5$. The scale parameter of the logistic errors centred around zero is sampled uniformly from $0.4$ to $0.7$. By sampling our parameters this way we ensure that the variances are approximately in the interval from $0.4$ to $1.6$.

\paragraph{The pair $(X,Y)$}
The node $X$ is randomly drawn from the true DAG $\g_0$, where we weight each node in $\g_0$ by the number of its descendants minus 1. 
Once $X$ is fixed, we sample $Y$ uniformly from the set $\mathrm{de}(X,\mathcal{G}_{0})\setminus \{X\}$.
The sampling procedure is repeated until there are at least two valid adjustment sets relative to the selected pair $(X,Y)$ in the completed partially directed acyclic graph (CPDAG) of \(\g_0\).

\paragraph{Causal structure learning}
We use causal structure learning algorithms to generate large numbers of reasonable candidate graphs for our test procedure.
If the error distribution is normal, we apply Greedy Equivalence Search (GES, \citet{Chickering02}) to estimate a completed partially directed acyclic graph (CPDAG). Note that the adjustment criterion also applies to CPDAGs. 
Otherwise, we apply LiNGAM \citep{shimizu2014lingam} and estimate a DAG.
We use the functions \textsf{ges} and \textsf{lingam} from \textsf{R} package \textsf{pcalg} with default options \citep{kalisch2012causal}.


\paragraph{Untestable cases}
If there is only one or no adjustment set in the candidate graph $\g$, the proposed test cannot be performed so we discard these cases.
If $Y\notin\mathrm{de}(X,\g)$ the valid adjustment sets are simply those sets that d-separate $X$ from $Y$. As there is a large literature on conditional independence tests which are more suitable here than our test procedure, we discard this case. 
If the rank of $\bSigma_{\mathcal{Z}}$ is estimated to be $1$, there is no effective over identifying constraint for our test procedure, so we discard these cases as well.

\paragraph{AUC calculation}
Recall that for each candidate graph and sample size for testing \(n\), we perform our test $100$ times. We plot the probability-probability plot between the corresponding $100$ $p$-values and the standard uniform distribution.
We compute the area under the curve (AUC) of this curve with the function \textsf{auc} from \textsf{R} package \textsf{MESS} \citep{ekstrom2020mess}.

\paragraph{Determining whether null hypothesis is true}
For every estimated graph and test strategy, we check using the true linear structural equation model whether the null hypothesis $H_0$ is true or false by computing the population level regression coefficients and checking whether they are all equal.

\paragraph{Version control} The simulation studies were conducted using \textsf{R} version 4.1.1.

\subsection{Extra simulation results}
\label{app:sim}

Figure~\ref{fig:double-sim-extra} and Figure~\ref{fig:double-sim-three} show additional plots of the AUCs from the simulation study. In Figure~\ref{fig:double-sim-extra} the AUCs are grouped by error distribution of the linear structural equation model, graph size of the true graph and expected neighbourhood size of the true graph, respectively. In Figure~\ref{fig:double-sim-extra} they are additionally grouped by the sample size used for testing and the candidate graph accuracy. The plots show that of the three parameters only the error distribution seems to have an impact on the performance of our testing procedure. This is likely due to the fact that in cases with normally distributed errors we can only learn a CPDAG, which contain fewer valid adjustment sets than DAGs.

\begin{figure}
    \centering
    \includegraphics[scale=0.6]{fig/double-sim-boxplot-final-extra.pdf}
    \caption{Extra violin plots (layered with boxplots) of AUCs from the simulation study.}
    \label{fig:double-sim-extra}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[scale=0.6]{fig/double-sim-boxplot-final-three.pdf}
    \caption{Extra violin plots (layered with boxplots) of AUCs from the simulation study, partitioning \(H_{0}\) into \(H_{0}^*\) and \(\neg H_{0}^* \wedge H_{0}\).   }
    \label{fig:double-sim-three}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[scale=0.6]{fig/double-sim-boxplot-minplus-final-three.pdf}
    \caption{Violin plots (layered with boxplots) of AUCs from the simulation study using only the \(\mathrm{Min}+\) strategy, partitioning \(H_{0}\) into \(H_{0}^*\) and \(\neg H_{0}^* \wedge H_{0}\).   }
    \label{fig:double-sim-minplus-three}
  \end{figure}
  
Table~\ref{tab:double-sim} summarises the proportions of candidate graphs (and strategies) where the null-hypothesis $H_{0}^{*}$ is true, the null hypothesis $H_{0}^{*}$ is false but the actual test null hypothesis $H_{0}$ is true and both are false, respectively.
Unsurprisingly $H_{0}^{*}$ is true more often for the high accuracy candidate graphs.
We can also see that the strategy $S=\mathrm{All}$ always result in a higher proportions of cases where $H_{0}$ is false when compared to $S=\mathrm{Min}+$, which is due to the fact that $S=\mathrm{Min}+$ consider a subset of the adjustment sets $S=\mathrm{All}$ considers.
The problematic cases where $\neg H_{0}^{*}\wedge H_{0}$ generally occur in around $10\%$ of the cases, and interestingly are more common for the larger graphs than for the smaller graphs.

\begin{table}[t!]
\setlength{\aboverulesep}{1pt}
\setlength{\belowrulesep}{1pt}
    \centering
    \small
    \begin{tabular}{p{2.5cm}ccccccc}
    \toprule
     \multirow{2}{*}{Factor} & Strategy & \multicolumn{3}{c}{$S=\mathrm{Min}+$} & \multicolumn{3}{c}{$S=\mathrm{All}$} \\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5} \cmidrule(lr){6-8}
     & Hypothesis & $H_{0}^{*}$ & $\neg H_{0}^{*}\wedge H_{0}$ & $\neg H_{0}$ & $H_{0}^{*}$ & $\neg H_{0}^{*}\wedge H_{0}$ & $\neg H_{0}$ \\
    \midrule
    \multirow{2}{*}{\parbox{2.5cm}{Expected graph\newline accuracy}} & Low & 42.82 & 1.98 & 55.20 & 36.14 & 1.98 & 61.88\\
    \cmidrule(lr){2-8}
    & High & 85.64 & 5.48 & 8.88 & 84.71 & 5.27 & 10.02\\
    \midrule
    \multirow{2}{*}{\parbox{2.5cm}{Graph\newline size}} & 10 & 81.12 & 3.08 & 15.8 & 77.73 & 3.08 & 19.19\\
    \cmidrule(lr){2-8}
    & 15 & 55.24 & 7.46 & 37.3 & 54.31 & 6.99 & 38.69\\
    \midrule
    \multirow{4}{*}{\parbox{2cm}{Neighbourhood\newline size}} & 2 & 92.18 & 0.00 & 7.82 & 88.83 & 0.00 & 11.17\\
    \cmidrule(lr){2-8}
    & 3 & 93.31 & 1.49 & 5.20 & 88.61 & 1.24 & 10.15\\
    \cmidrule(lr){2-8}
    & 4 & 63.14 & 6.34 & 30.53 & 62.56 & 6.34 & 31.10\\
    \cmidrule(lr){2-8}
    & 5 & 54.52 & 7.47 & 38.01 & 52.49 & 7.24 & 40.27\\
    \bottomrule
    \end{tabular}
    \caption{Percentage of true hypotheses in the simulation normalised within each combination of factor and strategy.}
    \label{tab:double-sim}
\end{table}

\begin{table}[ht!]
    \centering
    {\small
    \begin{tabular}{cccccccc}
    \toprule
        Cand. graph & & \multicolumn{2}{c}{\(H_{0}^{*}\)} &
        \multicolumn{2}{c}{\(\neg H_{0}^{*} \wedge H_{0}\)} & \multicolumn{2}{c}{\(\neg H_{0}\)} \\
        \cmidrule(lr){3-4} \cmidrule(lr){5-6} \cmidrule(lr){7-8}
        accuracy & $n$ & $S=\mathrm{Min+}$ & $S=\mathrm{All}$ & $S=\mathrm{Min+}$ & $S=\mathrm{All}$ & $S=\mathrm{Min+}$ & $S=\mathrm{All}$ \\
        \midrule
        \multirow{4}{*}{Low} & 50 & 0.0751 & 0.0909 & 0.0788 & 0.0759 & 0.5570 & 0.7396\\
        & 100 & 0.0636 & 0.0636 & 0.0587 & 0.0385 & 0.6352 & 0.7880\\
        & 200 & 0.0543 & 0.0510 & 0.0488 & 0.0516 & 0.7132 & 0.8341\\
        & 400 & 0.0493 & 0.0466 & 0.0525 & 0.0503 & 0.7887 & 0.8812\\
        \midrule
        \multirow{4}{*}{High} & 50 & 0.0786 & 0.0897 & 0.0711 & 0.0712 & 0.1543 & 0.1697\\
        & 100 & 0.0634 & 0.0587 & 0.0585 & 0.0557 & 0.2026 & 0.2094\\
        & 200 & 0.0559 & 0.0500 & 0.0558 & 0.0476 & 0.2838 & 0.3010\\
        & 400 & 0.0543 & 0.0471 & 0.0492 & 0.0475 & 0.3838 & 0.4152\\
        \bottomrule
    \end{tabular}
    }
    \caption{Proportion of hypotheses rejected at level $0.05$ in the simulation study.}
    \label{tab:three-hypotheses}
\end{table}

\paragraph{Minimal adjustment sets in large sparse graphs}
We ran a small simulation to demonstrate the scalability of the algorithm for minimal adjustment sets proposed by \citet{van2014constructing}.
We simulated Erdős–Rényi graphs with graph size \(100,250,500,1000,2500,5000\) and expected neighbourhood size \(2,3,4,5\).
For each combination above, we generated \(10\) DAGs.
For each DAG, we selected the pair of \((X,Y)\) nodes in the same way as in the main simulation described in Section~\ref{sec:sim}.
We then ran the algorithm to extract minimal adjustment sets relative to \((X,Y)\) and performed the rest of the testing procedure according to Algorithm~\ref{alg:test}.
We allowed up to one hour on each DAG to finish the computation of minimal adjustment sets, and for the graph sizes \(100,250,500,1000,2500,5000\), the percentages of completed algorithm runs were \(100\%, 57.5\%, 92.5\%, 100\%, 95\%, 35\%\), respectively.
The results suggest that the extraction of minimal adjustment sets is possible even for graphs with sizes in the order of \(1000\)s.
We also noted, however, that the space required to store the adjustment sets can also exceed the \(4\) GB RAM allocated. 

\paragraph{\(\mathrm{Min}+\) strategy-only simulation on larger graphs}

We conducted another simulation on graphs of size \(20, 40\) and \(80\) with precisely the same setup as the simulation in Section~\ref{sec:sim} using only the \(\mathrm{Min}+\) strategy. As the \(\mathrm{Min}+\) strategy is computationally much fast than the \(\mathrm{All}\) strategy we, we were able to increase the graph sizes while keeping the other configurations unchanged.
It is worth pointing out that attempting to run the simulation on graphs of \(20\) nodes with the \(\mathrm{All}\) strategy in the same setup almost always exceeded the one-hour timeout.
Figure~\ref{fig:double-sim-minplus-three} contains violin plots of AUCs framed by different parameters used in the simulation and coloured by their respective true hypotheses.
The results are very similar to what we saw in the simulation in Section~\ref{sec:sim}.
The small bulks around AUC \(0.25\) to \(0.3\) for \(\neg H_{0}^{*} \wedge H_{0}\) in Figure~\ref{fig:double-sim-minplus-three} are due to a specific DAG and structural equation model where our procedure was very conservative.
One particular simulated graph of size \(80\) was not included in the plots due to memory overflow during the computation of the minimal adjustment sets, which indicated that for graphs larger than $80$, memory might have to be taken into account.

\bibliography{su_509.bib}

\end{document}