%\documentclass{uai2022} % for initial submission
 \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{amsfonts}
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz, pgf} % nice language for creating drawings and diagrams
\usetikzlibrary{automata,positioning}
\usepackage{tikzsymbols}
\usepackage{graphicx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{xr}
\usepackage{soul}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\DeclareMathOperator{\pre}{pre}
\DeclareMathOperator{\pa}{pa}
\DeclareMathOperator{\ch}{ch}
\DeclareMathOperator{\de}{de}
\DeclareMathOperator{\nd}{nd}
\DeclareMathOperator{\an}{an}
\DeclareMathOperator{\sib}{sib}
\DeclareMathOperator{\dis}{dis}
\DeclareMathOperator{\mb}{mb}
\DeclareMathOperator{\doo}{do}
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator{\diedgeright}{\textcolor{black}{{\rightarrow}}}
\DeclareMathOperator{\diedgeleft}{\textcolor{black}{{\leftarrow}}}
\DeclareMathOperator{\biedge}{\textcolor{black}{{\leftrightarrow}}}
\DeclareMathOperator{\circright}{{{\circ\hspace{-0.15cm}\rightarrow}}}
\newcommand{\christoffel}[3]{\ensuremath{\Gamma^{#1#2}_{#3}}}
\newcommand{\rc}[1]{\ensuremath{\langle #1 \rangle}}
\newcommand{\trace}[1]{\ensuremath{\text{trace}(#1)}}
\newcommand{\red}{\textcolor{red}}
\newcommand{\E}{\mathbb{E}}
\newcommand\ci{\perp\!\!\!\perp}
\newcommand{\G}{{\mathcal G}}
\newcommand{\I}{{\mathbb I}}
% theorem, lemma, remark, definition
\usepackage{amsthm}
\newtheorem{cor}{Corollary}
\newtheorem{prop}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}[theorem]
\theoremstyle{remark}
\newtheorem*{remark}{Remark}
\newtheorem*{claim}{Claim}
\theoremstyle{definition}
\newtheorem{definition}{Definition}

\allowdisplaybreaks


\title{On Testability of the Front-Door Model via Verma  Constraints: Appendix}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<rb17@williams.edu>?Subject=Your UAI 2022 paper}{Rohit~Bhattacharya}{}}
\author[2]{Razieh~Nabi}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    Williams College\\
    Williamstown, Massachusetts, USA
}
\affil[2]{%
    Department of Biostatistics and Bioinformatics\\
    Emory University\\
   	Atlanta, Georgia, USA
}

\externaldocument{bhattacharya_335}
  
 \begin{document}
 
 \appendix
 \onecolumn
\maketitle


\appendix
% NOTE: necessary when ptmx or no mathfont class option is given
\providecommand{\upGamma}{\Gamma}
\providecommand{\uppi}{\pi}

\vspace{-1em}
The supplementary materials are organized as follows. Appendix~\ref{app:verma-comms} provides additional commentary on assumptions used in the testability criterion, while Appendix~\ref{app:list} lists all ADMGs that satisfy the criterion. Appendices~\ref{app:np-tests},~\ref{app:baseline}, ~\ref{app:nested}, and ~\ref{app:sims} provide additional notes on non-parametric Verma tests, models with additional baseline covariates, the nested Markov factorization, and the simulated datasets respectively. Appendix~\ref{app:proofs} contains proofs of the main results.

\section{On The True Mediation And Relevance Assumptions}
\label{app:verma-comms}

\begin{figure}[h]
	\begin{center}
		\scalebox{0.75}{
			\begin{tikzpicture}[>=stealth, node distance=1.9cm]
				\tikzstyle{square} = [draw, thick, minimum size=1.0mm, inner sep=3pt]
				
				\begin{scope}[xshift=0cm]
					\path[->, very thick]
					node[] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					
					(z) edge[blue] (a)
					(m) edge[blue] (y)
					(a) edge[red, <->, bend left] (y) 
					
					node[below of=a, xshift=1cm, yshift=1cm] (b) {(a)} ;
				\end{scope}
				
				\begin{scope}[xshift=8cm]
					\path[->, very thick]
					node[] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(a) edge[<->, red, bend left] (y)
					(a) edge[blue, bend right] (y)
					
					node[below of=a, xshift=1cm, yshift=1cm] (b) {(b)} ;
				\end{scope}
				
				
				\begin{scope}[xshift=16cm]
					\path[->, very thick]
					node[] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					
					(z) edge[blue] (a)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(a) edge[<->, red, bend left] (m)
					
					node[below of=a, xshift=1cm, yshift=1cm] (b) {(c)} ;
				\end{scope}
				
			\end{tikzpicture}
		}
	\end{center}
	\vspace{-0.5cm}
	\caption{ (a) A model that does not satisfy $M$ being a mediator between $A$ and $Y$; (b) A model that does not satisfy $Z\not\ci A$; (c) A model that does not satisfy $Z \not\ci Y | A, M$.}
	\label{fig:trivial-verma}
\end{figure}

Assumptions (A1) (true mediation) and (A2) (relevance of the anchor) help ensure that any Verma constraint detected between the anchor $Z$ and outcome $Y$  is ``non-trivial,'' i.e., is not implied by an ordinary independence constraint. We first show why this is important by way of example, and then show that no ordinary constraint exists between $Z$ and $Y$ in distributions satisfying (A1) and (A2). Before proceeding, we also briefly note that (A1) is a relatively weak assumption in that it relies on some minimal background knowledge about how the treatment affects the outcome, while (A2) is testable.

Consider the ADMG in Fig.~\ref{fig:trivial-verma}(a). Here, $M$ does not satisfy assumption (A1). It can be checked via m-separation that any distribution that nested factorizes with respect to this ADMG would encode the ordinary independence $Z \ci Y.$ Trivially, this constraint would also hold in the post-intervention distribution $p(Z, A, M, Y)/p(M|A, Z).$ Fig.~\ref{fig:trivial-verma}(b) demonstrates why accidentally testing for trivial Verma constraints such as this one may lead to incorrect conclusions about identifiability of the target $\E[Y|\doo(a)].$ In Fig.~\ref{fig:trivial-verma}(b), we have that $Z \ci A,$ so assumption (A2) is not satisfied. There exist no paths between $Z$ and $Y,$ leading to a trivial Verma constraint implied by the ordinary independence $Z \ci Y.$ Relying on a test of this trivial constraint would also mislead the analyst into thinking $\E[Y|\doo(a)]$ is identified, when in fact, the front-door conditions are not met due to the $A\diedgeright Y$ edge. Finally, consider the ADMG in Fig.~\ref{fig:trivial-verma}(c). Distributions factorizing according to this graph satisfy $Z \ci Y \mid A, M,$ which violates assumption (A2). These distributions will also exhibit the same trivial Verma constraint, and here too, relying on such a test would lead to incorrect conclusions on identifiability. The following lemma confirms that under the given assumptions such trivial constraints do not arise.

\begin{lemma}
	\label{lem:non-trivial}
	Distributions $p(Z, A, M, Y)$ satisfying assumptions (A1) and (A2) do not entail any ordinary independence constraints between $Z$ and $Y.$
\end{lemma}
\begin{proof}
	Given (A1), $p(Z, A, M, Y)$ must nested factorize wrt an ADMG, say $\G$, containing the path $A \diedgeright M \diedgeright Y.$ Given $Z \not\ci A$ and $Z$ is not a causal consequence of $A$ due to (A2), $\G$ also contains either $Z \diedgeright A$ or $Z\biedge A$ or both. This implies the existence of an unblocked path $Z\diedgeright A \diedgeright M \diedgeright Y$ and/or a similar one that starts with $Z \biedge A.$ These paths become blocked when we condition on $A$ and $M.$ However, since by assumption (A2), we have that $Z \not\ci Y | A, M$ at least one of the  following paths consisting of colliders exist in $\G$ (we use the notation $\circright$ below to mean a directed or bidirected edge):
	\begin{itemize}
		\item The edge $Z \circright Y$ exists in $\G$ (trivially a collider path.)
		\item A path $Z \circright A \biedge Y$ exists in $\G$ where $A$ is a collider.
		\item A path $Z \circright M \biedge Y$ exists in $\G$ where $M$ is a collider.
		\item A path $Z \circright A \biedge M \biedge Y$ and/or $Z \circright M \biedge A \biedge Y$ exists in $\G$ where both $A$ and $M$ are colliders.
	\end{itemize}

	Note that conditioning on a descendant of a collider also opens the collider; however, given just these 4 variables, this situation arises only when we condition on $M$ with $A$ being  the collider (a case which is already covered above.) An inducing path between two variables $P$ and $Q$ is defined as a path (comprised of any combination of directed and bidirected edges) between two variables such that every intermediate node is a collider and a causal ancestor of either $P$ or $Q.$ If such a path exists, then it is known that $P$ and $Q$ are not m-separable given any subset of other variables, i.e., there is no ordinary independence constraint between them \citep{verma1990equivalence}. Here, any of the above paths would form inducing paths ($Z\circright Y$ is trivially an inducing path), as by assumption (A1) both $A$ and $M$ are causal ancestors of $Y.$ Hence, there are no ordinary conditional independence constraints implied in distributions $p(Z, A, M, Y)$ that satisfy assumptions (A1-A3).
\end{proof}


\section{Exhaustive List of ADMGs Satisfying The Testable Criterion}
\label{app:list}

Fig.~\ref{fig:all-admgs-with-verma} is an exhaustive list of all ADMGs that satisfy the Verma restriction in Theorem~\ref{thm:fd}. Importantly, all of these ADMGs also satisfy the front-door criteria (or its extensions) that permit identification of $\E[Y|\doo(a)].$ The ADMGs in Fig.~\ref{fig:all-admgs-with-verma}(a-c) are ones where the anchor $Z$ also satisfies the instrumental variable conditions. A version of Fig.~\ref{fig:all-admgs-with-verma}  appears in \cite{shpitser2014introduction} as the conjectured list of ADMGs that satisfy a Verma restriction which distinguishes it from a supermodel where the $Z \diedgeright Y$ edge is present. This conjecture was based on empirical comparisons of Bayesian Information Criterion scores of all 4 variable ADMGs using a parameterization of the nested Markov model for discrete data \citep{evans2014markovian}.

\begin{figure*}[h]
	\begin{center}
		\scalebox{0.7}{
			\begin{tikzpicture}[>=stealth, node distance=2cm]
				\tikzstyle{square} = [draw, thick, minimum size=1.0mm, inner sep=3pt]
				
				\begin{scope}[xshift=0cm]
					\path[->, very thick]
					node[] (label) {(a)}
					node[right of=label, xshift=-1cm] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					
					(z) edge[blue] (a)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(a) edge[<->, red, bend left] (y)
					;
				\end{scope}
				
				\begin{scope}[xshift=11cm]
					\path[->, very thick]
					node[] (label) {(b)}
					node[right of=label, xshift=-1cm] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					
					(z) edge[<->, red] (a)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(a) edge[<->, red, bend left] (y)
					;
				\end{scope}
				
				\begin{scope}[xshift=0cm, yshift=-3cm]
					\path[->, very thick]
					node[] (label) {(c)}
					node[right of=label, xshift=-1cm] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					
					(z) edge[<->, red, bend left] (a)
					(z) edge[blue] (a)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(a) edge[<->, red, bend left] (y)
					;
				\end{scope}
				
				\begin{scope}[xshift=11cm, yshift=-3cm]
					\path[->, very thick]
					node[] (label) {(d)}
					node[right of=label, xshift=-1cm] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					
					(z) edge[blue] (a)
					(z) edge[blue, bend right] (m)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(a) edge[<->, red, bend left] (y)
					;
				\end{scope}
				
				\begin{scope}[xshift=0cm, yshift=-6cm]
					\path[->, very thick]
					node[] (label) {(e)}
					node[right of=label, xshift=-1cm] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					
					(z) edge[blue] (a)
					(z) edge[red, <->, bend right] (m)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(a) edge[<->, red, bend left] (y)
					;
				\end{scope}
				
				\begin{scope}[xshift=11cm, yshift=-6cm]
					\path[->, very thick]
					node[] (label) {(f)}
					node[right of=label, xshift=-1cm] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					
					(z) edge[red, <->] (a)
					(z) edge[blue, bend right] (m)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(a) edge[<->, red, bend left] (y)
					;
				\end{scope}
				
				\begin{scope}[xshift=0cm, yshift=-9cm]
					\path[->, very thick]
					node[] (label) {(g)}
					node[right of=label, xshift=-1cm] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					
					(z) edge[blue] (a)
					(z) edge[red, <->, bend left] (a)
					(z) edge[blue, bend right] (m)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(a) edge[<->, red, bend left] (y)
					;
				\end{scope}
				
				\begin{scope}[xshift=11cm, yshift=-9cm]
					\path[->, very thick]
					node[] (label) {(h)}
					node[right of=label, xshift=-1cm] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					
					(z) edge[blue] (a)
					(z) edge[red, <->, bend left] (m)
					(z) edge[blue, bend right] (m)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(a) edge[<->, red, bend left] (y)
					;
				\end{scope}
				
				
			\end{tikzpicture}
		}
	\end{center}
	\caption{An exhaustive list of all ADMGs that satisfy the Verma constraint $Z \ci Y$ in $p(Z, A, M, Y)/p(M|A, Z)$.}
	\label{fig:all-admgs-with-verma}
\end{figure*}

\clearpage

\section{Additional Details On Non-Parametric Tests}
\label{app:np-tests}

In this section we provide  additional details on how a non-parametric Verma test can be performed using our methods. Assume we are given a data set ${\cal S}_n$ with $n$ i.i.d samples of data, and any appropriate non-parametric test $\tau(Z, Y, M)$ that can be used to test an ordinary conditional independence $Z \ci Y \mid M.$ The idea for a non-parametric Verma test is simple: Verma constraints resemble ordinary conditional independencies albeit in identified post-intervention distributions. Thus, if we are able to generate a pseudo-dataset ${\cal S}^p_{(\cdot)}$ that resembles the post-intervention distribution in which the desired dormant conditional independence holds, then $\tau(Z, Y, M)$ can easily be applied to ${\cal S}^p_{(\cdot)}$ rather than ${\cal S}_n.$ More concretely, to use the dual weights for a non-parametric Verma test we would proceed as follows:

\begin{enumerate}
	\setlength{\itemsep}{0.25cm}
	\item Compute $q^d(M|A, Z) \equiv p(M|A, Z)/p(M|A=a, Z)$ for each row of data $i=1, \dots, n$ using suitable machine learning methods, e.g., kernel regressions or random forests.
	\item Create a pseudo-dataset ${\cal S}^p_{n/2}$ that mimics the post-intervention distribution $p(Z, M, Y \mid \doo(a))$ by drawing $n/2$ samples with replacement from ${\cal S}_n$, where each row $i$ has an (unnormalized) probability of being sampled given by the dual weights $1/q^d(M=m_i|A=a_i, Z=z_i).$
	\item Apply $\tau(Z, Y, M)$ to ${\cal S}^p_{n/2}$ with some pre-specified significance level $\alpha.$
\end{enumerate}

Note that some of the statistical inefficiency in our tests come from resampling only half the data from the original dataset when performing the Verma test. This is the simplest sampling scheme proposed by \cite{thams2021statistical},  but the authors have also proposed more complex adaptive schemes  that may improve performance; see their paper for details. A non-parametric test using primal weights would proceed in exactly the same way except we would fit the appropriate models to generate the primal weights $1/\widetilde{q}(A \mid Y, Z, M).$

\clearpage

\section{Models With Baseline Covariates}
\label{app:baseline}

\begin{figure}[h]
	\begin{center}
		\scalebox{0.7}{
			\begin{tikzpicture}[>=stealth, node distance=2cm]
				\tikzstyle{square} = [draw, thick, minimum size=1.0mm, inner sep=3pt]
				
				\begin{scope}[xshift=0cm]
					\path[->, very thick]
					node[] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					node[above right of=a, xshift=-0.5cm] (c) {$C$}
					node[below right of=a, xshift=-0.5cm] (label) {\large (a)\  $p(C, Z, A, M, Y)$}
					
					(z) edge[blue] (a)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(z) edge[blue, bend right] (m)
					(a) edge[red, <->, bend left] (y) 
					(c) edge[blue, bend right] (z)
					(c) edge[blue, bend right] (a)
					(c) edge[blue, bend left] (m)
					(c) edge[blue, bend left] (y)
					;
				\end{scope}
			
				\begin{scope}[xshift=9cm]
					\path[->, very thick]
					node[] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a, square] (m) {$m$}
					node[right of=m] (y) {$Y$}
					node[above right of=a, xshift=-0.5cm] (c) {$C$}
					
					(z) edge[blue] (a)
					(m) edge[blue] (y)
					(a) edge[red, <->, bend left] (y) 
					(c) edge[blue, bend right] (z)
					(c) edge[blue, bend right] (a)
					(c) edge[blue, bend left] (y)
					node[below right of=a, xshift=-0.5cm] (label) {\large (b) \ $p(C, Z, A, Y\mid \doo(m))$}
					;
				\end{scope}
					
			\end{tikzpicture}
		
		}
	\end{center}
	\vspace{-0.25cm}
	\caption{ (a) A modification of the model in Fig.~\ref{fig:intro}(b) to include baseline covariates $C;$ (b) CADMG showing that the Verma constraint still exists as a conditional independence $Z \ci Y \mid C$ in $p(C, Z, A, Y \mid \doo(m)).$}
	\label{fig:baseline-covariates}
\end{figure}

In this section we make explicit how our framework extends to settings with baseline  covariates. Consider the ADMG in Fig.~\ref{fig:baseline-covariates}(a) that is a modification of Fig.~\ref{fig:intro}(b) to include baseline covariates $C$ that point to all other variables $Z, A, M, Y.$ We show that the inclusion of additional covariates requires only small modifications to the underlying theory  in the main paper.

First, notice that there is still no ordinary conditional independence between $Z$ and $Y$ implied by Fig.~\ref{fig:baseline-covariates}(a). Further, the post-intervention distribution $p(C, Z, A, Y \mid \doo(m))$ is identified as $p(C, Z, A, Y, M=m) / p(M=m|A, Z, C).$ This post-intervention distribution factorizes according to the CADMG shown in Fig.~\ref{fig:baseline-covariates}(b) from which we can read off the dormant conditional independence $Z \ci Y \mid C$ in $p(C, Z, A, Y \mid \doo(m)).$ That is, the Verma constraint between the anchor and the outcome is now a \emph{conditional} rather than marginal independence in the post-intervention distribution, and the distribution itself is obtained using a propensity score for $M$ that  includes $C$ in the conditioning set. The identifying functional for the post-intervention distribution where we intervene on $A$ is also very similar: $p(C, Z, M, Y \mid \doo(a)) = p(C, Z) \times p(M | A=a, Z, C) \times \sum_A p(A| C, Z)\times p(Y|Z, A, M, C).$

Based on the above observations, the modifications to the theory are as follows. Assumptions (A1) and (A3) remain the same, however, the relevance assumption (A2) now includes $C$ in the conditioning set (similar to the relevance assumption in conditional IV models.) That is, (A2) is modified to read: $Z$ is a covariate that is \emph{not} a causal consequence of $A$ such that $Z\not\ci A \mid C$ and $Z \not\ci Y \mid A, M, C.$ As mentioned earlier, the Verma constraint that allows us to test that $A$ has no bidirected path to its children is then the dormant \emph{conditional} independence $Z \ci Y \mid C$ in $p(Z, A, M, Y, C)/p(M | A, Z, C).$ The tests proposed in Section~\ref{sec:finite-sample} are modified appropriately by defining $\widetilde{q}(A|Y, Z, M, C)$ and $q^d(M|A, Z, C)$ as:
\begin{align*}
	\widetilde{q}(A \mid Y, Z, M, C) &\equiv \frac{p(A\mid Z, C)\times p(Y \mid A, M, Z, C)}{\sum_A p(A\mid Z, C)\times p(Y \mid A, M, Z, C)} \\
	q^d(M \mid A, Z, C) &\equiv \frac{p(M \mid A, Z, C)}{p(M \mid A=a, Z, C)}.
\end{align*}
The primal and dual weights are then used to test $Z \ci Y \mid M, C$ in $p(C, Z, A, M, Y)/\widetilde{q}(A|Y, Z, M, C)$ and $Z \ci Y \mid M, C$ in $p(C, Z, A, M, Y)/q^d(M|A, Z, C)$. The same weights can be re-used for causal effect estimation as before.

Hence, the proposed framework does not exclude the possibility of including additional baseline covariates. Similar to conditional IV models, the inclusion of such covariates can be beneficial from an identification standpoint by reducing the possibility of unmeasured confounding on the causal path from $A \diedgeright M \diedgeright Y.$ The addition of these covariates may also yield additional statistical efficiency in the test and downstream causal effect estimation.

\clearpage


\section{Nested Markov Factorization of ADMGs}
\label{app:nested}

The nested Markov factorization of $p(V)$ with respect to an ADMG $\G(V)$ is defined using conditional ADMGs (CADMGs) derived from $\G(V)$ and kernel objects derived from $p(V)$ via a \textit{fixing} operation \citep{richardson2017nested}. The fixing operation can be causally interpreted as an application of the g-formula on a single variable to a given graph-kernel pair to obtain another graph-kernel pair corresponding to a conditional ADMG and a post-intervention distribution that factorizes according to it. We define the relevant concepts below.

\emph{Conditional ADMG (CADMG)} -- A CADMG $\G(V,W)$ is an ADMG whose vertices can be partitioned into random variables $V$ and fixed/intervened variables $W.$ Only outgoing directed edges may be adjacent to variables in $W.$ For any random variable $V_i \in V$ in a CADMG $\G(V, W)$, the usual definitions of genealogical relations, such as parents and descendants, extend naturally by allowing for the inclusion of fixed variables into these sets. However, bidirected connected components a.k.a districts of a CADMG ${\cal G}(V,W)$ are only defined for elements of $V$.

\emph{Kernel} -- A kernel  $q_V(V \mid W)$ is a mapping from values in $W$ to normalized densities over $V$. That is, a kernel acts in most respects like an ordinary conditional distribution. In particular, given a kernel $q_V(V \mid W)$ and a subset ${X}\subseteq {V}$,  conditioning and marginalization are defined in the usual way as $q_X({X} \mid {W}) \equiv \sum_{{V} \setminus {X}} \ q_V(V \mid W)$ and $q_V({V}\setminus {X} \mid {X}, {W}) \equiv {q_V({V} \mid {W})}/{q_V({X} \mid {W})}$. 

\emph{Fixing operation for a single variable} -- Fixability of a single variable and the graphical and probabilistic operations of fixing it are defined as follows: 
\begin{itemize}
	\setlength{\itemsep}{0.25cm} 
	
	\item \emph{Fixability} --  A vertex $V_i \in V$ is said to be \emph{fixable} in $\G(V,W)$ if  $V_i \rightarrow \ldots \rightarrow V_j$ and $V_i \leftrightarrow \ldots \leftrightarrow V_j$ do not both exist in $\G$, for any $V_j \in V \setminus V_i$. 
	
	\item \emph{Graphical operation of fixing} -- 
	The graphical operation of fixing $V_i,$ denoted by $\phi_{V_i}(\G),$ yields a new CADMG $\G(V \setminus V_i, W \cup V_i)$ where all edges with arrowheads into $V_i$ are removed, and $V_i$ is fixed to a particular value $v_i.$  All other edges in $\G$ are kept the same. 
	
	\item  \emph{Probabilistic operation of fixing} -- 
	Given a kernel $q_V(V\mid W)$, the associated CADMG $\G(V,W),$ and $V_i \in V$, the corresponding probabilistic operation of fixing $V_i$, denoted by $\phi_{V_i}(q_V;\G),$ yields a new kernel defined as follows:
	\begin{align}
		\phi_{V_i}(q_V; \G) \equiv q_{V\setminus V_i}(V \setminus V_i \mid W \cup V_i) \equiv \frac{q_{V}(V \mid W)}{q_V(V_i \mid \mb_\G(V_i), W)},
		\label{eq:ordinary_fixing} 
	\end{align}
	where $\mb_\G(V_i)$ denotes the Markov blanket of $V_i$, which consists of all vertices in the district of $V_i$ and the parents of the district of $V_i$ (excluding $V_i$ itself.)
\end{itemize}  

\emph{Fixing operation for a set of variables} -- The above definitions can be extended to a set of vertices $S$ as follows: 
\begin{itemize}
	\setlength{\itemsep}{0.25cm} 
	
	\item \emph{Fixability} -- A set $S \subseteq V$ is said to be fixable if there exists an ordering $(S_1, \dots, S_p)$ such that $S_1$ is fixable in $\G,$ $S_2$ is fixable in $\phi_{S_1}(\G),$ and so on. Such an ordering is said to form a valid fixing sequence for $S$ and we denote it by $\sigma_S$. 
	\item \emph{Graphical operation} -- It is known that applying any two valid fixing sequences on $S$ yield the same CADMG, so we denote this by $\phi_S(\G(V,W)).$  %${ V} \setminus { S}$ is said to be \emph{reachable} in $\G$. 	
	\item \emph{Probabilistic operation} -- $\phi_{\sigma_{ S}}(q_V; \G)$ is defined via the usual function composition to yield operators that fix all elements in ${S}$ in the order given by $\sigma_{S}$.
\end{itemize}

%The last concept we need for  defining the nested Markov model of an ADMG is the notion of an \emph{intrinsic} set. A set $D$ is said to be \emph{intrinsic} in $\G(V)$ if $V\setminus D$ is fixable in $\G$\footnote{That is, from a causal perspective, $q_D(D\mid \pa_\G(D)))$ is identified from $p(V)$ via sequential application of the g-formula.} and $D$ forms a single district in $\phi_{V\setminus D}(\G)$. %if ${\cal G}_{ D}$ has a single district, where ${\cal G}_{D}$ is the induced subgraph where we keep all vertices in ${D}$ and edges whose endpoints are in ${D}$. %We define $\phi_{\sigma_{S}}(\G)$ and $\phi_{\sigma_{ S}}(q_{\bf V}; \G)$ via the usual function composition to yield operators that fix all elements in ${S}$ in the order given by  $\sigma_{S}$. %given by $\sigma_{\bf S}$.
%

\emph{Intrinsic set} -- A set $D$ is said to be \emph{intrinsic} in $\G(V)$ if $V\setminus D$ is fixable in $\G$\footnote{That is, from a causal perspective, $q_D(D\mid \pa_\G(D)))$ is identified from $p(V)$ via sequential applications of the g-formula.} and $\phi_{V\setminus D}(\G)$ contains a single district.

Finally, a distribution $p(V)$ is said to satisfy the nested Markov factorization wrt an ADMG $\G(V)$ if there exists a set of kernels $q_D(D \mid \pa_\G(D))$, one for every $D$ intrinsic in $\G(V)$, s.t. for every fixable set $S$ and every valid fixing sequence $\sigma_S,$
\begin{align}
	\phi_{\sigma_S}(p(V);\G) = \prod_{D \in {\cal D}(\phi_S(\G))} q_D(D \mid \pa_{\G}(D)),  %\hspace{1cm} {\small \textit{(Nested Markov factorization)}}
	\label{eq:nested_Markov_model}
\end{align}%
where ${\cal D}(\phi_S(\G))$ denotes the set of all districts in the CADMG $\phi_S(\G).$ The nested Markov factorization asserts that every kernel that can be derived via a valid sequence of fixing satisfies the district factorization with respect to the CADMG obtained by this sequence, and each of the kernels appearing in the factorization corresponds to intrinsic sets. 

%\clearpage

\section{Details On Simulated Data}
\label{app:sims}

\begin{figure}[h]
	\begin{center}
		\scalebox{0.6}{
			\begin{tikzpicture}[>=stealth, node distance=1.9cm]
				\tikzstyle{square} = [draw, thick, minimum size=1.0mm, inner sep=3pt]
				
				\begin{scope}[xshift=0cm]
					\path[->, very thick]
					node[] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					node[above of=m, yshift=-0.75cm, opacity=0.5] (u12) {$U_{1, 2}$}
					
					(z) edge[blue] (a)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(z) edge[blue, bend right] (m)
					(a) edge[red, <->, bend left] (y) 
					(u12) edge[blue, opacity=0.4, bend right] (a)
					(u12) edge[blue, opacity=0.4, bend left] (y)
					
					node[below of=a, xshift=1cm, yshift=0cm] (b) {(a)} ;
				\end{scope}
				
				\begin{scope}[xshift=7cm]
					\path[->, very thick]
					node[] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					node[below of=a, yshift=0.75cm, opacity=0.5] (u34) {$U_{3, 4}$}
					node[above of=m, yshift=-0.75cm, opacity=0.5] (u12) {$U_{1, 2}$}
					
					(z) edge[blue] (a)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(z) edge[red, <->, bend right] (m)
					(a) edge[red, <->, bend left] (y) 
					(u12) edge[blue, opacity=0.4, bend right] (a)
					(u12) edge[blue, opacity=0.4, bend left] (y)
					(u34) edge[blue, opacity=0.4, bend left] (z)
					(u34) edge[blue, opacity=0.4, bend right] (m)
					
					node[below of=a, xshift=1cm, yshift=0cm] (b) {(b)} ;
				\end{scope}
				
				
				\begin{scope}[xshift=14cm]
					\path[->, very thick]
					node[] (z) {$Z$}
					node[right of=z] (a) {$A$} 
					node[right of=a] (m) {$M$}
					node[right of=m] (y) {$Y$}
					node[above of=m, yshift=-0.75cm, opacity=0.5] (u12) {$U_{1, 2}$}
					
					(z) edge[blue] (a)
					(a) edge[blue] (m)
					(m) edge[blue] (y)
					(z) edge[blue, bend right=55] (m)
					(a) edge[red, <->, bend left] (y) 
					(u12) edge[blue, opacity=0.4, bend right] (a)
					(u12) edge[blue, opacity=0.4, bend left] (y)
					(u12) edge[blue, opacity=0.4] (m)
					(a) edge[red, bend right, dashed, <->] (m)
					(m) edge[red, bend right, dashed, <->] (y)
					
					node[below of=a, xshift=1cm, yshift=0cm] (b) {(c)} ;
				\end{scope}
			
			\begin{scope}[xshift=21cm]
				\path[->, very thick]
				node[] (z) {$Z$}
				node[right of=z] (a) {$A$} 
				node[right of=a] (m) {$M$}
				node[right of=m] (y) {$Y$}
				node[above of=m, yshift=-0.75cm, opacity=0.5] (u12) {$U_{1, 2}$}
				
				(z) edge[blue] (a)
				(a) edge[blue] (m)
				(m) edge[blue] (y)
				(z) edge[blue, bend right] (m)
				(a) edge[red, <->, bend left] (y) 
				(u12) edge[blue, opacity=0.4, bend right] (a)
				(u12) edge[blue, opacity=0.4, bend left] (y)
				(a) edge[blue, bend right, dashed] (y)
				
				node[below of=a, xshift=1cm, yshift=0cm] (b) {(d)} ;
			\end{scope}
				
			\end{tikzpicture}
		}
	\end{center}
	\vspace{-0.5cm}
	\caption{ Data for the simulations are generated according to: (a, b) Two hidden variable causal DAGs and the corresponding ADMGs that satisfy the front-door assumptions; (c, d) Two hidden variable causal DAGs and the corresponding ADMGs that do not satisfy the front-door assumptions due to additional confounding and a direct effect of $A$ on $Y$ respectively. }
	\label{fig:sims-parametric-verma}
\end{figure}

The causal graphs used to perform experiments in Task (i) of Section~\ref{sec:experiments} are shown in Fig.~\ref{fig:sims-parametric-verma} -- underlying hidden variables are depicted with lower opacity to highlight the structure of the ADMG. The first two graphs are ones in which $A$ has no bidirected path to its children and the Verma constraint $Z \ci Y$ in $p(Z, A, Y \mid \doo(m))$ holds; the latter two are ones in which there is additional confounding or violation of the exclusion restriction, which also coincide with the absence of any non-parametric equality constraint between $Z$ and $Y.$ 
We first describe generating data according to the hidden variable causal model in Fig.~\ref{fig:sims-parametric-verma}(a) whose observed margin would nested factorize wrt to the ADMG shown in the same figure. Each coefficient $\beta_{(\cdot)}$ appearing in any of the equations below is generated randomly from a uniform distribution $\text{Uniform}(1, 2).$
\begin{align*}
	&U_1 = \text{Uniform}(-1, 1) \\
	&p(U_2=1) = \text{expit}(0.5) \\
	& p(Z=1) = \text{expit}(0.5) \\
	& p(A=1 \mid Z, U_1, U_2) = \text{expit}(-0.5 + \beta_{ZA}Z - \beta_{U_1A}U_1 + \beta_{U_2A}U_2) \\
	& p(M=1 \mid Z, A) = \text{expit}(-0.5 -  \beta_{ZM}Z + \beta_{AM}A)\\
	& Y \mid U_1, U_2, M = \beta_{U_1Y} U_1 + \beta_{U_2Y}U_2 - \beta_{MY}M + \text{Normal}(0, 1).
\end{align*}
%
Similarly, data generation for Fig.~\ref{fig:sims-parametric-verma} (b) is performed as follows:
\begin{align*}
	&U_1 = \text{Uniform}(-1, 1) \\
	&U_3= \text{Uniform}(-1, 1) \\
	&p(U_2=1) = \text{expit}(0.5) \\
	&p(U_4=1) = \text{expit}(0.5) \\
	&p(Z=1 \mid U_3, U_4) = \text{expit}(0.5 + \beta_{U_3Z}U_3 - \beta_{U_4Z}U_4) \\
	&p(A=1 \mid Z, U_1, U_2) = \text{expit}(-0.5 + \beta_{ZA}Z - \beta_{U_1A}U_1 + \beta_{U_2A}U_2) \\
	&p(M=1 \mid U_3, U_4, A) = \text{expit}(-0.5 - \beta_{U_3M}U_3 + \beta_{U_4M}U_4 + \beta_{AM}A) \\
	&Y \mid U_1, U_2, M = \beta_{U_1Y} U_1 + \beta_{U_2Y}U_2 - \beta_{MY}M + \text{Normal}(0, 1).
\end{align*}
%
Data generation for Fig.~\ref{fig:sims-parametric-verma}(c) is similar to Fig.~\ref{fig:sims-parametric-verma}(a) except the equation for $M$ is modified to include $U_{1, 2}$ as follows,
\begin{align*}
	p(M=1 \mid Z, A, U_1, U_2) = \text{expit}(-0.5 - \beta_{ZM}Z + \beta_{AM}A + \beta_{U_1M}U_1 - \beta_{U_2M}U_2).
\end{align*}
%
Data generation for Fig.~\ref{fig:sims-parametric-verma}(d) is also similar to Fig.~\ref{fig:sims-parametric-verma}(a) except the equation for $Y$ is modified to include $A$ as follows, 
\begin{align*}
	Y \mid U_1, U_2, M, A = \beta_{U_1Y}U_1 + \beta_{U_2Y}U_2  - \beta_{MY}M - \beta_{AY}A +  \text{Normal}(0, 1).
\end{align*}
%
For non-linear data generating processes, the equations for $M$ are modified in all the graphs in Fig.~\ref{fig:sims-parametric-verma} to add an interaction term between $Z$ and $A.$ Since we are using the dual weights estimated via machine learning techniques to perform the front-door test and compute the causal effect, it suffices to make this relation non-linear -- non-linearity in other portions of the data generating process would not affect the computation of the dual weights due to variational independence.

Finally, for experiments comparing IV and front-door estimation, the graph used to generate data in a manner that satisfies the front-door but not IV assumptions is Fig.~\ref{fig:sims-parametric-verma}(a), and so the data generating mechanism remains the same. To generate data for a graph satisfying both assumptions, we use a subgraph where the $Z \diedgeright M$ is absent corresponding to dropping $Z$ from the equation for $M$ as follows: $	p(M=1 \mid A) = \text{expit}(-1 + \beta_{AM}A).$
%\begin{align*}
%	p(M=1 \mid A) = \text{expit}(-1 + \beta_{AM}A).
%\end{align*}

\clearpage

\section{Proofs} 
\label{app:proofs} 

{\bf Theorem~\ref{thm:fd}: }
\begin{proof}
	Per Lemma~\ref{lem:non-trivial} if $p(Z, A, M, Y)$ satisfies assumptions (A1-A3), the Verma constraint is non-trivial in the following sense: There is no ordinary independence constraint  between $Z$ and $Y$ in the observed data distribution and  the independence $Z \ci Y$ arises only in the post-intervention distribution $p(Z, A, Y | \doo(m)).$  Under Verma faithfulness, distributions that satisfy this constraint must nested Markov factorize wrt an ADMG that permits identification of $p(Z, A, Y | \doo(m)),$ and where $Z$ and $Y$ are m-separated in the resulting CADMG obtained by removal of edges into $M.$ Clearly $Z$ and $Y$ are not adjacent (via a directed or bidirected edge) in any such ADMG $\G$. We now show that the existence of any structure such that $A$ has a bidirected path to one of its children would contradict the aforementioned conditions.
	
	From results in \cite{tian2002general}, we know that $p(Z, A, Y \mid \doo(m))$ is identified from an observed data distribution nested Markov relative to $\G$ if and only if there is no bidirected path from $M$ to $Y$ in $\G.$ Given the existence of $A \diedgeright M \diedgeright Y$ due to assumption (A1), the presence of $M \biedge Y$ in $\G$ then contradicts identifiability of $p(Z, A, Y | \doo(m)).$ From the discussion in Appendix~\ref{app:verma-comms}, the relevance assumption is not satisfied if only $A \biedge M$ exists in $\G$ (see Fig.~\ref{fig:trivial-verma}(c) for an example.) Hence, $A \biedge M$ must be paired with either $M \biedge Y,$ which we know contradicts identifiability, or $A \biedge Y.$ When paired with the latter we get the bidirected path $M \biedge A \biedge Y$ which also contradicts identifiability of $p(Z, A, Y | \doo(m)).$ Hence, in ADMGs encoding the Verma constraint, the only bidirected edge permitted between $A, M,$ and $Y$ is $A \biedge Y.$ In fact, $A\biedge Y$ must be present to satisfy assumption (A2). This is why the edge is compelled in all ADMGs in the pattern Fig.~\ref{fig:pattern-admgs}(a). 
	
	The presence of $A\diedgeright Y$ in $\G$ does not necessarily contradict identifiability of $p(Z, A, Y | \doo(m)).$ However, it  contradicts m-separability between $Z$ and $Y$ in the resulting CADMG due to the presence of the chain $Z\diedgeright A \diedgeright Y$ which must be blocked by conditioning on $A,$ but doing so opens a collider path $Z \circright A \biedge Y$ ($\circright$ depicts a directed edge or bidirected edge; at least one of these must be present in $\G$ due to the relevance assumption A2.) So far we have shown that if the Verma constraint holds, the corresponding ADMG $\G$ is one in which (i) $Z$ and $Y$ are not adjacent, (ii) the only bidirected edge present between the variables $A, M,$ and $Y$ is $A \biedge Y,$  and (iii) $A \diedgeright Y$ is not present.
	
	The only remaining possibility for the existence of a bidirected path from $A$ to its child $M$ then is if we have \emph{both} $Z\biedge A$ and $Z \biedge M$ in $\G.$ This too produces a contradiction, as these edges complete another bidirected path  $M \biedge Z \biedge A \biedge Y$ resulting in non-identifiability of $p(Z, A, Y | \doo(m)).$ Hence,  the distribution factorizes according to some ADMG $\G$ that has no bidirected path from $A$ to any of its children (and the choice of valid ADMGs are given by the pattern in Fig.~\ref{fig:pattern-admgs}(a).)
	
		
\end{proof}

{\bf Lemma~\ref{lem:pattern-id}: } 

\begin{proof}
	In any valid ADMG derived from pattern~\ref{fig:pattern-admgs}(a), we know that $A$ does not have any bidirected path to any of its children. Let $D_A$ represent the district of $A$ in any such ADMG. Per results in \cite{tian2002general}, the post-intervention distribution $p(V \setminus A | \doo(a))$ is identified as
	\begin{align*}
		p(V) \times \frac{\sum_A q_{D_A}(D_A \mid \pa_\G(D_A))}{q_{D_A}(D_A \mid \pa_\G(D_A))}.
	\end{align*}
	In our case, this implies $p(Z, M, Y | \doo(a)) = p(Z, A, M, Y)/\tilde{q}(A|Z, M, Y)\vert_{A=a},$ where $\tilde{q}(.) \equiv \frac{q_{D_A}(D_A \mid \pa_\G(D_A)}{\sum_A q_{D_A}(D_A \mid \pa_\G(D_A)}.$ 
	Hence, to show that the post-intervention distribution is identified by the same functional in any ADMG in Fig.~\ref{fig:pattern-admgs}(a), it suffices to show that $\frac{q_{D_A}(D_A \mid \pa_\G(D_A)}{\sum_A q_{D_A}(D_A \mid \pa_\G(D_A)}$ is the same in all such ADMGs. There are only two cases: the first where $D_A = \{Z, A, Y\}$ and second where $D_A=\{Z, Y\}$ ($M$ cannot be in $D_A$ by the pre-condition that $A$ has no bidirected path to its children.) In the first case, $q_{D_A}(D_A \mid \pa_\G(D_A)) \equiv p(Z, A, Y \mid \doo(m))  = p(Z)\times p(A\mid Z)\times p(Y\mid A, M, Z).$ Therefore,
	\begin{align*}
		\frac{q_{D_A}(D_A \mid \pa_\G(D_A))}{\sum_A q_{D_A}(D_A \mid \pa_\G(D))} = \frac{p(Z)\times p(A\mid Z)\times p(Y\mid A, M, Z)}{\sum_A p(Z)\times p(A\mid Z)\times p(Y\mid A, M, Z)} = \frac{p(A\mid Z)\times p(Y\mid A, M, Z)}{\sum_A p(A\mid Z)\times p(Y\mid A, M, Z)}.
	\end{align*}
	In the second case when $D_A = \{A, Y\}$ we have already seen in Section~\ref{sec:prelims} that $q_{D_A}(D_A \mid \pa_\G(D_A)) = p(A\mid Z) \times p(Y \mid A, M, Z).$ The result immediately follows that in both scenarios the conditional kernels are the same. That is, regardless of the specific edges incorporated from the pattern in Fig.~\ref{fig:pattern-admgs}(a), the functional for $p(Z, M, Y \mid \doo(a))$ and hence $\E[Y \mid \doo(a)]$ remains the same. 
\end{proof} 

\clearpage

\bibliography{bhattacharya_335}


\end{document}


