% \documentclass[twoside]{article}

 % \usepackage{aistats2025}
% % If your paper is accepted, change the options for the package
% % aistats2025 as follows:
% %
% %\usepackage[accepted]{aistats2025}
% %
% % This option will print headings for the title of your paper and
% % headings for the authors names, plus a copyright note at the end of
% % the first column of the first page.

% % If you set papersize explicitly, activate the following three lines:
% %\special{papersize = 8.5in, 11in}
% %\setlength{\pdfpageheight}{11in}
% %\setlength{\pdfpagewidth}{8.5in}

% % If you use natbib package, activate the following three lines:
% %\usepackage[round]{natbib}
% %\renewcommand{\bibname}{References}
% %\renewcommand{\bibsection}{\subsubsection*{\bibname}}
% \usepackage{amsmath,amsthm,enumitem,nicefrac,bbm,verbatim,amssymb,amsfonts,amscd, graphicx,algpseudocode,hyperref,float,mathtools,bm,xcolor,appendix,subcaption}
% \usepackage[utf8]{inputenc}
% \usepackage[T1]{fontenc}
% \usepackage[ruled, lined, linesnumbered, commentsnumbered, longend]{algorithm2e}
% % If you use BibTeX in apalike style, activate the following line:
% %\bibliographystyle{apalike}

% \usepackage[round]{natbib}
% \renewcommand{\bibname}{References}
% \renewcommand{\bibsection}{\subsubsection*{\bibname}}
% 
% \usepackage{amsmath,amsthm,enumitem,nicefrac,bbm,verbatim,amssymb,amsfonts,amscd, graphicx,algpseudocode,hyperref,float,mathtools,bm,xcolor,appendix,subcaption}
% % \usepackage[utf8]{inputenc}
% % \usepackage[T1]{fontenc}
%  \input{glodef}
%  \input{locdef}
%  \begin{document}

% If your paper is accepted and the title of your paper is very long,
% the style will print as headings an error message. Use the following
% command to supply a shorter title of your paper so that it can be
% used as headings.
%
%\runningtitle{I use this title instead because the last one was very long}

% If your paper is accepted and the number of authors is large, the
% style will print as headings an error message. Use the following
% command to supply a shorter version of the authors names so that
% they can be used as headings (for example, use only the surnames)
%
%\runningauthor{Surname 1, Surname 2, Surname 3, ...., Surname n}

% Supplementary material: To improve readability, you must use a single-column format for the supplementary material.
%\onecolumn
% \ifdefined\SINGLE
% \else
% \title{Supplementary Material for Revisiting the Berkeley Admissions Data: Statistical Tests for Causal Hypotheses}
% \maketitle\appendix 
% %\titlespacing{\section}{0pt}{*0}{*0} % Adjust values as needed
% \fi

\section{Additional Preliminaries}
\begin{definition}[Twin SCM]
    Let $\model = \Paren{V,W,\cX,P,f}$ be an SCM. The twinning operation maps $\model$ to the \textbf{twin SCM}
    \begin{equation*}
        \model^{\text{twin}} \triangleq \Paren{V \cup V',W,\cX_V \times \cX_{V'} \times \cX_W, P,\tilde{f}}
    \end{equation*}
    where $V' = \left \lbrace v': v \in V \right \rbrace$ is a disjoint copy of $V$ and the causal mechanism $\tilde{f}:\cX_V \times \cX_{V'} \times \cX_W \mapsto \cX_V \times \cX_{V'}$ is given by $\tilde{f}\Paren{x_V,x_{V'},x_W} = \Paren{f(x_V,x_W),f(x_{V'},x_W)}$.
\end{definition}

\begin{definition}[Solution function]
Let $\model = \Paren{\enop, \exrv, \spc, f,P}$ be an acyclic SCM and $C \subseteq \enop$. A \textbf{solution function} of $\model$ with respect to $C$ is a measurable mapping $g_C : \cX_{\enop \backslash C} \times \spc_{\exrv} \mapsto \spc_{C}$ that satisfies the structural equations for $C$, i.e., for all $x_{\enop \backslash C} \in \spc_{\enop \backslash C}$, $P(X_{\exrv})$-a.a $x_{\exrv} \in \spc_{\exrv}$, 
\begin{equation*}
    g_{C}\Paren{x_{\enop\backslash C},x_{\exrv}} = f_{C}\Paren{x_{\enop \backslash C},g_{C}\Paren{x_{\enop\backslash C},x_{\exrv}},x_{\exrv}}.
\end{equation*}
\end{definition}

\begin{definition}[Markov kernels]
    Let $\mathcal{T}$ and $\mathcal{W}$ be measurable spaces. A Markov kernel is defined as a measurable map $K : \mathcal{T} \mapsto \mathcal{P}\Paren{\mathcal{W}}$ where $\mathcal{P}\Paren{\mathcal{W}}$ is defined as the space of probability measures on $\mathcal{W}$.
\end{definition} 

\section{IV Inequalities Expressed as Markov Kernels}\label{app:iv}
For \eqref{eq:iv} to be well defined, we required that $P_{\model}(Z=z) > 0$ for all $z$ for any $\model \in \modeliv$ (see Definition \eqref{def:IVmodelclass}). In this section, we relax this requirement by noting that, in fact, IV inequalities are more appropriately expressed in terms of $P_{\model}\Paren{X,Y \mid \doop{Z}}$. 
\begin{lemma}\label{lem:iv_mk}
Let $\modelivrelax \triangleq \left\lbrace \model: G(\model) \text{ is a subgraph of Figure }\ref{fig:iv} \right \rbrace$. For any $\model \in \modelivrelax$, 
\begin{equation}\label{eq:iv_MK}
     \max_{x} \sum_{y} \max_{z} P_{\model}\Paren{X=x,Y=y\mid \doop{Z=z}} \leq 1. 
\end{equation}
\end{lemma}
\begin{proof}
    Since 
    \begin{align*}
        P_{\model}(X=x,Y=y \mid \doop{Z=z}) &= P_{\model}(f_X(z,U)=x,f_Y(x,U)=y) \\
        &\le P(f_Y(x,U)=y) = P_{\model}(Y=y \mid \doop{X=x}), 
    \end{align*}
\begin{equation}
    \max_{x} \sum_{y} \max_{z} P_{\model}\Paren{X=x,Y=y\mid \doop{Z=z}} \leq \max_{x} \sum_{y} P_{\model}(Y=y \mid \doop{X=x}) = 1.
\end{equation}
\end{proof}
Note that \eqref{eq:iv_MK} is defined even when $ \exists z \in \cZ$ such that $P(Z=z) = 0$. In contrast, positivity must be assumed in \eqref{eq:iv} for the terms to be well-defined. Further, if positivity is assumed, then $\modelivrelax = \modeliv$ and \eqref{eq:iv_MK}  is identical to \eqref{eq:iv}. 

 
\section{Proofs for Section 3}
\subsection{Nested Fairness Notions: Without Confounding}\label{app:nested-nocf}
\unconfnested*
\begin{proof}
\bm{$\nullgraphunconf = \nullctrfunconf$}: We first show that $\nullgraphunconf \subseteq \nullctrfunconf$. $\model \in \nullgraphunconf$ implies $\forall \ldept$, $f_A(s,d,U_A)$ is constant in $\lsex$ $P$-a.s. Therefore, for all $\ldept, \lsex$, \begin{equation}\label{eq:nocf-ctrf}
P_{\model}\Paren{f_A(s,d,U_A)=f_A(S,d,U_A)}=1.
\end{equation}
Therefore, $\model \in \nullctrfunconf$. For the converse, $\model \in \nullctrfunconf$ implies \eqref{eq:nocf-ctrf}. For $s\neq s'$, and all $d$,
\begin{equation*}
P_{\model}\Paren{f_A(s,d,U_A)=f_A(S,d,U_A)}=P_{\model}\Paren{f_A(s,d,U_A)=f_A(s',d,U_A)}P_{\model}\Paren{S=s'} +P_{\model}\Paren{S=s}.
\end{equation*}
From \eqref{eq:nocf-ctrf} if $P_{\model}\Paren{S=s'} > 0$, we conclude $P_{\model}\Paren{f_A(s,d,U_A)=f_A(s',d,U_A)} = 1$. If $P_{\model}\Paren{S=s'} = 0$, since \eqref{eq:nocf-ctrf} holds for $s'$, i.e., for all $d,s'$, $P_{\model}\Paren{f_A(s',d,U_A)=f_A(S,d,U_A)}=1$, we have $$P_{\model}\Paren{f_A(s',d,U_A)=f_A(s,d,U_A)} = 1.$$ Therefore, $\model \in \nullgraphunconf$.

\bm{$\nullctrfunconf \subset \nullinterunconf$}: For $\model \in \nullctrfunconf$, \eqref{eq:nocf-ctrf} implies $P_{\model}\Paren{f_A(s,d,U_A)} = P_{\model}\Paren{f_A(S,d,U_A)}$ for all $d,s$, implying $\model \in \nullinterunconf$. Since Example~\ref{ex:unconfexample} belongs to  $\nullinterunconf \backslash \nullctrfunconf$, the inclusion is strict.

\bm{$\nullinterunconf \subset \nullobsunconf$}: For $\model \in \nullinterunconf$, $P_{\model}\Paren{A=1 \mid \doop{D=d},\doop{S=s}}$ is constant in $s$. Consider a pair $\lsex,\ldept,$ such that, for $\model \in \nullinterunconf$, $P_{\model}\Paren{\lsex,\ldept}>0$. Then 
\begin{equation}\label{eq:inter-obs-nocf}
P_{\model}\Paren{A=1 \mid \doop{D=d},\doop{S=s}} = P_{\model}\Paren{A=1 \mid D=d,S=s}.
\end{equation}
Note that, if, for $\lsex' \neq \lsex$, $P_{\model}\Paren{\lsex',\ldept}=0$, then $P_{\model}\Paren{A=1 \mid D=d,S=s} = P_{\model}\Paren{A=1 \mid D=d}$. If instead, $P_{\model}\Paren{\lsex',\ldept}>0$, then from \eqref{eq:inter-obs-nocf} for $S=\lsex'$, we have that $$P_{\model}\Paren{A=1 \mid D=d,S=s} =P_{\model}\Paren{A=1 \mid D=d,S=s'} = P_{\model}\Paren{A=1 \mid D=d}.$$ Therefore, we conclude that $\model \in \nullobsunconf$ implying $\nullinterunconf \subseteq \nullobsunconf$. Since the SCM in Example~\ref{ex:posexample} lies in $\nullobsunconf \backslash \nullinterunconf$, $\nullinterunconf \subset \nullobsunconf$.

If for all $s,d$, $P_{\model}\Paren{s,d} > 0$, then for $\model \in \nullobsunconf$, $$P_{\model}\Paren{A=1 \mid D=d,S=s} =P_{\model}\Paren{A=1 \mid \doop{D=d},\doop{S=s}}$$ is constant in $s$ and equal to $P_{\model}\Paren{A=1 \mid \doop{D=d}}$. This implies $\model \in \nullinterunconf$. Therefore, if for all $s,d$, $P_{\model}\Paren{s,d} > 0$, then $\nullinterunconf = \nullobsunconf$. 


% \begin{align*}
%     P_{\model}\Paren{A=1 \mid \doop{D=d}} = \sum_s P_{\model}\Paren{A=1 \mid \doop{D=d},S=s} P_{\model}\Paren{S=s \mid \doop{D=d}}
% \end{align*}

\end{proof}

\subsection{Equivalence of Tests Without Confounding}\label{app:equiv-nocf}

\unconfequiv*

\begin{proof}
    From Lemma~\ref{lem:notion_equiv}, $\distgraphunconf = \distctrfunconf \subseteq \distinterunconf \subseteq \distobsunconf$. Therefore, it suffices to prove that $\distgraphunconf = \distobsunconf$. For every $P_{\model} \in \distobsunconf$, $$P_{\model}\Paren{\outcome, \sex,\dept} = P_{\model}\Paren{\sex} \otimes P_{\model}\Paren{\dept \mid \sex} \otimes P_{\model}\Paren{\outcome \mid \dept}.$$ Hence, $\exists \tilde{\model} \in \nullgraphunconf$ such that $P_{\model}\Paren{A,D,S} = P_{\tilde{\model}}\Paren{A,D,S}$. 
\end{proof}

\section{Proofs for Section 4}
\subsection{Sharpness of IV inequalities}\label{app:ivsharp}
\begin{figure}[th]
     \centering
            \begin{tikzpicture}
            \tikzstyle{vertex}=[circle,fill=none,draw=black,minimum size=17pt,inner sep=0pt]
\node[vertex] (Z) at (0,0) {$Z$};
\node[vertex][fill=lightgray] (U) at (0,1) {$U_Z$};
\node[vertex] (Y) at (3,0) {$Y$};
\node[vertex] (X) at (1.5,0) {$X$};
\node[vertex][fill=lightgray] (R) at (2.2,1) {$R$};
%\node[vertex] (S') at (1,-0.5) {$S'$};
\path (Z) edge (X);
\path (X) edge (Y);
\path (U) edge (Z);
\path (R) edge (X);
\path (R) edge (Y);

            \end{tikzpicture}
        \caption{Response-function parameterization of $M \in \modeliv$} 
        \label{fig:proof_iv}
        \end{figure}

% \begin{theorem}[Theorem~\ref{thm:iv_tight} restated]
% Let $X,Y,Z$ be random variables defined on $\cX,\cY,\cZ$ respectively, with $|\cX| = n\geq 2, |\cY|=2, |\cZ| =2$ where $n \in \NN$. Let the set of conditional distributions that satisfy the instrumental-variable inequalities \eqref{eq:iv} be defined as $\distiv \triangleq \left \lbrace P(X,Y|Z) : P(X,Y|Z)\text{ satisfies }\eqref{eq:iv} \right \rbrace$. Define the set of conditional observational distributions of $\model \in \modeliv$ as $\distmodeliv \triangleq \left \lbrace P_{\model}(X,Y|Z) : \model \in \modeliv \right \rbrace.$ Then $\distiv = \distmodeliv.$
% \end{theorem}
\ivtight*

\begin{proof}We prove a more general statement that includes Theorem~\ref{thm:iv_tight} as a special case.

\begin{lemma}\label{lem:mk_iv_tight}
 Let $X,Y,Z$ be discrete random variables defined on $\cX,\cY,\cZ$ respectively, with $|\cX| = n\geq 2, |\cY|=2, |\cZ| =2$. Define $\mkiv \triangleq \left \lbrace K(X,Y \mid Z) : K(X,Y \mid Z)\text{ satisfies }\eqref{eq:iv_MK} \right \rbrace$. Define $\mkmodeliv \triangleq \left \lbrace P_{\model}\Paren{X,Y \mid \doop{Z}} : \model \in \modelivrelax \right \rbrace.$ Then $\mkiv = \mkmodeliv.$   
\end{lemma}

Note that $\distiv = \left \lbrace P(Z): \forall z, P(Z=z)>0  \right \rbrace \otimes \mkiv$ since assuming positivity, \eqref{eq:iv} is identical to \eqref{eq:iv_MK}. Further, $\distmodeliv = \left\{ P_{\model}(Z): \model \in \modeliv \right\} \otimes \mkmodeliv$ since assuming positivity, $\modelivrelax = \modeliv$ and $P_{\model}\Paren{X,Y \mid \doop{Z}} = P_{\model}\Paren{X,Y \mid Z}$ for $\model \in \modeliv$. Since the first factors are identical, Theorem~\ref{thm:iv_tight} follows from Lemma~\ref{lem:mk_iv_tight}.
\end{proof}

\begin{proof}[Proof of Lemma~\ref{lem:mk_iv_tight}]



% We first note that $\distiv = \distivpos \hspace{2pt}\dot{\cup} \hspace{2pt} \distivzero$ where
% \begin{align}
% \distivpos &\triangleq \left \lbrace P(X,Y,Z) : \forall z, P(Z=z) > 0 \text{ and } P(X,Y \mid Z) \text{ satisfies } \eqref{eq:iv} \right \rbrace, \label{eq:distivpos} \\
% \distivzero &\triangleq  \left \lbrace P(X,Y,Z) : \exists z \text{ s.t. } P(Z=z) = 0 \right \rbrace, \label{eq:distivzero}
% \end{align}
%  where the latter holds because $|\cZ|=2$ implies that if $P(Z=z) = 0$ for some $z$, then \eqref{eq:iv} imposes no additional constraints on $P(X,Y|Z=z')$ where $z' \neq z$. 
% Similarly we can express $\distmodeliv = \distmodelivpos \hspace{2pt}\dot{\cup} \hspace{2pt} \distmodelivzero$ where 
% \begin{align}
% \distmodelivpos &\triangleq \left \lbrace P_{\model}(X,Y,Z) : \model \in \modeliv \text{ and }\forall z, P_{\model}(Z=z) > 0 \right \rbrace, \label{eq:distmodelivpos} \\
% \distmodelivzero &\triangleq  \left \lbrace P_{\model}(X,Y,Z) : \model \in \modeliv \text{ and } \exists z \text{ s.t. } P_{\model}(Z=z) = 0 \right \rbrace.\label{eq:distmodelivzero}
% \end{align}
%  Any distribution in $\distmodelivzero$ lies in $\distivzero$. Conversely, any distribution in $\distivzero$ is also generated by a $\model \in \modeliv$ where $P_{\model}(Z=z) = 0$ implying that $\distivzero = \distmodelivzero$. We now show that $\distivpos = \distmodelivpos$.
% Since $P(X,Y,Z) = P(Z) \otimes P(X,Y \mid Z)$, we express $\distivpos$ and $\distmodelivpos$ as 
% \begin{align}
%     \distivpos &= \left \lbrace P(Z): \forall z, P(Z=z) > 0 \right \rbrace \times \left \lbrace P(X,Y \mid Z): P(X,Y \mid Z) \text{ satisfies } \eqref{eq:iv} \right \rbrace, \label{eq:prodivpos} \\
%     \distmodelivpos &= \left \lbrace P_{\model}(Z): \model \in \modeliv \text{ and } \forall z, P(Z=z) > 0 \right \rbrace \times \left \lbrace P_{\model}(X,Y \mid Z): \model \in \modeliv \right \rbrace.\label{eq:prodmodelivpos}
% \end{align}
% Since the IV inequalities only concern $P(X,Y \mid Z)$ and being in $\modeliv$ does not pose restrictions on $P_{\model}(Z)$, the first factors in the above expressions are identical. Therefore, we restrict attention to proving that the second factors are equal. In the following, we denote them as $\distivposcnd$ and $\distmodelivposcnd$, respectively.

For $\model \in \modelivrelax$, the response-function parameterization yields a counterfactually equivalent SCM \cite[Section 8.4]{ForreMooij25} $\tilde{\model} = (\enop,\tilde{\exrv},\tilde{\spc},\tilde{f},\tilde{P})$, where $\enop = \left \lbrace Z,X,Y \right \rbrace, \tilde{\exrv} = \left \lbrace \response, U_Z\right \rbrace, \tilde{\spc} = \spc_{\enop}\times\spc_{\tilde{\exrv}}, \tilde{f} = \Paren{\tilde{f}_{Z}, \tilde{f}_{X}, \tilde{f}_{Y}}$ where we define $\spc_{\response}, \tilde{f},\tilde{P}$ through the function $\Phi: \spc_{\exrv} \mapsto \spc_{\tilde{\exrv}}$ where
\begin{align*}
    \spc_{\response} &\triangleq \cX^{\cZ} \times \cY^{\cX},\\
    \forall u_Z,u_X,u_Y,u, \Phi\Paren{u_Z,u_X,u_Y,u} &\triangleq \Paren{\Paren{z \mapsto f_X(z,u,u_X),x \mapsto f_Y(x,u,u_Y)},u_Z},\\
    \forall u_Z, \tilde{f}_{Z}(u_Z) &\triangleq f_{Z}(u_Z),\\
    \forall \respfunc, z, \tilde{f}_{X}\Paren{\respfunc,z} &\triangleq \respfunc_1\Paren{z}, \\
    \forall \respfunc, x, \tilde{f}_{Y}\Paren{\respfunc,x} &\triangleq \respfunc_2\Paren{x},
    \end{align*}
where $\respfunc = \Paren{\respfunc_1,\respfunc_2}$ and $\tilde{P}$ is the push-forward distribution $\Phi_{*}(P)$.
Note that $\spc_{\response}$ is a discrete space, $\response$ a discrete random variable, and $\tilde{P}(\response)$  a discrete distribution over $\spc_{\response}$.

Under the response-function parameterization, only $\tilde{P}(R)$ is a parameter. We will consider $\tilde{P}(R)$ to be an element of $\mathbb{R}^{n_X^{n_Z} n_Y^{n_X}}$
where $\#\cX = n_X, \#\cZ = n_Z,\#\cY = n_Y$ and  $$\cK_{\tilde{\mathbb{M}}_{\text{IV}+}} \triangleq \left \lbrace P_{\tilde{\model}}(X,Y \mid \doop{Z}): \model \in \modelivrelax \right \rbrace$$ to be a subset of $\mathbb{R}^{n_X n_Y n_Z}$. Note that because of the counterfactual equivalence of the response-function parameterization, which in turn implies interventional equivalence, $\cK_{\tilde{\mathbb{M}}_{\text{IV,r}}} = \mkmodeliv$. From Lemma~\ref{lem:iv_mk}, $\mkmodeliv \subseteq \mkiv$. 

 % Given that $\cP_{\tilde{\mathbb{M}}_{IV}}$ is the image under a linear mapping of the probability simplex in $\mathbb{R}^{n_X^{n_Z} n_Y^{n_X}}$, it is a convex polyhedral set.
To show the converse, we show that each extreme point of $\mkiv$ is obtained by a point in $\mkmodeliv$. We enumerate all extreme points of $\mkiv$ in Lemma~\ref{lem:extremepointsiv}. We show that each such extreme point is obtained by the following response-function. Choose $x, x' \in \cX, y,y' \in \cY$ with $y=y'$ if $x=x'$. Then any response function satisfying 
\begin{align*}
    r_1(z) &= \begin{cases} 
    x \quad z=0 \\
x' \quad z=1
    \end{cases}\\
      r_2(\tilde{x}) &= \begin{cases} 
    y \quad \tilde{x}=x \\
y' \quad \tilde{x}=x'\\
\text{arbitrary} \quad \text{otherwise}
    \end{cases}
\end{align*}
gives all extreme points of $\mkiv$. Therefore, $\mkmodeliv \supseteq \mkiv$, implying $\mkmodeliv = \mkiv$. 

\end{proof}

\begin{lemma}\label{lem:extremepointsiv}
    Consider the real vector space $\mathbb{R}^{n_X n_Y n_Z}$ spanned by the canonical basis vectors $$\left \lbrace \delta_{x,y|z}: x \in \cX, y \in \cY, z \in \cZ\right \rbrace$$
    where $\delta_{x,y|z}$ denotes a unit vector of length $n_Xn_Yn_Z$ where all entries except the one at $(x,y,z)$ are zero. For $n_Y=n_Z=2, n_X = n\geq 2$, $\mkiv$ considered as a subset of this vector space is a polyhedral set with extreme points 
\begin{equation*}
   \EE = \left \lbrace \delta_{x,y|0} + \delta_{x',y'|1}: x,x' \in \cX; y,y' \in \cY : x\neq x' \right \rbrace \cup \left \lbrace \delta_{x,y|0} + \delta_{x,y|1}: x \in \cX, y\in \cY \right \rbrace.
\end{equation*}
\end{lemma}
\begin{proof}
Consider $\mkiv$ to be a subset of $\mathbb{R}^{n_X n_Y n_Z}$ where each element of $\mkiv$ is represented as $\left \{ K\Paren{X=x, Y=y \mid Z=z} \right \}_{x \in \cX, y \in \cY, z \in \cZ}$ and satisfies
\begin{align*}
\forall x \in \cX : K\Paren{X=x,Y=0|Z=0} + K\Paren{X=x,Y=1|Z=1} &\leq 1,\\
\forall x \in \cX : K\Paren{X=x,Y=0|Z=1} + K\Paren{X=x,Y=1|Z=0} &\leq 1,\\
\forall x \in \cX, \forall y \in \cY, \forall z \in \cZ : K\Paren{X=x,Y=y|Z=z} &\geq 0,\\
\forall z \in \cZ : \sum_{x,y}K\Paren{X=x,Y=y|Z=z} &=1.
\end{align*}
  An elementary result in convex geometry states that a point is an extreme point of a polyhedral set (defined by a set of linear (in)equality constraints) in $\mathbb{R}^m$ vector space if and only if it is a feasible point and there exists a subset $A$ of $m$ constraints that are active and linearly independent.
For the case at hand, for a point to be an extreme point of $\mkiv$, it has to satisfy the above $6n+2$ constraints (feasibility) and additionally, a subset of $n_Xn_Yn_Z = 4n$ of them must be active and linearly independent.
The two normalization constraints are equality constraints and hence must be active at any feasible point.

We first show that all points in $\EE$ are extreme points of $\mkiv$.
First, choose $x,x' \in \cX$ with $x \neq x'$ and choose $y,y' \in \cY$.
It can be verified that $\delta_{x,y|0} + \delta_{x',y'|1}$ satisfies the IV inequalities with $2$ out of the $2n$ IV inequalities being active.
Further, $4n-2$ non-negativity constraints and both normalization constraints are active.
For the subset $A$ of active constraints we can take e.g.\ both active IV inequalities together with the $4n_X - 2$ active nonnegativity constraints; one can check that these are linearly independent.
Second, choose $x \in \cX, y \in \cY$. 
It can be verified that $\delta_{x,y|0} + \delta_{x,y|1}$ satisfies the IV inequalities with $2$ out of the $2n$ IV inequalities being active.
Further, $4n-2$ non-negativity constraints and both normalization constraints are active.
For the subset $A$ of active constraints we can take e.g.\ both active IV inequalities together with the $4n_X - 2$ active nonnegativity constraints; one can check that these are linearly independent.

Finally, we check whether $\EE$ exhausts all extreme points.
Pick a feasible point $b \in \mathbb{R}^{n_X n_Y n_Z}$.
We will refer to the indices $i$ with $b_i \neq 0$ ($b_i = 0$) as the ``non-zero (zero) entries of $b$'', and to the indices $j$ in active constraint $a^T b = 1$ with $a_j \neq 0$ as the ``active entries of the constraint''. 
For a set of entries of $b$ and a set of active entries of one or more active constraints, we refer to their intersection as their ``overlap''.
We also call the $b_i$ corresponding to $K (X, Y | Z = z)$ a ``stratum''.

Suppose $b$ is an extreme point of $\mkiv$.
Then at least $4n - 2$ of the $6n$ inequality constraints must be active.
Hence if exactly $r$ IV inequalities are active at $b$, then we need at least $4n - 2 - r$ active nonnegativity constraints at $b$, or in other words, $b$ must have at least $4n - 2 - r$ zero entries.
Because both strata of $b$ need to be normalized, $b$ can have at most $4n - 2$ zero entries.
The number of zero entries in $b$ that can overlap with the active entries of the active IV inequalities is upper bounded by $r$; indeed, otherwise there would be an active IV inequality for which both its active entries are zero entries of $b$, contradicting that this IV inequality is active.

We will show that the only possibilities for $b$ are the extreme points that we already identified, proceeding case by case.
  \begin{enumerate}
    \item $r > 2$.
      This yields a contradiction with the normalization constraints.
      Indeed, each entry of $b$ appears in exactly one IV inequality, and each IV inequality contains exactly two active entries (one for each stratum).
      Hence, the sum of those entries of $b$ that correspond with active entries of active IV constraints must be exactly $r$.
      However, by the normalization constraints, the sum over \emph{all} entries of $b$ must be 2.
      This implies that $r \le 2$.
    \item $r = 2$.
      Then $b$ must contain at least $4n-4$ zero entries, of which at most two can overlap with the active entries of the active IV inequalities.
      \begin{enumerate}
        \item If there are no such overlaps, then all other entries of $b$ must zero entries.
          This means that we need the $4n-4$ nonnegativity constraints corresponding to those other entries in the subset $A$, as well as the two normalization and inequality constraints; no other active constraints exist that could be added to $A$.
          But then the constraints in $A$ would not be linearly independent.
          Indeed, the following linear dependence between the active constraints is obtained: adding the coefficient vectors of the two active IV inequalities and those of all active non-negativity constraints yields the same result as adding the coefficient vectors of the two normalization constraints.
          So we arrive at a contradiction.
        \item 
          If there is at least one overlap, 
          then $b_j=1$ with $j$ the other active entry in the active IV inequality with the overlap.
          This implies $2n-1$ zeroes in the stratum of $j$.
          Consider now the other active IV inequality. 
          Since $b_j=1$, the active entry corresponding to that stratum must be a zero entry of $b$. 
          Hence, $b_k=1$ for $k$ the other active entry in this IV inequality.
          This means that $b$ must be of the form $\delta_{x_1,x_2|0} + \delta_{x_1',x_2'|1}$, with the two non-zero entries corresponding to active entries of two different IV inequalities, and hence we recover the extreme points already identified.
      \end{enumerate}
    \item $r = 1$.
      Then $b$ must contain at least $4n-3$ zero entries, of which at most one can overlap with  the active entries of active IV inequalities.

      Because of the normalization constraints, we need at least one non-zero entry in both strata.
      This means that there must be a stratum $x_3$ with exactly one non-zero entry, and then $b$ must be of the form
      $b = \delta_{x_1,x_2|x_3} + \gamma \delta_{x_1',x_2'|x_3'} + (1-\gamma) \delta_{x_1'',x_2''|x_3'}$ with $\gamma \in (0,1]$ and $x_3' \ne x_3$.
      This means that the active IV inequality must be the one that has active entry $(x_1,x_2|x_3)$.
      Its other active entry is then $(x_1,1-x_2|x_3')$, and this must be an overlapping zero entry of $b$.

      If $\gamma = 1$ then this would also activate the IV inequality that contains active entry $(x_1',x_2'|x_3')$.
      Since the only active IV inequality must also contain active entry $(x_1,x_2|x_3)$, this gives a contradiction, as the two coefficients of $b$ at the active entries sum to 2 instead of 1.
      Hence, $\gamma \in (0,1)$, and $b$ contains exactly $4n-3$ zero entries.

      So the active constraints consist of 1 active IV inequality, 2 active normalization constraints and $4n-3$ active nonnegativity constraints.
      However, these are not linearly independent.
      Indeed: subtracting the coefficient vectors of the $2n-1$ nonnegativity constraints in stratum $x_3$ from the coefficient vector of the normalization constraint of that stratum, and adding the coefficient vector of the nonnegativity constraint corresponding to $(x_1,1-x_2|x_3')$ gives a vector that is identical to the coefficient vector of the active IV inequality.

      So we have arrived at a contradiction.
    \item $r = 0$.
      Then $b$ must contain at least $4n-2$ zero entries.
      Because of the normalization constraints, $b$ needs one non-zero entry in both strata, and its value must be 1.
      This will activate at least one IV inequality. 
      Contradiction. \qedhere
  \end{enumerate}
\end{proof}

\begin{comment}
\begin{proof}
    Consider $\mkiv$ to be a subset of $\mathbb{R}^{n_X n_Y n_Z}$ where each element of $\mkiv$ is represented as $\left \{ K\Paren{X=x, Y=y \mid Z=z} \right \}_{x \in \cX, y \in \cY, z \in \cZ}$ and satisfies
\begin{align*}
\forall x \in \cX : K\Paren{X=x,Y=0|Z=0} + K\Paren{X=x,Y=1|Z=1} &\leq 1,\\
\forall x \in \cX : K\Paren{X=x,Y=0|Z=1} + K\Paren{X=x,Y=1|Z=0} &\leq 1,\\
\forall x \in \cX, \forall y \in \cY, \forall z \in \cZ : K\Paren{X=x,Y=y|Z=z} &\geq 0,\\
\forall z \in \cZ : \sum_{x,y}K\Paren{X=x,Y=y|Z=z} &=1.
\end{align*}
For a point to be an extreme point of $\mkiv$, it has to satisfy the above $6n+2$ constraints and additionally, at least $n_Xn_Yn_Z = 4n$ of them must be active and linearly independent. The normalization constraints have to be satisfied and therefore, active, which leaves us with enumerating all feasible points for which at least $4n-2$ of the $6n$ inequalities are active and linearly independent.

We first show that all points in $\EE$ are extreme points of $\mkiv$.
Choose $x,x' \in \cX$ with $x \neq x'$ and choose $y,y' \in \cY$. It can be verified that $\delta_{x,y|0} + \delta_{x',y'|1}$ satisfies the IV inequalities with $2$ out of the $2n$ IV inequalities being active. Further, $4n-2$ non-negativity constraints are active. All active constraints are linearly independent. 
Choose $x \in \cX, y \in \cY$. It can be verified that $\delta_{x,y|0} + \delta_{x,y|1}$ satisfies the IV inequalities with $2$ out of the $2n$ IV inequalities being active. Further, $4n-2$ non-negativity constraints are active. All active constraints are linearly independent. 

Finally, we check whether $\EE$ exhausts all extreme points. Pick a feasible point $b \in  \mathbb{R}^{n_X n_Y n_Z}$. We will refer to the indices $i$ with $b_i \neq 0$ ($b_i = 0$) as the “non-zero (zero) entries of $b$”, and to the indices $j$ in an active IV inequality constraint $a^\top b = 1$ with $a_j \neq 0$ as the “active entries of the IV inequality”, and say that two such entries overlap if $i = j$. We also call the set of entries of $b$ corresponding to $K (X, Y | Z = z)$ a “stratum”. We proceed case by case.


1. More than two IV inequalities are active in $b$. This yields a contradiction with the normalization constraints.

2. Exactly two IV inequalities are active in $b$. Then $b$ must contain at least $4n- 4$ zero entries, of which at most two can overlap with the active IV inequality entries. a) If there are no overlaps, then $b$ must be $0$ in all other entries, but then the active constraints would not be linearly independent, since the sum of the active normalization constraints equals equals the sum of active IV constraints and the active nonnegativity constraints.
b) If at least one of these overlaps, then $b_j = 1$ with $j$ the other active entry in the active IV inequality with the overlap. This implies $2n-1$ zeroes in the stratum of $j$, so in total we have identified $2n$ zeroes in $b$. We need $2n- 4$ more active constraints. Consider the other IV inequality. Since $b_j = 1$, the active entry corresponding to that stratum must be a zero entry of $b$. Hence, $b_k = 1$ for $k$ the other active entry in this IV inequality. Thus this gives us the extreme points already identified.

3. Exactly one IV inequality is active in $b$. Then we need to pick $4n - 3$ zero entries corresponding to active nonnegativity constraints.
a) Suppose none of these entries overlaps. Then we have $4n- 2$ locations for these zeros. We can pick one nonzero entry that’s not an active IV inequality entry. The normalization constraint implies $b_j = 1$ for one of the two active IV inequality entries, and hence $b_k = 0$ for the other one. Contradiction.
b) Suppose exactly one of these entries overlaps. We need $2n- 2$ more active constraints. Only two possible non-zero entries for $b$ remain. In order for the active constraints to span the entire vector space, at least one of these should be a zero entry. This can be checked by observing that if the two possible non-zero entries of $b$ are positive, then the $4n$ active constraints cannot express the vector that is $1$ in all entries except the two positive entries where it is $0$. Therefore, if at least one of the two possible non-zero entries of $b$ is zero, then $b_j = 1$ for the other entry, which activates another IV inequality. Contradiction.
c) If there are two overlapping zeroes the IV inequality couldn’t be active.

4. Zero IV inequalities are active in $b$. Then $b$ must contain at least $4n - 2$ zero entries. Because of the normalization constraints, we need one non-zero entry in both strata, and it must actually be a $1$. This will activate at least one IV inequality. Contradiction.
\end{proof}
\end{comment}

\subsection{Nested Fairness Notions: With Confounding}\label{app:nested-cf}
\begin{proposition}\label{prop:cfnotions}
    \begin{equation*}
    \nullgraphrelax = \nullctrfrelax \subset \nullinterrelax, 
    \end{equation*}
     \begin{equation*}
    \nullgraph = \nullctrf \subset \nullinter. 
    \end{equation*}
\end{proposition}

\begin{proof}
If $\model \in \modelsedgerelax$, then 
\begin{equation}
    A^{\doop{S=s,D=d}} = f_{\outcome}\Paren{s,d,U_A,U}, \hspace{5mm}A^{\doop{D=d}} = f_{\outcome}\Paren{S,d,U_A,U}.
\end{equation}

For $\model \in \nullgraphrelax$, since $\sex$ is not a parent of $\outcome$, for all $\ldept, f_{A}(s,d,U_A,U)$ is constant in $s$ $P$-a.s. This implies that for all $d,s,s'$, 
\begin{equation*}
P_{\model}\Paren{f_{\outcome}\Paren{s,d,U_A,U} = f_{\outcome}\Paren{s',d,U_A,U}} = 1
\end{equation*}
Therefore, for all $s,d$,
\begin{equation}\label{eq:ctrf_causalmechanism}
P_{\model}\Paren{f_{\outcome}\Paren{s,d,U_A,U} = f_{\outcome}\Paren{S,d,U_A,U}} = 1,
\end{equation}
implying that $\model \in \nullctrfrelax$. For the converse, $\model \in \nullctrfrelax$ implies \eqref{eq:ctrf_causalmechanism}. For $s\neq s'$, and all $d$,
\begin{align*}
&P_{\model}\Paren{f_A(s,d,U_A,U)=f_A(S,d,U_A,U)}\\
&=P_{\model}\Paren{f_A(s,d,U_A,U)=f_A(s',d,U_A,U)}P_{\model}\Paren{S=s'} +P_{\model}\Paren{S=s}.
\end{align*}
If $P_{\model}\Paren{S=s'} > 0$, we conclude $P_{\model}\Paren{f_A(s,d,U_A,U)=f_A(s',d,U_A,U)} = 1$. If $P_{\model}\Paren{S=s'} = 0$, since \eqref{eq:ctrf_causalmechanism} holds for $s'$, we have $P_{\model}\Paren{f_A(s,d,U_A,U)=f_A(s',d,U_A,U)} = 1$. Therefore, $\model \in \nullgraphrelax$. Therefore, $\nullgraphrelax = \nullctrfrelax$. Further, $\nullgraph = \nullctrf$.

We now prove that $\nullctrfrelax \subseteq \nullinterrelax$.
% \begin{align*}
%      P_{M}\Paren{\outcome \mid \doop{D=d}} &= \sum_{s} P_{M}(S=s \mid \doop{D=d}) P_{M}\Paren{\outcome \mid \doop{D=d},S=s} \\
%      &\stackrel{(a)}{=}\sum_{s} P_{M}(S=s) P_{M}\Paren{\outcome \mid \doop{D=d},S=s}\\
%      &\stackrel{(b)}{=} \sum_{s} P_{M}(S=s) P_{M}\Paren{\outcome \mid \doop{D=d,S=s}}
% \end{align*}
%    where $(a)$ and $(b)$ follow from Rule 2 of do-calculus. Therefore, the interventional notion of fairness criteria is equivalent to 
% \begin{equation}
%     P_{M}\Paren{\outcome \mid \doop{D=d,S=s}} = P_{M}\Paren{\outcome \mid \doop{D=d,S=s'}}
% \end{equation}
% for $s \neq s'$. This implies that 
For $\model \in \nullctrfrelax$, \eqref{eq:ctrf_causalmechanism} holds. Therefore, for all $\lsex,\ldept$,
\begin{equation}
P_{\model}\Paren{f_{\outcome}\Paren{s,d,U_A,U}} = P_{\model}\Paren{f_{\outcome}\Paren{S,d,U_A,U}}.
\end{equation}   
Therefore, $\nullctrfrelax  \subseteq \nullinterrelax$ and subsequently $\nullctrf  \subseteq \nullinter$. Note that the Example~\ref{ex:unconfexample} lies in $\modelsedgerelax$ (and in $\modelsedge$ since $P_{\model}(S=s)>0$ for all $s$) for any $U$ that is independent of $U_S,U_D,U_A$. Further, $P_{\model}\Paren{A=1 \mid \doop{S=s},\doop{D=d}} = 0.5 = P_{\model}\Paren{A=1 \mid \doop{D=d}}$; however, $S$ is a parent of $A$. Therefore, $\nullctrfrelax \subset \nullinterrelax$ and $\nullctrf \subset \nullinter$.

% \begin{equation}
%     A^{\doop{S=s,D=d}} = f_{\outcome}\Paren{s,d,U_A,U}, A^{\doop{D=d}} = f_{\outcome}\Paren{S,d,U_A,U}.
% \end{equation}

% Therefore, for $s \neq s'$,
% \begin{equation*}
%     P_M(A^{\doop{S=s,D=d}} = A^{\doop{D=d}}) = \Pr\Paren{ f_{\outcome}\Paren{s,d,U_A,U} = f_{\outcome}\Paren{s',d,U_A,U}}, 
% \end{equation*}

\end{proof}

\subsection{Equivalence of Statistical Tests}\label{app:equiv-cf}

 \begin{figure}[th]
     \centering
            \begin{tikzpicture}
            \tikzstyle{vertex}=[circle,fill=none,draw=black,minimum size=17pt,inner sep=0pt]
\node[vertex] (Z) at (0,0) {$S$};
\node[vertex][fill=lightgray] (U) at (0,1) {$U_S$};
\node[vertex] (Y) at (2.5,0) {$A$};
\node[vertex] (X) at (1.3,1) {$D$};
\node[vertex][fill=lightgray] (R) at (2.4,0.95) {$R$};
%\node[vertex] (S') at (1,-0.5) {$S'$};
\path (Z) edge (X);
\path (X) edge (Y);
\path (U) edge (Z);
\path (R) edge (X);
\path (R) edge (Y);
\path (Z) edge (Y);

            \end{tikzpicture}
        \caption{Response-function parameterization of $M \in \modelsedgerelax$} 
        \label{fig:proof_conf}
        \end{figure}

\confequiv*

\begin{proof}Like in Section~\ref{app:ivsharp}, we prove a more general statement, Lemma~\ref{lem:mk_equivalence}, that includes Theorem~\ref{thm:equivalence} as a special case. 
We define analogues of $\modelsedge, \nullnotion$ that remove the positivity assumption, $P_{\model}(S=s)>0$ for all $s$, as $\modelsedgerelax, \nullnotionrelax$, respectively (where we use `notion' as a placeholder for `graph', `ctrf' and `inter').
\begin{lemma}\label{lem:mk_equivalence}Let 
\begin{align*}
\mkgraph &\triangleq \left \lbrace P_{\model}\Paren{\dept,\outcome \mid \doop{\sex}} : \model \in \nullgraphrelax \right \rbrace, \\
\mkinter &\triangleq \left \lbrace P_{\model}\Paren{\dept,\outcome \mid \doop{\sex}} : \model \in \nullinterrelax \right \rbrace, \\
\mkctrf &\triangleq \left \lbrace P_{\model}\Paren{\dept,\outcome \mid \doop{\sex}} : \model \in \nullctrfrelax \right \rbrace.
\end{align*}
Then $\mkinter = \mkctrf = \mkgraph = \mkiv,$ where $\mkiv$ is defined in Lemma~\ref{lem:mk_iv_tight}.
\end{lemma}

Note that $\distiv = \left \lbrace P(Z): \forall z, P(Z=z)>0  \right \rbrace \otimes \mkiv$ since assuming positivity, \eqref{eq:iv} is identical to \eqref{eq:iv_MK}. Further, $\distnotion = \left\{ P_{\model}(\sex):\model \in \nullnotion  \right\} \otimes \mknotion$ since assuming positivity, $\modelsedge = \modelsedgerelax$ and $P_{\model}\Paren{\dept,\outcome \mid \doop{\sex}} = P_{\model}\Paren{\dept,\outcome \mid \sex}$ for $\model \in \modelsedge$. Since the first factors are identical, Theorem~\ref{thm:equivalence} follows from Lemma~\ref{lem:mk_equivalence}.
\end{proof}


% \begin{theorem}[Theorem~\ref{thm:equivalence} restated]
% Let 
% \begin{align*}
% \distgraph &\triangleq \left \lbrace P_{\model}\Paren{\outcome,\dept | \sex} : \model \in \nullgraph \right \rbrace, \\
% \distinter &\triangleq \left \lbrace P_{\model}\Paren{\outcome,\dept| \sex} : \model \in \nullinter \right \rbrace, \\
% \distctrf &\triangleq \left \lbrace P_{\model}\Paren{\outcome,\dept| \sex} : \model \in \nullctrf \right \rbrace,
% \end{align*}
% Then
%      $$\distinter = \distctrf = \distgraph = \distiv,$$
%      where $\distiv$ is defined in Theorem~\ref{thm:iv_tight}.
% \end{theorem}
\input{proof_equivalence.tex}

\subsection{Relaxing the assumption of no confounding between $S$ and $D$}\label{app:SDconf}

In this section, we prove that Theorem~\ref{thm:iv_tight} and Theorem~\ref{thm:equivalence} hold when $\modeliv$ and $\modelsedge$ are expanded by allowing for confounding between $\sex$ and $\dept$. Denote the corresponding expanded models by $\modelivZX$ and $\modelsedgeSD$, respectively, where the former has structural equations of the form $Z = f_Z(U_Z,U_{ZX}), X=f_{X}(Z,U_X,U_{ZX},U_{XY}),Y= f_Y(X,U_Y,U_{XY})$ where $U_Z,U_X,U_Y,U_{ZX},U_{XY}$ are independent exogenous random variables, and the latter has structural equations of the form $S = f_S(U_S,U_{SD}), D=f_{D}(S,U_D,U_{SD},U_{DA}),A= f_A(S,D,U_A,U_{DA})$ where $U_S,U_D,U_A,U_{SD},U_{DA}$ are independent exogenous random variables. The corresponding expanded causal null hypothesis corresponding to the fairness notions are denoted by $H^0_{\text{cf-notion}+SD}$ where we use `notion' as a placeholder for `graph',`ctrf', `inter'. 

For $\model \in \modelivZX$, pick $U \sim \text{Unif}\left[0,1\right]$ and a deterministic map $h: \mathcal{Z} \times \left[0,1\right] \mapsto \mathcal{U}_{ZX}$ such that $U_{ZX} = h(Z,U)$ a.s.. Note that such an $h$ always exists for any random variable $U_{ZX}$ taking values in a standard measurable space (see e.g.\ \cite[Corollary 2.7.7]{ForreMooij25}). Define $\tilde{\model} = (V=\{\tilde{Z},\tilde{X},\tilde{Y} \}, W = \{ U_{\tilde{Z}},U_{\tilde{X}},U,U_{\tilde{Y}},U_{\tilde{XY}}\}, \cX = \cX_{V} \times \cX_W, \tilde{f} = (f_{\tilde{Z}},f_{\tilde{X}},f_{\tilde{Y}}),\tilde{P})$ where a) $\forall u_{\tilde{Z}}, f_{\tilde{Z}}(u_{\tilde{Z}}) \triangleq u_{\tilde{Z}}$, b) $\forall u_{\tilde{X}}, u, u_{\tilde{XY}}, \tilde{z}, f_{\tilde{X}}(u_{\tilde{X}},u,u_{\tilde{XY}},\tilde{z}) \triangleq f_{X}(\tilde{z},h(\tilde{z},u),u_{\tilde{X}},u_{\tilde{XY}})$, c) $\forall u_{\tilde{Y}},u_{\tilde{XY}}, \tilde{x}, f_{\tilde{Y}}(u_{\tilde{Y}},u_{\tilde{XY}}, \tilde{x}) \triangleq f_{Y}(u_{\tilde{Y}},u_{\tilde{XY}}, \tilde{x})$, and $\tilde{P} = P_{\model}(Z) \otimes P_X \otimes \text{Unif}\left[0,1\right] \otimes P_{Y} \otimes P_{XY}$ where $P_X, P_Y, P_{XY}, P_Z, P_{ZX}$ are marginals of $P$ over $U_X, U_Y, U_{XY}, U_Z, U_{ZX}$ respectively.  Note that $P_{\model}(Z)$ is the pushforward of $P_Z \otimes P_{ZX}$ through $f_Z$ thus making $\tilde{P}$ a product distribution over the exogenous random variables. By the above construction, $\tilde{M} \in \modeliv$. For every $ \model \in \modelivZX$, $P_{\model}(X,Y,Z) = P_{\model}(Z) \times P_{\model}(X,Y \mid Z) = P_{\tilde{\model}}(\tilde{Z}) \times P_{\tilde{\model}}(\tilde{X},\tilde{Y} \mid \doop{\tilde{Z}}) \in \distmodeliv$. Therefore, $\{ P_{M}(Z,X,Y): M \in \modelivZX \} = \distmodeliv$. Using a similar argument that replaces the labels $Z,X,Y$ by $S,D,A$, $\{ P_{M}(S,D,A): M \in H^{0}_{\text{cf-notion}+SD}\} = \distnotion$. This implies Theorem~\ref{thm:iv_tight} and Theorem~\ref{thm:equivalence} hold for $\modelivZX$ and $\modelsedgeSD$ respectively.

\section{Comparison With Existing Notions}
\subsection{Without Confounding}
\subsubsection{Counterfactual Fairness and Demographic Parity}\label{app:kusnerctrfdemo}
%The counterfactual fairness notion of \cite{KusnerLRS17} implies demographic parity for the Berkeley example without allowing for confounding.

We restate the counterfactual notion of fairness from \citet{KusnerLRS17} for the Berkeley example below.
\begin{definition}[Counterfactual Fairness \citep{KusnerLRS17}]\label{def:ctrfkusner}
$\model \in \modelsunconfedge$ is fair if for all $\lsex,\ldept$, $P_{\model}\Paren{s,d}>0$ implies
% \begin{equation*}
% P_{\model}\Paren{\outcome^{\doop{\sex=\lsex'}}\mid \dept = \ldept, \sex= \lsex} = P_{\model}\Paren{\outcome \mid \dept = \ldept, \sex=\lsex}.
% \end{equation*}
\begin{equation*}
P_{\model}\Paren{\outcome^{\doop{\sex=\lsex'}}\mid \dept = \ldept, \sex= \lsex} = P_{\model}\Paren{\outcome^{\doop{\sex=\lsex}}\mid \dept = \ldept, \sex= \lsex}
\end{equation*}
for $s' \neq s$.
\end{definition}
The counterfactual fairness notion of \cite{KusnerLRS17} implies demographic parity for the Berkeley example without allowing for confounding.
\begin{proposition}\label{prop:ctrf-demo-nocf}
        If $\model \in \modelsunconfedge$ is counterfactually fair according to Definition~\ref{def:ctrfkusner}, then $P_{\model}$ satisfies demographic parity, i.e., for all $s,s'$ such that $P_{\model}(s), P_{\model}(s') >0$, $$P_{\model}\Paren{A=1|S=s} = P_{\model}\Paren{A=1|S=s'}.$$
\end{proposition}
    
\begin{proof}
\begin{figure}[t]
     \centering
            \begin{tikzpicture}
            \tikzstyle{vertex}=[circle,fill=none,draw=black,minimum size=17pt,inner sep=0pt]
\node[vertex] (S) at (0,0) {$S$};
\node[vertex][fill=lightgray] (U_S) at (0,-1) {$U_S$};
\node[vertex] (S') at (0,-2) {$S'$};
\node[vertex] (A) at (3,0) {$A$};
\node[vertex] (A') at (3,-2) {$A'$};
\node[vertex] (D) at (1.5,1.5) {$D$};
\node[vertex] (D') at (1.5,-3.5) {$D'$};
\node[vertex][fill=lightgray] (U_D) at (1.5,-1) {$U_D$};
\node[vertex][fill=lightgray] (U_A) at (3,-1) {$U_A$};
%\node[vertex][fill=lightgray] (R) at (2.2,1) {$R$};
%\node[vertex][fill=lightgray] (U_S) at 
%\node[vertex] (S') at (1,-0.5) {$S'$};
\path (S) edge (D);
\path (D) edge (A);
\path (S) edge (A);
\path (S') edge (A');
\path (S') edge (D');
\path (D') edge (A');
\path (U_S) edge (S);
\path (U_D) edge (D);
\path (U_D) edge (D');
\path (U_A) edge (A);
\path (U_A) edge (A');
%\path (R) edge (X);
%\path (R) edge (Y);

            \end{tikzpicture}
        \caption{Causal graph of twin network $(\model^{\text{twin }})^{\doop{S'=s'}}$} 
        \label{fig:twin_net_kusner}
        \end{figure}
    The right-hand side in Definition~\ref{def:ctrfkusner} is $P_{\model}\Paren{A \mid D=d,S=s}$. Therefore, for $s,d$ such that $P_{\model}(s,d) >0$, counterfactual fairness implies that 
    \begin{equation*}
    P_{\model}\Paren{\outcome^{\doop{\sex=\lsex'}}, \dept = \ldept \mid \sex= \lsex} = P_{\model}\Paren{\outcome, \dept = \ldept \mid \sex=\lsex}.
    \end{equation*}
    Marginalizing $\dept$, 
       \begin{equation}\label{eq:marginalDctrf}
    P_{\model}\Paren{\outcome^{\doop{\sex=\lsex'}}\mid \sex= \lsex} = P_{\model}\Paren{\outcome\mid \sex=\lsex}.
    \end{equation}
By Rule $2$ of do-calculus, if $P_{\model}(S=s)>0$, 
\begin{align*}
    P_{\model}\Paren{\outcome\mid \sex=\lsex} &= P_{\model}\Paren{\outcome\mid \doop{\sex=\lsex}}, \\
    &\stackrel{(a)}{=} P_{\model}\Paren{\outcome^{\doop{\sex=\lsex'}}\mid \sex= \lsex}, \\
    &\stackrel{(b)}{=} P_{\model^{\text{twin}}}\Paren{\outcome'\mid \doop{\sex'=\lsex'},\sex= \lsex}, \\
    &\stackrel{(c)}{=} P_{\model}\Paren{\outcome \mid \doop{\sex=\lsex'}}, \\
    &\stackrel{(d)}{=} P_{\model}\Paren{\outcome \mid \sex=\lsex'}.
\end{align*}
    where $(a)$ follows from \eqref{eq:marginalDctrf}, $(b)$ follows from expressing the counterfactual $P_{\model}\Paren{\outcome^{\doop{\sex=\lsex'}}\mid \sex= \lsex}$ in the twin network model, $(c)$ follows from the twin network, and $(d)$ follows from Rule $2$ of do-calculus since $P_{\model}(s')>0$. Therefore, counterfactual fairness implies demographic parity. 
\end{proof}
Note that demographic parity falls prey to Simpson's paradox in the Berkeley example. The above result shows that a valid test for demographic parity is a valid test for \citet{KusnerLRS17}'s counterfactual fairness notion for the assumed model class, $\modelsunconfedge$.
% \subsection{Natural Direct Effect (NDE)}
% The NDE is given by 
% \begin{equation} 
% P_{\model}\Paren{\outcome^{\doop{\sex = \lsex', \dept = \dept^{\doop{\sex=\lsex}}}}=1} - P_{\model}\Paren{\outcome^{\doop{\sex=s}}=1},
% \end{equation}

% \begin{proposition}\label{prop:NDE}
% If $S \indep A \mid D$, then NDE is $0$. 
% \end{proposition}
% \begin{proof}
%     By Pearl's mediation formula \citep{Pearl01}, the NDE can be written as 
% \begin{equation*}
%     \sum_{d} \Paren{P_{\model}\Paren{\outcome = 1 \mid \sex = \lsex', \dept = \ldept} - P_{\model}\Paren{\outcome = 1 \mid \sex = \lsex, \dept = \ldept}}  P_{M}\Paren{\dept = \ldept \mid \sex = \lsex}
% \end{equation*}
% Note that, when $A \indep S \mid D$, the above expression evaluates to $0$. Further, this shows that the NDE is a weighted average of the difference in admission-rate-discrepancy over the departments.
% \end{proof}

% For simplicity, we prove that for $\model \in \modelsunconfedge$,  $P_{\model}\Paren{\outcome^{\doop{\sex = \lsex', \dept = \dept^{\doop{\sex}}}}=1}$ is equivalent to a certain interventional query in a SCM $\model'$ where $\model'$ is such that along with the endogenous variables mentioned above, a copy of $S$ is also included which mediates the direct effect of $S$ on $A$. Denote this new endogenous variable by $\formsex$. The causal mechanism of $S'$ is $S' \triangleq f_{S'}(S) = S$ and the causal mechanism of $A$ is $A \triangleq f_{A}(S',D,U_A)$. We now show that 

% \begin{equation}
% P_{\model}\Paren{\outcome^{\doop{\sex = \lsex', \dept = \dept^{\doop{\sex=\lsex}}}}=1} = P_{\model'}\Paren{\outcome = 1 \mid \doop{\formsex = \lsex'}, \sex = \lsex}. 
% \end{equation}   

% Note that 
% \begin{align*}
%     P_{\model'}\Paren{\outcome = 1 \mid \doop{\formsex = \lsex'}, \sex = \lsex} &= P_{\model'}\Paren{f_{A}\Paren{s',D^{\doop{\sex=\lsex}},U_A}=1} \\
%     &= P_{\model}\Paren{\outcome^{\doop{\sex = \lsex', \dept = \dept^{\doop{\sex=\lsex}}}}=1}. 
% \end{align*}
% Further, $P_{\model}\Paren{\outcome^{\doop{\sex=s}}=1} = P_{\model}\Paren{\outcome \mid S=s}$ by Rule 2 of do-calculus which can further be written as 
% \begin{equation*}
%     P_{\model}\Paren{\outcome \mid S=s} = \sum_{d} P_{\model}\Paren{\outcome = 1 \mid \sex = \lsex, \dept = \ldept} P_{M}\Paren{\dept = \ldept \mid \sex = \lsex}.
% \end{equation*}

% \begin{align*}
%     P_{\model'}\Paren{\outcome = 1 \mid \doop{\formsex = \lsex'}, \sex = \lsex}  &= \sum_{d} P_{\model'}\Paren{\outcome = 1 \mid \doop{\formsex = \lsex'}, \sex = \lsex,\dept = \ldept} P_{M'}\Paren{\dept = \ldept \mid \doop{\formsex = \lsex'}, \sex = \lsex}, \\
%     &\stackrel{(a)}{=} \sum_{d} P_{\model'}\Paren{\outcome = 1 \mid \doop{\formsex = \lsex'}, \dept = \ldept} P_{M'}\Paren{\dept = \ldept \mid \sex = \lsex}, \\
%     &\stackrel{(b)}{=}\sum_{d} P_{\model'}\Paren{\outcome = 1 \mid \formsex = \lsex', \dept = \ldept} P_{M'}\Paren{\dept = \ldept \mid \sex = \lsex},\\
%     &\stackrel{(c)}{=}\sum_{d} P_{\model}\Paren{\outcome = 1 \mid \sex = \lsex', \dept = \ldept} P_{M}\Paren{\dept = \ldept \mid \sex = \lsex}.
% \end{align*}
% where $(a)$ follows from Rule 3 of do-calculus and $A \indep S \mid D$ in the intervened SCM, $(b)$ follows from Rule 2 of do-calculus, and $(c)$ follows from $\formsex$ being a copy of $\sex$. Therefore, 
\subsubsection{Path-dependent Counterfactual Fairness}\label{app:kusnerpathnocf}
We next show that testing the path-dependent counterfactual fairness notion given in the appendix of \citet{KusnerLRS17} coincides with a conditional independence test $A \indep S \mid D$. 

\begin{definition}[Path-dependent Counterfactual Fairness \citep{KusnerLRS17}]\label{def:pdctrfkusner}
$\model \in \modelsunconfedge$ is fair if for all $\lsex,\ldept,$ $P_{\model}\Paren{s,d}>0$ implies
\begin{equation*}
P_{\model}\Paren{\outcome^{\doop{\sex = \lsex', \dept = \ldept}}=1 \mid \dept = \ldept, \sex=\lsex} = P_{\model}\Paren{\outcome^{\doop{\sex = \lsex, \dept = \ldept}}=1 \mid \dept = \ldept, \sex=\lsex}
\end{equation*}
for $s' \neq s$.
\end{definition}

\begin{proposition}\label{prop:pathwise}
$$\left \lbrace P_{\model}(A,D,S): \model \in \modelsunconfedge \text{ satisfies path-dependent counterfactual fairness} \right \rbrace = \distobsunconf. $$
% 
\end{proposition}
\begin{proof}
\begin{figure}[th]
     \centering
            \begin{tikzpicture}
            \tikzstyle{vertex}=[circle,fill=none,draw=black,minimum size=17pt,inner sep=0pt]
\node[vertex] (S) at (0,0) {$S$};
\node[vertex][fill=lightgray] (U_S) at (0,-1) {$U_S$};
\node[vertex] (S') at (0,-2) {$S'$};
\node[vertex] (A) at (3,0) {$A$};
\node[vertex] (A') at (3,-2) {$A'$};
\node[vertex] (D) at (1.5,1.5) {$D$};
\node[vertex] (D') at (1.5,-3.5) {$D'$};
\node[vertex][fill=lightgray] (U_D) at (1.5,-1) {$U_D$};
\node[vertex][fill=lightgray] (U_A) at (3,-1) {$U_A$};
%\node[vertex][fill=lightgray] (R) at (2.2,1) {$R$};
%\node[vertex][fill=lightgray] (U_S) at 
%\node[vertex] (S') at (1,-0.5) {$S'$};
\path (S) edge (D);
\path (D) edge (A);
\path (S) edge (A);
\path (S') edge (A');
%\path (S') edge (D');
\path (D') edge (A');
\path (U_S) edge (S);
\path (U_D) edge (D);
%\path (U_D) edge (D');
\path (U_A) edge (A);
\path (U_A) edge (A');
%\path (R) edge (X);
%\path (R) edge (Y);
            \end{tikzpicture}
        \caption{Causal graph of twin network $(\model^{\text{twin }})^{\doop{S'=s',D'=d}}$ for $\model \in \modelsunconfedge$} 
        \label{fig:twin_net_pathwise}
        \end{figure}
 We first show that if $\model$ satisfies the path-dependent counterfactual fairness notion then $\model \in \nullobsunconf$.  The path-dependent counterfactual fairness notion implies that for all $s,d$ such that $P_{\model}(s,d)>0$, 
\begin{equation*}
P_{\model}\Paren{\outcome^{\doop{\sex = \lsex', \dept = \ldept}}=1 \mid \dept = \ldept, \sex=\lsex} = P_{\model}\Paren{\outcome = 1\mid \dept = \ldept, \sex=\lsex}.
\end{equation*}
If for $s' \neq s$, $P_{\model}(s',d)>0$, then we simplify $ P_{\model}\Paren{\outcome^{\doop{\sex = \lsex', \dept = \ldept}}=1 \mid \dept = \ldept, \sex=\lsex}$ using the twin network in Figure~\ref{fig:twin_net_pathwise}.
\begin{align*}
P_{\model}\Paren{\outcome^{\doop{\sex = \lsex', \dept = \ldept}}=1 \mid \dept = \ldept, \sex=\lsex} &= P_{\model^{\text{twin}}}\Paren{\outcome'=1 \mid \doop{\dept' = \ldept, \sex' = \lsex'}, \dept = \ldept, \sex=\lsex} \\
    &\stackrel{(a)}{=} P_{\model^{\text{twin}}}\Paren{\outcome'=1 \mid \doop{\dept' = \ldept, \sex' = \lsex'}}, \\
    &\stackrel{(b)}{=} P_{\model}\Paren{\outcome=1 \mid \dept = \ldept, \sex = \lsex'},
\end{align*}
where $(a)$ follows since $S,D \indep A'$ in the intervened twinned SCM and $(b)$ follows from the twinned SCM. This implies that $$P_{\model}\Paren{\outcome=1 \mid \dept = \ldept, \sex = \lsex} = P_{\model}\Paren{\outcome=1 \mid \dept = \ldept, \sex = \lsex'} = P_{\model}\Paren{\outcome=1 \mid \dept = \ldept}.$$ If instead, $P_{\model}(s',d)=0$, then still $P_{\model}\Paren{A=1 \mid D=d, S=s} = P_{\model}\Paren{A=1 \mid D=d}$. Therefore, $\model \in \nullobsunconf$.
% For the counterfactual notion of fairness as defined in \eqref{eq:nullctrf}, for $\model \in \modelsunconfedge$,
% \begin{equation}
%     A^{\doop{S=s,D=d}} = f_{\outcome}\Paren{s,d,U_A}, A^{\doop{D=d}} = f_{\outcome}\Paren{S,d,U_A}.
% \end{equation}
% If $P_{\model}\Paren{f_{\outcome}\Paren{s,d,U_A} = f_{\outcome}\Paren{S,d,U_A}}=1,$ then assuming $P_{\model}\Paren{S=s'} \geq 0$,

% \begin{align*}
%     P_{\model}\Paren{f_{\outcome}\Paren{s,d,U_A} = f_{\outcome}\Paren{S,d,U_A}} &= P_{\model}\Paren{S=s'}P_{\model}\Paren{f_{\outcome}\Paren{s,d,U_A} = f_{\outcome}\Paren{s',d,U_A}}  \\
% &+P_{\model}\Paren{S=s}P_{\model}\Paren{f_{\outcome}\Paren{s,d,U_A} = f_{\outcome}\Paren{s,d,U_A}}, \\
%     &=  P_{\model}\Paren{S=s} + P_{\model}\Paren{S=s'}P_{\model}\Paren{f_{\outcome}\Paren{s,d,U_A} = f_{\outcome}\Paren{s',d,U_A}} = 1.
% \end{align*}
% Therefore, $P_{\model}\Paren{f_{\outcome}\Paren{s,d,U_A} = f_{\outcome}\Paren{s',d,U_A}}$. This implies that $S$ is not a parent of $A$ in $G(M)$. Since this is identical to the graphical notion of fairness, the counterfactual notion of fairness also is tested by the conditional independence $\sex \indep \outcome \mid \dept.$
Clearly, if $\model \in \nullgraphunconf$, path-dependent counterfactual fairness is satisfied. However, note that, Example~\ref{ex:unconfexample} satisfies path-dependent counterfactual fairness but does not belong to $\nullgraphunconf$. The conclusion follows from Theorem~\ref{thm:unconf_test_equiv}. 
\end{proof}

\subsection{With Confounding}
In this section, we compare the statistical tests that result from the NDE that the notions of \citet{NabiShpitser18} and \citet{Chiappa19} are based on, and \cite{KusnerLRS17}'s counterfactual fairness and path-dependent counterfactual fairness notions, when confounding is allowed. 

\subsubsection{NDE} With confounding between the mediator and the outcome, \citet{KaufmanKMGP05} obtain bounds on the NDE for the all-binary variable case by the linear programming approach of \citet{BalkePearl97}. The resulting bounds are implied by the IV inequalities but not equivalent to them, which gives us a strictly weaker test than the IV inequalities. For completeness, we present the bounds below. The lower and upper bounds for $NDE(A;0 \rightarrow 1)$ are

\begin{align*}
   &\max \left.\begin{cases}
        P(A=0 \mid S=0) - 1, \\
        P(A=0 \mid D=0 \mid S=0) - P(A=1 \mid D=1 \mid S=0) + P(A=1 \mid D=0 \mid S=1) - 1 \\
        P(A=0 \mid D=1 \mid S=0) - P(A=1 \mid D=0 \mid S=0) + P(A=1 \mid D=1 \mid S=1) - 1
    \end{cases} \right\}, \\
    &\min \left.\begin{cases}
        1- P(A=1 \mid S=0), \\
        1+P(A=0 \mid D=1 \mid S=0) - P(A=1 \mid D=0 \mid S=0) - P(A=0 \mid D=0 \mid S=1) \\
        1+P(A=0 \mid D=0 \mid S=0) - P(A=1 \mid D=1 \mid S=0) - P(A=0 \mid D=1 \mid S=1)
    \end{cases} \right\}. \\
\end{align*}

The lower and upper bounds for $NDE(A;1 \rightarrow 0)$ are

\begin{align*}
   &\max \left.\begin{cases}
        P(A=0 \mid S=1) - 1, \\
        P(A=1 \mid D=0 \mid S=0) - P(A=1 \mid D=1 \mid S=1) + P(A=0 \mid D=0 \mid S=1) - 1 \\
        P(A=1 \mid D=1 \mid S=0) - P(A=1 \mid D=0 \mid S=1) + P(A=0 \mid D=1 \mid S=1) - 1
    \end{cases} \right\}, \\
    &\min \left.\begin{cases}
        1- P(A=1 \mid S=1), \\
        1+P(A=0 \mid D=0 \mid S=1) - P(A=0 \mid D=1 \mid S=0) - P(A=1 \mid D=1 \mid S=1) \\
        1+P(A=0 \mid D=1 \mid S=1) - P(A=0 \mid D=0 \mid S=0) - P(A=1 \mid D=0 \mid S=1) 
    \end{cases} \right\}. \\
\end{align*}
Equating the NDE to $0$, gives us a strictly larger null hypothesis compared to the one obtained based on the IV inequalities. This implies that the resulting statistical test is strictly weaker. 


\subsubsection{Counterfactual Fairness \citep{KusnerLRS17}} \label{app:ctrfkusner-cf}
% Unlike the case without allowing for confounding, the counterfactual and path-dependent counterfactual fairness notions induce the same set of observational distributions and those are precisely $\distiv$. 
% \begin{proposition}\label{prop:ctrf-fair-equiv}
%  If $\model \in \modelsedge$ satisfies counterfactual fairness and $P_{\model}(S=s)>0$ for all $s$, then $P_M(A,D|S) \in \distgraph = \distiv$. In addition, $P_{\model}$ satisfies demographic parity.
%  \end{proposition}
% \begin{proof}
% We closely follow the proof of Proposition~\ref{prop:cf-ctrf-pathwise} with some modifications. We show that a model $\model \in \modelsedge$ that satisfies the counterfactual fairness notion is observationally equivalent to a model in $\nullgraph$. This implies that the set of observational distributions of models that satisfy the counterfactual notion of fairness is described by $\distgraph$. 

% If $\model \in \modelsedge$ satisfies counterfactual fairness, then for all $s,d$, such that $P_{\model}(s,d)>0$
% \begin{equation*}
% P_{\model}\Paren{\outcome^{\doop{\sex = \lsex'}}=1 \mid \dept = \ldept, \sex=\lsex} = P_{\model}\Paren{\outcome^{\doop{\sex = \lsex}}=1 \mid \dept = \ldept, \sex=\lsex}
% \end{equation*}
% for $s' \neq s$. Note that the right-hand side above is $P_{\model}\Paren{A=1 \mid D=d, S=s}$.
% The counterfactual $P_{\model}\Paren{\outcome^{\doop{\sex = \lsex'}}=1 \mid \dept = \ldept, \sex=\lsex}$ is given by the push-forward of $P(U_A,U,D\mid D=d,S=s) = \delta_d(D) \times P(U_A,U\mid D=d,S=s)$ through $f_{A}(s',D,U_A,U)$. Because of the independence on the value of $s'$, the same holds for the function $$\bar{f}_{A}\Paren{d,U_A,U} = \frac{1}{2}\Paren{f_{A}(0,d,U_A,U) + f_{A}(1,d,U_A,U)}.$$ Consider an SCM, $\bar{\model}$ that is identical to $\model$ except for the causal mechanism of $A$ being $\bar{f}_{A}$. 
% Clearly, $\bar{\model} \in \nullgraph$ and $P_M(S,D) = P_{\bar{M}}(S,D)$. By the above argument, for all $s,d$ such that $P_{\model}(s,d) >0$, we have $P_{\model}\Paren{\outcome = 1\mid \dept = \ldept, \sex=\lsex} = P_{\bar{\model}}\Paren{\outcome = 1\mid \dept = \ldept, \sex=\lsex}$. This implies that if for all $s$, $P_{\model}(S=s)>0$, then $P_{\model}\Paren{A,D \mid S} \in \distgraph = \distiv$. The proof of Proposition~\ref{prop:ctrf-demo-nocf} holds even for $\modelsedge$.
% \end{proof}
The proof of Proposition~\ref{prop:ctrf-demo-nocf} also holds when confounding is allowed since the implications of the do-calculus rules in the proof hold even in the twin network with confounding. 
% \begin{proposition}\label{prop:ctrf-fair-equiv}
%  If $\model \in \modelsedge$ satisfies counterfactual fairness then $P_{\model}$ satisfies demographic parity.
%  \end{proposition}

% \begin{proof}
%     The proof of Proposition 
% \end{proof}

\subsubsection{Path-dependent Counterfactual Fairness \citep{KusnerLRS17}} \label{app:pathwise-cf}

 \begin{proposition}\label{prop:cf-ctrf-pathwise}
If $\model \in \modelsedge$ satisfies path-dependent counterfactual fairness then $P_M(D,A,S) \in \distgraph = \distiv$. If $M \in \nullgraph$, then $\model$ satisfies path-dependent counterfactual fairness.
 \end{proposition}
\begin{proof}
We show that a model $\model \in \modelsedge$ that satisfies the path-dependent counterfactual fairness notion is observationally equivalent to a model in $\nullgraph$. This implies that the set of observational distributions of models that satisfy the path-dependent counterfactual notion of fairness, are described by $\distgraph$. 

If $\model \in \modelsedge$ satisfies path-dependent counterfactual fairness, then for all $s,d$ such that $P_{\model}\Paren{s,d}>0$,
\begin{equation*}
P_{\model}\Paren{\outcome^{\doop{\sex = \lsex', \dept = \ldept}}=1 \mid \dept = \ldept, \sex=\lsex} = P_{\model}\Paren{\outcome^{\doop{\sex = \lsex, \dept = \ldept}}=1 \mid \dept = \ldept, \sex=\lsex}
\end{equation*}
for $s' \neq s$. Note that the right-hand side above is $P_{\model}\Paren{\outcome = 1\mid \dept = \ldept, \sex=\lsex}$. The counterfactual $P_{\model}\Paren{\outcome^{\doop{\sex = \lsex', \dept = \ldept}}=1 \mid \dept = \ldept, \sex=\lsex}$ is given by the push-forward of $P(U_A,U|D=d,S=s)$ through $f_{A}(s',d,U_A,U)$. Because of the independence on the value of $s'$, the same holds for the function $$\bar{f}_{A}\Paren{d,U_A,U} = \frac{1}{2}\Paren{f_{A}(0,d,U_A,U) + f_{A}(1,d,U_A,U)}.$$ Consider an SCM, $\bar{\model}$, that is identical to $\model$ except for the causal mechanism of $A$ being $\bar{f}_{A}$. 
Clearly, $\bar{\model} \in \nullgraph$ and $P_M(S,D) = P_{\bar{M}}(S,D)$. By the above argument, for all $s,d$ such that $P_{\model}(s,d)>0$, we have $P_{\model}\Paren{\outcome = 1\mid \dept = \ldept, \sex=\lsex} = P_{\bar{\model}}\Paren{\outcome = 1\mid \dept = \ldept, \sex=\lsex}$. This implies that $P_{\model}(D,A,S) \in \distgraph = \distiv$. 
\begin{figure}[th]
     \centering
            \begin{tikzpicture}
            \tikzstyle{vertex}=[circle,fill=none,draw=black,minimum size=17pt,inner sep=0pt]
\node[vertex] (S) at (0,0) {$S$};
\node[vertex][fill=lightgray] (U_S) at (0,-1) {$U_S$};
\node[vertex] (S') at (0,-2) {$S'$};
\node[vertex] (A) at (3,0) {$A$};
\node[vertex] (A') at (3,-2) {$A'$};
\node[vertex] (D) at (1.5,1.5) {$D$};
\node[vertex] (D') at (1.5,-3.5) {$D'$};
\node[vertex][fill=lightgray] (U_D) at (1.5,-1) {$U_D$};
\node[vertex][fill=lightgray] (U_A) at (3,-1) {$U_A$};
\node[vertex][fill=lightgray] (U) at (5.5,-1) {$U$};

%\node[vertex][fill=lightgray] (R) at (2.2,1) {$R$};
%\node[vertex][fill=lightgray] (U_S) at 
%\node[vertex] (S') at (1,-0.5) {$S'$};
\path (S) edge (D);
\path (D) edge (A);
%\path (S) edge (A);
%\path (S') edge (A');
%\path (S') edge (D');
\path (D') edge (A');
\path (U_S) edge (S);
\path (U_D) edge (D);
%\path (U_D) edge (D');
\path (U_A) edge (A);
\path (U_A) edge (A');
\path (U) edge (A');
\path (U) edge (A);
\path (U) edge (D);
\path (U) edge (D');
%\path (R) edge (X);
%\path (R) edge (Y);
            \end{tikzpicture}
        \caption{Causal graph of twin network $(\model^{\text{twin }})^{\doop{S'=s',D'=d}}$ for $\model \in \nullgraph$} 
        \label{fig:twin_net_pathwise_cf}
        \end{figure}
Conversely, if $\model \in \nullgraph$, then from the twin network of $\model$ in Figure~\ref{fig:twin_net_pathwise_cf}, clearly $A' \indep S' \mid S,D$ and therefore, path-dependent counterfactual fairness is satisfied.   

\end{proof}
Therefore, for the assumed model class, a valid statistical test for path-dependent counterfactual fairness is also a valid test for the IV inequalities from Theorem~\ref{thm:iv_tight} and vice versa. 


\section{Additional Results for Bayesian Testing Procedure: Cum-laude Dataset and Prior Sensitivity}\label{app:bayes}

We consider the dataset from \citet{Bol23} that contains data from $5239$ PhD students in the Netherlands studying at a large Dutch university from 2011-2021. \citet{Bol23} observed a bias in the percentage of `cum-laude' distinctions awarded to male PhD students ($6.57 \%$) versus female PhD students ($3.68 \%$). 

As in the Berkeley example, there is data on the sex of the student, their academic field and whether they were awarded cum-laude. Unlike the Berkeley example, there are more covariates that measure additional information, including the sex composition of the dissertation committee, the sex composition of the supervisory team that includes the promoters and co-promoters. For the current analyses we don't take into account these covariates and only analyze the dataset with respect to sex, academic field and award outcome. 

As reported by \citet{Bol23}, unlike the Berkeley dataset, the bias among female and male cum-laude award rates does not vanish when conditioned on department. Therefore, with the assumption of no confounding between the academic field choice and the cum-laude award outcome, the conclusion of the conditional independence test implies that the data generating mechanism is unfair when assuming that no latent unprotected mediators between sex and cum-laude award rates exist. Allowing for confounding requires us to use the Bayesian testing procedure proposed in Section~\ref{sec:bayesiantest} for the IV inequalities. We have $|\cX| = |\cX_{\dept}| = 6$, $|\cY| = |\cX_{\outcome}|=2, |\cZ| = |\cX_{\sex}|=2$. We choose a flat Dirichlet prior over parameters ($\theta = \left \lbrace P(d,a,s): d \in \cX_D, a \in \cX_A, s \in \cX_S\right \rbrace$) in both models $\MM_0, \MM_1$, i.e., for $i=0,1, \pi(\theta | \MM_i) = c_i \text{Dir}\Paren{1,1,\cdots,1}$ where $c_i$ is a normalizing constant. The counts from the data $R_1, R_2 \cdots R_m$ are used to obtain the posterior, $P(\theta \mid R_1, R_2, \cdots R_m)$ which is also a Dirichlet distribution. Using $n=10^6$ samples, we observe no violations of the IV inequality. Therefore, the confidence interval for the posterior probability of the cum-laude data satisfying the IV inequalities is $\left[1-3.69\times 10^{-6},1\right]$. Hence, when allowing for confounding, on arrives at the conclusion that, given the available data and restricting the analysis to only three variables, the fairness of the data-generating mechanism is undecidable.

\textbf{Prior Sensitivity: }
For both the Berkeley dataset and the Bol dataset, the final confidence interval is dependent on the choice of the prior. We presented the analysis with a flat Dirichlet prior, for both datasets. We find that the lower limit of the confidence interval does not change as we vary the parameter $\alpha$ over the interval $\left[ 10^{-2}, 10^{5} \right]$ for a Dirichlet $\text{Dir}\Paren{\alpha, \alpha, \cdots, \alpha}$ prior. 

\textbf{Frequentist Test of \citet{WangRR17}: } The frequentist test of \citet{WangRR17} converts every IV inequality into a one-sided association test for a $2\times2$ contingency table. Specifically, for fixed $d,a$, an IV inequality of the form 
$$\Pr\Paren{D=d,A=a \mid S=1} + \Pr\Paren{D=d,A=1-a \mid S=0} \leq 1$$ is transformed into $$\gamma^{d,a} \leq 0$$ where 
$\gamma^{d,a} \triangleq \Pr\Paren{Q^{d,a}=1\mid S=1} - \Pr\Paren{Q^{d,a}=1\mid S=0}$ where $$Q^{d,a} = \begin{cases}
    \bm{1}\left[D=d,A=a\right] & \text{if }S=1, \\
    1-\bm{1}\left[D=d,A=1-a\right] & \text{if } S=0. \\
\end{cases}$$
Note that $Q^{d,a}$ and $S$ are binary random variables and $\gamma^{d,a}=0$ if and only if $Q^{d,a} \indep S$. Further $\gamma^{d,a} \in [-1,1]$ for all $d,a$. 

Since the direction of the one-sided test matters, we check the sign of the difference of conditional probabilities using maximum likelihood (ML) estimates and then conduct a Pearson's chi-square test for independence. For the Berkeley data, the ML estimates of $\gamma^{d,a}$ were negative (less than $-0.6$) for all $d,a$, and the independence tests rejected the null hypothesis of independence with p-value $0.0$ (i.e., less than the smallest positive number representable using double precision floating point format, i.e., $<5\times10^{-324}$). We take this to be significant evidence that the null hypothesis of $\gamma^{d,a} \leq 0$ is not rejected. As noted in \citet{WangRR17}, although we test for multiple IV inequalities, the Bonferroni correction is $1/2$ and does not scale as the number of IV inequalities.
% \section{Bayes Factor Uncertainty Quantification}
% In \cite{HeckDavisStrober19}, uncertainty quantification techniques for the Bayesian test using the encompassing prior method are provided. We follow those, namely a sampling-based approach that reports the standard deviation over $R$ trials. Over $10$ trials, the standard deviation was obtained to be less than $1e-6$. \cite{KlugkistLH10} perform simulations to test prior sensitivity and conclude that the uniform prior performs well without any apriori knowledge.
% \end{document}
