%\documentclass{uai2024} % for initial submission

\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage[capitalize]{cleveref}
\usepackage{tikz}
\usetikzlibrary{fit,arrows,shapes.arrows,shapes.geometric,shapes.multipart,decorations.pathmorphing,positioning,shapes.swigs,}
\RequirePackage{amsthm,amsmath,amsfonts,amssymb}
\usepackage[ruled,noend,linesnumbered]{algorithm2e} 
\usepackage{subcaption}
\usepackage{thmtools}
\usepackage{cases}
\usepackage{physics}

\NewDocumentCommand{\ARef}{ s s m }{%
    \IfBooleanTF{#2}{}{%
        \cref{#3}%
    }%
    \IfBooleanT{#1}{%
        \IfBooleanF{#2}{%
            , %
        }%
        line~\ref{#3}%
    }%
}

\SetKw{FAIL}{}
\SetKwComment{Comment}{/* }{ */}

\newcommand\ci{\perp\!\!\!\perp}
%\theoremstyle{plain}
\newtheorem{thm}{Theorem}
% \newtheorem{cor}{Corollary}%[Theorem]
\newtheorem{lem}{Lemma}
% \newtheorem{prop}{Proposition}
%\newenvironment{proof}{\paragraph*{Proof}}{\hfill$\square$}

\newcommand{\ilya}[1]{{\color{red!70!blue} #1}}
\newcommand{\amir}[1]{{\color{red} #1}}
\newcommand{\jaron}[1]{{\color{brown} #1}}
\usepackage[normalem]{ulem}
\newcommand{\stkout}[1]{\ifmmode\text{\sout{\ensuremath{#1}}}\else\sout{#1}\fi}
 \newcommand{\tildep}[1]{#1'} 

% \theoremstyle{definition}
\newtheorem{dfn}{Definition}
\newtheorem{rmk}{Remark}
% \newtheorem*{dfn*}{Definition}
\newtheorem{assumption}{Assumption}

% \theoremstyle{remark}
% \newtheorem{rmk}{Remark}
\newtheorem{ex}{Example}
\crefname{assumption}{Assumption}{Assumptions}
\crefname{thm}{Theorem}{Theorems}
\crefname{dfn}{Definition}{Definitions}
\crefname{lem}{Lemma}{Lemmas}
\crefname{algocf}{Algorithm}{Algorithms}

% \newcommand{\cmark}{\ding{51}}%
% \newcommand{\xmark}{\ding{55}}%
\newcommand{\G}{\mathcal{G}}
\newcommand{\1}{\mathbf{1}}

\DeclareMathOperator{\sterile}{sterile}
\DeclareMathOperator{\pa}{pa}
\DeclareMathOperator{\ch}{ch}
\DeclareMathOperator{\sib}{sib}
\DeclareMathOperator{\pre}{pre}
\DeclareMathOperator{\pas}{spa}
\DeclareMathOperator{\an}{an}
\DeclareMathOperator{\ans}{san}
\DeclareMathOperator{\de}{de}
\DeclareMathOperator{\nde}{nde}
\DeclareMathOperator{\nd}{nd}
\DeclareMathOperator{\des}{sde}
\DeclareMathOperator{\barr}{barren}
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator{\doo}{do}
\DeclareMathOperator{\dis}{dis}
\DeclareMathOperator{\mb}{mb}
\DeclareMathOperator{\cl}{cl}


%  Alternate depiction of  in main vs appendix
\newcommand{\restatemarker}{\textup{
\begin{align*}
    &q_{\vec{V}}(\tildep{\vec{D}} \mid \pas_\G (\vec{D}')) \mid_{S=s} \\
    &=  p(\tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})))  \times \\
    &\Big( 
    \prod_{V_i \in \prec} p(V_i \mid \pre_{\prec}(V_i), \tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})), \pas(\tildep{\vec{D}}), S=s)  \Big)
\end{align*}
} \ignorespaces}
\AddToHook{cmd/appendix/after}{\renewcommand{\restatemarker}{\[
    q_{\vec{V}}(\tildep{\vec{D}} \mid \pas_\G (\vec{D}')) \mid_{S=s} = \prod_{V_i \in \prec} 
    p(V_i \mid \pre_{\prec}(V_i), \tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})), \pas(\tildep{\vec{D}}), S=s)  
    p(\tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})))\]}}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{A General Identification Algorithm For Data Fusion Problems Under Systematic Selection}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jaron2005@gmail.com>?Subject=Your UAI 2024 paper}{Jaron J. R. Lee}{}}
\author[2]{AmirEmad Ghassami}
\author[1]{Ilya Shpitser}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    Johns Hopkins University\\
    Baltimore, Maryland, USA
}
\affil[2]{%
    Department of Mathematics and Statistics\\
    Boston University\\
    Boston, Massachusetts, USA\\
}
 
\begin{document}
\maketitle

\begin{abstract}
  Identification of causal effects can be hampered by confounding, selection bias, and other complications. 
  Data fusion is one approach to addressing these difficulties, through the inclusion of auxiliary data on the population of interest. 
  Such data may measure a different set of variables, or be obtained under different experimental or observational conditions than the primary dataset. 
  In particular, selection of experimental units into different datasets may be systematic; similar difficulties are encountered in missing data problems.
  However, existing methods for combining datasets either do not consider this issue, or assume simple selection mechanisms. 
  In this paper, we propose a general approach, based on graphical causal models, for causal inference from data on the same population that is obtained
  under different experimental conditions. Our framework allows both arbitrary unobserved confounding, and arbitrary selection processes into different experimental regimes in our data.
  We describe how systematic selection processes may be organized into a hierarchy similar to censoring processes in missing data: selected completely at random, selected at random, and selected not at random. 
  Finally, we provide a novel general identification algorithm for interventional distributions in this setting. 
  % This is the abstract for this article.
  % It should give a self-contained single-paragraph summary of the article's contents, including context, results, and conclusions.
  % Avoid citations; but if you do, you must give essentially the whole reference.
  % For example: This whole paper is devoted to praising É. Š. Åland von Vèreweg's most recent book (“Utopia's government formation problems during the last millenium”, Springevier Publishers, 2016).
  % Also, do not put mathematical notation and abbreviations in your abstract; be descriptive.
  % So not “we solve \(x^2+A xy+y^2\), where \(A\) is an RV”, but “we solve quadratic equations in two unknowns in which a single coefficient is a random variable”.
  % The reason is that mathematical notation will not display correctly when the abstract is reused on the proceedings website, for example, and that one should not assume the abstract's reader knows the abbreviation.
  % Of course the same remarks hold for your paper's title.
\end{abstract}

\section{Introduction}\label{sec:intro}
Understanding causality is important for actionable insights. Traditionally, causality has been inferred through the use of randomized experiments, where an experimenter randomly assigns a variable $A$ to values corresponding to treatment or control, and measures the \emph{causal effect} of these assignments on an outcome $Y$. However, such randomized experiments can be expensive, ethically fraught, or otherwise not possible to implement. % [CITATION].

Given access to observational data,
cause-effect relationships may 
be quantified via causal effects, which aim to predict what would have happened had a randomized experiment been (hypothetically) performed.
A fundamental problem in causal inference is to
estimate causal effects given access only to observational data
and a
causal model.

Causal effects may only be consistently estimated from observed data if they are \emph{identified}, that is if they 
can be uniquely expressed as a function of observed data distribution 
under the assumptions encoded by a causal model.
Sound and complete identification algorithms for causal effects have been developed using the formalism of graphical causal models
\citep{shpitserIdentificationJointInterventional2006,huangCompletenessIdentifiabilityAlgorithm2008}. 

If the causal effect of interest is not identified, more assumptions may be imposed on the causal model, or informative conclusions about the causal effect may be obtained by deriving bounds.  Alternatively, the primary dataset may be augmented with one or more informative secondary datasets, in what is termed \emph{data fusion}. In this approach, an analyst has access to multiple datasets %from the same environment,
on the same population.  These datasets may represent observational data on variables of interest, or represent results of randomized experiments, potentially ones that randomize treatments other than the treatments of primary interest. 
The task is to check if the desired causal effect can be computed from this collection of datasets.  Sound and complete algorithms have been developed for this problem under the assumption that the collection of dataset is given
\citep{leeGeneralIdentifiabilityArbitrary2022,kivvaRevisitingGeneralIdentifiability2022}. 

However, it is possible (indeed likely) that units are assigned to different datasets systematically, which is a possibility not considered by previous approaches. %via a systematic
%and therefore nonignorable 
%selection process.
Consider the %example
problem introduced by \cite{atheyCombiningExperimentalObservational2020},
%with graphical representation in
and further explored by
\cite{ghassamiCombiningExperimentalObservational2022},
where
%They want 
%In this problem,
the goal is
to estimate the effect of class size on %eighth-grade 
student test scores in New York, using two datasets. The first dataset is collected from the %New York
public school system, in which class size depends on observed and unobserved covariates %such as zip code, socioeconomic status etc., 
that create confounding for the relationship between class size and test scores. The second dataset is from a randomized experiment studying the effect of class size conducted in a different population at a different point in time. As is typical in data fusion, the desired causal effect is not identified in either of datasets separately, and indeed the datasets measure differing sets of variables. %The authors note that
%The causal effects estimated in each dataset separately produce significantly different results. To explain this, the authors point out that the datasets likely represent different subsets of the overall student population, \amir{I do not agree with this. First of all, observational dataset has unobserved confounders. So we cannot even estimate causal effect from that data. Secondly, the experimental data has unobserved outcome. So, again we cannot even estimate causal effect from that data. Change in the distribution of students' covariates is the least of our concerns in that setting}
However, the authors note that there are significantly different covariate distributions between the two. In other words, there is systematic selection determining which dataset a particular student ends up in. %, depending on their covariates.
%underlying populations of the datasets are different,
Therefore, appropriate adjustments are needed before causal conclusions obtained from the two subpopulation can be compared -- or combined into a single conclusion on the overall %student
population.

% - an observational dataset, and an experimental dataset randomizing $A$. A set of pre-treatment covariates $\vec{C}$ are measured, and unmeasured confounding is also present. Importantly, in this setting %they
% it is %suspected
% reasonable to assume that students are systematically assigned to the two datasets based on observed
% %and unobserved
% pre-treatment covariates. In such instances, algorithms that assume this never happens can return biased results.

%\ilya{[Use the Amir/Athey example.]}
%For instance, consider data collected from a hospital, in which there are two subpopulations: the normal ward, in which standard care is provided, and %a special ward
%the intensive care unit, where an experiment on critical care practices is being conducted. In the data fusion parlance, these represent an observational dataset, and an experimental dataset where care practices are intervened upon. However, patients at this hospital are not themselves randomly assigned -- indeed, they are %able to
%selected either into ward or the ICU based on severity of their condition.
%which ward they wish to enter, based on their preferences. Thus, it may come to be that the patients in the normal ward have significantly different characteristics than those in the special ward. In such instances, the prior data fusion algorithms may not hold.

%The previously described systematic selection is well-known in the missing data literature, under the hierarchy %originally
%proposed in \cite{rubinInferenceMissingData1976}, where %there
%variables may be missing completely at random (MCAR), missing at random (MAR), or missing not at random (MNAR).  \amir{this does not deserve to be a paragraph.}
% Target parameters that feature MCAR and MAR variables are identified, while the question of identification is significantly more involved in MNAR models.  For a review of identification theory in missing data models using
% graphical models, %ing formalism,
% see \citep{bhattacharya19mid,bhattacharya20completeness}.

%However, identification under MNAR is much more involved.
%These problems have been studied from a graphical model perspective, and identification approaches differ significantly depending on the exact type of mechanism present. In particular, the missingness indicator is explicitly represented in such graphs, and the parents (observed and unobserved) of the indicators influences the whether a particular target is identified. 

%In this work,
Our contributions are as follows. We define a novel graphical %structural
causal model %specifically adapted for
for representing such data fusion problems,
%under selection,
where %we explicitly represent
the selection mechanism %in the graph
is represented as a random variable that potentially depends on other variables in the problem in complex ways.
%We allow the selector to potentially be influenced by both observed and unobserved variables.
We use this representation to show that systematic selection exhibits a hierarchy similar to the missing data hierarchy, where selection could occur complete at random (SCAR), at random (SAR), or not at random (SNAR).  Next, we show that applying a sound and complete algorithm that corrects for systematic selection following by a sound and complete algorithm that corrects for confounding cannot be complete in settings where both difficulties occur together.  Finally, we propose a general algorithm that aims to correct for systematic selection and confounding at once.

% that is complete under certain conditions.


%Conceptually, this is equivalent to the result of a randomized experiment, in which the treatment is randomly assigned by an experimenter, and the outcome distribution is measured. While such experiments remain the gold standard for causality in many fields, running them can be prohibitive for a variety of reasons [CITATION]. Sound and complete algorithms have been developed to compute these results from observational data \cite{shpitserIdentificationJointInterventional2006,huangCompletenessIdentifiabilityAlgorithm2008a}. 

%Recently, there has been growing interest in identification and

%Once a causal effect is identified, estimation strategies that accept any graph can be applied, including parametric \cite{evansSmoothIdentifiableSupermodels2019,drtonComputingMaximumLikelihood2008} and semiparametric \cite{bhattacharyaSemiparametricInferenceCausal2020} methods.

\section{Background and Notation}\label{sec:background}

We use upper case Roman letters to denote random variables, e.g., $A$ and vector notation for sets thereof, e.g., $\vec{A}$.  Values are lowercase letters, e.g., $a$, and sets of values are vectored lower case letters, e.g., $\vec{a}$.  For a random variable $Z$, its domain is denoted as $\mathfrak{X}_{Z}$,  and domains of sets $\vec{Z}$ as $\mathfrak{X}_{\vec{Z}}$. We denote the positive part of the domain (i.e. where $p(Z) > 0$) as $\mathfrak{X}^+_Z$. Given a subset $\vec{A} \subseteq \vec{B}$ of variables, and values 
%assignment
$\vec{b}$ of $\vec{B}$, we denote by $\vec{b}_{\vec{A}}$ the subset of values of $\vec{b}$ pertaining to variables in $\vec{A}$.

We use the framework of graphical causal modeling in this paper.
%, which is formulated using graph theory \amir{not true, IMHO}.
Specifically, we will consider acyclic directed mixed graphs (ADMGs) which contain directed ($\to$) and bidirected ($\leftrightarrow$) edges and no directed cycles, and a special case of ADMGs -- directed acyclic graphs (DAGs) which contain only directed edges ($\to$).
%We consider graphs with directed edges ($\to$), representing direct causation, and bidirected edges ($\leftrightarrow$), representing latent confounding. Graphs of directed edges are called directed acyclic graphs (DAGs) while graphs with both types of edges are called acyclic directed mixed graphs (ADMGs).
We employ the standard genealogical definitions for parents, ancestors, descendants, and districts of a variable $X$, as: $\pa_\G (X) = \{Y \mid Y \to X\}$, $\an_\G (X) = \{Y \mid Y \to \ldots \to X\} \cup \{X\}$, $\de_\G (X) = \{Y \mid X \to \ldots \to Y\} \cup \{X\}$, $\dis_\G(X) = \{Y \mid X \leftrightarrow \ldots \leftrightarrow Y\} \cup \{X\}$. %\amir{Y not X?}
In addition, we will define $\nd_{\G}(X)$ as the set of non-descendants of $X$, which is all vertices other than those in $\de_{\G}(X)$. 
These definitions apply disjunctively over sets - e.g., for set $\vec{Z}$, $\pa_\G(\vec{Z}) = \cup_{Z \in \vec{Z}} \pa_\G (Z)$.
Strict versions of these definitions exclude variables in the argument, and are denoted with prepended $s$ - e.g., for set $\vec{Z}$, $\pas_\G(\vec{Z}) = \pa_\G (\vec{Z}) \setminus \vec{Z}$. We denote districts of a graph as ${\cal D}(\G)$. Districts form a partition of vertices in a graph. Given a graph $\G$ with vertex set $\vec{V}$ and $\vec{Z} \subseteq \vec{V}$, define the induced subgraph $\G_{\vec{Z}}$ of $\G$ to be the graph containing vertices $\vec{Z}$ and edges in $\G$ only among elements in $\vec{Z}$. 
%\amir{does not deserve to be a paragraph}

%We denote sequences of (sets of) random variables by subscripted indices $Z_i$, and we denote values of such sequences by lower case Roman letters $z_i$.
%and domains of sets of variables are denoted as $\mathfrak{X}_Z \equiv \otimes_{i} \mathfrak{X}_{Z_i}$. 

We will consider structural causal models (SCMs), which are associated with DAGs.  Given a DAG $\G$ with vertices $\vec{V}$ representing observed variables, we assume each variable $V \in \vec{V}$ is determined via an invariant mechanism called a structural equation: $f_V : \mathfrak{X}_{\pa_{\G}(V) \cup \{ \epsilon_V \}} \mapsto 
\mathfrak{X}_V$, where $\epsilon_V$ is an exogenous unobserved random variable associated with $V$ representing the random noise in the system.  We will assume that all $\epsilon_V$ are mutually independent.  Some authors denote such an SCM as a non-parametric structural equation model with independent errors (NPSEM-IE) \citep{thomas13swig}.

%We denote induced subgraphs with respect to set $\vec{W}$ as $\G_{\vec{W}}$.

A causal model encodes responses to variables to the intervention operation, where structural equations of a set of variables $\vec{A}$ are replaced by constants $\vec{a}$.  This operation is denoted by $\text{do}(\vec{a})$ in \cite{pearl09causality}.  A random variable response of variable $Y$ to an intervention $\text{do}(\vec{a})$ may also be written as a \emph{potential outcome} $Y(\vec{a})$. % [ref].
Potential outcomes encode causal relationships in the sense that they allow representation of outcomes in hypothetical randomized controlled trials. %\amir{PO variables do this not interventions. intervention is an actual act, it is not hypothetical} 
For example, the average causal effect $\mathbb{E}[Y(a) - Y(a')]$ represents the difference in outcome response, on the mean difference scale, of two experimental groups, where a treatment $A$ is set to an active ($a$) or control ($a'$) value.

Since potential outcomes represent hypothetical changes in a causal system,
%\amir{PO variables do this not interventions. intervention is an actual act, it is not hypothetical}
responses to hypothetical interventions are not always available.  An important task in causal inference is \emph{identification,} ensuring that interventional distributions $p(\vec{Y}(\vec{a})) = p(\vec{Y} | \text{do}(\vec{a}))$ are functions of the available distributions (classically, the observed data distribution  $p(\vec{V})$). %\amir{in our case, of the available distributions, not necessarily the observational dist} 

It is well known that if all variables $\vec{V}$ in an SCM with independent errors are observed, every interventional distribution $p(\vec{V} \setminus \vec{A} | \text{do}(\vec{a}))$ is identified via the truncated DAG factorization known as the \emph{g-formula} \citep{robins86new}:
{\small
\begin{align*}
p(\vec{V} \setminus \vec{A} \mid \text{do}(\vec{a}))
= \prod_{V \in \vec{V} \setminus \vec{A}}
p(V \mid \pa_{\cal G}(V)) \vert_{\vec{a}_{\pa_{\cal G}(V) \cap \vec{A}}}.
\end{align*}
}A simple version of the g-formula is the adjustment formula, which yields the average causal effect of the treatment $A$ on the outcome $Y$ if all confounders of $A$ and $Y$ are observed as a vector $\vec{C}$:
$\mathbb{E}[\mathbb{E}[Y | a, \vec{C}] - \mathbb{E}[Y | a', \vec{C}]]$.  Note that the g-formula with the empty $\vec{A}$ also holds and implies that the observed data distribution $p(\vec{V})$ may be written as $\prod_{V \in \vec{V}} p(V | \pa_{\G}(V))$, and thus is Markov with respect to the DAG $\G$ \citep{pearl88probabilistic,lauritzen96graphical}.

\section{The ID Algorithm
%Identification in Hidden Variable Causal Models
%: A Review
}

If some variables in the model are unobserved, identification of interventional distributions becomes considerably more complicated, with some distributions not being identified at all, and distributions that are identified being potentially more complex functionals of the observed data distribution than the g-formula.  General identification algorithms given the observed marginal distribution $p(\vec{V})$ derived from a hidden variable causal model represented by a DAG ${\cal G}(\vec{V} \cup \vec{H})$, where $\vec{H}$ represent hidden variables have been characterized via the ID algorithm \citep{tianIdentificationCausalEffects2002,shpitserIdentificationJointInterventional2006}.  The ID algorithm takes as input an ADMG ${\cal G}(\vec{V})$ derived from ${\cal G}(\vec{V} \cup \vec{H})$ via the latent projection operation \citep{verma90equiv}, the observed data distribution $p(\vec{V})$, and disjoint variable sets $\vec{A},\vec{Y}$ corresponding to the interventional distribution $p(\vec{Y} | \text{do}(\vec{a}))$ of interest.
%\amir{what do we mean by "representing"?}.  
The ID algorithm outputs either the identifying functional for $p(\vec{Y} | \text{do}(\vec{a}))$ in terms of $p(\vec{V})$, or the token ``not identified.''  It is known that the ID algorithm is both sound (outputs correct identifying functionals in all cases) and complete (whenever it outputs ``not identified,'' the corresponding distribution is indeed not a function of $p(\vec{V})$ in the model) \citep{huang06do,shpitserIdentificationJointInterventional2006}.

Just as the g-formula is a one line formula representing a modified DAG factorization, the ID algorithm may be formulated as a one line formula representing a modified nested Markov factorization of a latent projection ADMG \citep{richardsonNestedMarkovProperties2023}.
%The ID algorithm may be view
We now briefly review the ID algorithm formulated in this way in terms of Markov kernels, and the fixing operator.
%on graphs and kernels.

%Let ${\cal M}_{SCM}=\langle U, V, p(U), F\rangle$ denote a structural causal model, where $U$ is a set of unobserved variables distributed according to $p(U)$ that factorize as $\prod_i p(U_i)$, $V$ is a set of observed variables, and $F$ is a set of functions. Each variable $V_i \in V$ has a function $f_{V_i}\in F$ that describes the data generating process for $V_i$ in terms of $V$ and $U$. Each structural causal model induces a graph $\G$ containing a node for each $V_i \in V$, a directed edge from $V_i$ to $V_j$ if $f_{V_j}$ has $V_i$ as an argument, and a bidirected edge between $V_i$ and $V_j$ if both $f_{V_i}$ and $f_{V_j}$ have $U_i \in U$ as an argument. We will consider only structural causal models that induce acyclic graphs. 

%\textbf{DAGs and ADMGs}:

%\textbf{Kernels}:
A Markov kernel $q_{\vec{V}} (\vec{V} | \vec{W})$ is a nonnegative function that marginalizes to 1 over $\vec{V}$ for values of $\mathfrak{X}_{\vec{W}}$, 
%maps values of $\mathfrak{X}_{\vec{W}}$ to normalized densities \amir{what is a normalized density?} over $\vec{V}$
and may be viewed as a generalization of a conditional distribution, but is not necessarily constructed by applying a conditioning operation to a joint distribution.
For example, the kernel $q_Y(Y | a) \equiv \sum_{\vec{C}} p(Y | a, \vec{C}) p(\vec{C})$ which appears in the adjustment formula is not, in general, equal to $p(Y | a)$.

Given a Markov kernel, additional kernels may be constructed by the conditioning and marginalization operators, which are defined in the natural way:
{\small
\[q_{\vec{V}} (\vec{B} | \vec{W}) \equiv \sum_{\vec{V}\setminus \vec{B}}q_{\vec{V}} (\vec{V} | \vec{W}); q_{\vec{V}} (\vec{V} \setminus \vec{B} | \vec{B} \cup \vec{W}) = \frac{q_{\vec{V}} (\vec{V} | \vec{W})}{q_{\vec{V}}(\vec{B} | \vec{W})}.
\]
}
%Note that while kernels act like conditional distributions they 
%\amir{subscript for the dennominator?}

%\textbf{Fixing}:
The fixing operator \citep{richardsonNestedMarkovProperties2023} is an operator applied to graphs and kernels  %graphical and probabilistic operator
that ``removes'' vertices and random variables by rendering them ``fixed''.
%that turns random vertices into fixed vertices.
A relevant generalization of an ADMG called a conditional ADMG (CADMG) $\G(\vec{V},\vec{W})$ contains two types of vertices: random (denoted by $\vec{V}$), and fixed (denoted by $\vec{W}$).  Fixed vertices cannot have any edges with an arrowhead into them, and will be displayed as squares in graphs.  Note that an ADMG is a CADMG where $\vec{W}$ is empty.
Kernels $q_{\vec{V}}(\vec{V} | \vec{W}=\vec{w})$ 
and CADMGs $\G(\vec{V},\vec{W})$ will represent interventional distributions $p(\vec{V} | \text{do}(\vec{w}))$, and their Markov structure, respectively.
\emph{Mutilated graphs} used in \cite{pearl09causality} to describe interventional contexts may be viewed as CADMGs, provided intervened on variables are distinguished from variables that remain random.

For a CADMG $\G(\vec{V},\vec{W})$, a vertex $V \in \vec{V}$ is fixable if there is no other vertex that is both a descendant and in the same district in $\G$, that is if $\dis_{\G}(V) \cap \de_{\G}(V) = \{V\}$.
%\amir{isn't V always in this intersection?}.
If $V$ is fixable we define a new CADMG $\G(\vec{V} \setminus \{ V \}, \vec{W} \cup \{ V \}) \equiv \phi_V(\G(\vec{V},\vec{W}))$ by means of a fixing operator $\phi_V$ which renders $V$ a fixed vertex, removes all edges in $\G(\vec{V},\vec{W})$ with an arrowhead into $V$, and keeps all other vertices and edges unaltered.

Given a non-empty sequence of vertices $\pi$, we define $h(\pi)$ to be its first element, and $t(\pi)$ to be the subsequence of $\pi$ containing all elements after the first.
A sequence $\pi$ of vertices in $\vec{V}$ is said to be valid (or fixable) in a CADMG $\G(\vec{V},\vec{W})$ if either the sequence is empty, % ($\pi = \langle \rangle$), 
or $h(\pi)$ is fixable in $\G$ and $t(\pi)$ is fixable  in $\phi_{h(\pi)}(\G)$.  Any two sequences on the same set of vertices $\vec{S} \subseteq \vec{V}$ fixable in $\G(\vec{V},\vec{W})$ yield the same CADMG, allowing us to write $\phi_{\vec{S}}(\G)$ to mean ``obtain the CADMG after applying the fixing operator to $\vec{S}$ via any valid sequence''.
A set $\vec{R} \subseteq \vec{V}$ is said to be reachable in an ADMG $\G$ with vertices $\vec{V}$ if there exists a valid fixing sequence for $\vec{V} \setminus \vec{R}$.  Given a set $\vec{R} \subseteq \vec{V}$ that is not reachable in $\G$, the unique smallest reachable superset of $\vec{R}$ that is reachable of $\G$ is called a \emph{reachable closure of $\vec{R}$}, or simply closure of $\vec{R}$, and denoted by $\cl_{\G}(\vec{R})$.

Given a kernel $q_{\vec{V}}(\vec{V} | \vec{W})$ associated with a CADMG $\G(\vec{V},\vec{W})$ where $V$ is fixable, define $\phi_V(q_{\vec{V}}; \G)$ to be the kernel
$q_{\vec{V} \setminus \{V\}}(\vec{V} \setminus \{V\} \mid \vec{W} \cup \{ V \}) \equiv \frac{q_{\vec{V}}(\vec{V} \mid \vec{W})}{q_{\vec{V}}(V \mid \nd_{\G}(V),\vec{W})}$. 
%\amir{$\{V\}$ after bar?}
Given a CADMG $\G(\vec{V},\vec{W})$, kernel $q_{\vec{V}}(\vec{V} | \vec{W})$ and a sequence $\pi$ of vertices in $\vec{V}$ fixable in $\G$, we define $\phi_{\pi}(q_{\vec{V}},\G)$ as $q_{\vec{V}}$ if $\pi$ is the empty sequence, and as $\phi_{t(\pi)}(\phi_{h(\pi)}(q_{\vec{V}}; \G); \phi_{h(\pi)}(\G))$ otherwise.
Given $p(\vec{V})$ which is a marginal distribution obtained from $p(\vec{V} \cup \vec{H})$ which is Markov with respect to a DAG $\G(\vec{V} \cup \vec{H})$, and corresponding latent projection ADMG $\G(\vec{V})$ of $\G(\vec{V} \cup \vec{H})$, and any two sequences $\pi_1,\pi_2$ on $\vec{S} \subseteq \vec{V}$ valid in $\G$, $\phi_{\pi_1}(q_{\vec{V}}; \G) = \phi_{\pi_2}(q_{\vec{V}}; \G)$.  We thus denote the resulting kernel by $\phi_{\vec{S}}(q_{\vec{V}}; \G)$.

The identification of interventional distributions $p(\vec{Y} | \text{do}(\vec{a}))$ for any disjoint subsets $\vec{A},\vec{Y}$ of $\vec{V}$ in a hidden variable causal model associated with a DAG $\G(\vec{V} \cup \vec{H})$ with a latent projection $\G(\vec{V})$ has been characterized as follows.  Let $\vec{Y}^* = \an_{\G(\vec{V})_{\vec{V} \setminus \vec{A}}}(\vec{Y})$, and define $\G^*$ as $\G(\vec{V})_{\vec{Y}^*}$  
%\amir{did we define this notation?}.  
Then
$p(\vec{Y} | \text{do}(\vec{a}))$ is identified if and only if every element in ${\cal D}(\G(\vec{V})_{\vec{Y}^*})$ is reachable in ${\cal G}(\vec{V})$.  If so, the reformulation of the ID algorithm by \cite{richardsonNestedMarkovProperties2023} gives
{\small
\begin{align}
\label{eqn:do-fact}
p(\vec{Y} | \text{do}(\vec{a}))
&= \sum_{\vec{Y}^* \setminus \vec{Y}}
\prod_{\vec{D} \in {\cal D}(\G(\vec{V})_{\vec{Y}^*})}
\!\!
p(\vec{D} | \text{do}(\pas_{\G(\vec{V})}(\vec{D})))\\
\notag
&=
\sum_{\vec{Y}^* \setminus \vec{Y}}
\prod_{\vec{D} \in {\cal D}(\G(\vec{V})_{\vec{Y}^*})}
\!\!
\phi_{\vec{V} \setminus \vec{D}}(p(\vec{V}); \G(\vec{V}))
\vert_{\vec{A} = \vec{a}}.
\end{align}
}
% This expression may be viewed as a generalization of the g-formula, in the sense that just as g-formula is a modified factorization of a DAG, the above experssion is a modified nested Markov factorization of a latent projection ADMG \cite{richardsonNestedMarkovProperties2023}.

\section{
The GID Algorithm: A Review
}
\label{sec:prior-work}
\begin{figure*}[h]
\centering
\begin{subfigure}[b]{0.15\textwidth}
\centering
        \begin{tikzpicture}
		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
		\tikzstyle{block2} = [draw, rectangle, inner sep=2.5pt, fill=lightgray]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            \node[] (c) at  (0, 0) {$C$};
            \node[] (a) at  (0, -1) {$A$};
			\node[] (m) at  (1, -1) {$M$}; 
			\node[] (y) at  (2, -1) {$Y$}; 
			\node[block] (u2) at  (1, 0) {$U_2$}; 
			\node[block] (u3) at  (2, 0) {$U_3$}; 
			\node[block] (u1) at  (1, -1.5) {$U_1$}; 
            
            \draw[-stealth] (a) to (m);
            \draw[-stealth] (c) to (a);
            \draw[-stealth] (m) to (y);
            \draw[-stealth] (u1) to (a);
            \draw[-stealth] (u1) to (y);
            \draw[-stealth] (u2) to (c);
            \draw[-stealth] (u2) to (m);
            \draw[-stealth][bend right=35] (u3) to (c);
            \draw[-stealth] (u3) to (y);
                                   	                                      
        \end{tikzpicture}
        \caption{}
        \label{fig:antibiotics-study-1}
%\centering
\end{subfigure}%
\begin{subfigure}[b]{0.15\textwidth}

\centering
        \begin{tikzpicture}
		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
		\tikzstyle{block2} = [draw, rectangle, inner sep=2.5pt, fill=lightgray]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            \node[] (c) at  (0, 0) {$C$};
            \node[] (a) at  (0, -1) {$A$};
			\node[] (m) at  (1, -1) {$M$}; 
			\node[] (y) at  (2, -1) {$Y$}; 
			\node[block] (u2) at  (1, 0) {$U_2$}; 
			\node[block] (u3) at  (2, 0) {$U_3$}; 
			\node[block] (u1) at  (2, -2) {$U_1$}; 
            
            \draw[-stealth] (a) to (m);
            \draw[-stealth] (c) to (a);
            \draw[-stealth] (m) to (y);
            \draw[-stealth] (u1) to (a);
            \draw[-stealth] (u1) to (y);
            \draw[-stealth] (u2) to (c);
            \draw[-stealth][bend right=35] (u3) to (c);
            \draw[-stealth] (u3) to (y);
            
            
        \end{tikzpicture}
        \caption{}
        \label{fig:antibiotics-study-2}
%\centering
\end{subfigure}%
\begin{subfigure}[b]{0.15\textwidth}
\centering
        \begin{tikzpicture}
		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            
            \node[] (c) at  (0, 0) {$C$};
            \node[] (a) at  (0, -1) {$A$};
			\node[] (m) at  (1, -1) {$M$}; 
			\node[] (y) at  (2, -1) {$Y$}; 
			\node[selector] (s) at  (1, -2) {$S$}; 
			\node[block] (u2) at  (1, 0) {$U_2$}; 
			\node[block] (u3) at  (2, 0) {$U_3$}; 
			\node[block] (u1) at  (2, -2) {$U_1$}; 
            
            \draw[-stealth] (a) to (m);
            \draw[-stealth] (c) to (a);
            \draw[-stealth] (m) to (y);
            \draw[-stealth] (u1) to (a);
            \draw[-stealth] (u1) to (y);
            \draw[-stealth] (u2) to (c);
            \draw[-stealth] (u2) to (m);
            \draw[-stealth] (s) to (m);
            \draw[-stealth][bend right=35] (u3) to (c);
            \draw[-stealth] (u3) to (y);
        \end{tikzpicture}
        \caption{}
        \label{fig:antibiotics-scar}
%\centering
\end{subfigure}%
\begin{subfigure}[b]{0.15\textwidth}
\centering
        \begin{tikzpicture}
		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            
            \node[] (c) at  (0, 0) {$C$};
            \node[] (a) at  (0, -1) {$A$};
			\node[] (m) at  (1, -1) {$M$}; 
			\node[] (y) at  (2, -1) {$Y$}; 
			\node[] (w) at  (0, -2) {$W$}; 
			\node[selector] (s) at  (1, -2) {$S$}; 
			\node[block] (u2) at  (1, 0) {$U_2$}; 
			\node[block] (u3) at  (2, 0) {$U_3$}; 
			\node[block] (u1) at  (2, -2) {$U_1$}; 
            
            \draw[-stealth] (a) to (m);
            \draw[-stealth] (c) to (a);
            \draw[-stealth] (m) to (y);
            \draw[-stealth] (u1) to (a);
            \draw[-stealth] (u1) to (y);
            \draw[-stealth] (u2) to (c);
            \draw[-stealth] (u2) to (m);
            \draw[-stealth] (s) to (m);
            \draw[-stealth] (w) to (s);
            \draw[-stealth] (w) to (a);
            \draw[-stealth] (w) to (y);
            \draw[-stealth] (a) to (s);
            \draw[-stealth][bend right=35] (u3) to (c);
            \draw[-stealth] (u3) to (y);
        \end{tikzpicture}
        \caption{}
        \label{fig:antibiotics-sar}
%\centering
\end{subfigure}%
\begin{subfigure}[b]{0.15\textwidth}
\centering
        \begin{tikzpicture}
		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            
            \node[] (c) at  (0, 0) {$C$};
            \node[] (a) at  (0, -1) {$A$};
			\node[] (m) at  (1, -1) {$M$}; 
			\node[] (y) at  (2, -1) {$Y$}; 
			\node[] (w) at  (0, -2) {$W$}; 
			\node[selector] (s) at  (1, -2) {$S$}; 
			\node[block] (u2) at  (1, 0) {$U_2$}; 
			\node[block] (u3) at  (2, 0) {$U_3$}; 
			\node[block] (u1) at  (2, -2) {$U_1$}; 
            
            \draw[-stealth] (a) to (m);
            \draw[-stealth] (c) to (a);
            \draw[-stealth] (m) to (y);
            \draw[-stealth] (u1) to (a);
            \draw[-stealth] (u1) to (y);
            \draw[-stealth] (u2) to (c);
            \draw[-stealth] (u2) to (m);
            \draw[-stealth] (s) to (m);
            \draw[-stealth] (w) to (s);
            \draw[-stealth] (w) to (a);
            \draw[-stealth] (w) to (y);
            \draw[-stealth] (u1) to (s);
            \draw[-stealth] (a) to (s);            
            \draw[-stealth][bend right=35] (u3) to (c);
            \draw[-stealth] (u3) to (y);
        \end{tikzpicture}
        \caption{}
        \label{fig:antibiotics-snar}
%\centering
\end{subfigure}

\begin{subfigure}[b]{0.3\textwidth}
\centering
        \begin{tikzpicture}[node distance=1.5cm]
		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            
            \node[selector] (s) at  (0, 0) {$S$};
            \node[inner sep=0pt, right of=s] (a)  {$A$};
			\node[inner sep=0pt, right of=a] (y)  {$Y$}; 
            \node[block, above of=s, right=.5] (u1)  {$U_1$};
            \node[block, above of=a, right=.5] (u2) {$U_2$};
			%\node[] (y) at  (2, -1) {$Y$}; 
			%\node[] (w) at  (0, -2) {$W$}; 
			%\node[selector] (s) at  (1, -2) {$S$}; 
			%\node[block] (u2) at  (1, 0) {$U_2$}; 
			%\node[block] (u3) at  (2, 0) {$U_3$}; 
			%\node[block] (u1) at  (2, -2) {$U_1$}; 
            % \draw[-stealth] (u2) to (y);
            % \draw[-stealth] (u2) to (a);
            % \draw[-stealth] (u1) to (s);
            % \draw[-stealth] (u1) to (a);
            % \draw[-stealth] (s) to
            % node[below left,yshift=-0.1cm,xshift=-0.1cm]{\tiny $\{A\}$} (a);
            % \draw[-stealth] (a) to
            % node[above left,yshift=0.2cm,xshift=-0.1cm]{\tiny $\{A\}$}
            % (y);
            \draw[-stealth] (s) to (a);
            \draw[-stealth] (a) to (y);
            \draw[-stealth] (u2) to (y);
            \draw[-stealth] (u2) to  node[below,sloped]{\tiny $\{A\}$}  (a);
            \draw[-stealth] (u1) to (s);
            \draw[-stealth] (u1) to node[below,sloped]{\tiny $\{A\}$} (a);
            % \draw[-stealth] (a) to (m);
            % \draw[-stealth] (c) to (a);
            % \draw[-stealth] (m) to (y);
            % \draw[-stealth] (u1) to (a);
            % \draw[-stealth] (u1) to (y);
            % \draw[-stealth] (u2) to (c);
            % \draw[-stealth] (u2) to (m);
            % \draw[-stealth] (s) to (m);
            % \draw[-stealth] (w) to (s);
            % \draw[-stealth] (w) to (a);
            % \draw[-stealth] (w) to (y);
            % \draw[-stealth] (u1) to (s);
            % \draw[-stealth][bend right=35] (u3) to (c);
            % \draw[-stealth] (u3) to (y);
        \end{tikzpicture}
        \caption{}
        \label{fig:id-double-bow}
%\centering
\end{subfigure}%
\begin{subfigure}[b]{0.3\textwidth}
\centering
        \begin{tikzpicture}[node distance=1.5cm]
		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            
            \node[selector] (s) at  (0, 0) {$S$};
            \node[inner sep=0pt, right of=s] (a)  {$A$};
			\node[inner sep=0pt, right of=a] (y)  {$Y$}; 
            \node[block, above of=s, right=.5] (u1)  {$U_1$};
            \node[block, above of=a, right=.5] (u2) {$U_2$};
			%\node[] (y) at  (2, -1) {$Y$}; 
			%\node[] (w) at  (0, -2) {$W$}; 
			%\node[selector] (s) at  (1, -2) {$S$}; 
			%\node[block] (u2) at  (1, 0) {$U_2$}; 
			%\node[block] (u3) at  (2, 0) {$U_3$}; 
			%\node[block] (u1) at  (2, -2) {$U_1$}; 
            % \draw[-stealth] (u2) to (y);
            % \draw[-stealth] (u2) to (a);
            % \draw[-stealth] (u1) to (s);
            % \draw[-stealth] (u1) to (a);
            % \draw[-stealth] (s) to
            % node[below left,yshift=-0.1cm,xshift=-0.1cm]{\tiny $\{A\}$} (a);
            % \draw[-stealth] (a) to
            % node[above left,yshift=0.2cm,xshift=-0.1cm]{\tiny $\{A\}$}
            % (y);
            \draw[-stealth] (s) to (a);
            \draw[-stealth] (a) to (y);
            \draw[-stealth] (u2) to (y);
            %\draw[-stealth] (u2) to  node[below,sloped]{\tiny $\{A\}$}  (a);
            \draw[-stealth] (u1) to (s);
            %\draw[-stealth] (u1) to node[below,sloped]{\tiny $\{A\}$} (a);
            % \draw[-stealth] (a) to (m);
            % \draw[-stealth] (c) to (a);
            % \draw[-stealth] (m) to (y);
            % \draw[-stealth] (u1) to (a);
            % \draw[-stealth] (u1) to (y);
            % \draw[-stealth] (u2) to (c);
            % \draw[-stealth] (u2) to (m);
            % \draw[-stealth] (s) to (m);
            % \draw[-stealth] (w) to (s);
            % \draw[-stealth] (w) to (a);
            % \draw[-stealth] (w) to (y);
            % \draw[-stealth] (u1) to (s);
            % \draw[-stealth][bend right=35] (u3) to (c);
            % \draw[-stealth] (u3) to (y);
        \end{tikzpicture}
        \caption{}
        \label{fig:id-double-bow-cs-graph}
%\centering
\end{subfigure}%
\begin{subfigure}[b]{0.3\textwidth}
\centering
    \begin{tikzpicture}[node distance=1.5cm]
		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
		\tikzstyle{fixed} = [draw, rectangle, inner sep=.7pt]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            
            \node[selector] (s)  {$S$};
            %\node[inner sep=0pt] (a) at  (0, -.5) {$A$};
           \node[name=a, shape=swig vsplit, right of= s]{\nodepart{left}{$A$} \nodepart{right}{$a$}};
            \node[block, above of=s, right=.5] (u1)  {$U_1$};
            \node[block, above of=a, right=.5] (u2)  {$U_2$};
            %\node[fixed] (aa) at  (0, -1) {$a$};
			\node[inner sep=0pt, right of=a] (y) {$Y(a)$}; 
			%\node[] (y) at  (2, -1) {$Y$}; 
			%\node[] (w) at  (0, -2) {$W$}; 
			%\node[selector] (s) at  (1, -2) {$S$}; 
			%\node[block] (u2) at  (1, 0) {$U_2$}; 
			%\node[block] (u3) at  (2, 0) {$U_3$}; 
			%\node[block] (u1) at  (2, -2) {$U_1$}; 
            \draw[-stealth] (s) to (a);
            \draw[-stealth] (a) to (y);
            \draw[-stealth] (u2) to (y);
            \draw[-stealth] (u2) to[out=200, in=100] node[below,sloped]{\tiny $\{A\}$}  (a);
            \draw[-stealth] (u1) to (s);
            \draw[-stealth] (u1) to[out=250] node[below,sloped]{\tiny $\{A\}$} (a);
            %\draw[stealth-stealth][bend right=35] (s) to (a);
            %\draw[stealth-stealth][bend right=35] (a) to (y);
            
            % \draw[-stealth] (a) to (m);
            % \draw[-stealth] (c) to (a);
            % \draw[-stealth] (m) to (y);
            % \draw[-stealth] (u1) to (a);
            % \draw[-stealth] (u1) to (y);
            % \draw[-stealth] (u2) to (c);
            % \draw[-stealth] (u2) to (m);
            % \draw[-stealth] (s) to (m);
            % \draw[-stealth] (w) to (s);
            % \draw[-stealth] (w) to (a);
            % \draw[-stealth] (w) to (y);
            % \draw[-stealth] (u1) to (s);
            % \draw[-stealth][bend right=35] (u3) to (c);
            % \draw[-stealth] (u3) to (y);
        \end{tikzpicture}
        \caption{}
        \label{fig:id-double-bow-swig}
%\centering
\end{subfigure}
% \begin{subfigure}[b]{0.08\textwidth}
% \centering
%     \begin{tikzpicture}
% 		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
% 		\tikzstyle{fixed} = [draw, rectangle, inner sep=.5pt]
% 		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
% 		\tikzstyle{input} = [coordinate]
% 		\tikzstyle{output} = [coordinate]
%             \tikzset{edge/.style = {->,> = latex'}}
            
%             \node[selector] (s) at  (0,0.5) {$S$};
%             \node[] (a) at  (0, -.5) {$A$};
%             \node[block] (u1) at  (-0.6, 0) {$U_1$};
%             \node[block] (u2) at  (-0.6, -1.25) {$U_2$};
%             \node[fixed] (aa) at  (0, -1) {$a$};
% 			\node[] (y) at  (0, -2) {$Y(a)$}; 
 
%             \draw[-stealth] (s) to (a);
%             \draw[-stealth] (aa) to (y);
%             \draw[-stealth] (u2) to (y);
%             \draw[-stealth] (u2) to (a);
%             \draw[-stealth] (u1) to (s);
%             \draw[-stealth] (u1) to (a);

%         \end{tikzpicture}
%         \caption{}
%         \label{fig:id-double-bow-swig}
% %\centering
% \end{subfigure}

\caption{%Labelled selection directed acyclic graphs. 
%Example LS-DAGs.
Example graphs illustrating systematic selection.
Unobserved variables are denoted with a enclosing circle, while observed variables are not.
%Variables are connected using directed edges which indicate direct causation.
The (observed) selection variable $S$ is denoted by a diamond. %enclosed in a diamond to highlight its special nature.
Half-circles represent the split-node operation in SWIGs \citep{richardsonSingleWorldIntervention2013}.
}
\label{fig:prior-work-examples}
\end{figure*}

% To provide motivation for our approach, we consider an example motivated by postoperative acute kidney injury [CITATION]. In a research hospital ${\cal H}_1$, a study is conducted measuring the impact of intra-operative oxygen $A$ on the development of AKI $Y$. A large set of pre-treatment covariates $(\vec{W}, \vec{C})$ are collected. For ethical reasons, $A$ cannot be randomly assigned, but is instead assigned according to a protocol that depends only on $(\vec{W}, \vec{C})$. The The study is then replicated at another hospital ${\cal H}_2$ in an attempt to understand if these results generalize.
%\textbf{Motivation}:
Consider an observational study ${\cal S}_1$ which aimed to assess antibiotic effectiveness for treating infections. In practice, patients are prone to non-compliance, where the full course is not taken as instructed. We represent this causal structure with antibiotic prescription $A$, compliance level $M$, and a longer term outcome variable $Y$ such as hospital 30-day readmission. Unobserved common causes $U_1, U_2, U_3$ create confounding for several study variables, but a pre-treatment proxy $C$ of $U_2, U_3$ is observed. The ID algorithm
%of \cite{shpitserIdentificationJointInterventional2006}
demonstrates that $p(Y(a))$ is not identified from data on observed variables in the model shown in \cref{fig:antibiotics-study-1}.
%this setting.

Now consider an  experimental study ${\cal S}_2$, in which the investigator is able to %ensure
control compliance level based on assigned treatment. 
%\amir{why is there an edge from A to M? what does controling the compliance mean? Do we ask some people to not fully comply??}. 
This results in \cref{fig:antibiotics-study-2}, in which the edge $A \to M$ is kept, but the edge $U_2 \to M$ is removed, representing the situation in which compliance status only depends on the prescribed medication (and not, say, the patient's mood or appetite). It turns out that if both datasets are drawn from the same population, the causal effect $p(Y(a))$ may be obtained from the combined dataset as $\sum_{c, m, \tilde{a}} \E_2[Y \mid m, \tilde{a}] p_1 (\tilde{a}) \sum_c p_1 (m \mid a, c) p_1 (c)$, where subscripts indicate distributions computed from datasets $\mathcal{S}_1$ or $ \mathcal{S}_2$.
%\amir{this functional comes out of no where without referring to what comes next} where subscripts indicate the dataset used for computation of that term. 
\citet{leeGeneralIdentifiabilityArbitrary2019} provided a sound and complete graphical \emph{gID algorithm} for the identification of such queries, in the special case where experimental datasets are formulated only using the $\text{do}(.)$ operator, rather than more complex policy interventions like in the above example that may depend on other variables in the problem.

We now reformulate the gID algorithm using the fixing operator $\phi$ (see also \cite{leeCausalEffectIdentifiability2020,leeIdentificationMethodsArbitrary2020a}).
%. {\color{red}[ref our own paper, elias’ ICML paper for GID-
%PO]}
%as we detail briefly below. 
%\textbf{gID algorithm}:
Given the interventional distribution of interest $p(\vec{Y}(\vec{a}))$, and a latent projection ADMG $\G(\vec{V})$ representing a hidden variable causal model,
%be a causal effect, and 
let the available datasets be denoted $\{p_i (\vec{V} \mid \doo(\vec{Z}_i))\}_{i=1}^K$ with corresponding CADMGs $\{\G_i(\vec{V} \setminus \vec{Z}_i, \vec{Z}_i) \}_{i=1}^K$.  Note that these CADMGs need not have been obtained via the fixing operator.
Then, if for each  $\vec{D} \in {\cal D} (\G(\vec{V})_{\vec{Y}^*})$ there exists $j_{\vec{D}} \in \{1, \ldots, K\}$ such that $\vec{D}$ is reachable in 
%$p_{j_{\vec{D}}}, \G(\vec{V} \setminus \vec{Z}_i, \vec{Z}_i)$,
$\G_{j_{\vec{D}}}(\vec{V} \setminus \vec{Z}_{j_{\vec{D}}}, \vec{Z}_{j_{\vec{D}}})$,
then output
{\small
\[
p(\vec{D} \mid \doo(\pas_{\G(\vec{V})}(\vec{D}))) = \phi_{(\vec{V} \setminus \vec{Z}_{j_{\vec{D}}}) \setminus \vec{D}} (p_{j_{\vec{D}}}; \G_{j_{\vec{D}}}).
\]
}
% \amir{we said nothing about how the distributions are related to one another!!}
% \jaron{that is exactly what gID does so we are leaving it as is. we address this underspecification in next paragraph}
The causal effect is then obtained by the usual district factorization (\ref{eqn:do-fact}) as in the ID algorithm.
% \[
% p(\vec{Y} | \text{do}(\vec{a})) = \sum_{\vec{Y}^* \setminus \vec{Y}} \prod_{\vec{D} \in {\cal D} (\G(\vec{V})_{\vec{Y}^*})} p(\vec{D} \mid \doo(\pas_{\G(\vec{V})}(\vec{D})))
% \]

%\textbf{Limitations of current approaches}:
The primary limitation of this and related prior work is that the selection process is \emph{under-specified}. In the case of gID, the selection process is not represented at all, as nothing is specified about how distributions $\{p_i (\vec{V} \mid \doo(\vec{Z}_i)) \}_{i=1}^K\}$ are related. In other related work such as 
transportability \citep{bareinboimTransportabilityCausalEffects2012}, selection bias \citep{bareinboimRecoveringCausalEffects2015}, or as a general representation of interventional and observational domains in causal inference \citep{dawidDecisiontheoreticFoundationsStatistical2021}, the selectors enter the model but only as non-random indicators that index domains.

%In particular, the selection process is either not represented at all (as in the gID algorithm), or only enters the model via indicators that index domains, but which are not themselves random variables that are a part of the model. In addition to data fusion problems, the resulting models have been used to address questions of 
%transportability \citep{bareinboimTransportabilityCausalEffects2012}, selection bias \citep{bareinboimRecoveringCausalEffects2015}, or as a general representation of interventional and observational domains in causal inference \citep{dawidDecisiontheoreticFoundationsStatistical2021}.

Since domain selectors are not treated as full random variables, the resulting models do not yield a single coherent data likelihood, which is an impediment to statistical inference.  A more serious issue, however, is that by not modeling the selection process explicitly, it is not possible to represent \emph{systematic selection}, which is often how units from a single superpopulation are assigned to different experimental and observational settings in practice.
%data fusion, and related problems that feature multiple domains.

%obscures the fact that the selection process itself may be systematic, and in particular subject to the usual causal inference problem %s of observed and/or unobserved of confounding.

%\textbf{A hierarchy of selection}:
To address these issues, we will represent the selection process as a random variable $S$.  In this paper, this variable indexes intervention status of its children in the graph, but may also indicate changes in structural equations representing domain differences.  Selectors may potentially share common (and potentially unobserved) parents with other variables, creating potential confounding.

%We will use the random variable $S$ to represent the selection process.
%In this framework,

%joint distribution by requiring a marginal distribution over the selection variable $S$.


% In the next section, we describe an extension of graphical causal models that incorporate the selector variable $S$.  Since $S$ governs whether a variable functions in an observational or experimental regime, and since variables in experimental regimes are set to constants, our graphical model will naturally incorporate context-specific restrictions.

%this needs to go later.
% {\color{red} TODO: rework notation based on previous background}
% We briefly illustrate some problems that arise when $S$ is introduced as a random variable. To demonstrate the complexities, we consider \cref{fig:id-double-bow} with SWIG \cref{fig:id-double-bow-swig}. In this graph there are two studies - an observational study, and one where $A$ is intervened upon. The first issue is that $S$ is no longer an unrestricted random variable, since it is discrete and its cardinality is restricted by the studies considered. The second issue is that the structural equations and graph do not indicate that when $S$ indicates the randomized experiment on $A$, that $A$ is no longer influenced by unmeasured confounding. Finally, because this is data fusion and not transportability, we need to ensure that when $S$ does not indicate an intervention for a variable, that the structural equation for that variable remains unchanged.

% {\color{red}
% $S$ is confounded by unobserved variables, meaning this problem is SNAR, but yet identification follows by:
% \begin{align*}
%    p(Y(a)) &= p(Y(a) \mid S) \\
%    &= p(Y(a) \mid S = (\{A\}, a)) \\
%    &= p(Y \mid S = (\{A\}, a)) \\
% \end{align*}
% }

% We next introduce the additional restrictions necessary to define the context-specific structural causal model, ${\cal M}_{CS}$.

% The first restriction is that we require that a special variable called the context selection variable $S \in V$. $S$ must be a discrete variable, and its domain $\mathfrak{X}_S$ is a non-empty subset of the power set of $V \setminus \{S\}$. Intuitively, $S$ represents the various observational and experimental contexts that are available to the analyst, by reference to the sets of variables that are intervened in each context. For example, the observational context is represented by $\emptyset$, an experiment where some variable $A$ is randomized is represented by $\{A\}$, and an experiment where variables $A_1, \ldots, A_k$ are intervened on is represented by $\{A_1, \ldots, A_k\}$, and $\mathfrak{X}_S = \{\emptyset, \{A\}, \{A_1, \ldots, A_k\}\}$. Then, for each $s \in \mathfrak{X}_S$, and for each $V_i \in s$, $f_{V_i}$ has $S$ as an argument. 
% We denote the set of all such $f_{V_i}$ as $F_S \subset F$.
% There are no specific restrictions on $f_S$ itself, as long as it induces an acyclic graph. 

% Second, we specify the behavior of variables under intervention. For each $s \in \mathfrak{X}_S$, consider each $V_i \in s$. Such a value of $s$ means that we are in a context where each $V_i$ is randomized by an experimenter by probabilities of their choosing. %corresponds to This case corresponds to the randomization probabilities on the treatment variables imposed by an experimenter in experiment $s$ for variable $V_i$, and are thus in principle a priori known distributions. 
% Then, we 
% %denote each of these as $e_s (V_i)$. Additionally, we 
% define random variables $E_{s, V_i}$ whose domain matches that of $V_i$, and we let $p(E_{s, V_i})$ represent the randomization probabilities chosen by the experimenter.

% We let $E$ denote the set of all random variables $E_{s, V_i}$, and we let $p(E) = \prod_{s, V_i} p(E_{s, V_i})$ denote the distribution of all such random variables. Then, we proceed to formalize the behavior of variables under intervention using context-specificity:

\section{Causal Models For 
Selection}
%Representing
%Data Fusion Problems

In this section, we describe how to augment SCMs with an additional selector random variable $S$ that governs whether certain variables that are its children in the causal graph are intervened on or keep their natural behavior.
%The selector is formally defined as follows.

%\subsection{Context Specific SCMs}

%We impose some further restrictions to define the data fusion structural causal model. The first restriction is that we require a special variable called the context selection variable, which we call $S \in \vec{V}$. We will also require that $S$ takes on certain semantics.

%{\color{red} need to distinguish $\mathfrak{X}_S$ vs $\mathfrak{X}_S^+$}

\begin{dfn}[Context Selected SCM]\label{dfn:ics-scm}
Given an SCM with independent errors associated with a DAG $\G(\vec{V})$, a context selected SCM (CS-SCM) associated with a DAG $\bar{\G}(\vec{V} \cup \{ S \})$, such that $\ch_{\bar{\G}}(S) \neq \emptyset$ and $\pa_{\bar{\G}}(S)$ is arbitrary, is defined as follows:
\begin{itemize}
% \item $S \equiv \{ \langle S^e_V, S^v_V \rangle : V \in \ch_{\bar{\G}}(S) \}$, where for all $S^e_V$, $\mathfrak{X}_{S^e_V} = \{ 0, 1 \}$, and for all $S^v_V$, $\mathfrak{X}_{S^v_V} = \mathfrak{X}_V$, and $\mathfrak{X}^+_{S^v_V} = \mathfrak{X}_V^+$.

\item $S \equiv \{\langle S^e_{\ch_{\bar{\G}}(S)}, S^v_{\ch_{\bar{\G}} (S)} \rangle \mid S^e_{\ch_{\bar{\G}} (S)} \in \mathfrak{X}_{\ch_{\bar{\G}} (S)} \subseteq \{0, 1\}^\abs{\ch_{\bar{\G}} (S)}, S^v_{\ch_{\bar{\G}} (S)}  \in \mathfrak{X}_{\ch_{\bar{\G}} (S)} \equiv \otimes_{V \in \ch_{\bar{\G}} (S)}\mathfrak{X}_{V} \} $, and furthermore $\mathfrak{X}_{\ch_{\bar{\G}} (S)}^+ \equiv \otimes_{\ch_{\bar{\G}} (S)}\mathfrak{X}^+_{V}$.
\item Every $V \in \vec{V} \setminus \ch_{\bar{\G}}(S)$ maintains its structural equation $f_V(\pa_{\G}(V),\epsilon_V)$ from the original SCM.
\item For every $V \in \vec{V} \cap \ch_{\bar{\G}}(S)$, the structural equation $\tilde{f}_V$
%(\pa_{\G}(W), S, \epsilon_W)
for $V$ in the CS-SCM is defined in terms of $S$ and the structural equation $f_V(\pa_{\G}(V),\epsilon_V)$
%(\pa_{\G}(W), \epsilon_W)
for $V$ in the original SCM as:
{\small
\begin{align*}
V &\gets \tilde{f}_V(\pa_{\G}(V), S, \epsilon_V) \\
\tilde{f}_V(\pa_{\G}(V), S, \epsilon_V) &\equiv \begin{cases}
f_V(\pa_{\G}(V),\epsilon_V) & \text{ if } S^e_V = 0 \\
S^v_V & \text{ if } S^e_V = 1
\end{cases}
\end{align*}
}
% \amir{
% \begin{enumerate}
%     %\item I would not use underbrace to define $\tilde{f}$
%     \item Should we have an assumption about $S_V^v$? For example, what if it is just one value and does not cover the whole support? i.e., is there a positivity req?
% \end{enumerate}
% }

\item For $S$, its structural equation is specified by a new equation
\[S \gets f_S (\pa_{\bar{\G}}(S), \epsilon_S)\]
where $\pa_{\bar{\G}} (S)$ is chosen to avoid introducing cycles.

%\amir{Note that the example you gave earlier does not work with your definition of CS-SCM!!}
\end{itemize}
\end{dfn}
In words, the CS-SCM is an SCM augmented with a selector variable $S$ consisting of intervention indicators $S^e_V$ and intervention values $S^v_V$ for every child $V$ of $S$.  If $S^e_V=1$, the child $V$ of $S$ is intervened on, and $S^v_V$ indicates the value of the intervention.  If $V$ is not intervened on, $V$ acts as a usual function of its parents other than $S$ via its structural equation $f_V$ from the original SCM.
To simplify notation, when discussing CS-SCMs, we will include $S$ in the set $\vec{V}$, and denote values of $S$ as a vector pair $\langle \vec{s}^e, \vec{s}^v \rangle$.


We note that the relationship of the random selector $S$ and its children we describe here is similar closely related to \emph{context variables} in the joint causal modeling approach in \cite{mooijJointCausalInference2020}.  Our approach is also related to the decision-theoretic framework in \cite{dawidDecisiontheoreticFoundationsStatistical2021} and selection diagrams applied to selection bias and transportability problems in \cite{bareinboimRecoveringCausalEffects2015,bareinboimTransportabilityCausalEffects2012}, although these approaches do not treat their respective selectors as fully random variables in the causal model. 
%Unlike these approaches, we treat $S$ as a random variable that is allowed to be causally influenced by other variables in the system, and thus as a full part of the model. \jaron{This approach is similar }
%\amir{what do you mean by ``full part''? Do you want to also compare with Mooij et al?}

In subsequent developments we will make use of the following definition.
Given a set of variables $\vec{D} \subseteq \vec{V}$ in a CS-SCM with associated DAG $\G$, we say
a value $s = \langle\vec{s}^e, \vec{s}^v \rangle$ is \emph{laidback for $\vec{D}$} if for each $V \in \vec{D}$, either $\vec{s}^e_{V} = 0$, or $V \not \in \ch_{\G} (S)$.
We say $s$ is \emph{serious for $\vec{D}$} if it is not laidback for $\vec{D}$. 
%\amir{are we later changing these terminologies?}
We will also use the abuse of notation $s = \emptyset$ to denote any value set $\langle \vec{s}^e, \vec{s}^v \rangle$, where
$\vec{s}^e_{\vec{V}} = \vec{0}$ for observed variables $\vec{V}$. Such value sets correspond to the observational context of an CS-SCM, the context where no interventions took place.

The CS-SCM exhibits \emph{context-specific} independencies, whereby a variable is no longer a function of some others at particular values of a parent. \cite{pensarLabeledDirectedAcyclic2015} provide a succinct graphical representation of this information.
\begin{dfn}[Labelled selection DAG]\label{dfn:labelled_selection_dag}
Let ${\G}(\vec{V})$ denote a DAG associated with a CS-SCM with edges $\vec{E}$.
For each edge $E = (A \to B) \in \vec{E}$ {such that
$S \in \pa_{{\G}}(B)$ and $S \neq A$, we attach a label $L_E = \{ B \}$.}
% we attach a label {\color{red}$L_E = \{ V: s^e_V \in \mathfrak{X}^e_{S}, \textrm{$s^e$ is serious for $Y$}\}$} whenever $X \neq S$.
Then, ${\mathcal{G}}^{[]}(\vec{V})$ with edge labels $\vec{L} = \cup_{E} L_E$  denotes a labelled selection DAG (LS-DAG)
associated with the CS-SCM.

% For each $V \in \vec{V} \setminus \{S\}$ mentioned in $\mathfrak{X}_{S_0}$, and for each edge $E = (A \to B) \in \vec{R}$, we attach a label $L_{E} = \{s: s \in \mathfrak{X}_{S_0}, s \cap \{V\} \neq \emptyset\}$
% Let the set of all labels be defined as $\vec{L} = \{L_{E}: \forall E = (A \to B) \in \vec{R}, V \in \vec{V} \setminus \{S\}\}$
\end{dfn}
% LS-DAGs allow us to remove edges based on values of $S$, which we will make use of below to represent context-specific independences in observational and interventional distributions induced by ICS-SCMs.  In particular,
Given an LS-DAG ${\G}^{[]}$ and a value $s$ of $S$, we define the \emph{context selected} DAG ${\G}^{[s]}$ to be an edge subgraph of ${\G}^{[]}$ where any edge with a label $\{ V \}$ is removed if $\vec{s}^e_V = 1$, and removes all other edge labels.  Note that the graph
${\G}^{[\emptyset]}$ removes all edge labels but no edges.
\cref{fig:id-double-bow} is an example of an LS-DAG $\G^{[]}$, while \cref{fig:id-double-bow-cs-graph} is an example of $\G^{[s]}$ for a value $s$ such that $\vec{s}^e_A = 1$.

%\amir{why not focus on $s^e$ and talk about 0,1? Did we say something about removing the labels before?}

While we developed the CS-SCM in this section, we note that such models are cross-world models due to the independence of error terms in the NPSEM-IE. Since we do not rely on cross-world independencies elsewhere in this paper, this section can in principle be reformulated using single-world models such as the finest fully randomized causally interpretable structured tree graph, or FFRCISTG \citep{robins86new}. The principle difference between these models is that FFRCISTG assumptions can in theory be empirically verified under hypothetical randomized experiments where any subset of variables can be intervened upon, whereas this is not true in an NPSEM-IE as well as the CS-SCM that we defined. \cite{shpitserMultivariateCounterfactualSystems2021} provide further details on this distinction.

%The following section describes how
%the Markov structure implied by the CS-SCM, in both observational and interventional contexts, may be recovered from such edge subgraph of LS-DAGs,
%and their interventional analogues.

\subsection{SWIGs and Context Selected SWIGs}

Given a DAG $\G(\vec{V})$ representing an SCM, and a set of values $\vec{a}$ of treatments $\vec{A}$, a single world intervention graph (SWIG) \citep{thomas13swig} $\G(\vec{V}(\vec{a})) = \G(\vec{a})$ is a graph obtained from $\G$ by creating a ``random'' version $A$ and a ``fixed'' version $a$ of every $A \in \vec{A}$ vertex in $\G$, with every random version $A$ inheriting all edges with arrowheads into $A$ in $\G$, and every fixed version $a$ inheriting all outgoing edges from $A$ in $\G$.  In addition every vertex $V$ in $\G(\vec{a})$ is relabelled as $V(\vec{a})$ to signify that these vertices represent counterfactual random variables.
%A SWIG $\G(\vec{a})$ is a CDAG $\G(\vec{V},\vec{a})$.
%\amir{This is not quite SWIG yet, you have to remove variables from labels that are not ancestors}

A SWIG $\G(\vec{a})$ represents Markov structure of an interventional distribution $p(\vec{V}(\vec{a}))$ obtained from an SCM with a graph $\G$ via the d-separation criterion \citep{thomas13swig}.  Standard genealogic relations generalize readily to SWIGs.
%Districts in SWIGs are only defined with respect to random vertices.
%\amir{1. THis is too short to be a paragraph\\ 2. d-sep is done on G(a) not on G}

We consider a special case of SWIGs applicable to our setting.  Given a CS-SCM associated with an LS-DAG ${\G}^{[]}$ with vertices $\vec{V}$, if $S \not\in \vec{A}$, we represent $p(\vec{V}(\vec{a}))$ via a \emph{labelled selection SWIG (LS-SWIG)} ${\G}^{[]}(\vec{a})$, which is obtained by employing the standard SWIG construction while keeping the labels in $\bar{\G}^{[]}$.

If $S \in \vec{A}$, let $s$ be the value of $S$ in $\vec{a}$.  Then we define a context-specific SWIG $\bar{\G}^{[s]}(\vec{a})$ as follows.
Given an LS-SWIG $\bar{\G}^{[]}(\vec{a})$, we remove any random vertex in $\ch_{\bar{\G}(\vec{a})}(s)$ that $s$ is serious for, and all edges adjacent to such vertices.
%In addition, any edge into a variable $s$ is serious for is removed.
This operation represents the fact that such a vertex corresponds to a constant.

Despite removal of certain vertices and edges, context-specific SWIGs correctly represent independences in interventional distributions obtained from CS-SCMs due to the following result.

\begin{restatable}{thm}{csswigs}
Given a CS-SCM associated with ${\G}^{[]}(\vec{V})$, and any $\vec{A} \subseteq \vec{V}$ such that $S \in \vec{A}$ (including $\vec{A} = \{ S \}$),
any d-separation statement in ${\G}^{[s]}(\vec{a})$,
for $s$ consistent with $\vec{a}$, implies a conditional independence statement in $p(\vec{V}(\vec{a}))$.
\end{restatable}
%Note that this result still holds for $\vec{A} = \{ S \}$.
%\amir{this is not a formal statement of a theorem}



\subsection{The Selection Hierarchy and Context Selected G-formula}

%\ilya{[move this later.]}
%{\color{red}No labels in these graph.}
Treating the selector $S$ as a part of the model allows us to represent systematic selection via a hierarchy similar to the missing data hierarchy \citep{rubinInferenceMissingData1976}, with
selected completely at random (SCAR), selected at random (SAR), and selected not at random (SNAR) models.
In particular, we can recast the earlier antibiotic example as a SCAR model, by assuming that assignment into the different studies ${\cal S}_1, {\cal S}_2$ is random and that $S$ has no causes \cref{fig:antibiotics-scar}. One can view the SCAR model as the generalization ``closest in spirit'' to the original gID formulation  that admits a coherent observed data likelihood that includes both observational and interventional contexts.

SCAR models, like MCAR models in missing data, are
%As in MCAR models, this is
often unrealistic, as we expect selection into different domains to be systematic.
%\amir{I don't believe MCAR is ``generally'' unrealistic.}
%depend on observed or unobserved variables.
In our example, if the selection mechanism into either the observational group or the experimental study is influenced by observed characteristics $W$, such as the patient's age, as well as the treatment assignment $A$, the result is a SAR model shown in (\cref{fig:antibiotics-sar}).  If the patients are also selected based on unobserved characteristics that also influence patient outcomes, such as a doctor's intuition about a particular case $U_1$, the result is a SNAR model shown in (\cref{fig:antibiotics-snar}).

%Since $S$ governs whether variables in $\ch_{\bar{\G}}(S)$ are intervened,

Since $S$ is a part of the model, representing situations where only some interventions %al contexts
are available to the analyst entails imposing restrictions on support of $S$.
Thus, we allow only a subset of $\mathfrak{X}_S$, termed $\mathfrak{X}^+_S$, to have support.
For example, if $S$ has children $A_1$ and $A_2$, we may allow ${\mathfrak X}_{\{ S^e_{A_1}, S^e_{A_2} \}}$
to have support on the set $\{ \{ 0, 0 \}, \{ 0, 1 \}, \{ 1, 0 \}\}$.  In other words, $S$ allows no variables to be intervened on, or either only $A_1$ or $A_2$ to be intervened on, but not both $A_1$ and $A_2$.
Prior work
represented this by explicitly providing a set of distributions as inputs to the algorithm \citep{leeGeneralIdentifiabilityArbitrary2019}.

Queries corresponding to interventional distributions $p(\vec{Y}(\vec{a}))$ in an SCM must be modified in a CS-SCM to take the special nature of $S$ into account.  In particular, the analogue of the query $p(\vec{Y}(\vec{a}))$ in the original SCM corresponds to
%the interventional distribution of interest is
$p(\vec{Y}(\vec{a},S=\emptyset))$, which reads ``the distribution of outcomes $\vec{Y}$, when the context of the CS-SCM is set to the observational value, and the variable $\vec{A}$ is set to $\vec{a}$''. Intuitively, this excludes contexts where variables such as $\vec{Y}$ are intervened, and whose intervened distributions are not of scientific interest.
%\amir{can we have extra independencies due to intervening on S and setting it to emptyset?}
Note that this query potentially entails a positivity violation in the sense that no positive support may exist in the observed data distribution for the situation where $\vec{A}=\vec{a}$ and $S=\emptyset$.  This occurs, in particular if elements of $\vec{A}$ are among children of $S$.  While this may potentially prevent identification, restrictions on the CS-SCM may allow identification to be obtained in some cases.  A close analog of this phenomenon arises in the interventionist formulations of mediation analysis \citep{robins10alternative,rrs20volume_mediation_jmlr}.

If all variables in an CS-SCM are observed, we have the following result for the query $p(\vec{Y}(\vec{a},S=\emptyset))$ that directly generalizes the g-formula.
\begin{restatable}[Context Selected g-formula]{thm}{icsscm}\label{thm:ics-scm}
Fix a fully observed CS-SCM corresponding to an LS-DAG ${\G}^{[]}$ with a vertex set $\vec{V}$, and disjoint subsets $\vec{A},\vec{Y}$ of $\vec{V}$.
Let $\vec{Y}^* = \an_{{\G}^{[]}
%_{\vec{V} \setminus (\vec{A}\cup\{S\})}
(\vec{a},\emptyset)
}(\vec{Y})$ %\amir{different from before}.
Then $p(\vec{Y}(\vec{a},S=\emptyset))$ is identified if and only if for every element $V \in \vec{Y}^*$ there exists a value $s_V \in {\mathfrak X}_S^+$ laidback for $V$ (i.e. $s^e_V = 0$). 
%\amir{if not child of S, laidback is not defined}  
If so, we have:
{\small
\begin{align}
%&
p(\vec{Y}(\vec{a},S=\emptyset))
%\notag\\
%&
=
\sum_{\vec{Y}^* \setminus \vec{Y}} \prod_{V \in \vec{Y}^*} p(V | \pa_{{\G}}(V)) \vert_{\vec{a}_{\vec{A} \cap \pa_{{\G}}(V)}, S^e_V = 0}.
\label{eqn:ics-g}
\end{align}
}
\end{restatable}

The above g-formula takes into consideration the requirement that $S = \emptyset$, which ensures that the causal effect is computed in the observational context only.

Note that the query may not be identified even under full observability, if available contexts for $S$ are not laidback for elements in
%without enough variation in the set
$\vec{Y}^*$.  Nevertheless, the above result allows identifiability in situations corresponding to SCAR or SAR.
While incorporating the selection process as an explicit part of the causal model allows us to explicitly represent complex types of systematic selection, it also (unsurprisingly) creates difficulties with identification of causal effects in models with hidden variables that yield systematic selection more complicated than SCAR or SAR.
% For example, we might imagine that we have three contexts - an observational context, an experimental context in which the variable $A$ is set to $0$, and another in which $A=1$. Then, we could represent each of these via $\mathfrak{X}_S = \{(\emptyset, \emptyset), (\{A\}, 0), (\{A\}, 1)\}$ respectively. We ensure that $A$ is a function of $S$. The distribution over $S$ determines both the propensity to observe the various contexts, but also the randomization probabilities within a given context. For instance, if $p(S = (\{A\}, 0)) = p(S= (\{A\}, 1))$ then that means that in our experimental context the distribution of $\{A\}$ was a fair coin.


% {\color{red}
% The simplest SCAR model can be seen as a generalization of earlier prior work \citep{leeGeneralIdentifiabilityArbitrary2019,atheyCombiningExperimentalObservational2020,dawidDecisiontheoreticFoundationsStatistical2021} that inherits identification results from that work, while simultaneously allowing a coherent observed data likelihood to be specified, allowing statistical inference.
% }


%Mechanism stability baked in?

%Subsets of ${\mathfrak X}_S$ have support?

%\cref{dfn:context_specificity} imparts a similar flavor to the non-stochastic regime indicators of .


% \begin{dfn}[Context selection variable] \label{ass:cs_var}
% Let $S \in \vec{V}$ the context selection variable, which is discrete, and be partitioned as $S = (S_0, S_1)$, where $S_0$ are the experiment indicators, and $S_1$ are the experiment values.
% %$S_0$ and $S_1$ themselves may be vectors. 

% Let the domain of $S$ be denoted \[\mathfrak{X}_S  = \{(s_0, s_1) \mid s_0 \in \mathbb{P}(\ch_\G (S)), s_1 \in \mathfrak{X}_{s_0}\},\] where $\mathbb{P}(\cdot)$ is the powerset operator, and where $\mathfrak{X}_{s_0}$ denotes the domain of the variables in $s_0$. 

% We consider subsets the domain of $S$ \[\mathfrak{X}^+_S = \{(s_0, s_1) \mid s_0 \in \mathfrak{X}^+_{S_0}\subseteq \mathbb{P}(\ch_\G (S)), s_1 \in \mathfrak{X}_{s_0} \},\] where we allow restricted sets of experiment indicators, but assume that all values of the indicators are allowed. We ensure that variable is a function of $S$ in its structural equation if and only if it appears in $\mathfrak{X}^+_{S_0}$.
% \end{dfn}


% Intuitively, $S$ is meant to represent the set of contexts available to the analyst, where $S_0$ is an indicator for the particular observational or experimental context, and $S_1$ is the value of the experimental values (if applicable). 



% The second restriction is that under an intervention, we specify the behavior of the intervened variable. We do this by imposing a context-specific assumption, in which variables under intervention do not pay attention to 
%if and only if $\exists s \in \mathfrak{X}_S$ such that  $V_i \in s$.  
%The function for $S$, $f_S$, may take on  is that it must induce an acyclic graph. This restriction means that $S$ itself cannot be determined by 

%appears in $\mathfrak{X}_S$. \amir{[what if we have $\{V_1,V_2\}\in\mathfrak{X}_S$]} 


% \amir{Perhaps we should add a comment about: We consider the case that we may have access to data from an observational domain and several interventional domains. We refer to the mechanism that chooses which domain is generating each realization of the data as the \textit{selection mechanism}. This mechanism can be influenced by both the endogenous and the exogenous variables in the system.}

% \amir{Maybe first say what we want to say, i.e., how we model interventions: }For each element $s \in \mathfrak{X}_S$ that is not the empty set, and for each $V_i \in s$, we construct an exogenous observed variable $I_{s, i}$ with associated distribution $p(I_{s, i})$. These represent the experimenter determined randomization probabilities in each context, which are assumed to be known.

% \begin{assumption}[Representing variables under intervention using context specificity] \label{dfn:context_specificity} 
% % Let $V_i \in V \setminus \{S\}$ such that $f_{V_i} \in F_S$. Let the direct causes of $V_i$ (and therefore arguments of $f_{V_i}$) be $U^{V_i} \subseteq U$, $V^{V_i} \subseteq V, E^{V_i} \subseteq E$, and $S$. For $s \in \mathfrak{X}_S$, if 
% % %Let $s \in \bar{\mathfrak{X}}_{S,i}= \{s: V_i \in s, s \in \mathfrak{X}_S\}$.  Then, if 
% % $V_i \in s$, then
% % \[f_{V_i}(U^{V_i}, V^{V_i} \setminus \{S\}, E^{V_i}, S=s) \equiv E_{s, V_i}.\]
% %\amir{what is j? what is I? why are all the variables arguments of $f_i$?}
% % {\color{red}
% % Let $V_i \in V \setminus \{S\}$ such that its structural equation is a function of $S$. Let the direct causes of $V_i$ (and therefore arguments of $f_{V_i}$) be $U^{V_i} \subseteq U$, $V^{V_i} \subseteq V, E^{V_i} \subseteq E$, and $S$. Then for $s = (s_0, s_1)$, if $V_i \in s_0$, then
% % \[f_{V_i}(U^{V_i} , V^{U_i}\setminus \{S\}, S=s) \equiv f'_{V_i}(S=s)\]
% % }

% Let $V \in \vec{V} \setminus \{S\}$ such that $S \in \pa(V)$, and let $\epsilon_V$ be a random variable.%Then, $f_{V_i}$ is defined as
% For $s = (s_0, s_1)$, if $V \in s_0$, then
% \[f_{V}(\pa(V)\setminus \{S\}, S = s, \epsilon_V) = f'_{V}(S=s) \]
% %\[f_{V_i} \equiv \begin{cases}
% %   f'_{V_i} (\pa(V_i) \setminus \{S\}, S = s, \epsilon_i), \quad V_i \not \in s_0 \\
% %   f''_{V_i} (S = s), \quad V_i \in s_0 \\
% %\end{cases}\]

% \end{assumption}


% \begin{dfn}[Context-specific structural causal model]
% Let $V$ denote observed variables, $U$ denote unobserved variables with distribution $p(E)$, $E$ denote the experimenter-determined variables with distribution $p(E)$, and $F$ denote a set of functions of variables. Let $\mathbb{P}(\cdot)$ denote the power set operator.

% The context-specific structural causal model (SCM) is a tuple ${\cal M}^\star_{CS}=\langle U , V, E, p(E), p(U), F \rangle$, satisfying the following conditions:
% \begin{enumerate}
%     \item $S \in V$, $\mathfrak{X}_S\subseteq {\mathbb P}(V \setminus \{S\})$, and $p(E) = \prod_{s \in \mathfrak{X}_S, V_i \in s} p(E_{s, V_i})$; 
%     \item For each $V_i \in \cup_{s \in \mathfrak{X}_S} s \setminus \emptyset$, $f_{V_i} \in F \in F_S \equiv \{f_{V_i} \mid V_i \in \cup_{s \in \mathfrak{X}_S} s \setminus \emptyset\}$ satisfies \cref{dfn:context_specificity}, such that for $V_i$-specific subsets $U^{V_i} \subseteq U, V^{V_i} \subseteq V$, 
% \[f_{V_i}(U^{V_i}, V^{V_i} \setminus \{S\}, \{E_{s', V_i}\}_{s' \in \mathfrak{X}_S}, S=s) = \begin{cases} E_{s, V_i} \quad\text{if $V_i \in s$}  \\ f_{V_i}' (U^{V_i}, V^{V_i} \setminus \{S\}, S=s) \quad\textrm{otherwise}\end{cases}\]
% \item For each $V_i \not \in \cup_{s \in \mathfrak{X}_S} s \setminus \emptyset$, $f_{V_i} \in F \setminus F_S$ is not a function of $E$ or $S$, and has the form 
% \[f_{V_i}(U^{V_i}, V^{V_i} \setminus \{S\}). \]
% \end{enumerate}
%For $f_S$
%satisfying \cref{dfn:context_specificity}. 
%Under \cref{dfn:context_specificity} this leads to the following functional form:



%\end{dfn}

% The third restriction is that in data fusion we expect a certain stability of mechanisms between different contexts. While the contexts vary due to interventions being performed, we assume that the underlying causal mechanisms are otherwise the same. This is reflected in the \emph{mechanism stability assumption}, which further restricts the form of the function $f_V$. 

% \begin{assumption}[Mechanism stability]\label{ass:mechanism_stability_assumption} 
% % Let $V_i \in V \setminus \{S\}$ where $S$ is in the argument of $f_{V_i}$, and let $f'_{V_i}$ denote the part of $f_{V_i}$ when $S$ takes values $s$ such that $V_i \not \in s$. Let $s_1, s_2 \in \{s: V_i \not \in s, s \in \mathfrak{X}_S\}$. Then, for $V_i$-specific subsets $U^{V_i} \subseteq U, V^{V_i} \subseteq V$, 
% % \[f'_{V_i}(U^{V_i}, V^{V_i} \setminus \{S\}, s_1) = f'_{V_i}(U^{V_i}, V^{V_i} \setminus \{S\}, s_2).\]
% % {\color{red}
% % Let $V_i \in V \setminus \{S\}$ such that its structural equation is a function of $S$. Then for $s = (s_0, s_1)$, if $V_i \not \in s_0$, then
% % \[f_{V_i}(U^{V_i} , V^{U_i}\setminus \{S\}, S=s) \equiv f'_{V_i}(U^{V_i} , V^{U_i}\setminus \{S\})\]
% % }
% Let $V \in \vec{V} \setminus \{S\}$ such that $S \in \pa(V)$. Then, for $s = (s_0, s_1)$, if $V \not \in s_0$, then
% \[f_{V}(\pa(V) \setminus \{S\}, S = s, \epsilon_V) = f''_{V} (\pa(V_i) \setminus \{S\}, \epsilon_V)\]
% \end{assumption}


% Then, we can define the data fusion context-specific structural causal model, under the restrictions imposed by \cref{ass:cs_var,dfn:context_specificity,ass:mechanism_stability_assumption}.
% Let $V$ denote observed variables, $U$ denote unobserved variables with distribution $p(U)$, $E$ denote the experimenter-determined variables with distribution $p(E)$, and $F$ denote a set of functions of variables. Let $\mathbb{P}(\cdot)$ denote the power set operator.

% The data fusion context specific structural causal model (SCM) is a tuple ${\cal M}_{CS}=\langle U , V, E, p(E), p(U), F \rangle$, satisfying the following conditions:
% \begin{enumerate}
%     \item $S \in V$, $\mathfrak{X}_S\subseteq {\mathbb P}(V \setminus \{S\})$, and $p(E) = \prod_{s \in \mathfrak{X}_S, V_i \in s} p(E_{s, V_i})$; 
%     \item For each $V_i \in \cup_{s \in \mathfrak{X}_S} s \setminus \emptyset$, $f_{V_i}$ satisfies \cref{dfn:context_specificity} and \cref{ass:mechanism_stability_assumption}, such that for $V_i$-specific subsets $U^{V_i} \subseteq U, V^{V_i} \subseteq V$, 
% \[f_{V_i}(U^{V_i}, V^{V_i} \setminus \{S\}, \{E_{s', V_i}\}_{s' \in \mathfrak{X}_S}, S=s) = \begin{cases} E_{s, V_i} \quad\text{if $V_i \in s$}  \\ f_{V_i}' (U^{V_i}, V^{V_i} \setminus \{S\}) \quad\textrm{otherwise}\end{cases}\]
% \item For each $V_i \not \in \cup_{s \in \mathfrak{X}_S} s \setminus \emptyset$, $f_{V_i}$ is not a function of $E$ or $S$, and has the form 
% \[f_{V_i}(U^{V_i}, V^{V_i} \setminus \{S\}). \]
% \end{enumerate}
%For $f_S$
%satisfying \cref{dfn:context_specificity}. 
%Under \cref{dfn:context_specificity} this leads to the following functional form:

% Let $V$ denote observed variables, $U$ denote unobserved variables, and $F$ denote a set of functions of variables, whose arguments are some subset of $U \cup V$.

% \begin{dfn}[Data fusion context-specific structural causal model]\label{dfn:data_fusion_scm}
% The data fusion context specific structural causal model is a set of variables $\vec{V} \cup \vec{H}$, a total ordering on that set $\prec$, subsets of the preceding variables $\pa(V) \subseteq \pre_\prec (V)$, and a set of random variables $\{\epsilon_V\}_{V \in \vec{V} \cup \vec{H}}$, satisfying the following conditions:
% \begin{enumerate}
%     \item There exists a variable $S \in \vec{V}$ satisfying \cref{ass:cs_var}
%     \item For each $V$ whose corresponding $f_{V}$ has $S$ as an argument and satisfies \cref{dfn:context_specificity,ass:mechanism_stability_assumption}, for each $s \in \mathfrak{X}_S$, and for $\mathfrak{X}_{S_0}^{V} = \{s_0 \in \mathfrak{X}_{S_0} \mid s_0 \cap \{V\} \neq \emptyset\}$ the set of values of $S_0$ which correspond to an experimental intervention, 
%     \begin{align*}
%        %&V=f_{V_i} (U^{V_i}, V^{V_i} \setminus \{S\}, S=s ) \\ 
%        &V = f_{V} (\pa(V) \setminus \{S\}, S = s, \epsilon_V)  \\
%        & = \begin{cases}
%        f'_{V} (S = s), \quad s_0 \in \mathfrak{X}_{S_0}^{V} \textrm{ by \cref{dfn:context_specificity}}\\
%        f''_{V} (\pa(V) \setminus \{S\}, \epsilon_V), \quad s_0 \not \in \mathfrak{X}_{S_0}^{V}\textrm{ by \cref{ass:mechanism_stability_assumption}}
%        \end{cases}
%     %    &= \begin{cases}f'_{V_i} (S=s), \quad s_0 \in \mathfrak{X}_{S_0}^{V_i} \textrm{ by \cref{dfn:context_specificity}}\\ 
%     % f''_{V_i} (U^{V_i}, V^{V_i} \setminus \{S\}), s_0 \not \in \mathfrak{X}_{S_0}^{V_i} \textrm{ by \cref{ass:mechanism_stability_assumption}}\end{cases} 
%     \end{align*}
% \end{enumerate}

% \end{dfn}

% \begin{rmk}
% Note that the definition \cref{dfn:data_fusion_scm} does not impose any restrictions on $V$ whose corresponding $f_{V}$ does not have $S$ as an argument. The structural equation is then $f_{V} (\pa(V), \epsilon_V)$.
% \end{rmk}

% \begin{rmk}
%     We can also consider $M^\star_{CS}$, in which only \cref{ass:cs_var,dfn:context_specificity} hold.
% \end{rmk}


% \begin{dfn}[Laid-back/Serious]
% A value $s \in \mathfrak{X}_S$ is laidback for $V$ if $V \not \in s_0$. $S$ is laidback for $V$ if each value $s \in \mathfrak{X}_S$ is laidback for $V$.

% A value is serious for $V$ if it is not laidback, and $S$ is serious for $V$ if there exists a value which is serious for $V$.
% \end{dfn}

% Finally, in what follows we will only consider structural causal models that induce acyclic graphs. 

% \begin{dfn}[Unlabelled graphs of structural causal models]\label{dfn:unlabelled_graph_from_scm}
% Let $M$ be a structural causal model in ${\cal M}_{CS}$. Let $\bar{\G}$ be the unlabelled graph containing a node for each $V \in \vec{V}$, a directed edge from $V$ to $W$ if $f_{V}$ is a function of $W$, and a bidirected edge between $V$ and $W$ if both $f_{V}$ and $f_{W}$ are functions of a common hidden variable $H \in \vec{H}$.
% \end{dfn}

% Then $\bar{\G}$ induced from an $M \in {\cal M}_{CS}$ (alternatively ${\cal M}^\star_{CS}$) under \cref{dfn:unlabelled_graph_from_scm} must be acyclic.

% We call $\bar{\G}$ the unlabelled DAG or ADMG as appropriate, for reasons that will become clearer in \cref{sec:graphical_representations}


% Each element of the model $M \in {\cal M}_{CS} (\G)$ induces a statistical model over all variables. Let $\vec{V}$ denote observed variables and $\vec{H}$ denote hidden variables. We define this set of distributions as
% \[{\cal P}_{CS} (\G(\vec{V} \cup \vec{H})) = \{p(\vec{V} \cup \vec{H}): \prod_{V \in \vec{V} \cup \vec{H}} p(V \mid \pa_\G (V)) \}\]
% where  if $S \in \pa_\G(V)$ then for $s = (s_0, s_1)$, if $s_0 \cap \{V\} = \emptyset$, $p(V \mid \pa_\G (V) \setminus \{S\}, S=s) = p(V \mid \pa_\G (V) \setminus \{S\})$ (by \cref{ass:mechanism_stability_assumption}), and if $s_0 \cap \{V\} \neq \emptyset$ then $p(V=s_1 \mid \pa_\G (V) \setminus \{S\}, S=s) = 1$ (by \cref{dfn:context_specificity}).

% We define observed margins of these distributions as 
% { \smaller
% \[{\cal P}_{CS}^{obs} (\G(\vec{V} )) = \Big\{ \sum_{\vec{H}} p(\vec{V} \cup \vec{H}): p(\vec{V} \cup \vec{H}) \in {\cal P}_{CS} (\G(\vec{V} \cup \vec{H})) \Big\} \]
% }


%\subsection{Graphical representations under selection}\label{sec:graphical_representations}
% In \cref{sec:background} we introduced a formal definition of the structural causal model in ${\cal M}_{CS}$. However, these models were represented by what we termed unlabelled graphs $\bar{\G}$, which did not fully capture the various restrictions we introduced.
%The ICS-SCM introduces value-specific dependencies, which induce conditional independencies that 

% In particular, the restrictions placed on the functional forms of $f_{V} \in F_S$ mean that there are context-specific independencies that are not otherwise reflected in a standard graph. %There is a long history of graphical representations of context-specific independence \citep{pensarLabeledDirectedAcyclic2015,tikkaIdentifyingCausalEffects2019}.
% The labelled DAG was introduced in \cite{pensarLabeledDirectedAcyclic2015}, in which labels were attached to edges in a DAG to indicate under what circumstances when they would go missing, and therefore imply the corresponding context specific independence. We modify this definition for our purposes in introducing the labelled selection DAG, by providing a method by which unlabelled graphs $\bar{\G}$ corresponding to a model ${\cal M}_{CS}$ can be augmented with the appropriate labels.


% \begin{dfn}[Labelled selection CDAG]\label{dfn:labelled_selection_dag}
% Let $\bar{\G}$ denote a CDAG with random vertices $\vec{V}$, fixed vertices $\vec{W}$ and edges $\vec{E}$, with $S \in V$, and $\mathfrak{X}_{S_0}$ the set of available interventions. 

% For each $V \in \vec{V} \setminus \{S\}$ mentioned in $\mathfrak{X}_{S_0}$, and for each edge $E = (A \to B) \in \vec{R}$, we attach a label $L_{E} = \{s: s \in \mathfrak{X}_{S_0}, s \cap \{V\} \neq \emptyset\}$

% Let the set of all labels be defined as $\vec{L} = \{L_{E}: \forall E = (A \to B) \in \vec{R}, V \in \vec{V} \setminus \{S\}\}$

% Then, $\G = (\vec{V}, \vec{R}, \vec{L})$ denotes a labelled selection CDAG (LS-CDAG).

% \end{dfn}


% %For example, \cref{fig:motivation_single_graphs} depicts $\bar{\G}$ generated from a model $M \in {\cal M}_{CS}$. If we want to further represent that $\mathfrak{X}_S = \{\emptyset, \{A\}\}$, we can apply \cref{dfn:labelled_selection_dag} to $\bar{\G}$ to obtain $\G$, depicted in \cref{fig:fig:motivation_ls_dag}. 


% \begin{dfn}[Labelled selection CADMG]\label{dfn:ls-cadmg}
% Let $\bar{\G}$ denote a CADMG with random vertices $\vec{V}$, fixed vertices $\vec{W}$, directed edges $\vec{E}$, and bidirected edges $\vec{B}$. Let $S \in \vec{V} \cup \vec{W}$ and let $\mathfrak{X}_{S_0}$ be defined.

% For each $V \in (\vec{V}  \cup \vec{W}) \setminus \{S\}$ mentioned in $\mathfrak{X}_{S_0}$, for each edge $E \in \vec{E}$, we attach a label $L_{E} = \{s: s \in \mathfrak{X}_{S_0}, s \cap \{V\} \neq \emptyset\}$, and for each edge $B \in \vec{B}$, we attach a label $L_{B} = \{s: s \in \mathfrak{X}_{S_0}, s \cap \vec{V} \neq \emptyset\}$.

% Let the set of all such defined labels be denoted as $\vec{L}$.

% Then, $\G = (\vec{V}, \vec{W}, \vec{R}, \vec{B}, \vec{L})$ denotes a labelled selection CADMG (LS-CADMG).
    
% \end{dfn}
% % {\color{red} Introduce an example of a CADMG here}

% % We can also consider going from a labelled graph to a set of models consistent with that graph. 
% % Let ${\cal M}_{CS} (\G)$ denote all models $M$ such that their unlabelled graphs $\bar{\G}$ under \cref{dfn:unlabelled_graph_from_scm} generate labelled graphs under \cref{dfn:ls-cadmg} that are equal to $\G.$

% \begin{dfn}
% A context $s$ specific graph ${\cal G}^{[s]} $ is a %quadruple
% $4$-tuple
% $(\vec{V}, \vec{W}, \vec{E}', \vec{B}')$ representing a CADMG derived from a LS-CADMG ${\cal G}$ with quintuple $(\vec{V}, \vec{W}, \vec{E}, \vec{B}, \vec{L})$, where $\vec{E}' \equiv \{E = (X \to Y)  \in \vec{E}\mid  s\not \in L_{E}\}$, and $\vec{B}' \equiv \{B = (X \leftrightarrow Y ) \in \vec{B} \mid  s\not \in L_{B}\}$.
% \end{dfn}

% In this paper, genealogical relations with respect to an LS-CADMG $\G$ shall not consider the presence of labels (meaning that the CADMG $\bar{\G}$ may be subsituted in place of $\G$). Furthermore, in a slight abuse of notation, for some $s = (s_0, s_1) \in \mathfrak{X}_S$, we will take $\G^{[s]}$ to mean $\G^{[s_0]}$.

% We finish by defining a latent projection operator for LS-DAGs to LS-ADMGs, in the vein of Definition A.2 of \cite{richardsonNestedMarkovProperties2023}.
% \begin{dfn} (latent projection operator)
% Let $\G = \G(\vec{V} \cup \{H\})$ be an LS-CADMG with vertex set $\vec{V} \cup \{H\}$ and label set $\vec{L}$, where $H$ is to be latent projected. The latent projection $\G'=\G(\vec{V})$ is an LS-CADMG with vertex set $\vec{V}$, where for every triple of distinct vertices $A, B, H$:
% \begin{itemize}
% \item if $A \to H \to B$ in $\G$, then $A \to B$ in $\G$, with label equal to $(L_{A \to H} \cup L_{H \to B})$. If $A \to B$ exists then this label is further intersected with  $L_{A \to B}$.
% \item if $A \leftarrow H \rightarrow B$ or $A \leftrightarrow H \to B$ then $A \leftrightarrow B$, with label equal to $(L_{A \to H} \cup L_{H \to B})$. If $A \leftrightarrow B$ also exists then this label is further intersected with  $L_{A \leftrightarrow B}$.
%     %\item $\G(V)$ contains an edge $a \to b$ if there is a directed path $a \to \ldots \to b$ on which every non-endpoint vertex is in $U$. Furthermore, the edge $a \to b$ receives a label that is formed from the union of all labels along this directed path.
%     %\item $\G(V)$ contains an edge $ a \leftrightarrow b$ if there exists a path between $a$ and $b$ such that the non-endpoints are all non-colliders in $U$, and such that the edge adjacent to $a$ and the edge adjacent to $b$ have arrowheads into those vertices. Furthermore, the edge $a \leftrightarrow b$ receives a label formed from the union of all labels along this path.
% \end{itemize}
    
% \end{dfn}

%\subsection{ISC-SCMs With Hidden Variables}


\subsection{
Latent Projections in CS-SCMs}\label{sec:latent_projections}

In order to formulate a %n appropriate
general identification algorithm
%generalization of the ID algorithm
for CS-SCMs with hidden variables, we first generalize latent projections and the fixing operator to CS-SCMs.

A \emph{labelled selection acyclic directed mixed multigraph (LS-ADMMG)} is a multigraph with directed and bidirected edges, no directed cycles, and the property that any pair of edges of the same type connecting the same vertex pair $A,B$ must have different labels.
Given an LS-DAG ${\G}^{[]}(\vec{V} \cup \vec{H})$, where $S \in \vec{V}$, and where labels may exist on any edge in this LS-DAG,
define a latent projection ${\G}^{[]}(\vec{V})$ to be an LS-ADMMG with vertices $\vec{V}$, where
for each directed path from $A \in \vec{V}$ to $B \in \vec{V}$ in ${\G}^{[]}(\vec{V} \cup \vec{H})$ where all intermediate elements are in $\vec{H}$, a directed edge labeled by a union of labels for every edge on the path is added to $\G(\vec{V})$. 
Similarly, for each marginally d-connecting path from $A$ to $B$ in $\bar{\G}^{[]}(\vec{V} \cup \vec{H})$, where the first edge is into $A$ and the last into $B$ and where all intermediate elements are in $\vec{H}$, add to $\G(\vec{V})$ a bidirected edge labelled by a union of labels for every edge on this path.
Note that the result is a multigraph since the same pair may be connected by the same edge type with multiple labels.
A similar definition yields a latent projection ${\G}^{[]}(\vec{V}(\vec{a}))$ of a labelled hidden variable SWIG ${\G}^{[]}(\vec{V}(\vec{a}) \cup \vec{H}(\vec{a}))$.  An example illustrating why labelled multigraphs are necessary to represent latent projections of LS-DAGs in general is found in the Appendix.

Similarly, we define a labelled selection conditional ADMMG (LS-CADMMG) as an LS-ADMMG with random and fixed vertices, such that fixed vertices cannot have edges with arrowheads into them.

% extension of \cite{pensarLabeledDirectedAcyclic2015}.\footnote{to take into account latent projections?}

% \begin{dfn}[Labelled selection CADMG]\label{dfn:ls-cadmg}
% Let $\bar{\G}(\vec{V}, \vec{W})$, \ilya{where $S \in \vec{V} \cup \vec{W}$}, denote a CADMG associated with the latent projection of a hidden variable ICS-SCM with set of edges $\vec{E}$.
% %with random vertices $\vec{V}$ associated , fixed vertices $\vec{W}$, directed edges $\vec{E}$, and bidirected edges $\vec{B}$. Let $S \in \vec{V} \cup \vec{W}$ and let $\mathfrak{X}_{S_0}$ be defined.
% For each edge $E = (X \to Y)$ or $(X \leftrightarrow Y)$ we attach label {\color{red}$L_E = \{s^e \in \mathfrak{X}_S^e: \textrm{$s^e$ serious for W}\}$} whenever $S \neq X$.
% Then  $\G(\vec{V}, \vec{W})$ with labels $\vec{L} = \cup_{E} L_E$ is a labelled selection CADMG.
% \end{dfn}

% For each $V \in (\vec{V}  \cup \vec{W}) \setminus \{S\}$ mentioned in $\mathfrak{X}_{S_0}$, for each edge $E \in \vec{E}$, we attach a label $L_{E} = \{s: s \in \mathfrak{X}_{S_0}, s \cap \{V\} \neq \emptyset\}$, and for each edge $B \in \vec{B}$, we attach a label $L_{B} = \{s: s \in \mathfrak{X}_{S_0}, s \cap \vec{V} \neq \emptyset\}$.
% Let the set of all such defined labels be denoted as $\vec{L}$.
% Then, $\G = (\vec{V}, \vec{W}, \vec{R}, \vec{B}, \vec{L})$ denotes a labelled selection CADMG (LS-CADMG).

Given an LS-CADMMG ${\G}^{[]}(\vec{V},\vec{W})$ where $S \in \vec{W}$, the \emph{context %$s$ specific
selected graph} ${\G}^{[s]}(\vec{V},\vec{W})$ is defined by removing every edge such that $s$ is serious for any vertex in that edge's label, %keeping
removing labels for all other edges, and removing every unlabelled duplicate edge of the same type connecting every pair of vertices.
%while removing their labels.
Note that %such a graph is
this construction always yields a CADMG.  Note also that if $s=\emptyset$, all parents and siblings of $\G$ are preserved in $\G^{[s]} = \G^{[\emptyset]}$, but the resulting object is no longer a multigraph.

%We will always apply
The fixing operator and genealogic relations generalize in a straightforward way to multigraphs we consider.  In particular, multiple labelled edges of the same type connecting vertices $A$ and $B$ are treated as a single edge of that type, with labels ignored.
% to graphs obtained from a multigraph $\bar{\G}^{[]}$ by evaluating them at some value $s$.
If a graph index for a genealogic set is omitted, it is understood to be ${\G}^{[\emptyset]}$.
% Standard fixability and genealogical relations in an LS-CADMG $\G$ are defined with respect to $\G^{[\emptyset]}$.
%by ignoring labels.

\subsection{Towards Identification Under SNAR}
%Systematic Selection}

A seemingly reasonable approach for obtaining identification of interventional distributions $p(\vec{Y}(\vec{a}, S=\emptyset))$ given a hidden variable CS-SCM represented by an LS-ADMMG ${\cal G}^{[]}(\vec{V})$
%in the presence of systematic selection
is to first address systematic selection by identifying $q_{\vec{V}}(\vec{V}) \equiv p(\vec{V} | \text{do}(S=\emptyset))$, corresponding to the SWIG ${\cal G}^{[\emptyset]}(\emptyset)$, and then invoke the ID algorithm on this CADMG, the distribution ${p}(\vec{V})$ and the query $p(\vec{Y}(\vec{a}, S=\emptyset))$.
This strategy clearly yields a sound algorithm.  In fact, we can show that despite the extra context-specific independencies implied by an CS-SCM,
%and represented by an LS-DAG ${\cal G}$,
we have the following result.

\begin{restatable}%[Soundness and completeness for $S=\emptyset$ interventions]
{thm}{sidsoundness}
%   \cref{alg:main-general} is complete for queries of the form $p(\vec{Y} \mid \doo(\emptyset))$. 
Given a hidden variable CS-SCM represented by a LS-ADMMG ${\G}^{[]}(\vec{V})$,
the ID algorithm with causal query
$p(\vec{V} | \text{do}(S\!=\!\emptyset))$, data distribution $p(\vec{V})$, and ADMG ${\G}^{[\emptyset]}$
is sound and complete.
% for the query  in any ICS-SCM represented by the LS-DAG $\bar{\cal G}^{[]}$.
\label{thm:id-s-0}
\end{restatable}

Despite this, the above sequential strategy does not yield a complete algorithm for systematic selection for the more general causal query $p(\vec{Y} (\vec{a}, S=\emptyset))$.
To see why, consider the following simple example, illustrated by the hidden variable LS-DAG $\bar{\cal G}^{[]}$ shown in Fig.~\ref{fig:id-double-bow}, where we are interested in $p(Y(a,s=\emptyset))$.  %It is well known 
Completeness of the ID algorithm implies that this interventional distribution is not identified under standard SCM semantics corresponding to this graph.  Theorem \ref{thm:id-s-0} above also implies the distribution $p(Y, A \mid \text{do}(S=\emptyset))$ is not identified.

However, identification is obtained in an CS-SCM due to the following simple derivation:
{\small
\begin{align*}
p(Y(a,S=\emptyset)) &= p(Y(a)) = p(Y(a) | S = (s^e_A=1,s^v_A = a))\\
&= p(Y(a) \mid A=a, S = (s^e_A=1,s^v_A = a))\\
&= p(Y \mid A=a, S = (s^e_A=1,s^v_A = a)).
\end{align*}
}Here the first equality follows by the exclusion restrictions in this model (or by rule 3 of the potential outcomes calculus \citep{malinsky19po}). The second equality follows since $Y(a) \ci S$ in this model,  which may be verified by the context selected SWIG shown in \ref{fig:id-double-bow-swig}.
%More details on SWIGs may be found in \citep{thomas13swig}.
The third equality follows by definition of the CS-SCM, and the final equality by consistency.

%Success of identification here
This derivation is explained by noting that $S$ acts as a \emph{perfect instrument} for the effect of $A$ on $Y$.  Specifically $S$ only influences $Y$ through $A$, and $S$ is independent of any confounders for $A$ and $Y$.  In addition, unlike standard instruments, $S$ completely determines the value of $A$, thereby eliminating any influence of the confounder $U_2$ on $A$.
In light of examples like above, we formulate a general identification algorithm that is able to handle both systematic selection and unobserved confounding together.

%\begin{dfn}[Latent projections of an LS-DAG]
    
% {\color{red}[How do we define the latent projection to go from LS-DAG to LS-ADMG? The problem is, you may need a \emph{boolean function} to describe S settings that make some latent projection edge go away.  Imagine $A$ has two bidirected paths to $B$, and $S$ can break each of them in one of two places: $W_1,W_2$ and $Z_1,Z_2$.  Then is our label $(W_1 \lor W_2) \land (Z_1 \lor Z_2)$?  We either do that, or we have to potentially add multiple $\to$ and $\leftrightarrow$ between $A$ and $B$ with different labelings.]}
% \end{dfn}

%{\color{red}$S$ could be in $W$.}
% \begin{dfn}[Context $s$ specific graph]
% Given an LS-CADMG $\G(\vec{V}, \vec{W})$, \ilya{where $S \in \vec{V} \cup \vec{W}$} and a value $s = (s^e, s^v) \in \mathfrak{X}_S$, a context $s$ specific graph ${\cal G}^{[s]} $ is defined by the LS-CADMG %{\color{red}
% $\G'(\vec{V}, \vec{W}) $ with edges $\vec{E}'$ and no labels. 

% %= (\vec{V}, \vec{E}', L=\emptyset)$\footnote{[inconsistent notation]}} where $\vec{E}' = \{E \mid s^e \not \in L_E\}$.
% %$\ch_{\bar{\G}}(S) \cap \vec{R}$ that $s$ is serious about become fixed vertices, 

% % $4$-tuple
% % $(\vec{V}, \vec{W}, \vec{E}', \vec{B}')$ representing a CADMG derived from a LS-CADMG ${\cal G}$ with quintuple $(\vec{V}, \vec{W}, \vec{E}, \vec{B}, \vec{L})$, where $\vec{E}' \equiv \{E = (X \to Y)  \in \vec{E}\mid  s\not \in L_{E}\}$, and $\vec{B}' \equiv \{B = (X \leftrightarrow Y ) \in \vec{B} \mid  s\not \in L_{B}\}$.
% \end{dfn}

%Specifically, if $S$ assumes certain values, certain elements of $\ch_{\bar{\G}}(S)$ become constants.
 
% \begin{dfn}[Context specific graphical fixing operator] \label{dfn:cs_graphical_fixing_operator}
% Let $\G(\vec{R},\vec{W})$ be %an LS-CADMG
% an LS-CADMG derived from an ADMG $\bar{\G}(\vec{V} \cup \{ S \})$
% which is a latent projection of a DAG $\bar{\G}(\vec{V} \cup \{S \} \cup \vec{H})$ representing an ICS-SCM.
% Let $s \in \mathfrak{X}_S$ be a value of $S$, and let $V \in \vec{R}$ be fixable in $\G(\vec{R},\vec{W})$.
% %$\mathbb{F}(\G) = \{ V \in \vec{V} \mid \dis_\G (V) \cap \de_\G (V) = \{V\}\}$ be the set of fixable vertices
% %(recalling that genealogical relations in LS-CADMGs are defined ignoring labelled edges)
% %and let $V \in \mathbb{F}(\G)$ be any such fixable vertex.
% Then, we define the $s$-specific fixing operator $\phi_V^s(\bar{\G}(\vec{R},\vec{W}))$ as
% $\phi_V(\G(\vec{R},\vec{W}))$ if $V \neq S$, 
% %and as a CADMG obtained from $\phi_V(\G(\vec{R},\vec{W}))$ where all elements in 
% %$\ch_{\bar{\G}}(S) \cap \vec{R}$ that $s$ is serious about become fixed vertices, 
% and $\phi_V (\G^{[s]} (\vec{R}, \vec{W}))$
% if $V=S$.
% %\[
% %\phi_S^s (\G) \equiv \G^{[s], *}(\vec{V} \setminus \{V\}, \vec{W} \cup \{V\})
% %\]
% %returns a CADMG where $\G^{[s], *}$ has precisely the subset of edges of graph $\G^{[s]}$ which do not have arrowheads at $\{V\}$;
% \end{dfn}

% \begin{dfn}[Context specific probabilistic fixing operator]\label{dfn:cs_probabilistic_fixing_operator}
% For $\G(\vec{R}, \vec{W})$ an LS-CADMG with kernel $q_{\vec{V}} (\vec{R} \mid \vec{W})$, for $V \in \vec{R}$ fixable in $\G(\vec{R}, \vec{W})$, for $s \in \mathfrak{X}_S$ a value of $S$, define
% %the s-specific fixing operator
% $\phi_V^s (q(\vec{R} \mid \vec{W}); \G(\vec{R}, \vec{W}))$
% %is defined as
% as
% \[\begin{cases}
% q(\vec{R} \mid \vec{W}) / q(V \mid \nd_{\G(\vec{R}, \vec{W})} (V)) & V \neq S \\
% \left(q(\vec{R} \mid \vec{W}) / q(V \mid \nd_{\G(\vec{R}, \vec{W})} (V))\right)\mid_{S = s}& V = S
% \end{cases}\]
% \end{dfn}

% \ilya{
% A sequence $\pi$ of vertices in an LS-CADMG ${\cal G}(\vec{V},\vec{W})$
% is said to be $s$-valid for a value $s$ if either $\pi = \langle \rangle$, or $h(\pi)$ is $s$-fixable in ${\cal G}$, and $t(\pi)$ is $s$-fixable in $\phi^s_{h(\pi)}({\cal G})$.
% It is straightforward to verify that any two $s$-valid sequences yield the same LS-CADMG.  Thus, if an $s$-valid sequence for $\vec{Z} \subseteq \vec{V}$ exists in an LS-CADMG ${\cal G}$, we write $\phi^s_{\vec{Z}}({\cal G})$ to denote the resulting LS-CADMG (obtained by any such sequence).

% {\color{red}[Define kernel s-fixing carefully.  We need to get rid of $S$ and any children that mimic $S$ at once.  Or say somewhere we only get rid of such children in a way consistent with $s$.]}
% }
%The probabilistic version of the fixing operator applied to distributions in an ICS-SCM stays the same, however, it exhibits extra context-specific independences advertised by the CADMGs $\G^{[s]}$ returned by the operator $\phi^s$.

% \begin{dfn}(Context specific probabilistic fixing operator)
% Let $\G$ be a LS-CADMG induced from the latent projection of a causal model $M \in {\cal M}_{CS}(\G)$, and let $q_{\vec{V} \mid \vec{W}}(\vec{V} \mid \vec{W}) \in {\cal P}_O(\G)$ be a kernel representing the  observed margin of that model $M$. If $S \in \mathbb{F}(\G)$, then 
%         \[\phi_{S}^s (q(\vec{V} \mid \vec{W}); \G) \equiv \frac{q(\vec{V} \mid \vec{W})}{q(S=s \mid \mb_{\G} (S))}\]
% \end{dfn}

% LS-CADMGs admit regular fixing operations that ignore labelled edge information. With slight abuse of notation, we denote sequences of regular and context specific fixing as $\phi^s_{\vec{W} \cup \{S\}} (\G), \phi^s_{\vec{W} \cup \{S\}} (p; \G)$ respectively, where it is understood that the superscript $s$ applies only when the variable to be fixed in sequence is $S$.

% \subsection{The Perfect Instrument Trick}

% The special relationship between the selector $S$ and its children allows us to obtain non-parametric identification in ICS-SCMs where identification fails in the corresponding SCM.

% As we will see below, generalizing this idea will be instrumental for obtaining identification in ICS-SCMs in cases where neither the ID algorithm, nor gID algorithm are able to succeed.


\section{An Identification Algorithm for Systematic Selection}
%{\color{red} TODO we want to say a little more about the kinds of targets that we are interested in identifying - e.g. $p(Y(a))$ vs $p(Y(a, s = laidback for Y*))$
%
%}

Here, we present a general identification algorithm, shown as Algorithms~\ref{alg:main-general} and \ref{alg:cs-general}, for the query $p(\vec{Y}(\vec{a},S=\emptyset))$ in a hidden variable CS-SCM represented by a latent projection multigraph ${\G}^{[]}(\vec{V})$, where $S \in \vec{V}$.  
The algorithm proceeds from the usual factorization used by both the ID and gID algorithms:
{\small
\begin{align*}
    p(\vec{Y}(\vec{a},S=\emptyset)) =
    \sum_{\vec{Y}^*\setminus\vec{Y}}
    \prod_{\vec{D}^* \in {\cal D}({\G}^{[]}(\vec{a},\emptyset))_{\vec{Y}^*}}
    \!\!\!\!
    p(\vec{D}^* | \text{do}(\pas(\vec{D}^*)))
\end{align*}
}Consider the following example, which illustrates how each term in this factorization is identified by one of three cases: either directly by the ID algorithm, or the gID algorithm, or a new case, formalized via Algorithm~\ref{alg:cs-general}, which obtains identification via the most general version possible of the perfect instrument trick described in the previous section.  Failure cases return either the hedge \citep{shpitserIdentificationJointInterventional2006}, or the thicket \citep{leeGeneralIdentifiabilityArbitrary2019}.
\begin{algorithm}[h]
    
\caption{SS-ID (systematic selection ID)}\label{alg:main-general}
\KwData{${\G}^{[]}, \vec{a}, \vec{Y}, p(\vec{V})$}
\KwResult{$p(\vec{Y}(\vec{a}, S=\emptyset))$ or FAIL}
$\vec{Y}^* \gets \an_{{\G}^{[]}
%_{\vec{V} \setminus (\vec{A} \cup \{S\})}
(\vec{a},\emptyset)
} (\vec{Y})$ \;
\For{$\vec{D}^* \in {\cal D}({\G}^{[]}_{\vec{Y}^*})$}{
    \If{no $s$ exists that is laidback for $\vec{D}^*$}{
    \Return FAIL(positivity) \label{alg:general_fail_positivity}
    }
    %$p(\cl(\vec{D}^*) \mid \doo(\pa^s(\cl(\vec{D}^*)))) \gets \phi_{V \setminus \cl(\vec{D}^*)} (p(V); \G)$ \;
    \eIf{$\cl(\vec{D}^*) = \vec{D}^*$}{
            $q(\vec{D}^* | \pas(\vec{D}^*)) \gets \phi_{\vec{V} \setminus \vec{D}^*} (p, {\G}^{[]}
            %(\vec{V})
            )\vert_{S=s}$,\\ %\; 
                        %for any
                        $s$ laidback for $\vec{D}^*$, consistent with $a_{\pas(\vec{D}^*)}$ %\;
            \label{alg:regular_fixing_kernel}
        
    %$q \gets p(\cl(\vec{D}^*) \mid \doo(\pas (\cl(\vec{D}^*)))) \mid_{A=a}$
    }{
                $\tilde{\G} \gets \phi_{\vec{V} \setminus \cl(\vec{D}^*)}({\G}^{[]})$;
                $\tilde{q} \gets \phi_{\vec{V} \setminus \cl(\vec{D}^*)}(p; {\G}^{[]})$\;
        \eIf{$S \not \in \cl(\vec{D}^*)$}{
                \eIf{
                there is $s$ laidback for $\vec{D}^*$, consistent with $\vec{a}_{\pas(\vec{D}^*)}$, and $\vec{D}^*$ reachable in $\tilde{\G}^{[s]}$  
                }{
                \label{alg:s_fixing_kernel}
                $q(\vec{D}^* | \pas(\vec{D}^*)) \gets %\phi^s_{\vec{V} \setminus \vec{D}^*} (p(\vec{V})\mid_{S=s}, \G^{[s]}(\vec{V}))$
                \phi_{\cl(\vec{D}^*) \setminus \vec{D}^*}(\tilde{q}; \tilde{\G}^{[s]})$
                %\;
                }{         
                \Return FAIL(thicket) \label{alg:general_fail_thicket}
                }
        }{

            $q(\vec{D}^* | \pas(\vec{D}^*))\gets %\text{output of }
            \cref{alg:cs-general}(\tilde{G}$,$\vec{a}$,$\tilde{q}$,$\vec{D}^*$,$\cl(\vec{D}^*))$ \label{alg:cs_general_call}
            %with ;
        
        }
        % $q \gets \cref{alg:cs}
        % (\phi_{V \setminus \cl_\G(\vec{D}^*)} (\G),$\\\quad$ a, \phi_{V \setminus \cl_\G (\vec{D}^*)} (p(V); \G)), \vec{D}^*)$}{$q \gets \cref{alg:s-fixable} (\phi_{V \setminus \cl_\G(\vec{D}^*)} (\G),$\\\quad  $a, \phi_{V \setminus \cl_\G (\vec{D}^*)} (p(V); \G)), \vec{D}^*)$}
    }
    %$r \gets r * q$
}
%Compute $\mathfrak{F}_{\Pi(D, \mathfrak{X}_S^+)}(s)$\;
%\eIf{$\mathfrak{F}_{\Pi(D, \mathfrak{X}_S^+)}(s) \neq \empty$}{
%   \Return $q \mid_{A=a}$
%    } 
%  }
%{
%\Return FAIL(positivity)
%}
\Return $\sum_{\vec{Y}^* \setminus \vec{Y}} \prod_{\vec{D}^*} q(\vec{D}^* | \pas(\vec{D}^*)) \vert_{\vec{a}_{\vec{A} \cap \pas_\G (\vec{D}^*)},s_{\vec{D}^*}}$,\\
with $s_{\vec{D}^*}$ %is a value of $S$
laidback for
$\vec{D}^*$, consistent with $\vec{a}_{\vec{A} \cap \pas_\G (\vec{D}^*)}$.
\end{algorithm}

\begin{algorithm}[h]
\caption{Identification for a confounded selector} \label{alg:cs-general}
\KwData{${\G}^{[]}(\vec{C},\pas(\vec{C})), a, q_{\vec{C}}(\vec{C} | \pas(\vec{C})), \vec{D}^*, \vec{C}$;\\
where $\vec{C} \equiv \cl(\vec{D}^*)$;}
\KwResult{$q_{\vec{D}^*}(\vec{D}^* \mid \pas(\vec{D}^*))$ or FAIL}

$\ch^*(S) \gets \ch(S) \cap ( \cl(\vec{D}^*) \setminus \vec{D}^*) $\;

\eIf{$\ch^*(S) = \emptyset$}
    { 
    
    \Return FAIL(hedge$\langle \vec{D}^*, \cl(\vec{D}^*) \rangle$)\;\label{alg:cs_fail_1}

    }{
   \For{$\bar{s} \in \mathfrak{X}^+_S$ 
   which are laidback for $\vec{D}^*$ but serious for $\vec{Z} \subseteq \ch^*(S)$ at %values
   $\vec{z}$
   %that are
   consistent for $a_{\pas(\vec{D}^*)}$}{
        Let $\vec{D}' \in {\cal D}({{\G}^{[]}(\bar{s})})$, s.t. $\vec{D}^* \subseteq \vec{D}'$ \label{alg:encapsulating_district}\;
        \If{
            $\{ D(\bar{s}) : D \in \vec{D}' \cap \de(S) \} \ci S \mid
            \{ D(\bar{s}) : D \in \vec{D}' \cap \nd(S) \} \cup \pas(\vec{D}')$ in ${\G}^{[]}(\bar{s})$ and
            $D^*$ is reachable in ${\G}^{[s]}(\bar{s})_{\vec{D}'}$
            \label{alg:perfect_iv_trick}
            }{
             $q^{s,\vec{z}}_{\vec{D}'}(\vec{D}' | \pas(\vec{D}')) \gets
                 \left[ \prod\limits_{D \in \de(S) \cap \vec{D}'} q_{\cl(D^*)}(D | \bar{s},\pre_{\prec}(D)
                 %, \pas_{\G}(\vec{D}')
                 ) \right] \vert_{\vec{Z} = \vec{z}}\linebreak \times
                 \left[ \prod\limits_{D \in \nd(S) \cap \vec{D}'} q_{\cl(D^*)}(D | \pre_{\prec}(D)
                 %, \pas(\vec{D}')
                 ) \right] \vert_{\vec{Z} = \vec{z}}
                 $, where $\pre_{\prec}(D)$ are topological predecessors of $D$ in $D' \cup \pas(D')$.
             \Return $\phi_{\vec{D}' \setminus D}(q^{s,\vec{z}}_{\vec{D}'};{\G}^{[s]}(\bar{s})_{\vec{D}'})$\; 
            }
        
        }
    \Return FAIL\label{alg:cs_fail_2}\;
    }
\end{algorithm}


% $p(\vec{D}^* | \text{do}(\pas(\vec{D}^*))$ is not identified directly, but is identified from an identified distribution $p(\cl(\vec{D}) | \text{do}(\pas(\cl(\vec{D}^*))))$ by taking advantage of available interventional distributions if $S \not\in \cl(\vec{D}^*)$ but $S$ indexes useful interventional distributions 

% We will consider identification of some treatment $\vec{a}$ on some outcome $\vec{Y}$, in the setting where $S$ is laid-back (i.e. not intervened upon) for all variables. Intuitively, this is the causal effect of $\vec{a}$ on $\vec{Y}$ in a purely observational domain, which excludes experimenter-determined randomization probabilities that are not of interest. This identification target is similar in spirit to the target implied by \cite{leeGeneralIdentifiabilityArbitrary2019}.

% Formally, we define this target as $p(\vec{Y}(\vec{a}, S=\emptyset))$, from the observed data distribution $p(\vec{V})$, in LS-ADMMG $\G (\vec{V})$. 

% {\color{olive}
% We introduce a further restriction on the available interventional distributions, which makes this setting less general than gID's input. We allow $\ch_\G(S)$ to be unrestricted. However, for each child of $S$, we require that there be some intervention that includes that child, and for each intervention set available, we require that every intervention superset up to $\ch_\G (S)$ is also available.
% }

%As before, note that $\mathfrak{X}_S^+ \subseteq \mathfrak{X}_S$.
% Note that
% this does not require that data from $S=(\emptyset,\emptyset)$ be available,
% because \cref{ass:mechanism_stability_assumption} or conditional independence
% relations involving $S$ could enable identification.

%The data available for identification is denoted $p(\vec{V})$, where $\mathfrak{X}_S^+ \subseteq \mathfrak{X}_S$.
\begin{figure}[h]
\centering
        \begin{tikzpicture}[rotate=90]
		\tikzstyle{block} = [draw, circle, inner sep=1.5pt, fill=lightgray]
		\tikzstyle{block2} = [draw, rectangle, inner sep=1.5pt, fill=lightgray]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}

            \node[inner sep=1pt] (w2) at (1, 0) {$W_2$};
            \node[inner sep=1pt] (a3) at (0, 0) {$A_3$};
            \node[inner sep=1pt] (c) at (1, -1) {$C$};
            \node[selector] (s) at (0, -1) {$S$};
            \node[inner sep=1pt] (a1) at (0, -2) {$A_1$};
            \node[inner sep=1pt] (a2) at (1, -2) {$A_2$};
            \node[inner sep=1pt] (w1) at (1, -3) {$W_1$};
            \node[inner sep=1pt] (m) at (0, -3) {$M$};
            \node[inner sep=1pt] (y) at (0, -4) {$Y$};
            
            \draw[-stealth] (m) to  (y);
            \draw[-stealth] (w1) to  (y);
            \draw[-stealth] (a1) to  (m);
            \draw[-stealth][bend left=15] (a2) to  (w1);
            \draw[-stealth] (s) to  (a1);
            \draw[-stealth][bend right=35] (s) to  (a2);
            \draw[-stealth] (c) to  (s);
            \draw[-stealth][bend left=55] (c) to (y);
            \draw[-stealth] (a3) to  (s);
            \draw[-stealth][bend right=0] (w2) to  (s);
            \draw[-stealth][bend left=30] (w2) to  (w1);
            \draw[stealth-stealth][bend right=20] (a2) to node[above]{{ \tiny $\{A_2\}$}}(y);
            \draw[stealth-stealth][bend right=15] (a1) to node[below]{\tiny $\{A_1\}$}(m);
            \draw[stealth-stealth][bend left=60] (w2) to (y);
            \draw[stealth-stealth][bend left=0] (w2) to (a3);
            \draw[stealth-stealth][bend left=0] (s) to node[above, xshift=-0.1cm]{\tiny $\{A_2\}$}(a2);
            \draw[stealth-stealth][bend right=0] (a2) to node[below]{\tiny $\{A_2\}$}(w1);
        \end{tikzpicture}
  %       \begin{tikzpicture}[rotate=90]
		% \tikzstyle{block} = [draw, circle, inner sep=1.5pt, fill=lightgray]
		% \tikzstyle{block2} = [draw, rectangle, inner sep=1.5pt, fill=lightgray]
		% \tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		% \tikzstyle{input} = [coordinate]
		% \tikzstyle{output} = [coordinate]
  %           \tikzset{edge/.style = {->,> = latex'}}

  %           \node[] (w2) at (-1, 0) {$W_2$};
  %           \node[] (a3) at (0, 0) {$A_3$};
  %           \node[] (c) at (1, 0) {$C$};
  %           \node[selector] (s) at (0, -1) {$S$};
  %           \node[] (a1) at (-.75, -2) {$A_1$};
  %           \node[] (a2) at (.75, -2) {$A_2$};
  %           \node[] (w1) at (.75, -3) {$W_1$};
  %           \node[] (m) at (-.75, -3) {$M$};
  %           \node[] (y) at (0, -4) {$Y$};
            
  %           \draw[-stealth] (m) to  (y);
  %           \draw[-stealth] (w1) to  (y);
  %           \draw[-stealth] (a1) to  (m);
  %           \draw[-stealth] (a2) to  (w1);
  %           \draw[-stealth] (s) to  (a1);
  %           \draw[-stealth] (s) to  (a2);
  %           \draw[-stealth] (c) to  (s);
  %           \draw[-stealth][bend left=45] (c) to (y);
  %           \draw[-stealth] (a3) to  (s);
  %           \draw[-stealth] (w2) to  (s);
  %           \draw[-stealth] (w2) to  (w1);
  %           \draw[stealth-stealth][bend right=35] (a2) to node[above]{{ \tiny $\{A_2\}$}}(y);
  %           \draw[stealth-stealth][bend right=35] (a1) to node[below]{\tiny $\{A_1\}$}(m);
  %           \draw[stealth-stealth][bend right=45] (w2) to (y);
  %           \draw[stealth-stealth][bend left=35] (w2) to (a3);
  %           \draw[stealth-stealth][bend left=35] (s) to node[above]{\tiny $\{A_2\}$}(a2);
  %           \draw[stealth-stealth][bend left=35] (a2) to node[above]{\tiny $\{A_2\}$}(w1);
  %       \end{tikzpicture}
        %\caption{}
%\centering
%\end{subfigure}%
\caption{An LS-ADMMG illustrating %the application of
Algorithm~\ref{alg:main-general}.}
\label{fig:general-case-example}
%\label{fig:fixing_example}
\end{figure}

% In this section we prove soundness and completeness for identifying $p(\vec{Y}(\vec{a}, S = \emptyset))$. All proofs are deferred to the supplement.


% \begin{restatable}[Positivity construction]{thm}{positivityconstruction}\label{thm:positivity-construction}
%     Let  $\vec{D}^*$ be a district of $\G$ where $S$ is not laid-back for $D^*$ for each value in $\mathfrak{X}_S^+$. Let $p(\vec{V}) \equiv p(\vec{V} \mid \textrm{$S$ is not laid-back for $D^*$})$ denote the available data. Then $p(\vec{D}^*) \equiv p(\vec{D}^* \mid \textrm{$S$ is laid-back for $\vec{D}^*$})$ is not identified.
% \end{restatable}
% \begin{proof}
% We first begin by considering $\G^0$, where we retain all vertices of $\G$, bidirected edges for $\vec{D}^*$, and all directed edges for children of $S$ as appropriate. 

% The proof is by induction over the number of directed edges between $\G^0$ and $\G$.

% For the base case in $\G^0$, it is clear that because $\vec{D}^*$ is not laid-back in the provided data, we cannot know the distribution of $\vec{D}^*$ for any value of $S$ which is laid-back for $\vec{D}^*$. This is because marginals cannot identify a joint distribution. Thus $p(\barr_\G (\vec{D}^*)) $ is not identified (QUESTION: why just the barren set here?)

% We now seek to prove the inductive hypothesis. Assume that the barren set of some $\G^i$ is not identified for values of $S$ which are laid back for that set.  Then, construct $\G^{i + 1}$ in one of two ways: addition of  directed edge, where endpoints are in $\vec{D}^*$; addition of directed path where endpoints are in  $\vec{D}^*$, but intermediate nodes are not.

% <some details about adding tilde nodes via 1:1, then cartesian products being applied to combine variables back, with an extra twist to ensure that $S$ relations respected>

% TO MOVE INTO COMPLETENESS PROOF: Once we establish that $p(\vec{D}^*)$ in the original graph w.r.t. laid-back $S$ is not identified, then later we may employ the 1:1 argument to show it's not ID. This will go directly from $\vec{D}^*$ to $\vec{Y}'$.

% \end{proof}

% \begin{restatable}[Noise-injected context-specific hedge]{thm}{noiseinjectedhedge}\label{thm:noise-injected-hedge}
% Fix a graph $\G$ with a vertex set $\vec{V}$ representing a causal model.
% Let $\vec{F}$ and $\vec{F}'$ be two districts in $\G$, where $\vec{F} \subset \vec{F}'$, and $\vec{F}$ and $\vec{F}'$ agree on the set of childless vertices in $\G_{\vec{F}}$ and $\G_{\vec{F}'}$.
% %Let $\G$ denote the graph over $F'$.
% Let $S \in \vec{F}$ the selection variable. Then $p(\vec{F} \mid \doo (\vec{a}))$ for any $\vec{A} \subseteq \vec{F}' \setminus \vec{F}$ is not identified, and we denote this hedge as $\langle \vec{F}, \vec{F}' \rangle$. 
% \end{restatable}




% \begin{restatable}[Generalized perfect instrument trick]{lem}{fullgeneralizedtrick} \label{lem:full_generalized_trick}
% Let $\G$ be a graph containing $S, A$ that is a district $D=\cl(\vec{D}^*)$. Let $A_S = A \cap \ch(S)$. Define $\G^* = \G(\{D \setminus A\}(a))$, and let $\tildep{\vec{D}}$ be a district of $\G(\{D \setminus A_S\})$. Let $\tildep{\vec{D}} \cap \de(\pas(\tildep{\vec{D}})) \ci S \mid (\tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}}))) \cup \pas(\tildep{\vec{D}})$ hold, where genealogical relations are taken with respect to $\G^*$ unless otherwise stated. 
% %Let $(A \setminus A_S) \cap \tildep{\vec{D}}$ be fixable in $\tildep{\vec{D}}$. 
% Let $\prec$ be a topological ordering on $\tildep{\vec{D}} \cap \de(\pas(\tildep{\vec{D}}))$ according to $\G_{\tildep{\vec{D}}}$. Let $s$ denote a value of $S$ that is serious for $A_S$ and consistent for $A=a$. Then,
% \restatemarker
% \end{restatable}

% {\color{olive}
% \begin{restatable}[Hedge for $S=\emptyset$ interventions]{thm}{sonlyhedge}\label{thm:s_only_hedge}

% Let $\G$ be a graph with vertex set $\vec{V}$. Let $\vec{F}', \vec{F}$ be districts where $\vec{F} \subset \vec{F}'$, and $S \in \vec{F}' \setminus \vec{R}$, where $\vec{R}$ is the root set. Let $S=\emptyset$ be the desired intervention.
% Then, $p(\vec{F} \mid \doo(S=\emptyset))$ is not identified.

% \end{restatable}
% }

\begin{ex}[Identifying $p(Y(\vec{a}, S = \emptyset))$ in \cref{fig:general-case-example}]\label{ex:general-case-example} 
%Putting all the cases together we have:
The identifying functional is
$p(Y (\vec{a}, S=\emptyset)) = \sum_{\vec{Y}^* \setminus Y} \!\!\prod_{D^*_i \in {\cal D}(\G^{[]}(a, \emptyset)_{\vec{Y}^*})}  q_{\vec{D}^*_i}(\vec{D}^*_i | \pas(\vec{D}^*_i))$, which is equal to
{\small
\begin{align*}
\sum_{M, W_1, W_2, C}
& p(C) p(M | a_1,%s^e_{A_1}=1,s^v_{A_1}=a_1
    s_{a_1})
 p(W_1 | W_2, a_2, %s^e_{A_2}=1,s^v_{A_2}=a_2
    s_{a_2})\\
&  \sum_{A_3} p(Y | M,W_2,W_1,C,s_{a_1,a_2},A_3) p(W_2,A_3),
%s^e_{\{ A_1,A_2\}}=1,S^v_{\{A_1,A_2\}} = a_1,a_2
\end{align*}
}where $s_{\vec{a}}$ is a shorthand for any value of $S$ which is serious for $\vec{A}$ at values $\vec{a}$.
See the Appendix for a detailed derivation.
\end{ex}

Our results show that our proposed algorithm is sound, and implies a non-identified query in all but one failure cases.  We illustrate a number of failure cases of the algorithm in the Appendix.  We conjecture this algorithm is also complete.
\vspace{-0.4cm}
\begin{restatable}[Soundness]{thm}{generalsoundness}
    \cref{alg:main-general} is sound.
\end{restatable}
\vspace{-0.2cm}
\begin{restatable}[Non-identification]{thm}{partialcompleteness}\label{alg:partial_completeness}
   If \cref{alg:main-general} fails at \ARef*{alg:general_fail_positivity}, \ARef*{alg:general_fail_thicket}, or \ARef*{alg:cs_fail_1} then the causal effect is not identified.
\end{restatable}
\vspace{-0.4cm}
\section{Conclusions}

In this paper, we have considered the problem of identification of causal effects in settings with multiple datasets, corresponding to the observational or interventional contexts derived from a causal model where units are selected into different contexts \emph{systematically}.  Unlike prior approaches, we represent systematic selection by means of an indicator random variable that is potentially related to other variables in the model in complicated ways.
We show that in the resulting \emph{Context Selected Structural Causal Model (CS-SCM)} systematic selection may be arranged into a hierarchy resembling the hierarchy of systematic censoring in missing data, with possible models including selected completely at random (SCAR), selected at random (SAR), and selected not at random (SNAR).  We show that in SCAR and SAR models, identification of interventional distributions may be obtained by a generalization of the g-formula.

In SNAR settings, where systematic selection and unobserved confounding are present, we provide a general identification algorithm which generalizes the gID algorithm \citep{leeGeneralIdentifiabilityArbitrary2022,kivvaRevisitingGeneralIdentifiability2022}, but which applies in causal models with arbitrarily complex types of systematic selection, and is able to achieve novel identification results using context-specific restrictions found in CS-SCMs.

% {\color{red}Note to self: how do we get around getting label info after fixing, if we need it in, e.g. alg. 2. }
% \begin{restatable}[Completeness for $S=\emptyset$ interventions]{thm}{generalcompleteness}
%    \cref{alg:main-general} is complete for queries of the form $p(\vec{Y} \mid \doo(\emptyset))$. 
% \end{restatable}
    % \begin{itemize}
    %     \item $\ch(S)$ not in forest. Then, immediately appeal to the 1:1 construction of \cite{shpitserIdentificationJointInterventional2006}.
    %     \item $\ch(S)$ is in forest.
    %     \begin{itemize}
    %         \item $S$ is never laidback for some $W \in \vec{Y}^*$ (i.e. there exists $W$ such that $\forall s_0 \in \mathfrak{X}_{S_0}, W \in s_0$). Then, two models can be constructed that differ on the distribution for $p(W)$, which is never observed in the given data. Since $W \in \vec{Y}^*$ this means that the causal effect will differ in the two models, while the observed agree.
    %         \item There exists $s$ which is laidback for $\vec{Y}^*$. Then, the 1:1 construction can be provided in that setting. The observed distributions will agree, and the causal effect will be a sum over $S$, where if $S=s$ then the causal effect in that strata disagrees, and it agrees otherwise. 
    %         \item None of the above.
    %     \end{itemize}
    % \end{itemize}


    % Two modes of failure:
    % 1. if cs-hedge/thicket: assume enough data such that there is a domain that is laidback from D* to Y'.  Assuming more data makes the problem easier. Show that this enables a construction witnessing non-ID. 

    % 2. If it's positivity: do the regular construction.
% UAI 2024 papers have to be prepared using \LaTeX.
% To start writing your paper, copy \texttt{uai2024-template.tex} and replace title, authorship, and content with your own.

% The UAI 2024 paper style is based on a custom \textsf{uai2024} class.
% The class file sets the page geometry and visual style.\footnote{%
%     The class uses the packages \textsf{adjustbox}, \textsf{environ}, \textsf{letltxmacro}, \textsf{geometry}, \textsf{footmisc}, \textsf{caption}, \textsf{textcase}, \textsf{titlesec}, \textsf{titling}, \textsf{authblk}, \textsf{enumitem}, \textsf{microtype}, \textsf{lastpage}, and \textsf{kvoptions}.
% }
% The class file also loads basic text fonts.\footnote{%
%     Fonts loaded are \textsf{times} (roman), \textsf{helvet} (sanserif), \textsf{courier} (fixed-width), and \textsf{textcomp} (common symbols).
% }
% \emph{You may not modify the geometry or style in any way, for example, to squeeze out a little bit of extra space.}
% (Also do not use \verb|\vspace| for this.)
% Feel free to use convenience functionality of loaded packages such as \textsf{enumitem}.
% The class enables hyperlinking by loading the \textsf{hyperref} package.

% You are free to load any packages available in \TeX{Live}~2020 that are compatible with the UAI class.\footnote{In case this template or your submission does not compile, always first make sure your \TeX\ installation is up-to-date.}
% (Mik\TeX{} and Mac\TeX{} generally contain the same packages.)
% Do not load conflicting packages—you will get an error message—, as this complicates creating the proceedings.
% Please avoid using obsolete commands, such as \verb|\rm|, and obsolete packages, such as \textsf{epsfig}.\footnote{%
%     See \url{https://ctan.org/pkg/l2tabu}.
% }

% \swap[ ]{in the header of your source file.}{Feel free to include your own macros}

% \section{General Formatting Instructions}
% As a general rule: \emph{follow the template}.

% \subsection{Authorship}
% Reviewing is double-blind.
% However, you can already fill in your author names and affiliations in the \verb|\author| block in the preamble following the example of the template because the class will remove it as long as the option \textsf{accepted} is not passed to the class.
% Nevertheless, make sure any other information in the paper does not disclose your identity, for example URLs to supplementary material.

% \subsection{Sectioning}
% Three numbered sectioning commands are provided: \verb|\section|, \verb|\subsection|, and \verb|\subsubsection|.
% Please respect their order, so do not put a \verb|\subsubsection| directly beneath a \verb|\section|.
% One unnumbered sectioning command is provided, \verb|\paragraph|.
% It can be used directly below any numbered section level.
% Do not use any other sectioning commands.

% \subsubsection{Typing the Section Titles}
% The \verb|\section| and \verb|\subsection| titles are uppercased by the class.
% Please type them in title case.
% (This is used in the PDF bookmarks.)
% Please also write the \verb|\subsubsection| titles in title case.

% \paragraph{What is title case?}
% \href{https://en.wikipedia.org/wiki/Title_case}{Wikipedia} explains:
% \begin{quote}
%     Title case or headline case is a style of capitalization used for rendering the titles of published works or works of art in English.
%     When using title case, all words are capitalized except for ‘minor’ words (typically articles, short prepositions, and some conjunctions) unless they are the first or last word of the title.
% \end{quote}

% \subsection{References, Citations, Footnotes}\label{sec:etc}
% \subsubsection{Cross-Referencing}
% Always use \verb|\label| and \verb|\ref|—or a command with a similar effect—when cross-referencing.
% For example, this subsection is Section~\ref{sec:etc}.

% \subsubsection{Citations}
% Citations should include the author's last name and year.
% They should be part of the sentence.
% An example parenthetical citation: “Good introductions to the topic are available \citep{latexcompanion}.”
% An example textual citation: “\citet{einstein} discusses electrodynamics of moving bodies.”
% Do not use a parenthetical citation where a textual one is appropriate.
% An example of what \emph{not} to do: “\citep{einstein} discusses electrodynamics of moving bodies.”

% We strongly advise to use reference list software such as Bib\TeX{} and a citation package such as \textsf{natbib}.
% The reference style you use should be compatible with the author-year citations.
% Both the citation style and reference style used should be consistent.

% For the original submission, take care not to reveal the authors' identity through the manner in which one's own previous work is cited.
% For example, writing
% “I discussed electrodynamics of moving bodies before \citep{einstein}.” would be inappropriate, as it reveals the author's identity.
% Instead, write “\citet{einstein} discussed electrodynamics of moving bodies.”

% \subsubsection{Footnotes}
% You can include footnotes in your text.\footnote{
%     Use footnotes sparingly, as they can be distracting, having readers skip back and forth between the main text and the foot of the page.
% }
% The footnote mark should follow the fragment to which it refers, so a footnote\footnote{
%     A footnote is material put at the foot of a page.
% }
% for a word has a footnote mark attached to that word and a footnote for a phrase or sentence has a footnote mark attached to the closing punctuation.

% \section{Math}\label{sec:math}
% The class file does not load any math support package like \textsf{amsmath}\footnote{%
%   See the \textsf{amsmath} documentation at \url{https://ctan.org/pkg/amsmath} for further details.
% }.
% We advise using the \textsf{mathtools}\footnote{%
%   See the \textsf{mathtools} documentation at \url{https://ctan.org/pkg/mathtools} for further details.
% }
% package, which extends \textsf{amsmath} with fixes and even more useful commands.
% Feel free to load other support packages for symbols, theorems, etc.

% Use the \textsf{amsmath} environments for displayed equations.
% So, specifically, use the \texttt{equation} environment instead of \verb|$$...$$| and the \texttt{align} environment instead of \texttt{eqnarray}.\footnote{For reasons why you should not use the obsolete \texttt{eqnarray} environment, see Lars Madsen, \textit{Avoid eqnarray!} TUGboat 33(1):21--25, 2012.}
% An \texttt{equation}:
% \begin{equation}\label{eq:example}
%   0 = 1 - 1.
% \end{equation}
% Two \texttt{align}'ed equations:
% \begin{align*} % no numbers with starred version
%   1 + 2 &= 3,\\
%   1 - 2 &= -1.
% \end{align*}
% Equations can also be put inline, of course.
% For example, Equation~\eqref{eq:example}: \(0=1+1\). % $0=1+1$ also works
% (Notice that both inline and displayed math are part of the sentence, so punctuation should be added to displayed math.)

% The \textsf{amsmath} and \textsf{mathtools} packages provide a lot of nice functionality, such as many common math operators, e.g., \(\sin\) and \(\max\), and also commands for defining new ones.

% \section{Floats}\label{sec:floats}
% Floats, such as figures, tables and algorithms, are moving objects and are supposed to float to the nearest convenient location.
% Please do not force them to go in the middle of a paragraph.
% They must respect the column width.

% Two-column floats are possible.
% They appear at the top of the next page, so strategic placement may be necessary.
% For an example, see Figure~\ref{fig:tikz}.
% They may not enter the margins.
% \begin{figure*}
%     \centering
%     \begin{tikzpicture}[xscale=1.5]
%         \coordinate (origin);
%         \draw[->] (origin) -- +(1cm,0) node[below] {$x$};
%         \draw[->] (origin) -- +(0,1cm) node[left] {$y$};
%         \fill[gray] (45:1cm) circle[radius=.2cm];
%     \end{tikzpicture}
%     \caption{A Nice Filled Ellipse with a Pair of Coordinate Axes.}\label{fig:tikz}
% \end{figure*}

% All material in floats should be legible and of good quality.
% So avoid very small or large text and pixelated or fuzzy lines.

% \subsection{Figures}\label{sec:figures}
% Figures should go in the \texttt{figure} environment and be centered therein.
% The caption should go below the figure.
% Use \verb|\includegraphics| for external graphics files but omit the file extension.
% Supported formats are \textsf{pdf} (preferred for vector drawings and diagrams), \textsf{png} (preferred for screenshots), and \textsf{jpeg} (preferred for photographs).
% Do not use \verb|\epsfig| or \verb|\psfig|.
% If you want to scale the image, it is better to use a fraction of the line width rather than an explicit length.
% For example, see Figure~\ref{fig:city}.

% \begin{figure}[!htb]
%   \centering
%   \includegraphics[width=0.7\linewidth]{barcelona.jpg}
%   \caption{A View of a Nice City.}\label{fig:city}
% \end{figure}

% Do not use \verb|\graphicspath|.
% If the images are contained in a subdirectory, specify this when you include the image, for example \verb|\includegraphics{figures/mypic}|.

% \subsection{Tables}\label{sec:tables}
% Tables should go in the \texttt{table} environment and be centered therein.
% The caption should go above the table and be in title caps.
% For an example, see Table~\ref{tab:data}.
% \begin{table}
%     \centering
%     \caption{An Interesting Table.}\label{tab:data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \subsection{Algorithms}\label{sec:algorithms}
% You can load your favorite algorithm package, such as \textsf{algorithm2e}\footnote{See the \textsf{algorithm2e} documentation at \url{https://ctan.org/pkg/algorithm2e}.}.
% Use the environment defined in the package to create a centered float with an algorithm inside.

% \section{Back Matter}
% There are a some final, special sections that come at the back of the paper, in the following order:
% \begin{itemize}
%   \item Author Contributions (optional)
%   \item Acknowledgements (optional)
%   \item References
% \end{itemize}
% They all use an unnumbered \verb|\subsubsection|.

% For the first two special environments are provided.
% (These sections are automatically removed for the anonymous submission version of your paper.)
% The third is the ‘References’ section.
% (See below.)

% (This ‘Back Matter’ section itself should not be included in your paper.)


% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

% \begin{acknowledgements} % will be removed in pdf for initial submission,
% 						 % (without ‘accepted’ option in \documentclass)
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
%     Briefly acknowledge people and organizations here.

%     \emph{All} acknowledgements go in this section.
% \end{acknowledgements}

% References
\bibliography{uai2024-template}

\newpage

\onecolumn

\title{A 
General Identification Algorithm For Data Fusion Problems Under Systematic Selection\\(Supplementary Material)}
\maketitle



%This Supplementary Material should be submitted together with the main paper.

\appendix
\section{Proofs}
% Table~\ref{tab:supp-data} lists additional simulation results; see also \citet{einstein} for a comparison. 

% \fullgeneralizedtrick*
% \begin{proof}
% Let $s=(s_0, s_1)$ be a value of $\mathfrak{X}_S$ such that $A_S \subseteq s_0$, and $s_1$ is a value consistent with $A=a$. Then,
%     \begin{align*}
%     &q_{\vec{V}} (\vec{D}' \mid \pas_\G (\vec{D}')) \\
%     &=p(\tildep{\vec{D}} \mid \doo(\pas(\tildep{\vec{D}}))) \\
%     &= p(\tildep{\vec{D}} \cap \de(\pas(\tildep{\vec{D}})) \mid \tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})), \doo(\pas(\tildep{\vec{D}})))p(\tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})))  \\
%     &= p(\tildep{\vec{D}} \cap \de(\pas(\tildep{\vec{D}})) \mid \tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})), \doo(\pas(\tildep{\vec{D}})), S = s)p(\tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})))  \\
%     &= \prod_{V_i \in \prec_{\tildep{\vec{D}} \cap \de(\pas(\tildep{\vec{D}}))}} p(V_i \mid \pre_{\prec_{\tildep{\vec{D}} \cap \de(\pas(\tildep{\vec{D}}))}}(V_i), \tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})), \pas(\tildep{\vec{D}}, S=s)  p(\tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})))
%     \end{align*}

% where the first equality holds by relating kernels to intervential distributions, the second holds by chain rule and the fact that non-descendants of an interventional set are not a function of that intervention, the third holds by the assumed conditional independence, and the fourth holds via an application of the g-formula to identification of the first term in the third equality.
% \end{proof}

\csswigs*
\begin{proof}

 This result is established by proof of soundness of the CSI-separation criterion in \cite{boutilierContextSpecificIndependenceBayesian1996}, definition of CS-ICMs (specifically, definition of seriousness of $S$ with respect to any element of $\ch_{\bar{\G}}(S)$), and a direct extension of results in \citep{thomas13swig}.
\end{proof}

\icsscm*
\begin{proof}
Soundness follows by application of the g-formula to the CS-SCM \citep{robins86new}, and the definition of our query.
%Due to the particular relationship between $S$ and its children, we must also make sure that there exists values of $S$ which are laidback for each element of $\vec{Y}^*$. 

Completeness holds by the following argument. Assume that $S$ is never laidback for some element of $\vec{Y}^*$, which we call $Z$. Then, it is possible to produce two models which have different distributions on $Z$ when $S=\emptyset$. It is then possible to construct two models
which agree on the observed data distribution (which only includes elements in $\mathfrak{X}^+_S$), but disagrees on
%This will clearly affect the distribution of
$p(\vec{Y}(a, S=\emptyset))$.
In particular, fix $Y \in \vec{Y}$.  Since $Z \in \vec{Y}^*$, there must be a directed path from $Z$ to $Y$.
Consider two elements of the causal model where the only edges are on the directed path from $Z$ to $Y$, and all vertices are otherwise mutually independent.
Then it is straightforward to construct two elements of the causal model where the mapping from $p(Z)$ to $p(Y)$ given by $\sum_Z p(Y \mid Z)$ is one to one.  Since the rest of the vertices of the model as mutually independent, we have that in the two elements we are considering, $p(\vec{Y}(a, S=\emptyset)) = \prod_{\tilde{Y} \in \vec{Y}} p(\tilde{Y})$.  This immediately yields non-identification since we can construct two elements that agree on $p(\tilde{Y})$ for every $\tilde{Y} \in \vec{Y} \setminus \{ Y \}$, and indeed on the observed data distribution, but disagree on $p(Y)$.
% ; however, because $S$ is never laidback for $Y$ this discrepancy does not appear in the observed data distribution.
\end{proof}



\begin{restatable}[Hedge for $S=\emptyset$ interventions]{thm}{sonlyhedge}\label{thm:s_only_hedge}

Let $\G$ be a graph with vertex set $\vec{V}$, with $S \in \vec{V}$ representing an CS-SCM. Let $\vec{F}', \vec{F}$ be bidirected-connected sets in $\G$ where $\vec{F} \subset \vec{F}'$, and $S \in \vec{F}' \setminus \vec{R}$, and $\vec{R}$ is the root set of both $\vec{F}$ and $\vec{F}'$. %Let $S=\emptyset$ be the desired intervention.
Then, $p(\vec{F} \mid \doo(S=\emptyset))$ is not identified.

\end{restatable}

\begin{proof}
We first begin by defining an edge subgraph $\G'$ of $\G$, in which we retain all vertices, all bidirected edges, all directed edges in $\an_\G (S)$, and all directed edges from $S$ to its children. Define $\vec{Z} = \cl_{\G'} (\vec{D}^*) \setminus \an_{\G'} (S)$.

We then consider the districts $\vec{D}' \in {\cal D}(\G'_{\vec{Z}})$. These districts are all bidirected connected components that have bidirected edges to $\an_{\G'} (S)$, and may or may not have children of $S$.

Choose a particular child of $S$, call it $L$. We consider $\vec{D}'_L \in {\cal D}(\G'_{\vec{Z}})$ which contains $L$.

Then, $L$ may be connected to $S$ in one of two ways by a bidirected path (which exists solely in $\vec{D}'_L \cup \an_{\G'} (S)$) - either this bidirected path enters $\an_{\G'}(S)$ via $S$, or via $\an_{\G'} (S) \setminus \{S\}$.

\begin{enumerate}
    \item

\textbf{Bidirected path enters $\an_{\G'}(S)$ via $S$}: In the first case, we can isolate the bidirected path from $L$ to $S$. Denote the variables on this path as $W_1, \ldots, W_k$, where some subset of these variables may also be children of $S$. Then it is possible to provide a modified hedge construction in which $p(L, \vec{W} \mid \doo(S=\emptyset))$ is not identified. 


We first describe a process to augment the graph $\G'$, which we denote as $\G''$. $\G''$ inherits all vertices of $\G'$, but only edges present in the subgraph over $\G'_{\vec{W} \cup \{L, S\}}$. We then split $S$ into an separate nodes $S^e_{C_i}$ for each child $C_i$, and these $S^e_C$ form a bidirected chain $S^e_Y \leftrightarrow S^e_{C_1} \leftrightarrow \ldots S^e_{C_k}$. The last $S^e_{C_k}$ inherits the bidirected edge into $S$. For each child $C$, we replace the $S \to C$ edge with $S^e_C \to C$.  For each child $C$, we create $S^v_C$ that has a single unobserved variable $U_{S^v_C}$ as parent, and has directed edge $S^v_C \to C$. Finally, we replace each bidirected edge $V \leftrightarrow W$ with $V \leftarrow U_{V, W} \rightarrow W$, where $U_{V, W}$ denotes an unobserved variable. $\G''$ is a subgraph of $\G'$ if you marginalize $\vec{U}$ and perform a cartesian product operation on all $\vec{S}^e = \{S^e_C\}_{C \in \ch_{\G'}(S)}, \vec{S}^v = \{S^v_C \}_{C \in \ch_{\G'}(S)}$.

Given $\G''$, we now describe a procedure to construct two models respecting \cref{dfn:ics-scm} which agree on the observed distribution, but disagree on the causal effect $p(\vec{F} \mid \doo(S=\emptyset))$. Let the cardinality of all variables to 2. In model 1, the value of each variable is equal to the bit parity of the parents. If $S^e_C$ is in the parents, then if $S^e_C = 0$ it takes on the bit parity of the other parents (noting that this is equivalent to the bit parity of all parents, since $0 \oplus X = X$ for any bit $X$), and if $S^e_C = 1$ then the variable takes on the value of $S^v_C$.  The same is true in model 2, except $W_1$ does not pay attention to the bit connecting it and the last $S^e_C$. 

In the observational setting, in both models, the structural equations are the same for $\vec{S}^e, \vec{S}^v$. In model 1, when $\vec{S}^e = \vec{0}$, $L \cup \vec{W}$ effectively counts the bit parity of each $U$ in $\G''$ twice, since $S^e_L$ is zero only if the $U'$ connecting it to $W_1$ is zero. This forms a distribution where values of $L \cup \vec{W}$ which have even bit parity have equal probability, and there is no probability mass elsewhere. If any $S^e_C = 1$, then there exists at least one such $U$ which has only one path to $L \cup \vec{W}$ and so the distribution is uniform. In model 2, when $\vec{S}^e = 0$, $L \cup \vec{W}$ effectively counts the bit parity of each $U$ in $\G''_{L \cup \vec{W}}$ twice, ignoring the $U$ that connects $\vec{W}$ to $S$, resulting in a distribution where values of $L \cup \vec{W}$ which have even bit parity having equal probability and no mass elsewhere. As with model 1, when any $S^e_C = 1$ then the  distribution becomes uniform. Thus the observed data distributions agree.

Under an intervention $S = \emptyset$, this has the effect of ensuring that in model 2, the bit parity of $L \cup \vec{W}$ is always even, whereas in model 1 it is uniform, because there is the contribution of the $U'$. This establishes the non-identification of $p(\vec{D}'_L \mid \doo(S=\emptyset))$.

    \item 
\textbf{Bidirected path enters $\an_{\G'}(S)$ via $\an_{\G'} (S) \setminus \{ S \}$}: Since $S$ must have at least one child, we select any child at random; call this child $L$. We consider the districts of $\G'_{\vec{Z}}$, and define $\vec{D}'_L$ to be the district containing $L$. Define $\vec{R}$ to be the root set of $\vec{F}$.

We first begin by providing an augmented graph $\G''$ constructed from $\G'$. This graph retains all vertices of $\G'$, all children of $S$, all directed edges in the ancestors of $S$. We then split $S$ according to the following procedure: $S$ is replaced with vertices $S^e_C, S^v_C$ for each $C \in \ch_{\G'} (S)$. $S^e_L$ inherits all incoming arrowheads previously into $S$. We create directed edges $S^e_C \to C, S^v_C \to C$ for each child $C$. For all $S^v_C$, we add an unobserved variable $U_{S^v_C} \to S^v_C$, and for all $S^e_C$ which is not $S^e_L$, we add $U_{S^e_C} \to S^e_C$. We note that $\G''$ is an edge subgraph of $\G'$ if the $\vec{U}$ are latent projected, and the $\vec{S}^e, \vec{S}^v$ are grouped into a single vertex via the cartesian product operator.

Next, we will now define a CS-SCM, by modifying the construction of \cite{shpitserIdentificationJointInterventional2006} in such a way to respect the context-specific restrictions. 

In model 1, if $V \not \in \ch_{\G''} (S)$ then $V \equiv \oplus \pa_{\G''}(V)$. Otherwise, $V \equiv \begin{cases} \oplus \pa_{\G''}(V) & S^e_V = 0 \\ S^v_V & S^e_V = 1\end{cases}$. 

In model 2, the same is true, except for variables in $\vec{D}'_L$. For those variables, if $V \not \in \ch_{\G''} (S)$ then they only pay attention to parents in $\vec{D}'_L$, and otherwise $V \equiv \begin{cases} \oplus \pa_{\G''_{\vec{D}'_L}}(V) & S^e_V = 0 \\ S^v_V & S^e_V = 1\end{cases}$.

In the observational distribution, both models induce the same distribution. First, we point out that for variables outside of $\vec{D}'_L$, the structural equations (and therefore the parts of the distribution associated with those variables) are the same. Next, we consider variables in $\vec{D}'_L$. First we point out that the distributions over $\vec{S}^e$ are the same in both models, and are uniform random distributions. When $\vec{S}^e = 0$, in model 1 variables in $\vec{D}'_L$ count the bit parity of their parents twice, whereas in model 2 variables in $\vec{D}'_L$ count the bit parity of their parents in $\vec{D}'_L$ twice. Either way this induces a conditional distribution (of $p(\vec{D}'_L \mid \vec{S}^e, \vec{S}^v)$) with equal probability over even bit parities, and zero probability otherwise. When there exists a child of $S$ such that $\vec{S}^e_C = 1$, then in both models the conditional distribution $p(\vec{D}'_L \mid \vec{S}^e, \vec{S}^v)$ is uniform because there exists at least one $U$ which has only one path down to $\vec{D}'_L$.

In the interventional distribution where the intervention $\doo(S=\emptyset)$ is applied, the distributions in the two models differ. In model 1, the distribution over $\vec{D}'_L$ is uniform, because the $U$ variables whcih connect $\an_{\G''} (S)$ to $\vec{D}'_L$ now only have one path to $\vec{D}'_L$. In model 2, the distribution has equal mass assigned to even bit parities for $\vec{D}'_L$, and no mass to other values. 

This establishes the non-identification of $p(\vec{D}'_L \mid \doo(S=\emptyset))$.

\end{enumerate}
If %$\vec{D}'_L \cap \vec{R} \neq \emptyset$
$\vec{R} \subseteq \vec{D}'_L$, then we have a witness for
%are done since this witnesses
the non-identifiability of $p(\vec{R} \mid \doo(S=\emptyset))$, and since $\vec{R} \subseteq \vec{F}$ this proves the claim. 

%Otherwise, the downward extension of \cite{shpitserIdentificationJointInterventional2006} can be used.

%In the second case, there is a bidirected path that enters $\an_{\G'} (S) \setminus \{S\}$. As before, we replace $S$ with the $S^e_C$ and $S^v_C$  for each child of $S$, we replace each $S \to C$ edge with $S^v_C \to C$ and $S^e_C \to C$. We pick a $S^e_C$ to inherit all the bidirected edges of $S$, with each other $S^e_C$ and all $S^v_C$ being independent and receiving a fair coin $U$ as a parent. Finally we replace each bidirected edge with unobserved fair coins $U$.

%[NEED TO DOUBLE CHECK CONSTRUCTION]

% For each district $\vec{D}' \in {\cal D}(\cl_{\G'} (\vec{D}^*) \setminus \an_{\G'} (S))$, if $\vec{D}'$ does not contain a child of $S$, each variable is equal to the bit parity of its parents. If $\vec{D}'$ does contain a child of $S$, then in model 1 it is equal to the bit parity of its parents, whereas in model 2 it is equal to the bit parity of parents ignoring the unobserved $U$ bits to $\an_{\G'} (S)$.

% Then, in the observational setting (where $S$ is not intervened) the two models agree. For $\vec{D}'$ not containing a child of $S$ this is clear because the structural equations are the same. For $\vec{D}'$ which do contain a child of $S$, when $S$ is serious for $\vec{D}'$, this results in a uniform distribution over $\vec{D}'$, and when $S$ is laidback for $\vec{D}'$, in model 1 each unobserved $U$ is counted twice (including the $U$ that span $\vec{D}'$ and $\an_{\G'} (S)$) whereas in model 2 only the $U$ between variables in $\vec{D}'$ are counted twice, resulting in each even bit parity over $\vec{D}'$ having equal probability and no mass elsewhere. Because there are no directed edges between the $\vec{D}'$ directly, this means that conditioned on $\an_{\G'}(S)$ the probability of a given $\vec{D}'$ is independent of another $\vec{D}''$, for $\vec{D}', \vec{D}'' \in {\cal D}(\cl_\G (\vec{D}^*) \setminus \an_\G (S))$. Under an intervention of $S = \emptyset$, the only potential for change is in $\vec{D}'$ containing $S$ (since otherwise the structural equations are exactly the same). In those districts, in model 2 the distribution assigns equal probability to values of $\vec{D}'$ with even bit parity and zero otherwise. However, in model 1 the distribution will be uniform, because of the influence of the $U$ spanning $\vec{D}'$ and $\an_{\G'}(S)$.

% To establish the required result, we return our attention to $\G$, and of $\vec{F}$ generally. At present, the above results show that $p(\vec{D}'_{C} \mid \doo(S=\emptyset))$ are not identified. In order to complete the proof, we must establish that $p(\vec{F} \mid \doo (S = \emptyset))$ is not identified. We can equivalently show that $p(\vec{R} \mid \doo(S = \emptyset))$, where $\vec{R}$ is the set of nodes in $\vec{F}$ which have no children in $\G$. There is no requirement that $\vec{D}'$ and $\vec{R}$ intersect, but there must exist a directed path from any $\vec{D}'_C$ containing a child of $S$ to $\vec{R}$. 

Otherwise, since the intervention is $S = \emptyset$, all edges in the graph $\bar{\G}^{[]}$ may be used in our construction of counterexamples.
%we can rely on the edges from $\vec{D}'_C$ to $\vec{R}$ to exist.
In particular, we can employ the downward extension of Theorem 4 found in \cite{shpitserIdentificationJointInterventional2006} %can be applied
to both elements of the causal model we are constructing as counterexamples. This gives
\begin{align*} 
p(\vec{R} \mid \doo(S = \emptyset)) &= \sum_{\vec{D}'_C} p(\vec{R} \mid \vec{D}'_C, S=\emptyset)  p(\vec{D}'_C \mid \doo (S= \emptyset)) \\
\end{align*}
where to suffices to choose  $p(\vec{F} \mid \vec{R}, S = \emptyset)$ that will yield a one to one mapping from $p(\vec{D}'_C \mid \doo (S= \emptyset))$ to $p(\vec{R} \mid \doo(S = \emptyset))$ in the above equation. %is made.
%where $\vec{D}' \neq \vec{D}'_Y$, we
%Such a bidirected path has to exist. 
% Finally, extending this result to $p(\vec{Y} \mid \doo(S = \emptyset))$ entails another application of downward extension, since $\vec{R}$ is guaranteed to be among the ancestors of $\vec{Y}$ in the SWIG $\G^{[\emptyset]}(S=\emptyset)$.
%In this case, we isolate the 


\end{proof}

% \noiseinjectedhedge*
% \begin{proof}
% {\color{olive}

% %Assume that $S$ is in $\vec{F}' \setminus F$.
% To begin, we start with a revision of the techniques introduced in \cite{shpitserIdentificationJointInterventional2006}. [INSERT REVISION HERE]% Given this augmented graph $\G'(\vec{V} \cup \{S\})$, we construct two elements of the causal model according to the bit parity scheme of \cite{shpitserIdentificationJointInterventional2006}. We start by replacing each bidirected edge in $\G'$ with a random fair coin $U_{X, Y}$ for endpoints $X, Y$. For each $S^v_C$, we add a random fair coin parent. In model 1, each variable $V \in \vec{V}$ is equal to the bit parity of the parents, whereas in model 2 the same is true except variables in $\vec{F}$ only pay attention to parents in $\vec{F}$. Each such model is an SCM, as this constitutes a structural equation defining each $f_V (\pa_\G' (V), \epsilon_V)$.

% We will consider three variations that we must handle, corresponding to the three fail cases present in \cref{thm:noise-injected-hedge}.

% \begin{enumerate}
%     \item \ARef*{alg:cs_fail_1}: 
%  First, we consider the special case where $\ch_{\G}(S) \cap (\vec{F}' \setminus \vec{F}) = \emptyset$. 

% IF $S$ does not have children in $\vec{F}$, then $S$ does not have children anywhere in $\vec{F}'$. Then, there are no context specific edges in the subgraph $\G_{\vec{F}'}$, and therefore the regular construction from \cite{shpitserIdentificationJointInterventional2006} immediately applies. 

% If $S$ does have children in $\vec{F}$, then the regular construction from \cite{shpitserIdentificationJointInterventional2006} applies with modifications ensuring that the construction respects context specificity. The intuition is that when $S$ is laid-back for $\vec{F}$, the regular hedge construction is active (and the desired discrepancy is exhibited). When $S$ is serious for $\vec{F}$, then for $\abs{\vec{F}} \geq 2$ this ensures that there exists at least one unobserved bit which has only one path to the root, meaning that the distribution is uniform in both models. If $\vec{F}$ only has one variable then in both models then all the mass is on the value implied by $S$ in both models. In either case, when $S$ is serious for $\vec{F}$ both models will agree on the observed and counterfactual distributions. 


% \item \ARef*{alg:cs_fail_2}: In this instance, $S \not \in \vec{F}'$, but may be a parent of $\vec{F}'$. We use the standard techniques of \cite{shpitserIdentificationJointInterventional2006} to construct two models for $\langle \vec{F}, \vec{F}' \rangle$. To ensure this is a valid ICS-SCM, we must introduce further modifications. First, for each variable which is a child of $S$, we modify the equation as 
% \[\begin{cases} C & S^e_C = 0 \\ S^v_C & S^e_C = 1 \end{cases}\] in both models, where $C$ is the random variable as it appeared in the original formulation of \cite{shpitserIdentificationJointInterventional2006}. Then, when $S^e_C = 0$ for all such $C$, the hedge acts normally and the discrepancy in the distribution $p(Y(a, \emptyset))$ is due to the hedge construction. When $S^e_C = 1$ for some $C$, then in both models, in both the observed and interventional distributions, the distribution over the root is uniform as there exists at least one unobserved variable which has only one path down to the root set.

% To finish the construction, we perform a modified downward extension (to be discussed momentarily) and add marginals over all other variables.
%  \item \ARef{alg:cs_fail_3}:
% %it so that when $S^e_C = 1$  
 
%  % Then the construction is a regular hedge as defined in
%  % \cite{shpitserIdentificationJointInterventional2006}. This construction is possible because there are no context-specific relationships in $F'$.  Note that the construction is such that each variable in $\G_{\vec{F}'}$ has at most one child (this is without loss of generality, since structural equations for variables in a causal model defined by $\G$ need not be defined in terms of every parent).
%  \item 


% \end{enumerate}
% % There are three cases that we must handle - the first case where $\ch_\G (S) \cap \vec{F}' = \emptyset$, the second case where the reachable closure did not include $S$, and

% % First, we consider the special case where $\ch_{\G}(S) \cap \vec{F}' = \emptyset$. Then the construction is a regular hedge as defined in
% % \cite{shpitserIdentificationJointInterventional2006}. This construction is possible because there are no context-specific relationships in $F'$.  Note that the construction is such that each variable in $\G_{\vec{F}'}$ has at most one child (this is without loss of generality, since structural equations for variables in a causal model defined by $\G$ need not be defined in terms of every parent).


% % Otherwise, $\ch_\G(S) \cap \vec{F}' \neq \emptyset$. This leads to the following cases:
% % \begin{enumerate}
% %     \item $S$ has siblings in $\vec{F}$.
% %     \item $S$ has no siblings in $\vec{F}$, and has children in $\vec{F}$ only
% %     \item $S$ has no siblings in $\vec{F}$, has children in $\vec{F}$ and $\vec{F}' \setminus \vec{F}$
% % \end{enumerate}

% % When we provide constructions for these cases, in all instances we shall assume that the available data is the powerset of the children of $S$. We will prove that it is not identified even in that circumstance, therefore non-identification holds with less data.


% % \textbf{Case 1}:
% % We replace $S$ with a bidirected chain of $S^e_C$ for each $C \in \ch_{\G_{\vec{F}'}}(S)$, letting the first variable in the chain inherit incoming edges of $S$. We add an additional variable $S^v_C$. 
% % %Each $S^e_C, S^v_C$ has a directed edge to $C$. 
% % Other edges and vertices in $\G_{\vec{F}'}$ are retained with no changes. % except for directed edges outgoing from $S$. 

% % Given this augmented graph $\G'(\vec{V} \cup \{S\})$, we construct two elements of the causal model according to the bit parity scheme of \cite{shpitserIdentificationJointInterventional2006}. We start by replacing each bidirected edge in $\G'$ with a random fair coin $U_{X, Y}$ for endpoints $X, Y$. For each $S^v_C$, we add a random fair coin parent. In model 1, each variable $V \in \vec{V}$ is equal to the bit parity of the parents, whereas in model 2 the same is true except variables in $\vec{F}$ only pay attention to parents in $\vec{F}$. Each such model is an SCM, as this constitutes a structural equation defining each $f_V (\pa_\G' (V), \epsilon_V)$.

% % However, these structural equations do not respect the ICS-SCM. To ensure they do, we amend the structural equations such that for each $C \in \ch_\G (S)$, 
% % \[C \equiv \begin{cases}
% %    f_C (\pa_\G'(C), \epsilon_C) & S^e_C = 0 \\
% %    S^v_C & S^e_C = 1
% % \end{cases}\]

% % If we are in the first case, where $\ch_{\G'}(S) \cap \vec{V} \neq \emptyset$, then this constitutes a valid set of structural equations for the witness. When $S^e_C =1$ for some $C$, then that $C \in \vec{F}$ becomes $S^v_C$ and is no longer influenced by other unobserved fair coins $\vec{U} \cap \vec{F}$. This means that there is a $U \in \vec{U}$, that lives in $\vec{F}$, which has $C$ as one of its endpoints, that now only has one path down to the root $\vec{R}$. This means that the distribution on the root will be uniform in both models. If $S^e_C= 0$ for all $C$ then we return to the original hedge construction, which as previously discussed preserves agreement on the observed data distribution. Under an intervention, where $A=a$, and $S^e_C = 0$ for all $C$, in model 1, the distribution on the root set is now uniform, whereas in model 2 it remains unchanged.

% % \textbf{Case 2}: Compared with case 1, we alter the base construction slightly.

% % We replace $S$ with a bidirected chain of $S^v_C$ (as opposed to $S^e_C$) for each $C \in \ch_{\G_{\vec{F}'}} (S)$, letting the first variable in the chain inherit incoming edges of $S$. We add an additional variable $S^e_C$ which is a sibling of the corresponding $S^v_C$. 

% % We construct the underlying hedge construction with ICS-SCM compliant modifications as per Case 1.


% % If we are in the second case, where there is a sibling of $S$ in $\vec{F}$, then additional modifications are required. We choose the variable $Y$ that is the sibling of $S$ in $\vec{F}$, and we use the unobserved fair coin $U$ (which now has children $Y, S^e_Y, S^v_Y$) connecting them to selectively alter the structural equations in both models as follows:
% % \[Y \equiv \begin{cases} f_Y (\pa_{\G'} (Y), \epsilon_Y), &U = 0 \\
% % f_Y (\pa_{\G'} (Y) \oplus \tilde{U} & U = 1\end{cases}\]
% % where $\tilde{U}$ is a new random fair coin that only points to $Y$.

% % The reason this modification is required, is  that when $S^e_C = 1$, that there is a $U'$ which previously pointed into $C$ that now has only one path to the root. This will affect agreement on the observed data distributions, since in model 1 the distribution over the root will be uniform, whereas in model 2 the distribution will continue to be uniform only on even bit parities. The modification ensures that model 2 has uniform bit parity whenever $S^e_C = 1$ through the shared $U$ bit.
% %This is because when $S^e_C = 1$ for some $C \in \ch_\G (S) \cap (\vec{F}' \setminus 

% % First case: Augment starting graph, where all the $S^e$ form part of the hedge, all the $S^v$ hang off to the side. Add noise injection (MODIFY EXISTING PROOF FOR THIS CASE)

% % Second case: Regular hedge except that we make sure it conforms to the ICS-SCM, no noise needed.

% }
% First, we consider the special case where $ch_{\G}(S) \cap \vec{F}' = \emptyset$. Then the construction is a regular hedge as defined in
% \cite{shpitserIdentificationJointInterventional2006}. This construction is possible because there are no context-specific relationships in $F'$.  Note that the construction is such that each variable in $\G_{\vec{F}'}$ has at most one child (this is without loss of generality, since structural equations for variables in a causal model defined by $\G$ need not be defined in terms of every parent).

% Second, we consider cases where $ch_{\G}(S) \cap \vec{F}' \neq \emptyset$. We now describe a procedure to augment the starting graph $\G_{F'}$ to yield an augmented graph $\G'$. The idea is to create separate bits of $S$ for each variable that represent the experiment indicator ($S_0^C$) and value ($S_1^C$).% code for the value ($S^v$)

% We replace $S$ with a bidirected chain of $S_{1}^C$ for each $C \in \ch_{\G_{\vec{F}'}} (S)$, letting the first such variable inherit edges of $S$ in $\G$ except for directed edges outgoing from $S$. Then, we add a sibling $S_0^C$ to each $S_1^C$ for each $C$.  Other edges and vertices in $\G_{\vec{F}'}$ are kept without change.

% Note that merging all $S_{0}^C,S_{1}^C$ vertices in $\G'$ into $S$ (by merging all $S_0^C$ to form $S_0$, and all $S_0^C$ to form $S_1$, and then defining $S = (S_0, S_1)$), with $S$ inheriting all edges between those vertices and vertices in $\G'$ other than $S_{0}^C,S_{1}^C$ yields $\G_{\vec{F}'}$, hence a set of structural equations for $\G'$ is in the causal model represented by $\G_{\vec{F}'}$, and thus yields a proof of non-identification in $\G$.

% Given this augmented graph $\G'(\vec{V})$, we construct two elements of the causal model according to the bit parity scheme of \citet{shpitserIdentificationJointInterventional2006}. We start by replacing each bidirected edge in $\G'$ with a random fair coin $U_{X, Y}$ for endpoints $X, Y$. Then, in model 1, each variable $V \in \vec{V}$ is equal to the bit parity of the parents. In model 2, the same is true, except variables in $\vec{F}$ pay attention only to parents in $\vec{F}$. It is easy to show that without any interventions, in both models each fair coin $U$ will have zero or two paths down to the root set, defined as $\vec{R}=\sterile_{\G'} (F)$. This means that $p(\sum_{R \in  \vec{R}} R \mod 2 = 1) = 0$ in both models. However under intervention $\doo(a)$, at least one $U \in \vec{F}' \setminus \vec{F}$ in model 1 has only one path into $\vec{R}$, whereas in model 2 none of the variables in $\vec{F}' \setminus  \vec{F}$ have paths to the root. Then in model 1 $p(\sum_{R \in \vec{R}} R \mod 2 = 1) > 0$ whereas in model 2 $p(\sum_{R \in \vec{R}} R \mod 2 = 1) = 0$.


% Then, we amend the resulting structural equations of each of the two elements such that for each $C$,
% \[
% C \equiv
% \begin{cases}
%  C &\text{ if } S_0^C = 0 \\
%  S_{1}^C &\text{ if }S_{0}^C = 1
% \end{cases}
% \]

% If it happens that $\ch_{\G'}(S) \cap \vec{F} \neq \emptyset$ then this constitutes a valid set of structural equations for the witness. Whenever $S_{0}^C = 1$ for some $C$, then that $C \in \vec{F}$ becomes the value of $S_{1}^C$ and is no longer influenced by other variables $\vec{U} \cap \vec{F}$. This will mean at least one $U \in \vec{U}$ will have only one path down to the root of $\vec{F}$, and so the distribution $p(\vec{F} \mid \vec{F}' \setminus \vec{F})$ will be uniform 
% in both elements of the model. If $S_{0}^C = 0$ for all $C$ then this is the original hedge construction, which in both models has equal probability on values where the bit parity of elements in $F$ is even, and zero otherwise. Finally, under an intervention where $A$ is set to $a$, in model 1, when $S_{0}^C = 0$ for all $C$, the distribution on the remainder of the root set is uniform, but in model 2, it remains unchanged from its observational distribution. 

% If it happens that $\ch_{G'}(S) \subset \vec{F}' \setminus \vec{F}$, then an additional modification is required. When $S_{0}^C = 1$ for a $C \in \ch(S) \cap (\vec{F}' \setminus \vec{F})$, then in model 1 there will be a $U \in \vec{F}' \setminus \vec{F}$ which is a parent of $C$ (such a $U$ must exist as $\G'$ is a district). This $U$ previously had two paths to the root, but now only has one, as the path through $C$ now only pays attention to the value of $S_{1}^C$. In model 2, the same $U$ has zero paths to the root. This will cause a discrepancy in the observed distribution, in which in model 1 the root is uniformly distributed on $R \setminus S_{S, C}$, but in model 2 the root is distributed uniformly only on values of $R \setminus S_{0}^C$ of odd bit parity (i.e. even for the whole $R$).

% The modification is as follows: choose any $Y \in \sib_{\G} (S) \cap \vec{F}$, and amend the structural equations in both models as
% \[Y \equiv \begin{cases} Y, &U_{S_{1}^{Y}, S_{0}^{Y}} = 0 \\ Y \oplus \tilde{U}, &U_{S_{1}^{Y}, S_{0}^Y}\end{cases}\]

% Then, the two models shall have the correct forms, such that the observed distributions agree, but the counterfactual distributions disagree for interventions on $\vec{F}' \setminus \vec{F}$.

% \end{proof}

\sidsoundness*

\begin{proof}
The ID algorithm is sound for queries from models in the CS-SCM. This is because the CS-SCM is a submodel (contains more restrictions) than the models considered in \cite{shpitserIdentificationJointInterventional2006}, and the ID algorithm was established to be sound in that same paper.


To see that the ID algorithm is complete for this query, we need to establish that whenever the ID algorithm fails, we can construct two models which have the same observed data distribution but different counterfactual distributions for $p(\vec{V} \mid \doo(S = \emptyset))$

As established in \cite{shpitserIdentificationJointInterventional2006,richardsonNestedMarkovProperties2023}, the ID algorithm fails when there is a district $\vec{D}^* \in {\cal D}(\G_{\vec{V} \setminus \{S\}})$ whose 
closure $\cl_\G (\vec{D}^*)$ is such that $\vec{D}^* \subset \cl_\G (\vec{D}^*)$. Furthermore, we can establish that $ S \in \cl_\G (\vec{D}^*) \setminus \vec{D}^*$, and that $\vec{D}^*$ must not be a district of $\G$, since otherwise $\vec{D}^*$ is reachable in $\G$.

When such a $\vec{D}^*$ is encountered in the process of running the ID algorithm, we will return a construction from \cref{thm:s_only_hedge}. The original hedge construction of \cite{shpitserIdentificationJointInterventional2006} is not suitable because it does not incorporate the special behavior introduced via the $S$ context variable. This bears witness to the non-identifiability of $p(\vec{D}^* \mid \doo(S=\emptyset))$.

Because $\vec{D}^* \subseteq \vec{V}$, it immediately follows that $p(\vec{V} \mid \doo(S = \emptyset))$ is not identified.

%, since if this were not the case, then $\vec{D}^*$ would be reachable.

\end{proof}


\generalsoundness*
\begin{proof}

The algorithm aims to identify $p(\vec{Y} \mid \doo(\vec{a}, S=\emptyset))$ in the causal model $\G$, with additional restrictions pertaining to the semantics of $S$, and its relationship to its children in $\G$, from the observed data distribution $p(\vec{V})$. These restrictions do not affect district factorizations of the observed and interventional distributions which hold due to \cite{shpitserIdentificationJointInterventional2006,richardsonNestedMarkovProperties2023}.

Then, for value assignment $v \in \mathfrak{X}_{\vec{V}}$,
\[p(\vec{Y} = v_{\vec{Y}} \mid \doo(\vec{a}, S=\emptyset)) = \sum_{\vec{Y^*} \setminus \vec{Y}} \prod_{D \in {\cal D}_{\G_{\vec{Y}^*}}} p(v_D \mid \doo (v_{\pas_\G (D)})), \]
where values $v_{\pas_\G (D)})$ in each term are consistent with $\vec{a}$ and $S=\emptyset$.

Each term is identified by one of three cases.

The first case is triggered at \ARef*{alg:regular_fixing_kernel}. In this case, we are justified in the choice of using any laid-back value $s$ for $\vec{D}^*$, because either $S$ is independent of $\vec{D}^*$ given its Markov blanket, or because the structural equations are all the same under those values due to mechanism invariance %assumption 
implied by the definition of the CS-SCM (see the restriction on the structural equation when $S^e_V = 0$ in \cref{dfn:ics-scm}). For that value of $s$, and kernels evaluated to that value, soundness follows by the standard soundness argument of the ID algorithm \citep{shpitserIdentificationJointInterventional2006,richardsonNestedMarkovProperties2023}, which holds in any SCM with independent errors, and thus also in an CS-SCM.


The second case is triggered at \ARef*{alg:s_fixing_kernel}, where $\vec{D}^* \subset \cl_\G (\vec{D}^*)$ and $S \not \in \cl_\G (\vec{D}^*)$. This follows from the soundness of the gID algorithm \citep{leeGeneralIdentifiabilityArbitrary2019}. Specifically, this case shows that the distribution $p(\cl_\G (\vec{D}^*) \mid \doo(\pas_\G (\cl_\G ( \vec{D}^*))))$ is identified, and represents the observed data distribution corresponding to a causal model represented by graph $\G_{\cl_\G (\vec{D}^*)}$. Since $S \not \in \cl_{\G} (\vec{D}^*)$, the available datasets in this model may be reformulated as observational and interventional distributions on $\cl_{\G} (\vec{D}^*)$ indexed by values of $S$. These are precisely the inputs of the gID algorithm, and soundness follows by the soundness of that algorithm. 

 The third case is triggered at \ARef*{alg:cs_general_call}, where $\vec{D}^* \subset \cl_\G (\vec{D}^*)$ and $S \in \cl_\G (\vec{D}^*)$. 

Pick an appropriate value $\bar{s}$. We note that
$p(\vec{D}'(\bar{s},\pas(\vec{D}')))$ is identified because of the following derivation:
\begin{align*}
p(\vec{D}'(\bar{s},\pas(\vec{D}'),\vec{V} \setminus \cl_{\G}(D^*)))
&=
\prod_{D \in \vec{D}'} q_{\cl_{\G}(D^*)}(D(\bar{s}) \mid \{ W(\bar{s}) : W \in \pre_{\prec}(D) \})\\
&=
\left(
\prod_{D \in \vec{D}' \cap \de_{\G}(S)}
q_{\cl_{\G}(D^*)}(D(\bar{s}) \mid \{ W(\bar{s}) : W \in \pre_{\prec}(D) \})
\right) \times\\
& \times 
\left(
\prod_{D \in \vec{D}' \cap \nd_{\G}(S)}
q_{\cl_{\G}(D^*)}(D(\bar{s}) \mid \{ W(\bar{s}) : W \in \pre_{\prec}(D) \})
\right)\\
&=
\left(
\prod_{D \in \vec{D}' \cap \de_{\G}(S)}
q_{\cl_{\G}(D^*)}(D \mid S = \bar{s}, \{ W : W \in \pre_{\prec}(D) \})
\right) \times\\
& \times 
\left(
\prod_{D \in \vec{D}' \cap \nd_{\G}(S)}
q_{\cl_{\G}(D^*)}(D \mid \{ W : W \in \pre_{\prec}(D) \})
\right)
\end{align*}
where $\pre_{\prec}(D)$ is the subset of $\vec{D}' \cup \pas_{\G}(\vec{D}')$ earlier than $D$ under some ordering $\prec$ topological for $\G$.  Here the first equality follows by the top level district factorization of any interventional distribution in an SCM, the second equality follows by arranging terms, and the third by independence, and rule 3 of the potential outcomes calculus \citep{malinsky19po}.

Given this, soundness of the third case follows from soundness of the ID algorithm formulated via the fixing operator.

% Some facts:
%  \begin{itemize}
%      \item $\vec{D}^* \cap \de_\G (S)$ is never empty
%      \item When a distribution $p$ is evaluated to values $S=s$, then context specific independence restrictions may be exploited in $p\mid_{S=s}$, however functions of this distribution that marginalize over $S$ are not well-defined
%      \item The graph $\hat{\G}$ enforces fixing restrictions (to prevent summations over $S$?)
%      \item In \cref{alg:cs-general} $q_1$ being concerned with nondescendants of $S$, is identified using regular fixing, since there exists a fixing sequence for all descendants of $S$ (starting with a childless vertex and repeating). $q_2$ does depend on $S$ so need to check fixing in $\hat{\G}$ evaluated at the appropriate value $s^*$.
%      \item Provided no ill-defined operations are performed then the kernel returned at the value $s^*$ is sound by the soundness of the ID alg?
%  \end{itemize}
%This case follows from the soundness of \cref{alg:cs-general}. Let $D = \cl_\G (\vec{D}^*)$, and let $\vec{A}_S = \vec{A} \cap \ch_\G (S) \cap \vec{D}$. Then, there exists a $\vec{D}' \in {\cal D} (\G (\{ \cl(\vec{D}) \setminus \vec{A}_S\}))$ which has the property $\vec{D}^* \subseteq \vec{D}'$. This is because $\vec{D}*$ is a district of $\G (\{ \cl(\vec{D}) \setminus \vec{A}\})$, and therefore continues to be a district in the the supergraph $\G(\cl(\vec{D}) \setminus \vec{A}_S)$. Hence there must exist an encapsulating district $\vec{D}'$ that contains it.

% If the conditional independence involving $S$ holds, then soundness of $q_{\vec{V}} (\vec{D}' \mid \pas_\G (\vec{D}'))$ follows by \cref{lem:full_generalized_trick}, and soundness of $q_{\vec{V}} ( \vec{D}^* \mid \pas_\G (\vec{D}^*))$ follows via soundness of the fixing operator \citep{richardsonNestedMarkovProperties2023}.

%For each value $d \in \mathfrak{X}_D$, \cref{alg:cs-general} factorizes $p(d_{\vec{D} \setminus \vec{A}_S} \mid \doo (\vec{A}_S))$.




  %       {\color{red}TO REWORK 
  %       note to self: need to indicate that in line 7 we are allowed to use any s laidback for D* because of either CI assumptions or mechanism stability






    
  % The algorithm aims to identify
  % $p(V \setminus A \mid \text{do}(a))$ for a singleton treatment $A$ in the causal model represented by $\G$,
  % with additional restrictions pertaining ot the relationship of $S$ to its children in $\G$.  Such restrictions do not affect district factorizations of the observed and interventional distributions which hold due to results in \cite{tianIdentificationCausalEffects2002}.

  % Thus, for every value assignment $v$,

  % \begin{align*}
  % &p(\{ W = v_W : W \in V \setminus A \} \mid \text{do}(a)) \\ 
  % &=\prod_{D \in {\cal D}_{\G_{V \setminus A}}}
  % p(v_D \mid \text{do}(v_{\pa_{\cal G}(D) \setminus D})).
  % \end{align*}
  % Each term is identified by one of three cases.
  
  % The first case, where $\cl_{\G}(\vec{D}^*) = \vec{D}^*$, follows by the soundness proof of the ID algorithm.  See \citep{shpitserIdentificationJointInterventional2006,richardsonNestedMarkovProperties2023} for details.

  % The second case, where $\vec{D}^* \subset \cl_{\G}(\vec{D}^*)$ and $S \not\in \cl_{\G}(\vec{D}^*)$ follows from the soundness of the gID algorithm.  Specifically, this case shows that the distribution $p(\cl_{\G}(\vec{D}^*) \mid \text{do}(\pa_{\G}(\cl_{\G}(\vec{D}^*)) \setminus \cl_{\G}(\vec{D}^*)))$ is identified, and represents the observed data distribution corresponding to a causal model represented by $\G_{\cl_{\G}(\vec{D}^*)}$.  Since $S \not\in \cl_{\G}(\vec{D}^*)$, the available datasets in this model may be reformulated as observational and interventional distributions on $\cl_{\G}(\vec{D}^*)$ indexed by values of $S$.  This is precisely the inputs for the gID algorithm, and identifiability follows by the soundness of that algorithm \cite{leeGeneralIdentifiabilityArbitrary2019}.

  % The third case, where $\vec{D}^* \subset \cl_{\G}(\vec{D}^*)$ and $S \in \cl_{\G}(\vec{D}^*)$ follows from the soundness of \cref{alg:cs}.

  % Let $D = \cl_{\G}(\vec{D}^*)$.  For every value $d$ of $D$, \cref{alg:cs} factorizes $p(D = d \setminus A \mid \text{do}(a))$ as $\prod_{\tildep{\vec{D}} \in {\cal D}(\G_{D \setminus A})} p(d_{\tildep{\vec{D}}} \mid d_{\pa_{\G}(\tildep{\vec{D}}) \setminus \tildep{\vec{D}}})$.
  % Given this factorization, each term corresponding to $\tildep{\vec{D}}$ is identified via one of two cases.  If $\tildep{\vec{D}}$ is reachable in $\G$, the soundness of the derivation for the term
  % $p(d_{\tildep{\vec{D}}} \mid d_{\pa_{\G}(\tildep{\vec{D}}) \setminus \tildep{\vec{D}}})$ follows by
  % the soundness of the fixing operator reformulation of the ID algorithm, as described in \citep{richardsonNestedMarkovProperties2023}.

  % Otherwise, soundness follows by Lemma \ref{lem:generalized_trick}.

    
  %   }
\end{proof}


%\generalcompleteness*
% {\color{red}
% \textbf{PLACEHOLDER - TO REMOVE}

% \begin{proof}

% To prove completeness for queries of the form $p(\vec{Y} \mid \doo(S=\emptyset))$, we must establish that whenever a FAIL is returned in \cref{alg:main-general}, that a suitable construction (in this case, \cref{thm:s_only_hedge}) can be provided.
    
%     %To prove completeness we will establish that whenever FAIL is returned, that a suitable construction bearing witness to non-identification of $p(\vec{Y}(\vec{a}))$ can be provided. We will consider three cases in which a non-identification witness is generated:

% \begin{enumerate}
%     \item \ARef*{alg:general_fail_positivity}: If there is no $s$ laidback for $\vec{D}^*$ then identification fails. This follows from the fact that identifying the joint distribution $p(\vec{D}^* \mid \text{do}(\pas_{\cal G}(\vec{D}^*) ))$ is impossible if only marginal distributions of this joint distribution are available and random variables in $\vec{D}^*$ do not exhibit additional factorization structure, since $\vec{D}^*$ is a district.

%     %In this case, we fail due to the positivity construction per \cref{thm:positivity-construction}
    
%     %is because then we have a collection of various margins of $\vec{D}^*$, but no joint. Then, because
%     %{ \color{red} TO REWORK
    
    
%     Since joint distributions cannot be recovered from marginal distributions without further assumptions, we fail to identify $q_{V}(\vec{D}^* \mid \pas({\vec{D}^*}))$. Recall that $S$ cannot be in $\vec{D}^*$ due to the definition of $\vec{Y}^*$.

    
%     This case handles the degenerate identification failure case of the gID algorithm (e.g. Section 3.1 of \cite{leeGeneralIdentifiabilityArbitrary2019}, or Section 3 of supplementary of \cite{kivvaRevisitingGeneralIdentifiability2022}, both provide details of explicit constructions showing this), where we may consider a model where $S$ has no parents and siblings to mimic the constructions.

%     \item \ARef*{alg:general_fail_thicket}: For $p(\vec{Y} \mid \doo(S=\emptyset))$ queries, this failure case can never happen. This is because the district $\vec{D}^*$ is either immediately reachable under the regular fixing operator $\phi$, or fails with $S$ inside the closure.
    
%     %A failure at this point involves the district $\vec{D}^*$ not containing $S$, but for each $s$ that is both consistent with $a_{\pas(\vec{D}^*)}$ and laidback for $\vec{D}^*$, $\vec{D}^*$ is not reachable in the corresponding $\G^{[s]}$. Then, we may adopt either the thicket construction of \cite{leeGeneralIdentifiabilityArbitrary2022} or corresponding alternative in \cite{kivvaRevisitingGeneralIdentifiability2022}, where $S$ is now viewed simply as an indexing operator for the various distributions that are inputs into gID. Then, the thicket construction immediately witnesses the non-identifiability of the desired causal effect $p(\vec{D}^* \mid \doo(\pas(\vec{D}^*)))$. 
%     %To move this construction into the model defined in \cref{dfn:data_fusion_scm} we can introduce any suitable marginal distribution over $S$ and other variables not involved in $\cl(\vec{D}^*)$.

%     \item \ARef*{alg:cs_fail_1}, \ARef*{alg:cs_fail_2}: In these cases, we fail with \cref{thm:s_only_hedge}.
%     %, \ARef*{alg:cs_fail_3}: In each of these cases, we fail with a special context-specific hedge per \cref{thm:noise-injected-hedge}.
%     %\ARef*{alg:general_fail_cs_case_1}, \ARef*{alg:general_fail_cs_case_3}, \ARef*{alg:general_fail_cs_case_2}: In each of these cases, we fail with a hedge constructed for the two districts named in the failure case, per \cref{thm:noise-injected-hedge}. 
    

% \end{enumerate}

%     %}

%     %To extend this construction to all of $V$, we introduce marginal distributions over all variables not otherwise mentioned in the construction.

    
%     %{\color{red} something about extending the hedge down to the root}
%     Given a non-identification structure for $p(\vec{D}^* \mid \doo (S=\emptyset)$ in any of the above cases, we now consider an argument for showing that $p(\vec{Y} \mid \doo (S=\emptyset))$ is not identified. 
    
%     %We first note that $\vec{A} \cap \pas(\vec{D}^*)$ is guaranteed to be not empty, as this is the only way $\cl(\vec{D}^*) \subset \vec{D}^*$. 

%     Let $\vec{R}^*$ be the root of the structure $p(\vec{D}^* \mid \doo (S=\emptyset))$. Retain a subset of edges in $\G_{\vec{Y}^*}$ that form a forest from the root down to $\vec{Y}$. Let $\vec{Y}' \subseteq \vec{Y}$ be the subset of $\vec{Y}$ where the forest connects.

%     If $\ch(S)$ are not in this forest the original downward extension construction as detailed in Theorem 4 of  \cite{shpitserIdentificationJointInterventional2006} holds.

%     Otherwise, without loss of generality, we may assume that there exists a value $s' \in \mathfrak{X}_S$ such that the forest connecting $\vec{R}$ to $\vec{Y}'$ is laid-back. Variables along the forest are equal to the bit parity of their parents if $S$ is laid-back for that variable, and equal to the suitable bit of $S$ if $S$ is serious for that variable. Then, we employ in that value a suitable one-to-one construction \citep{shpitserIdentificationJointInterventional2006,leeGeneralIdentifiabilityArbitrary2019} in both models. This gives
%     \begin{align*}
%         p(\vec{Y}' \mid \doo(S=\emptyset)) &= \sum_{\vec{D}^*} p(\vec{Y}' \mid \vec{R}, S=\emptyset) p(\vec{R} \mid \doo(S=\emptyset))\\
%         %&=\sum_{\vec{D}^*} p(\vec{Y}' \mid \vec{D}^*, S=\emptyset) p(\vec{R} \mid \doo(S=\emptyset))
%     \end{align*}
%      % \begin{align*}
%      %     p(\vec{Y}'\mid \doo(\vec{a})) &= \sum_{\vec{D}^*, S} p(\vec{Y}' \mid \vec{R}, S ) p(S)p(\vec{R} \mid \doo(\pas(\vec{D}^*))\\
%      %     &= \sum_S p(S) p(\vec{Y}' \mid S, \doo(\pas(\vec{D}^*))).
%      % \end{align*}

%      Any invertible choice for $p(\vec{Y}' \mid \vec{R}, S=\emptyset)$ will suffice. 

     
%     %For all other variables in $\G$ that do not appear in the forest, we may assume uniform distributions that are identical in both models. 
%     For values $s$ which are laid-back for the forest, 
%     $p(\vec{Y}' \mid \vec{R}^*, S=\emptyset)$
%     %$p(\vec{Y}' \mid S=s, \doo(\pas(\vec{D}^*)))$
%     will disagree between the two models, and for other values of $S$, they will agree (at least from the earliest serious variable onwards). However, this still suffices to prove that $p(\vec{Y}' \mid \doo(S=\emptyset))$ is not identified, which being a margin of $p(\vec{Y} \md \doo(S=\emptyset))$, implies that the latter is not identified.

%     Finally, we note that if we didn't have positive support on the value $s'$, then this would correspond to having less data available, and the causal target would still not be identified with less data.

% \end{proof}

% }

\partialcompleteness*
\begin{proof}
To demonstrate non-identification at each of these failure points, we will provide a construction.

\begin{enumerate}
  \item \ARef*{alg:general_fail_positivity}: If there is no $s$ laidback for $\vec{D}^*$ then identification fails. This follows from the fact that identifying the joint distribution $p(\vec{D}^* \mid \text{do}(\pas_{\cal G}(\vec{D}^*) ))$ is impossible if only marginal distributions of this joint distribution are available and random variables in $\vec{D}^*$ do not exhibit additional factorization structure, since $\vec{D}^*$ is a district.

    %In this case, we fail due to the positivity construction per \cref{thm:positivity-construction}
    
    %is because then we have a collection of various margins of $\vec{D}^*$, but no joint. Then, because
    %{ \color{red} TO REWORK
    
    
    Since joint distributions cannot be recovered from marginal distributions without further assumptions, we fail to identify $q_{V}(\vec{D}^* \mid \pas({\vec{D}^*}))$. Recall that $S$ cannot be in $\vec{D}^*$ due to the definition of $\vec{Y}^*$.

    
    This case handles the degenerate identification failure case of the gID algorithm (e.g. Section 3.1 of \cite{leeGeneralIdentifiabilityArbitrary2019}, or Section 3 of supplementary of \cite{kivvaRevisitingGeneralIdentifiability2022}, both provide details of explicit constructions showing this), where we may consider a model where $S$ has no parents and siblings to mimic the constructions.

    \item \ARef*{alg:general_fail_thicket}: A failure at this point involves the district $\vec{D}^*$ not containing $S$, but for each $s$ that is both consistent with $a_{\pas(\vec{D}^*)}$ and laidback for $\vec{D}^*$, $\vec{D}^*$ is not reachable in the corresponding $\G^{[s]}$. Then, we may adopt either the thicket construction of \cite{leeGeneralIdentifiabilityArbitrary2022} or corresponding alternative in \cite{kivvaRevisitingGeneralIdentifiability2022}, where $S$ is now viewed simply as an indexing operator for the various distributions that are inputs into gID. Then, the thicket construction immediately witnesses the non-identifiability of the desired causal effect $p(\vec{D}^* \mid \doo(a, S=\emptyset))$. 
    %To move this construction into the model defined in \cref{dfn:data_fusion_scm} we can introduce any suitable marginal distribution over $S$ and other variables not involved in $\cl(\vec{D}^*)$.
    \item \ARef*{alg:cs_fail_1}: Since $\ch^*(S)$ is empty, there are two possibilities -- it is either the case that $S$ also has no children in $\vec{D}^*$,  or $S$ has children in $\vec{D}^*$.  
    
    In the first case the construction is simply given as a regular hedge per \cite{shpitserIdentificationJointInterventional2006}, since $S$ has no children inside. 

    In the second case, we will modify the argument of \cite{shpitserIdentificationJointInterventional2006} slightly in order to respect the constraints of \cref{dfn:ics-scm}. For variables in $\cl(\vec{D}^*)$ which do not have $S$ as an parent, the structural equations are exactly as they appear in \cite{shpitserIdentificationJointInterventional2006} in both models. For other variables $V \not \in \ch(S) \cap \vec{D}^*$, when $S$ is laidback for $V$ (meaning $S^e_V = 0$), the variable is equal to the bit parity of its parents as defined in \cite{shpitserIdentificationJointInterventional2006}. When $S$ is serious for $V$ (meaning that $S^e_V = 1$) the variable is equal to $S^v_V$ as required by \cref{dfn:ics-scm}.

    This construction is a valid witness, and the proof is essentially the argument laid out in the second part of the proof for \cref{thm:s_only_hedge}. 

    %\item \ARef*{alg:general_fail_cs_case_1}, \ARef*{alg:general_fail_cs_case_3}, \ARef*{alg:general_fail_cs_case_2}: In each of these cases, we fail with a hedge constructed for the two districts named in the failure case, per \cref{thm:noise-injected-hedge}. 

\end{enumerate}

    %}

    %To extend this construction to all of $V$, we introduce marginal distributions over all variables not otherwise mentioned in the construction.

    
    %{\color{red} something about extending the hedge down to the root}
    Given a non-identification structure for $p(\vec{D}^* \mid \doo (a, S=\emptyset))$ in any of the above cases, we now consider an argument for showing that $p(\vec{Y} \mid \doo (\vec{a}, S=\emptyset))$ is not identified. 
    
    We first note that $\vec{A} \cap \pas(\vec{D}^*)$ is guaranteed to be not empty, as this is the only way $\cl(\vec{D}^*) \subset \vec{D}^*$. 

    Let $\vec{R}^*$ be the set of variables with no children in  graph $\G_{\cl(\vec{D}^*)}$. Let $\G'$ be the edge subgraph used in this construction, which consists of the edges in $\G_{\cl(\vec{D}^*)}$, and a subset of edges in $\G_{\vec{Y}^*}$ that form a forest from the root down to $\vec{Y}$. Let $\vec{Y}' \subseteq \vec{Y}$ be the subset of $\vec{Y}$ where the forest connects. %We can {\color{red} TODO}

    If $\ch(S)$ are not in this forest the original construction as detailed in Theorem 4 of  \cite{shpitserIdentificationJointInterventional2006} holds.

    Otherwise, without loss of generality, we may assume that there exists a value $s' \in \mathfrak{X}_S$ such that the forest connecting $\vec{R}$ to $\vec{Y}'$ is laid-back. Variables along the forest are equal to the bit parity of their parents if $S$ is laid-back for that variable, and equal to the suitable bit of $S$ if $S$ is serious for that variable. Then, we employ in that value a suitable one-to-one construction \citep{shpitserIdentificationJointInterventional2006,leeGeneralIdentifiabilityArbitrary2019} in both models. This gives
     \begin{align*}
         p(\vec{Y}'\mid \doo(\vec{a}, S =\emptyset)) &= \sum_{\vec{D}^*} p(\vec{Y}' \mid \vec{R}, \doo(S=\emptyset) ) p(\vec{R} \mid \doo(a, S=\emptyset))\\
         %&= \sum_S p(S) p(\vec{Y}' \mid S, \doo(\pas(\vec{D}^*))).
     \end{align*}
     For all other variables in $\G$ that do not appear in the forest, we may assume uniform distributions that are identical in both models. Then, for values $s$ which are laid-back for the forest, $p(\vec{Y}' \mid \vec{R}, \doo( S=\emptyset)))$ will disagree between the two models, and for other values of $S$, they will agree (at least from the earliest serious variable onwards). However, this still suffices to prove that $p(\vec{Y}' \mid \doo(\vec{a}, S=\emptyset))$ is not identified. Since this is a margin of $p(\vec{Y} \mid \doo (a, S=\emptyset))$, this proves that the latter is also not identified.

    Finally, we note that if we didn't have positive support on the value $s'$, then this would correspond to having less data available, and the causal target would still not be identified with less data.

\end{proof}
\section{Examples}
\subsection{Full Example}
\begin{ex}[continues=ex:general-case-example]
Throughout this example, we will use the shorthand $s_{\vec{a}}$ to mean any value of $S$ where $\{ s^e_{A} = 1, s^v_A = \vec{a}_A : A \in \vec{A} \}$.

%Applying \cref{alg:main-general},
We first note that $\vec{Y}^* = \{Y, M, W_1, W_2, C\}$, and ${\cal D}({\G}^{[]}_{\vec{Y}^*})\!=\!\{\vec{D}^*_1 \!=\!\{Y, W_2\}$, $\vec{D}^*_2\!=\!\{M \}$,$\vec{D}^*_3\!=\!\{W_1\}$,$\vec{D}^*_4\!=\!\{C\}\}$. 

$\vec{D}^*_1$ invokes \ARef*{alg:cs_general_call}.
%be laidback for $\vec{D}^*_1$ and serious for $\ch^*(S) = \{\}$
% Then, $\cl(D^*_1) = \{Y, W_1, S, W_2, A_3\}$, while the districts of this closure are ${\cal D}(\G(\cl(D^*_1) \setminus \vec{A})) = \{\{S\}, \{Y, A_3, W_2\}, \{W_1\}\}$. 
Per \ARef*{alg:encapsulating_district}, the relevant district of ${\G}^{[]}({s})$ encapsulating $\vec{D}^*_1$ is $\vec{D}' = \{Y,  A_3, W_2\}$, for $s = s_{a_1,a_2}$.
%Let $\bar{s} = ((A_1, A_2), (a_1, a_2))$. 

%$\vec{D}'_1$ does not trigger \ARef*{alg:instrument_trick} since $S$ cannot be independent of itself. Instead, $\vec{D}'_1 \cap \ch(A_S) = \emptyset$, and therefore \ARef*{alg:regular_fixing_in_cs} is triggered, returning $\phi_{\cl(D^*_1) \setminus (\vec{D}'_1 \setminus \{A_3\}))} (p; \G) = p(S)$.

$\vec{D}'$ triggers \ARef*{alg:perfect_iv_trick}, since 
$Y(a_1, a_2) {\ci} S \mid M(a_1), W_1(a_2), W_2, A_3$ and $\vec{D}^*_1$ is reachable in ${\G}^{[{s}]}({s})_{\vec{D}'}$. Then, 
$q_{\vec{D}'}^{\bar{s}, \vec{z}}(\vec{D}' | \pas(\vec{D}')) =$ $p(Y | M, W_2, W_1, C,s_{a_1,a_2},A_3) p(W_2,A_3)$, and $q_{\vec{D}_1^*}(\vec{D}_1^* | \pas(\vec{D}^*_1))$$=\phi_{A_3} (q_{\vec{D}'}^{\bar{s}, \vec{z}}(\vec{D}' | \pas(\vec{D}')); {\G}^{[\bar{s}]} (\bar{s})_{\vec{D}'})$\\$=\sum_{A_3} p(Y | M,W_2,W_1,C,s_{a_1,a_2},A_3) p(W_2,A_3)$.
%$\de(\pas(\vec{D}')) = \{Y\}, \pas(\vec{D}') = \{W_1\}, \nd(\pas(\vec{D}')) = \{W_2, A_3\}$, and $Y \ci_{\G^*} S \mid W_1, W_2, A_3$. 

%Going to \cref{lem:full_generalized_trick}, the kernel is computed as $q(\vec{D}' \mid \pas (\vec{D}')) =  (p(Y \mid W_1, S=s, W_2, A_3) p(W_2, A_3)$ where $s$ is serious for $(A \setminus A_S) \cap \pas(\vec{D}') = \{ A_2\}$. We then return $q_{\vec{V}}(\vec{D}_1^*  \mid \pas_\G  (\vec{D}_1^*)) = \phi_{A_3} (q(\vec{D}' \mid \pas (\vec{D}')); \G^{[s]}(\vec{V}))$

%$\vec{D}'_3$ also triggers \ARef*{alg:instrument_trick}, with $\pas(\vec{D}'_3) = \{W_2, A_2\},\vec{D}'_3 \cap \de(\pas(\vec{D}'_3)) = \{W_1\},  \vec{D}'_3 \cap \nd(\pas(\vec{D}'_3)) = \emptyset$, and $W_1 \ci S \mid W_2, A_2$. Going to \cref{lem:full_generalized_trick}, we see that the kernel is computed as $\phi_{A_3} p(W \mid S=s)= p(W \mid S=s)$ for $s$ serious for $(\vec{A} \setminus \vec{A}_S) \cap \pas(\vec{D}'_3) = \{A_2\}$.

%Then, for $\vec{D}^*_1$, \cref{alg:cs-general} returns $q(\vec{D}^*_1 \mid\pas(\vec{D}^*_1)) =\sum_{S, W_1, A_3} p(W_1 \mid S = (\{A_2\}, a_2))p(S) \sum_{A_3} p(Y \mid W_1, W_2, A_3, S = (\{A_2\}, a_2)) p(W_2, A_3) = \sum_{ W_1, A_3} p(W_1 \mid S = (\{A_2\}, a_2))  p(Y \mid W_1, W_2, A_3, S = (\{A_2\}, a_2)) p(W_2, A_3)$

$\vec{D}^*_2$ reaches \ARef*{alg:s_fixing_kernel}, with 
$\tilde{q}=\phi_{\vec{V} \setminus \{M,A_1\} }(p(\vec{V}); \tilde{\G}^{[]})=p(M, A_1 | S)$, yielding
 $q_{\vec{D}^*_2}(\vec{D}^*_2 | \pas(\vec{D}^*_2)) = \phi_{A_1}(\tilde{q}; \tilde{\G}^{[s_{a_1}]}) \vert_{a_1} = p(M | a_1, s_{a_1})$.
 % for $S=s_{a_1}$,
%returning $q(\vec{D}^*_2 \mid \pas(\vec{D}^*_2)) = \phi_{p(M \mid A_1=a_1, S=(\{M\}, a_1))$. 

$\vec{D}^*_3$ reaches \ARef*{alg:cs_general_call}
%. and calls \ARef*{alg:cs_general}
with
$\bar{q} = q_{\{W_1,A_2,S\}}(W_1,A_2,S | W_2,A_3,C)$ and the corresponding LS-CADMMG.  Let ${s} = s_{a_2}$.
Per \ARef*{alg:encapsulating_district}, the district of ${\G}^{[]}({s})$ encapsulating $\vec{D}^*_3$ is $\vec{D}' = \{W_1\}$. $\vec{D}'$ triggers \ARef*{alg:perfect_iv_trick}, because $W_1(a_2) \ci S | W_2,A_3,C$ in $\bar{q}$ and $\vec{D}^*_3$ is (trivially) reachable in ${\G}^{[{s}]}({s})_{\vec{D}'}$. Then, $q_{\vec{D}'}^{\bar{s}, \vec{z}} (\vec{D}' \mid \pas(\vec{D}')) = p(W_1 \mid W_2, a_2, {s})$.

%Then, $\cl(D^*_3) = \{W_1, A_2, S\}$, while the districts of this closure are ${\cal D} (\G(\cl(D^*_3) \setminus \vec{A})) = \{ \{W_1\}, \{S\}\}$. The encapsulating district for $\vec{D}^*$ is $\vec{D}' = \{W_1\}$. $\vec{D}'$ triggers \ARef*{alg:instrument_trick} because $W_1 \ci_{\G^*} S$, and is computed in \cref{lem:full_generalized_trick} as $p(W_1 \mid A_2, S=(\{A_2\}, a_2))$. Since $\vec{D}^*_3 = \vec{D}'$ this result is equal to $q_{\vec{V}} (\vec{D}^*_3 \mid \pas_{\G} (\vec{D}^*))$.

%$\vec{D}'_2$ is computed directly as $p(S)$. Then, for $\vec{D}^*_3$, \cref{alg:cs-general} returns $q(\vec{D}^*_3 \mid \pas(\vec{D}^*_3)) = \sum_{S} p(W_1 \mid A_2, S=(\{A_2\}, a_2))p(S) = p(W_1 \mid A_2, S=(\{A_2\}, a_2))$

$\vec{D}^*_4$ reaches \ARef*{alg:regular_fixing_kernel}, giving $q_{\vec{D}^*_4}(\vec{D}^*_4 | \pas(\vec{D}^*_4))$$ = p(C)$. 
This is a case our algorithm has in common with the ID algorithm, as no special structure of the problem involving the selector $S$ needs to be involved.


The identifying functional is
$p(Y (\vec{a}, S=\emptyset)) = \sum_{\vec{Y}^* \setminus Y} \!\!\prod_{D^*_i \in {\cal D}(\G_{\vec{Y}^*})}  q_{\vec{D}^*_i}(\vec{D}^*_i | \pas(\vec{D}^*_i))$, which is equal to
{\small
\begin{align*}
\sum_{M, W_1, W_2, C}
& p(C) p(M | a_1,%s^e_{A_1}=1,s^v_{A_1}=a_1
    s_{a_1})
 p(W_1 | W_2, a_2, %s^e_{A_2}=1,s^v_{A_2}=a_2
    s_{a_2})\\
&  \sum_{A_3} p(Y | M,W_2,W_1,C,s_{a_1,a_2},A_3) p(W_2,A_3)
%s^e_{\{ A_1,A_2\}}=1,S^v_{\{A_1,A_2\}} = a_1,a_2
\end{align*}
}
\end{ex}

\subsection{Illustrating the Use of Multigraphs For Representing Latent Projections of LS-DAGs}

\begin{figure}[h]
\centering
\begin{subfigure}{0.3\textwidth}
\centering
        \begin{tikzpicture}[rotate=0]
		\tikzstyle{block} = [draw, circle, inner sep=1.5pt, fill=lightgray]
		\tikzstyle{block2} = [draw, rectangle, inner sep=1.5pt, fill=lightgray]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            \node[] (a) at (0, 0) {$A$};
            \node[selector] (s) at (0, -1) {$S$};
            \node[] (z1) at (-1, -1) {$Z_1$};
            \node[] (z2) at (1, -1) {$Z_2$};
            \node[] (w1) at (-1, -2) {$W_1$};
            \node[] (w2) at (1, -2) {$W_2$};
            \node[] (b) at (0, -3) {$B$};

            \draw[-stealth] (a) to node[above,sloped]{\tiny $\{Z_1\}$}(z1);
            \draw[-stealth] (a) to node[above,sloped]{\tiny $\{Z_2\}$}(z2);
            \draw[-stealth] (s) to (z2);
            \draw[-stealth] (s) to (z1);
            \draw[-stealth] (s) to (w2);
            \draw[-stealth] (s) to (w1);
            \draw[-stealth] (z2) to node[right]{\tiny $\{W_2\}$}(w2);
            \draw[-stealth] (z1) to node[left]{\tiny $\{W_1\}$}(w1);
            \draw[-stealth] (w1) to (b);
            \draw[-stealth] (w2) to (b);

            % \node[] (w2) at (-1, 0) {$W_2$};
            % \node[] (a3) at (0, 0) {$A_3$};
            % \node[] (c) at (1, 0) {$C$};
            % \node[selector] (s) at (0, -1) {$S$};
            % \node[] (a1) at (-.75, -2) {$A_1$};
            % \node[] (a2) at (.75, -2) {$A_2$};
            % \node[] (w1) at (.75, -3) {$W_1$};
            % \node[] (m) at (-.75, -3) {$M$};
            % \node[] (y) at (0, -4) {$Y$};
            
            % \draw[-stealth] (m) to  (y);
            % \draw[-stealth] (w1) to  (y);
            % \draw[-stealth] (a1) to  (m);
            % \draw[-stealth] (a2) to  (w1);
            % \draw[-stealth] (s) to  (a1);
            % \draw[-stealth] (s) to  (a2);
            % \draw[-stealth] (c) to  (s);
            % \draw[-stealth][bend left=45] (c) to (y);
            % \draw[-stealth] (a3) to  (s);
            % \draw[-stealth] (w2) to  (s);
            % \draw[-stealth] (w2) to  (w1);
            % \draw[stealth-stealth][bend right=35] (a2) to node[above]{{ \tiny $\{A_2\}$}}(y);
            % \draw[stealth-stealth][bend right=35] (a1) to node[below]{\tiny $\{A_1\}$}(m);
            % \draw[stealth-stealth][bend right=45] (w2) to (y);
            % \draw[stealth-stealth][bend left=35] (w2) to (a3);
            % \draw[stealth-stealth][bend left=35] (s) to node[above]{\tiny $\{A_2\}$}(a2);
            % \draw[stealth-stealth][bend left=35] (a2) to node[above]{\tiny $\{A_2\}$}(w1);
        \end{tikzpicture}
        \caption{}
        \label{fig:multigraph}
%\centering
\end{subfigure}%
\begin{subfigure}{0.3\textwidth}
  \begin{tikzpicture}[rotate=0]
  \centering
		\tikzstyle{block} = [draw, circle, inner sep=1.5pt, fill=lightgray]
		\tikzstyle{block2} = [draw, rectangle, inner sep=1.5pt, fill=lightgray]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            \node[] (a) at (0, 0) {$A$};
            \node[selector] (s) at (0, -2) {$S$};
            \node[] (b) at (0, -1) {$B$};
            \draw[-stealth] (s) to (b);
            \draw[-stealth] (a) to node[left]{\tiny $\{(Z_1 \lor W_1) \land (Z_2 \lor W_2)\}$}(b);
            % \node[] (z2) at (1, -1) {$Z_2$};
            % \node[] (w1) at (-1, -2) {$W_1$};
            % \node[] (w2) at (1, -2) {$W_2$};
            % \node[] (b) at (0, -3) {$B$};

            % \draw[-stealth] (a) to node[above,sloped]{\tiny $\{Z_1\}$}(z1);
            % \draw[-stealth] (a) to node[above,sloped]{\tiny $\{Z_2\}$}(z2);
            % \draw[-stealth] (s) to (z2);
            % \draw[-stealth] (s) to (z1);
            % \draw[-stealth] (s) to (w2);
            % \draw[-stealth] (s) to (w1);
            % \draw[-stealth] (z2) to node[right]{\tiny $\{W_2\}$}(w2);
            % \draw[-stealth] (z1) to node[left]{\tiny $\{W_1\}$}(w1);
            % \draw[-stealth] (w1) to (b);
            % \draw[-stealth] (w2) to (b);

            % % \node[] (w2) at (-1, 0) {$W_2$};
            % % \node[] (a3) at (0, 0) {$A_3$};
            % % \node[] (c) at (1, 0) {$C$};
            % % \node[selector] (s) at (0, -1) {$S$};
            % % \node[] (a1) at (-.75, -2) {$A_1$};
            % % \node[] (a2) at (.75, -2) {$A_2$};
            % % \node[] (w1) at (.75, -3) {$W_1$};
            % % \node[] (m) at (-.75, -3) {$M$};
            % % \node[] (y) at (0, -4) {$Y$};
            
            % % \draw[-stealth] (m) to  (y);
            % % \draw[-stealth] (w1) to  (y);
            % % \draw[-stealth] (a1) to  (m);
            % % \draw[-stealth] (a2) to  (w1);
            % % \draw[-stealth] (s) to  (a1);
            % % \draw[-stealth] (s) to  (a2);
            % % \draw[-stealth] (c) to  (s);
            % % \draw[-stealth][bend left=45] (c) to (y);
            % % \draw[-stealth] (a3) to  (s);
            % % \draw[-stealth] (w2) to  (s);
            % % \draw[-stealth] (w2) to  (w1);
            % % \draw[stealth-stealth][bend right=35] (a2) to node[above]{{ \tiny $\{A_2\}$}}(y);
            % % \draw[stealth-stealth][bend right=35] (a1) to node[below]{\tiny $\{A_1\}$}(m);
            % % \draw[stealth-stealth][bend right=45] (w2) to (y);
            % % \draw[stealth-stealth][bend left=35] (w2) to (a3);
            % % \draw[stealth-stealth][bend left=35] (s) to node[above]{\tiny $\{A_2\}$}(a2);
            % % \draw[stealth-stealth][bend left=35] (a2) to node[above]{\tiny $\{A_2\}$}(w1);
        \end{tikzpicture}
    \caption{}
    \label{fig:multigraph_boolean_label}
\end{subfigure}%
\begin{subfigure}{0.3\textwidth}

  \begin{tikzpicture}[rotate=0]
  \centering
		\tikzstyle{block} = [draw, circle, inner sep=1.5pt, fill=lightgray]
		\tikzstyle{block2} = [draw, rectangle, inner sep=1.5pt, fill=lightgray]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            \node[] (a) at (0, 0) {$A$};
            \node[selector] (s) at (0, -2) {$S$};
            \node[] (b) at (0, -1) {$B$};
            \draw[-stealth] (s) to (b);
            \draw[-stealth, bend right=30] (a) to node[left]{\tiny $\{Z_2, W_2\}$}(b);
            \draw[-stealth, bend left=30] (a) to node[right]{\tiny $\{Z_1, W_1\}$}(b);
        \end{tikzpicture}
    \caption{}
    \label{fig:multigraph_multi_label}
\end{subfigure}
\caption{An LS-ADMMG illustrating the need for multigraph representations.}
\end{figure}

In this section we provide some further details on why multigraphs are required, especially under latent projections. Consider \cref{fig:multigraph}. In a latent projection of $Z_1, Z_2, W_1, W_2$, only variables $A, B, S$ remain. The directed edge $A \to B$ can now disappear if we consider an intervention where at least one of $Z_1, W_1$ is intervened upon, and at least one of $Z_2, W_2$ is intervened upon. This could be represented by a boolean logic label (e.g. $\{ (Z_1 \lor W_1) \land (Z_2 \lor W_2) \}$).  However, computing the correct $\G^{[s]}$ graph then requires of a boolean expression on each label, which is difficult to interpret visually (see \cref{fig:multigraph_boolean_label}).

Instead, we provide an alternative representation of the latent projection of \cref{fig:multigraph} via a multigraph shown in \cref{fig:multigraph_multi_label}. Here, if given a value of $s$, the corresponding $\G^{[s]}$ graph can be recovered by checking for each edge whether the label intersects the value of $s$.

\subsection{Failure Cases}
In this section we illustrate various failure cases that could occur throughout the application of \cref{alg:main-general}.


\begin{figure*}[h]
\centering
\begin{subfigure}[b]{0.15\textwidth}
\centering
        \begin{tikzpicture}
		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{block2} = [draw, rectangle, inner sep=2.5pt, fill=lightgray]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            \node[] (y) at  (0, 0) {$Y$};
            \node[selector] (s) at  (0, 1) {$S$};
            
            \draw[-stealth] (s) to (y);
        \end{tikzpicture}
        \caption{}
        \label{fig:fail-case-1}
%\centering
\end{subfigure}%
\begin{subfigure}[b]{0.15\textwidth}

\centering
        \begin{tikzpicture}
		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{block2} = [draw, rectangle, inner sep=2.5pt, fill=lightgray]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            \node[selector] (s) at  (0, 0) {$S$};
            \node[] (a1) at  (0, -1) {$A_1$};
			\node[] (a2) at  (1, 0) {$A_2$}; 
			\node[] (y) at  (1, -1) {$Y$}; 
            \draw[-stealth] (a1) to (y);
            \draw[-stealth] (a2) to (y);
            \draw[-stealth] (s) to (a1);
            \draw[-stealth] (s) to (a2);
            \draw[stealth-stealth][bend right=35] (a1) to (y);
            \draw[stealth-stealth][bend left=35] (a2) to (y);
            
            
        \end{tikzpicture}
        \caption{}
        \label{fig:fail-case-2}
%\centering
\end{subfigure}%
\begin{subfigure}[b]{0.15\textwidth}
\centering
        \begin{tikzpicture}
		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
		\tikzstyle{input} = [coordinate]
		\tikzstyle{output} = [coordinate]
            \tikzset{edge/.style = {->,> = latex'}}
            
            \node[selector] (s) at  (0, 0) {$S$};
            \node[] (a) at  (0, -1) {$A$};
			\node[] (y) at  (1, -1) {$Y$}; 
            
            \draw[-stealth] (a) to (y);
            \draw[stealth-stealth][bend left=35] (a) to (y);
            \draw[stealth-stealth][bend left=35] (s) to (y);
            \draw[-stealth] (s) to (y);
        \end{tikzpicture}
        \caption{}
        \label{fig:fail-case-3}
%\centering
\end{subfigure}%
% \begin{subfigure}[b]{0.15\textwidth}
% \centering
%         \begin{tikzpicture}
% 		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
% 		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
% 		\tikzstyle{input} = [coordinate]
% 		\tikzstyle{output} = [coordinate]
%             \tikzset{edge/.style = {->,> = latex'}}
            
%             \node[selector] (s) at  (0, 0) {$S$};
%             \node[] (a) at  (0, -1) {$A$};
% 			\node[] (y) at  (1, -1) {$Y$}; 
            
%             \draw[-stealth] (a) to (y);
%             \draw[-stealth] (s) to (a);
%             \draw[stealth-stealth][bend left=35] (s) to (a);
%             \draw[stealth-stealth][bend left=35] (s) to (y);
%         \end{tikzpicture}
%         \caption{}
%         \label{fig:fail-case-4}
% %\centering
% \end{subfigure}%
% \begin{subfigure}[b]{0.15\textwidth}
% \centering
%         \begin{tikzpicture}
% 		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
% 		\tikzstyle{fixed} = [draw, rectangle, inner sep=1pt]
% 		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
% 		\tikzstyle{input} = [coordinate]
% 		\tikzstyle{output} = [coordinate]
%             \tikzset{edge/.style = {->,> = latex'}}
            
%             \node[selector] (s) at  (0, 0) {$S$};
%             \node[fixed] (a) at  (0, -1) {$a$};
% 			\node[] (y) at  (1, -1) {$Y(a)$}; 
            
%             \draw[stealth-stealth][bend left=35] (s) to (y);
%             \draw[-stealth] (a) to (y);
%         \end{tikzpicture}
%         \caption{}
%         \label{fig:fail-case-4-star}
% %\centering
% \end{subfigure}%
% \begin{subfigure}[b]{0.15\textwidth}
% \centering
%         \begin{tikzpicture}
% 		\tikzstyle{block} = [draw, circle, inner sep=.5pt]
% 		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
% 		\tikzstyle{input} = [coordinate]
% 		\tikzstyle{output} = [coordinate]
%             \tikzset{edge/.style = {->,> = latex'}}
            
            
%             \node[] (a) at  (0, 0) {$A$};
%             \node[selector] (s) at  (1, 0) {$S$};
% 			\node[] (w1) at  (1, -1) {$W_1$}; 
% 			\node[] (w2) at  (2, 0) {$W_2$}; 
% 			\node[] (y) at  (1, -2) {$Y$}; 
%             \draw[-stealth] (w2) to (s);
%             \draw[-stealth] (a) to (s);
%             \draw[-stealth] (a) to (y);
%             \draw[-stealth] (s) to (w1);
%             \draw[-stealth] (w1) to (y);
%             \draw[stealth-stealth][bend left=35] (w2) to (y);
%             \draw[stealth-stealth][bend right=35] (w2) to (a);
            
            
            
            
            
            
            
            
            
            
            
            
            
            
            
%         \end{tikzpicture}
%         \caption{}
%         \label{fig:fail-case-5}
% %\centering
% \end{subfigure}%
\caption{Examples of Failure Cases in \cref{alg:main-general} and \cref{alg:cs-general}}
\label{fig:examples-failures}
\end{figure*}

In all cases, we are interested in identifying a kernel for a particular district $\vec{D}^*$, which is in the graph $\G_{\vec{Y}^*}$, where we remind the reader that $\vec{Y}^* = \an_{\G_{\vec{V} \setminus (\vec{A} \cup \{S\}) }} (\vec{Y})$.

\begin{enumerate}
\item \ARef*{alg:general_fail_positivity}: Consider \cref{fig:fail-case-2}, where we let $\mathfrak{X}_{S_0} = \{\{A_1\}, \{A_2\}\}$. Then, under each value of $S$, the district $\{Y\}$ is not reachable. A thicket can be constructed according to \cite{leeGeneralIdentifiabilityArbitrary2019,kivvaRevisitingGeneralIdentifiability2022}.
\item \ARef*{alg:general_fail_thicket}: Consider \cref{fig:fail-case-1}, where $\mathfrak{X}_{S_0} = \{\{Y\}\}$, and we are interested in identifying $p(Y \mid \doo(S=\emptyset))$. Then in this case we never observe the true distribution of $Y$ where $S$ is laid-back for $Y$. Then it is easy to conceive of two models which have different distributions for $p(Y \mid S = (\emptyset, \emptyset))$, which is not part of observed data. The observed data in this case is the randomization probabilities on $Y$ that was specified by the experimenter.

\item \ARef*{alg:cs_fail_1}: Consider \cref{fig:fail-case-3}, where $\vec{D}^*= \{Y, S\}$, and let $\mathfrak{X}_{S_0} = \{\emptyset, \{Y\}\}$. We see that \ARef*{alg:cs_general_call} gets triggered because there is an $s$ which is laid-back for $\vec{D}^*$ (namely $\emptyset$), $A$ is not fixable so $\cl(\vec{D}^*) = \{Y, A, S\} \neq  \{Y, S\}=\vec{D}^*$, and that $S \in \cl(\vec{D}^*)$.

Then, since $S$ is not a parent of $A$, the possibly modified hedge $\langle \{S, Y\}, \{S, Y, A\}\rangle$ is returned. See \cref{alg:partial_completeness} for details.

% \item \ARef*{alg:general_fail_cs_case_2}: Consider \cref{fig:fail-case-4}. Let $ \mathfrak{X}_{S_0} = \{\emptyset, \{A\}\}$.  Then, let $\vec{D}^* = \{Y\}$. We see that \ARef*{alg:cs_general_call} gets triggered because there is an $s$ which is laid back for $\vec{D}^*$, $A$ is not fixable so $\cl(\vec{D}^*) = \{Y, A, S\} \neq \{Y\} = \vec{D}^*$, and that $S \in \cl(\vec{D}^*)$.

% Then, in \cref{alg:cs-general}, we now consider $\G^* = \G(\{\cl(\vec{D}^*) \setminus \vec{A}\} (a)) = \G(\{S, Y\}(a))$ depicted in \cref{fig:fail-case-4-star}. \ARef*{alg:general_fail_cs_case_1} does not get triggered since $S \in \pa_{\G^*} (A)$. To find the encapsulating district, we see that because $S$ is a parent of $A$, that $\G^\dagger = \G^*$. We see that the encapsulating district for $\vec{D}^*$ in \cref{fig:fail-case-4-star} is $\vec{D}' = \{Y, S\}$. 
% Then, we perform the conditional independence check in \ARef*{alg:instrument_trick}. Note that $\pas_{\G^*} (\vec{D}') = \emptyset$, and therefore $\de_{\G^*} (\pas_{\G^*} (\vec{D}')) = \emptyset$. Then, the check queries $\emptyset \ci_{\G^*} S \mid Y, S$ which is false. Thus we terminate with the noise-injected hedge $\langle \{Y, S\}, \{Y, A, S\} \rangle$.
% \item \ARef*{alg:general_fail_cs_case_3}: Consider \cref{fig:fail-case-5}. Let $\vec{D}^* = \{W_2, Y\}$. The closure is $\cl(\vec{D}^*) = \{W_2, A, Y\}$. Then $\vec{D}' = \{W_2, Y\}$ is the encapsulating district. Then, we can apply the independence $Y \ci S \mid W_2, W_1$ in the SWIG where $A$ is projected out, to obtain $q(W_2, A, Y \mid W_1)$. But then $A$ cannot be fixed leaving us with the noise-injected hedge $\langle \{W_2, Y\}, \{W_2, Y, A\} \rangle$.

\end{enumerate}

\section{Analysis}
\subsection{Complexity of Algorithm \ref{alg:main-general}}
As with all identification algorithms in graphical models, we may ask two distinct computational complexity questions. 

The first question treats the algorithm as a decision procedure answering a YES/NO question about identification of a given query in a given model (and potentially giving additional useful information, such as the identifying functional if the answer is YES). The computational complexity of this version of our algorithm has an upper bound of a low order polynomial in the number of edges $\abs{E}$ and vertices $\abs{V}$ of the input graph. Specifically, here are the computational complexity calculations of a number of operations that appear in the algorithm: the computation of the set $\vec{Y}^*$ is $O(\abs{E} + \abs{V})$, for which a depth-first traversal of the graph can calculate this set. The computation of districts in $\vec{Y}^*$ is similarly $O(\abs{E} + \abs{V})$, via a depth first traversal. Positivity (“laidbackness”) checks are linear in $\mathfrak{X}_{S}$, the size of the state space of S. Finding a fixable vertex is at most quadratic in $\abs{V}$, and may be linear with clever use of hashing (since we must intersect descendants and districts). Iteration of the fixing operation happens at most $O(\abs{V})$ times. Calculations of reachability of the set entails iterative fixing. The overall algorithm is thus at most $O((\abs{V} + \abs{E} + \abs{S})^3)$, and more efficient implementations are likely possible. 

The second computational complexity question is the time it takes to evaluate the identification query itself given, for example, categorical data. This version of the algorithm is likely intractable for the simple reason that the sum over $\vec{Y}^* \setminus \vec{Y}$ in front of the algorithm may be difficult to evaluate in general graphs, for much the same reason that variable elimination and belief propagation algorithms are intractable in dense graphs. While some prior work exists on making this version of the algorithm tractable using ideas from variable elimination algorithms \citep{shpitserEfficientAlgorithmComputing2011}, this work only applies to certain types of sparse graphs, and has not yet been generalized to the algorithm we present (although this is a very interesting area of future work).



% \begin{figure}[h]
% %\centering
% \begin{subfigure}[b]{0.2\textwidth}
% \centering
%         \begin{tikzpicture}
% 		\tikzstyle{block} = [draw, circle, inner sep=2.5pt, fill=lightgray]
% 		\tikzstyle{block2} = [draw, rectangle, inner sep=2.5pt, fill=lightgray]
% 		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
% 		\tikzstyle{input} = [coordinate]
% 		\tikzstyle{output} = [coordinate]
%             \tikzset{edge/.style = {->,> = latex'}}
%             % vertices
%             %\node[] (l1) at  (1.5,-.85) {$S=1$};
%             %\node[] (l2) at  (-2.95,-.75) {$S=1$};            
%             \node[selector] (s) at  (0, 0) {$S$};
%             \node[] (a) at  (0, -1) {$A$};
% 			\node[] (y) at  (0, -2) {$Y$}; 
%             %edges
            
%             \draw[-stealth] (s) to (a);
%             \draw[-stealth] (a) to (y);
%             %\draw[-stealth] (c) to (a);
%             %\draw[-stealth] (a) to (y);
%             %\draw[-stealth] (c) to (y);
%     %         \draw[-stealth] (s) to (x); 
%  			\draw[stealth-stealth][bend left=35] (s) to node[right]{\small $\{A\}$} (a);           
%  			\draw[stealth-stealth][bend right=35] (s) to (y);           
    
%  			% \draw[-stealth] (s) to (y);
%  			% \draw[stealth-stealth] (s) to (z); 			           
%     %         \draw[-stealth] (z) to node[above, sloped]{ \tiny $S=1$} (x);
%  			% \draw[stealth-stealth][bend left=-35] (y) to node[above, sloped]{ \tiny $S=1$} (w);                                                 
%     %         \draw[-stealth] (x) to (y);
%     %         \draw[-stealth] (y) to  (w);                                      	                                      
%         \end{tikzpicture}
%         \caption{}
%         \label{fig:double_bow_arc_case_1}
% %\centering
% \end{subfigure}%
% \begin{subfigure}[b]{0.2\textwidth}
% \centering
%         \begin{tikzpicture}
% 		\tikzstyle{block} = [draw, circle, inner sep=2.5pt, fill=lightgray]
% 		\tikzstyle{block2} = [draw, rectangle, inner sep=2.5pt, fill=lightgray]
% 		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
% 		\tikzstyle{input} = [coordinate]
% 		\tikzstyle{output} = [coordinate]
%             \tikzset{edge/.style = {->,> = latex'}}
%             % vertices
%             %\node[] (l1) at  (1.5,-.85) {$S=1$};
%             %\node[] (l2) at  (-2.95,-.75) {$S=1$};            
%             \node[selector] (s) at  (0, 0) {$S$};
%             \node[] (a) at  (0, -1) {$A$};
% 			\node[] (y) at  (0, -2) {$Y$}; 
%             %edges
            
%             \draw[-stealth] (s) to (a);
%             \draw[-stealth] (a) to (y);
%             %\draw[-stealth] (c) to (a);
%             %\draw[-stealth] (a) to (y);
%             %\draw[-stealth] (c) to (y);
%     %         \draw[-stealth] (s) to (x); 
%  			\draw[stealth-stealth][bend left=35] (s) to node[right]{\small $\{A\}$} (a);           
%  			\draw[stealth-stealth][bend right=35] (a) to node[left]{\small $\{A\}$ }(y);           
    
%  			% \draw[-stealth] (s) to (y);
%  			% \draw[stealth-stealth] (s) to (z); 			           
%     %         \draw[-stealth] (z) to node[above, sloped]{ \tiny $S=1$} (x);
%  			% \draw[stealth-stealth][bend left=-35] (y) to node[above, sloped]{ \tiny $S=1$} (w);                                                 
%     %         \draw[-stealth] (x) to (y);
%     %         \draw[-stealth] (y) to  (w);                                      	                                      
%         \end{tikzpicture}
%         \caption{}
%         \label{fig:double_bow_arc_case_2}
% %\centering
% \end{subfigure}
% \begin{subfigure}[b]{0.2\textwidth}
% \centering
%         \begin{tikzpicture}
% 		\tikzstyle{block} = [draw, circle, inner sep=2.5pt, fill=lightgray]
% 		\tikzstyle{block2} = [draw, rectangle, inner sep=2.5pt, fill=lightgray]
% 		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
% 		\tikzstyle{input} = [coordinate]
% 		\tikzstyle{output} = [coordinate]
%             \tikzset{edge/.style = {->,> = latex'}}
%             % vertices
%             %\node[] (l1) at  (1.5,-.85) {$S=1$};
%             %\node[] (l2) at  (-2.95,-.75) {$S=1$};            
%             \node[] (w2) at (0, 0) {$W_2$};
%             \node[selector] (s) at (0, -1) {$S$};
%             \node[] (a) at (0, -2) {$A$};
%             \node[] (w1) at (0, -3) {$W_1$};
%             \node[] (y) at (0, -4) {$Y$};
%             %edges
            
%             \draw[-stealth] (w2) to (s);
%             \draw[-stealth] (s) to (a);
%             \draw[-stealth] (a) to (w1);
%             \draw[-stealth] (w1) to (y);
%     %         \draw[-stealth] (s) to (x); 
    
%  			% \draw[-stealth] (s) to (y);
%  			% \draw[stealth-stealth] (s) to (z); 			           
%     %         \draw[-stealth] (z) to node[above, sloped]{ \tiny $S=1$} (x);
    
%  			 \draw[stealth-stealth][bend left=-35] (s) to node[left]{\smaller $\{A\}$} (a);                                                 
%                 \draw[stealth-stealth][bend left=50] (w2) to (y);
%                 \draw[stealth-stealth][bend left=35] (a) to node[right]{$\{A\}$}(w1);
%                 \draw[stealth-stealth][bend right=35] (a) to node[left]{$\{A\}$}(y);
                
%     %         \draw[-stealth] (x) to (y);
%     %         \draw[-stealth] (y) to  (w);                                      	                                      
%         \end{tikzpicture}
%         \caption{}
%         \label{fig:double_bow_arc}
% %\centering
% \end{subfigure}%
% % \begin{subfigure}[b]{0.2\textwidth}
% % \centering
% %         \begin{tikzpicture}
% % 		\tikzstyle{block} = [draw, circle, inner sep=2.5pt, fill=lightgray]
% % 		\tikzstyle{block2} = [draw, rectangle, inner sep=2.5pt, fill=lightgray]
% % 		\tikzstyle{selector} = [draw, diamond, inner sep=.5pt]
% % 		\tikzstyle{input} = [coordinate]
% % 		\tikzstyle{output} = [coordinate]
% %             \tikzset{edge/.style = {->,> = latex'}}

% %             \node[] (w2) at (-1, 0) {$W_2$};
% %             \node[] (a3) at (0, 0) {$A_3$};
% %             \node[] (c) at (1, 0) {$C$};
% %             \node[selector] (s) at (0, -1) {$S$};
% %             \node[] (a1) at (-.75, -2) {$A_1$};
% %             \node[] (a2) at (.75, -2) {$A_2$};
% %             \node[] (w1) at (.75, -3) {$W_1$};
% %             \node[] (m) at (-.75, -3) {$M$};
% %             \node[] (y) at (0, -4) {$Y$};
            
% %             \draw[-stealth] (m) to  (y);
% %             \draw[-stealth] (w1) to  (y);
% %             \draw[-stealth] (a1) to  (m);
% %             \draw[-stealth] (a2) to  (w1);
% %             \draw[-stealth] (s) to  (a1);
% %             \draw[-stealth] (s) to  (a2);
% %             \draw[-stealth] (c) to  (s);
% %             \draw[-stealth][bend left=45] (c) to (y);
% %             \draw[-stealth] (a3) to  (s);
% %             \draw[-stealth] (w2) to  (s);
% %             \draw[-stealth] (w2) to  (w1);
% %             \draw[stealth-stealth][bend right=35] (a2) to node[above,sloped]{{ \tiny $\{A_2\}$}}(y);
% %             \draw[stealth-stealth][bend right=35] (a1) to node[below,sloped]{\tiny $\{A_1\}$}(m);
% %             \draw[stealth-stealth][bend right=45] (w2) to (y);
% %             \draw[stealth-stealth][bend left=35] (w2) to (a3);
% %             \draw[stealth-stealth][bend left=35] (s) to node[above,sloped]{\tiny $\{A_2\}$}(a2);
% %             \draw[stealth-stealth][bend left=35] (a2) to node[above,sloped]{\tiny $\{A_2\}$}(w1);
% %         \end{tikzpicture}
% %         \caption{}
% %         \label{fig:general-case-example}
% % %\centering
% % \end{subfigure}%
% \caption{Context specific LS-ADMGs}
% \label{fig:fixing_example}
% \end{figure}

% \section{Tian-Style Algorithm}
% \subsection{Identifying $p(\{V \setminus A\}(a))$}
% Following the example of \cite{tianIdentificationCausalEffects2002} we begin by providing a sound and complete algorithm for identifying the effect of a single treatment on all other variables, $p(\{V \setminus A \}(a))$.
% % \ilya{
% % Notes on completeness of algorithm for $p(\{ V \setminus A \}(a))$
% % via context-specific hedges.

% % The input $D = cl(\vec{D}^*) = \vec{D}^* \cup \{ A \}$ must:
% % \begin{itemize}
% %     \item be a district in ${\cal G}_{D}$.
% %     \item must contain $S$.
% %     \item must contain $A$.
% %     \item $\ch_{{\cal G}_{D}}(A) \neq \emptyset$ (otherwise $\vec{D}^*$ would have been identified).
% % \end{itemize}
% % If $\ch(S) = \emptyset$, we are not identified via an ordinary hedge construction.
% % If $A \not\in \ch(S)$, we are not identified since a Tian structure
% % $A,Y,W_1, \ldots, W_k$ must exist (where $A \to Y$), and if $S$ has any children in this structure, it must be laid back for $\{ Y, W_1, \ldots W_k \}$, hence an ordinary hedge construction suffices.

% % If $A \in \ch(S)$, it must be the case that $S$ is not a descendant of
% % $\vec{D}^* \cap \de_{\G^*}(a)$.  Consequently if alg. 3 fails, there must
% % exist $\vec{D}^*$ such that $S \in \vec{D}^* \cup \pa(\vec{D}^*)$.

% % Let's say $S$ has a single child in $\vec{D}^*$.  Then this child must be $A$.  Then $S \in \vec{D}^*$.

% % %we have a structure of the form $S \to A \to Y$ with vertices on this structure bidirected connected in $\G_D$
% % }

% When \cref{alg:cs} is called, the following facts must hold:
% \begin{enumerate}
%      \item $\cl(\vec{D}^*)$ must be a district (otherwise something would have been fixable)
%      \item $\cl(\vec{D}^*)$ must contain $A$ (otherwise $\cl(\vec{D}^*) = \vec{D}^*$)
%      \item $\cl(\vec{D}^*)$ must contain $S$
%      \item $A$ must have children (or it would have been fixable and thus not in $\cl(\vec{D}^*)$)
% \end{enumerate}


% \begin{algorithm}
% \caption{Identify}\label{alg:main}
% \KwData{$\G, a, p(V)$}
% \KwResult{$p(\{V \setminus A\}(a))$ or FAIL}
% % $y \gets 1$\;
% % $X \gets x$\;
% % $N \gets n$\;
% %$ r \gets 1$\;
% \For{$\vec{D}^* \in {\cal D}(\G_{V \setminus A})$}{
%     \If{no $s$ exists that is laidback for $\vec{D}^*$}{
%     \Return FAIL(positivity) \label{alg:fail_positivity}
    
%     }
%     %$p(\cl(\vec{D}^*) \mid \doo(\pa^s(\cl(\vec{D}^*)))) \gets \phi_{V \setminus \cl(\vec{D}^*)} (p(V); \G)$ \;
%     \eIf{$\cl(\vec{D}^*) = \vec{D}^*$}{
%             Choose any $s$ laidback for $\vec{D}^*$, consistent with $a$\;
%             $q[\vec{D}^*] = \phi_{V \setminus \vec{D}^*}^s (p\mid_{S=s}, \G^{[s]})$\;
        
%     %$q \gets p(\cl(\vec{D}^*) \mid \doo(\pas (\cl(\vec{D}^*)))) \mid_{A=a}$
%     }{
%         \eIf{$S \not \in \cl(\vec{D}^*)$}{
%                 \eIf{
%                 for any $s$ laidback for $\vec{D}^*$,  consistent with $a$, $\vec{D}^*$ reachable in $\G^{[s]}$  
%                 }{
%                 $q[\vec{D}^*] \gets \phi^s_{V \setminus \vec{D}^*} (p\mid_{S=s}, \G^{[s]})$\;
%                 }{
                
%                 \Return FAIL(thicket) \label{alg:fail_thicket}
%                 }
%         }{

%             $q[\vec{D}^*] \gets \cref{alg:cs}$
        
%         }
%         % $q \gets \cref{alg:cs}
%         % (\phi_{V \setminus \cl_\G(\vec{D}^*)} (\G),$\\\quad$ a, \phi_{V \setminus \cl_\G (\vec{D}^*)} (p(V); \G)), \vec{D}^*)$}{$q \gets \cref{alg:s-fixable} (\phi_{V \setminus \cl_\G(\vec{D}^*)} (\G),$\\\quad  $a, \phi_{V \setminus \cl_\G (\vec{D}^*)} (p(V); \G)), \vec{D}^*)$}
%     }
%     %$r \gets r * q$
% }
% %Compute $\mathfrak{F}_{\Pi(D, \mathfrak{X}_S^+)}(s)$\;
% %\eIf{$\mathfrak{F}_{\Pi(D, \mathfrak{X}_S^+)}(s) \neq \empty$}{
% %   \Return $q \mid_{A=a}$
% %    } 
% %  }
% %{
% %\Return FAIL(positivity)
% %}
% \Return $\prod_{\vec{D}^*} q[\vec{D}^*]$
% \end{algorithm}

% % \begin{algorithm}
% % \caption{Identify GID-style}\label{alg:s-fixable}
% % \KwData{$\G, a, q(V \mid W), D $}
% % Compute $\mathfrak{X}_S' \subseteq \mathfrak{X}_S$ which is laid back $S$ values for $D$, consistent for $A=a$\;
% % \For{$s \in \mathfrak{X}_S'$}{


% % \If{valid fixing sequence on $\cl(D) \setminus D$ in $\G^{[s]}$ exists}{
% % \Return $\phi^s_{\cl(D) \setminus D}(q(V \mid W) \mid_{S=s}, \G^{[s]}) \mid_{A=a}$
% % }
% % }
% % \Return FAIL(thicket)
% % \end{algorithm}
% \begin{lem} \label{lem:generalized_trick}
% Let $\G$ be a graph containing $S, A$ that is a district $D$. Define $\G^* = \G(\{D \setminus A\}(a))$, and let $\tildep{\vec{D}}$ be a district of $\G(\{D \setminus A\})$. Let $\tildep{\vec{D}} \cap \de(\pas(\tildep{\vec{D}})) \ci S \mid (\tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}}))) \cup \pas(\tildep{\vec{D}})$ hold, where genealogical relations are taken with respect to $\G^*$ unless otherwise stated. Let $A \in \ch_\G (S)$. 
% Then, 
% {\small
% \begin{align*}
%     &p(\tildep{\vec{D}} \mid \doo (\pas (\tildep{\vec{D}}))) \\
%     &=p(\tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}}))) \prod_{\tildep{\vec{D}}_i \in \prec_{\tildep{\vec{D}} \cap \de(\pas(\tildep{\vec{D}}))}}\\
%     &  p(\tildep{\vec{D}}_i \mid \pre_\prec (\tildep{\vec{D}}_i), \tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})), \pas(\tildep{\vec{D}}), S=s) \\
% \end{align*}
% }
    
% \end{lem}
% \begin{proof}
% Let $s$ be a value of $S$ which intervenes on $A$, is laid back for the rest of $\tildep{\vec{D}}$, and is consistent with the value $A=a$.

% {\small
% \begin{align*}
%     &p(\tildep{\vec{D}} \mid \doo (\pas (\tildep{\vec{D}}))) \\
%     &= p(\tildep{\vec{D}} \cap \de(\pas(\tildep{\vec{D}})) \mid \tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})), \doo(\pas(\tildep{\vec{D}}))) \\
%     &\times p(\tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}}))) \\ 
%     &= p(\tildep{\vec{D}} \cap \de(\pas(\tildep{\vec{D}})) \mid \tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})), \doo (\pas(\tildep{\vec{D}})), S=s)\\
%     &\times p(\tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}}))) \\
%     &= \prod_{\tildep{\vec{D}}_i \in \prec_{\tildep{\vec{D}} \cap \de(\pas(\tildep{\vec{D}}))}}p(\tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})))\\
%     &p(\tildep{\vec{D}}_i \mid \pre_\prec (\tildep{\vec{D}}_i), \tildep{\vec{D}} \cap \nd(\pas(\tildep{\vec{D}})), \pas(\tildep{\vec{D}}), S=s) \\
% \end{align*}
% }
% \end{proof}
% \begin{algorithm}
% \caption{Identify CS-Hedge} \label{alg:cs}
% \KwData{$\G_{\cl(\vec{D}^*)}, a, q[\cl(\vec{D}^*)], \vec{D}^*, \cl(\vec{D}^*)$}
% \KwResult{$q[\vec{D}^*]$ or FAIL}
% % \Comment{
% % \begin{enumerate}
% %     \item $\cl(\vec{D}^*)$ must be a district (otherwise something would have been fixable)
% %     \item $\cl(\vec{D}^*)$ must contain $A$ (otherwise $\cl(\vec{D}^*) = \vec{D}^*$)
% %     \item $\cl(\vec{D}^*)$ must contain $S$
% %     \item $A$ must have children (or it would have been fixable and thus not in $\cl(\vec{D}^*)$)
% % \end{enumerate}}
% $\G^* \gets \G(\{\vec{D}^* \setminus A\} (a))$ \;
% $D \gets \cl(\vec{D}^*)$\;
% \If{$S \not \in \pa_{\G^*}(A)$}{
%     \Return FAIL(cs-hedge) \label{alg:fail_cs_case_1}
% }
% \For{$\tildep{\vec{D}} \in {\cal D}(\G(\{D \setminus A\}))$}{
  
      
%     \eIf{$\tildep{\vec{D}} \cap \de_{\G^*} (\pas(\tildep{\vec{D}})) \ci S \mid (\tildep{\vec{D}} \cap \nd_{\G^*}(\pas_{\G^*} (\tildep{\vec{D}}))) \cup \pas(\tildep{\vec{D}})$}{
%         $q[\tildep{\vec{D}}]\gets$ computed via \cref{lem:generalized_trick}
    
%     }{
%         \eIf{$\tildep{\vec{D}} \cap \ch(A) = \emptyset$}{
%            \eIf{$\tildep{\vec{D}}$ reachable in $\G$}{
%               $q[\tildep{\vec{D}}] \gets \phi_{D \setminus \tildep{\vec{D}}}(q[D], \G)$
%         }{
%               \Return FAIL(cs-hedge for $\tildep{\vec{D}}, \pas(\tildep{\vec{D}})$) \label{alg:fail_cs_case_2}
%         }
        
%         }{
%             \Return FAIL(cs-hedge for $\tildep{\vec{D}}, A$) \label{alg:fail_cs_case_3}
        
%         }
    
%     }


%         % \eIf{$\vec{D}^* \cap \de_{\G^*} (a) \ci S \mid (\vec{D}^* \cup \pa_{\G^*} (\vec{D}^*)) \setminus (\vec{D}^* \cap \de_{\G^*} (a))$}{
            
%         %     $r_{\vec{D}^*} \gets \phi_{V \setminus (\vec{D}^* \cap \de_{G^*} (a))}(q(V \mid W), \G)\mid_{S=s} \times \phi_{V \setminus ((\vec{D}^* \cup \pa_{G^*} (\vec{D}^*)) \setminus (\vec{D}^* \cap \de_{\G^*} (a)))}(q(V \mid W), \G)$
        
%         % }{
%         %     \Return FAIL(context-specific hedge)
        
%         % }
    
    
   

% }
% \Return $\sum_{D \setminus \vec{D}^*} \prod_{\tildep{\vec{D}} \in {\cal D}(\G(\{D \setminus A\}))} q[\tildep{\vec{D}}]$
% %\For{$\vec{D}^* \in {\cal D}'= \{\vec{D}^* \in {\cal D}(\G^*) | \vec{D}^* \cap \de_{(\G*)}(a) \neq \emptyset\}$}{
% % \If{$S \not \ci \vec{D}^* \cap \de(a)_{\G^*} \mid (\vec{D}^* \cup \pa(\vec{D}^*)) \setminus (\vec{D}^* \cap \de(a)_{\G^*}))$}{
% %     \Return FAIL(context-specific hedge)

% %     }
% % }
% % \Return $\prod_{\vec{D}^* \in {\cal D}'}q(\vec{D}^* \mid S=s, \pa_{\G(a)}(\vec{D}^*) \setminus \vec{D}^*) \times \prod_{\vec{D}^* \in {\cal D}(\G^*) \setminus {\cal D}'} q(\vec{D}^* \mid \pa_{\G} (\vec{D}^*) \setminus \vec{D}^*)$
% % where $s$ is laidback for each $\vec{D}^*$ and consistent with $A=a$


% \end{algorithm}


% To establish completeness, we rely on a novel construction which we describe below. This new construction is needed because identification can fail on a structure which contains an $S$, and possibly children of $S$. Existing constructions such as the hedge of \citet{shpitserIdentificationJointInterventional2006} would therefore not be in the model defined in \cref{dfn:data_fusion_scm}.

% \begin{thm}[Double bow arc graph]
% $p(Y, S \mid \doo(A))$ is not identified in the double bow arc graph.
% \end{thm}
% \begin{proof}
% We first rename $S$ to $S_S$ and add a sibling of $S_S$ called $S_V$. We replace bidirected arcs with $U$ variables named after their endpoints. We remove the directed edge from $S \to A$. 

% Then, following \cite{shpitserIdentificationJointInterventional2006} we construct two models under their bit parity scheme, in which all variables are binary each are  Then for model 1: 
% \begin{align*}
%     A &= U_{A, Y} \\
%     Y &= A \oplus U_{S_S, Y} \\
%     S_S &= U_{A, S_S} \oplus U_{S_V, S_S} \oplus U_{S_S, Y} \\
%     S_V &= U_{S_V, S_S}
% \end{align*}
% whereas in model 2
% \begin{align*}
%     A &= U_{A, Y} \\
%     Y &= U_{S_S, Y} \\
%     S_S &= U_{S_V, S_S} \oplus U_{S_S, Y} \\
%     S_V &= U_{S_V, S_S}
% \end{align*}

% Then, the observed distributions agree such that positive support exists only where the bit parity of the root of $F$ is even, but when $A$ is intervened then in model 1 there is a uniform distribution on   the root, whereas it is unchanged in model 2.

% Then, in the structural equation for $A$ we enforce the semantics of intervening on $A$, and so replace the structural equation for $A$ in both models with 
% \[A = \begin{cases} U_1, S_S = 0 \\ S_V, S_S = 1 \end{cases}\]

% This has the effect of preserving the original hedge bit parity property when $S_S = 0$, but when $S_S = 1$, in model 1 the root now receives $U_{A, S_S}$ only once as the pathway $U_{A, S_S} \to A \to S_S$ is disrupted, whereas in model 2 the root never paid attention to $U_{A, S_S}$ in the first instance, creating a discrepancy in the observed distribution

% To restore agreement on the observed distribution we augment the structural equation of $Y$. When $S_S = 1$ we to add a random bit to the structural equation of $Y$, to ensure uniform distributions on the root. Since $S \to Y$ is not in $\G$, we instead use $U_{S_V, S_S}$ in its place, and create a new random bit $\tilde{U}$ such that in model 1
% \[Y = \begin{cases} A \oplus U_{S_S, Y}, U_{S_V, S_S} = 0 \\ A \oplus U_{S_S, Y} \oplus \tilde{U}, U_{S_V, S_S} = 1 \end{cases}\]

% whereas in model 2 
% \[Y = \begin{cases} U_{S_S, Y}, U_{S_V, S_S} = 0 \\ U_{S_S, Y} \oplus \tilde{U}, U_{S_V, S_S} = 1 \end{cases}\]
% \end{proof}


% %\begin{restatable}[Generalized trick]{lem}{fullgeneralizedtrick} 

% \begin{proof}

% \end{proof}

% \begin{thm}
%   \cref{alg:main} is sound.
% \end{thm}
% \begin{proof}
%   The algorithm aims to identify
%   $p(\vec{V} \setminus \{A\} \mid \text{do}(a))$ for a singleton treatment $A$ in the causal model represented by $\G$,
%   with additional restrictions pertaining to the relationship of $S$ to its children in $\G$.  Such restrictions do not affect district factorizations of the observed and interventional distributions which hold due to results in \cite{tianIdentificationCausalEffects2002}.

%   Thus, for every value assignment $v \in \mathfrak{X}_{\vec{V} \setminus \{A\}}$,

%   \begin{align*}
%   &p(\{ W = v_W : W \in V \setminus A \} \mid \text{do}(a)) \\ 
%   &=\prod_{D \in {\cal D}_{\G_{V \setminus A}}}
%   p(v_D \mid \text{do}(v_{\pa_{\cal G}(D) \setminus D})).
%   \end{align*}
%   Each term is identified by one of three cases.
  
%   The first case, where $\cl_{\G}(\vec{D}^*) = \vec{D}^*$, follows by the soundness proof of the ID algorithm.  See \citep{shpitserIdentificationJointInterventional2006,richardsonNestedMarkovProperties2023} for details.

%   The second case, where $\vec{D}^* \subset \cl_{\G}(\vec{D}^*)$ and $S \not\in \cl_{\G}(\vec{D}^*)$ follows from the soundness of the gID algorithm.  Specifically, this case shows that the distribution $p(\cl_{\G}(\vec{D}^*) \mid \text{do}(\pa_{\G}(\cl_{\G}(\vec{D}^*)) \setminus \cl_{\G}(\vec{D}^*)))$ is identified, and represents the observed data distribution corresponding to a causal model represented by $\G_{\cl_{\G}(\vec{D}^*)}$.  Since $S \not\in \cl_{\G}(\vec{D}^*)$, the available datasets in this model may be reformulated as observational and interventional distributions on $\cl_{\G}(\vec{D}^*)$ indexed by values of $S$.  This is precisely the inputs for the gID algorithm, and identifiability follows by the soundness of that algorithm \cite{leeGeneralIdentifiabilityArbitrary2019}.

%   The third case, where $\vec{D}^* \subset \cl_{\G}(\vec{D}^*)$ and $S \in \cl_{\G}(\vec{D}^*)$ follows from the soundness of \cref{alg:cs}.

%   Let $D = \cl_{\G}(\vec{D}^*)$.  For every value $d$ of $D$, \cref{alg:cs} factorizes $p(D = d \setminus A \mid \text{do}(a))$ as $\prod_{\tildep{\vec{D}} \in {\cal D}(\G_{D \setminus A})} p(d_{\tildep{\vec{D}}} \mid d_{\pa_{\G}(\tildep{\vec{D}}) \setminus \tildep{\vec{D}}})$.
%   Given this factorization, each term corresponding to $\tildep{\vec{D}}$ is identified via one of two cases.  If $\tildep{\vec{D}}$ is reachable in $\G$, the soundness of the derivation for the term
%   $p(d_{\tildep{\vec{D}}} \mid d_{\pa_{\G}(\tildep{\vec{D}}) \setminus \tildep{\vec{D}}})$ follows by
%   the soundness of the fixing operator reformulation of the ID algorithm, as described in \citep{richardsonNestedMarkovProperties2023}.

%   Otherwise, soundness follows by Lemma \ref{lem:generalized_trick}.

% \end{proof}

% \begin{thm}
%    \cref{alg:main} is complete. 
% \end{thm}
% \begin{proof}
%     To prove completeness we will establish that whenever FAIL is returned, that a suitable construction bearing witness to non-identification can be provided.

%     \ARef*{alg:fail_positivity}: If there is no $s$ laidback for $\vec{D}^*$ then identification fails. This follows from the fact that identifying the joint distribution $p(\vec{D}^* \mid \text{do}(\pa_{\cal G}(\vec{D}^*) \setminus \vec{D}^*))$ is impossible if only marginal distributions of this joint distribution are available and random variables in $\vec{D}^*$ do not exhibit additional factorization structure, since $\vec{D}^*$ is a district.
    
%     %is because then we have a collection of various margins of $\vec{D}^*$, but no joint. Then, because
%     Since joint distributions cannot be recovered from marginal distributions without further assumptions, we fail to identify $q[\vec{D}^*]$. This case handles the degenerate identification failure case of the gID algorithm (e.g. Section 3 of supplementary of \cite{kivvaRevisitingGeneralIdentifiability2022}, which provides an explicit construction showing this).

%     \ARef*{alg:fail_thicket}: A failure at this point involves the district $\vec{D}^*$ not containing $S$, but for each $s$ that is both consistent with $a$ and laidback for $\vec{D}^*$, $\vec{D}^*$ is not reachable in the corresponding $\G^{[s]}$. Then, we may adopt either the thicket construction of \cite{leeGeneralIdentifiabilityArbitrary2022} or corresponding alternative in \cite{kivvaRevisitingGeneralIdentifiability2022}, where $S$ is now viewed simply as an indexing operator for the various distributions that are inputs into gID. Then, the thicket construction immediately witnesses the non-identifiability of the desired causal effect $p(\vec{D}^* \mid \doo(\pas(\vec{D}^*)))$. To move this construction into the model defined in \cref{dfn:data_fusion_scm} we can introduce any suitable marginal distribution over $S$ and other variables not involved in $\cl(\vec{D}^*)$.

%     \ARef*{alg:fail_cs_case_1}, \ARef*{alg:fail_cs_case_2}, \ARef*{alg:fail_cs_case_3}: In each of these cases, we fail with a hedge constructed for the two districts named in the failure case, per \cref{thm:noise-injected-hedge}. To extend this construction to all of $V$, we introduce marginal distributions over all variables not otherwise mentioned in the construction.
% \end{proof}

% \begin{ex}[Identifying $p(\{Y, S\}(a))$ in \cref{fig:double_bow_arc_case_2}]

% Assume that $\mathfrak{X}_S = \{ (\emptyset, \emptyset), (\{A\}, 0), (\{A\}, 1)\}$. Applying \cref{alg:main}, we get that $\vec{D}^*_2= \{Y\}, \vec{D}^*_1 = \{S\}$. We notice that $\vec{D}^*_1$ triggers line 4 and admits the fixing sequence $Y, A$ and thus $q[\vec{D}^*_2] = p(S)$.  For $\vec{D}^*_2$, line 14 is triggered as $cl(\vec{D}^*_2) = \{Y, A, S\}$. Going to \cref{alg:cs}, which is called , we find that $\tildep{\vec{D}}_1 = \{Y\}, \tildep{\vec{D}}_2 = \{S\}$. For $\tildep{\vec{D}}_1$, since $Y \ci_{\G(\{\vec{D}^*_2 \setminus A\}(a))} S$, \cref{lem:generalized_trick} is triggered and therefore $q[\tildep{\vec{D}}_1] = p(Y \mid S=(\{A\}, a))$. For $\tildep{\vec{D}}_2$, line 11 is triggered and returns $q[\tildep{\vec{D}}_2] = p(S)$, which is immediately summed out in line 16. The final result is \[p(\{Y, S\}(a)) =  p(S) p(Y \mid S = (\{A\}, a)) \]
    
% \end{ex}
% \begin{table}[!h]
%     \centering
%     \caption{An Interesting Table.} \label{tab:supp-data}
%     \begin{tabular}{rl}
%         \toprule % from booktabs package
%         \bfseries Dataset & \bfseries Result\\
%         \midrule % from booktabs package
%         Data1 & 0.12345\\
%         Data2 & 0.67890\\
%         Data3 & 0.54321\\
%         Data4 & 0.09876\\
%         \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \section{Math font exposition}
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% How math looks in equations is important:
% \begin{equation*}
%     F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.


\end{document}
