\section{Proofs}
\label{sec:app_proofs}

\textbf{\Cref{prop:decomposition} restated}. \textit{Given a PAG $\1P$ over $\V$ and $\A\subset\C\subseteq\V$, let the region of $\A$ with respect to $\C$ be denoted $\1R_\A$. $Q[\C]$ can be decomposed as,
    \begin{align}
        Q[\C] = Q[\1R_\A] \cdot Q[\1R_{\C\backslash\A}] \hspace{0.1cm}/\hspace{0.1cm} Q[\1R_\A\inter\1R_{\C\backslash\A}].
    \end{align}
}
    
\begin{proof}
    The proof can be found in \cite[Thm.1]{jaber2019causal}.
\end{proof}

\textbf{\Cref{prop:duality} restated}. \textit{Let $\1P$ be the PAG underlying $P(\V)$. Under faithfulness, a causal effect is partially identifiable from $P(\V)$ with bound $[a,b]$ if and only if it is partially identifiable from $\1P$ and $P(\V)$ with bound $[a,b]$.}

\begin{proof}
    Let $\1P$ be the PAG underlying a given distribution $P(\V)$. Let $\3M$ be the set of all SCMs over a set of endogenous variables $\V$, and let $\3M(\1P)$ be the subset of SCMs over a set of endogenous variables $\V$ whose induced causal diagrams are consistent with the PAG $\1P$. The partial identification problem from $P(\V)$ (\Cref{def:partial_identification}) may be stated as follows,
    \begin{align}
        \underset{\M \in \3M}{\text{min / max}}\hspace{0.1cm} P_\M (\y_{\x}), \quad \text{such that}\quad P_\M(\v)=P(\v).
    \end{align}
    Similarly, the partial identification problem from the PAG $\1P$ and $P(\V)$ (\Cref{def:partial_identification_pag}) may be stated as follows,
    \begin{align}
        \underset{\M \in \3M(\1P)}{\text{min / max}}\hspace{0.1cm} P_\M (\y_{\x}), \quad \text{such that}\quad P_\M(\v)=P(\v).
    \end{align}
    These definitions highlight that the bounding problem involves a search over structural models consistent with the observational distribution $P(\V)$, and optionally $\1P$. We will show that under faithfulness, $\{\M\in\3M: P_\M(\v)=P(\v)\} = \{ \M\in\3M(\1P): P_\M(\v)=P(\v)\}$. In that case, the optimization problems coincide and therefore their solutions coincide.
    
    To see this note that $\{ \M\in\3M(\1P): P_\M(\v)=P(\v)\} \subseteq \{ \M\in\3M: P_\M(\v)=P(\v)\}$ by definition since $\3M(\1P)$ introduces a restriction on the space $\3M$. Under faithfulness, consider any SCM $\M\in\3M$ such that $P_\M(\v)=P(\v)$. It follows then that,
    \begin{align*}
        (\X\indep\Y\mid\Z)_{P_\M} \Leftrightarrow (\X\indep\Y\mid\Z)_{\G_\M}.
    \end{align*}
    In other words, under faithfulness, the graph $\G_\M$ entails a $d$-separation for every conditional independence in the data, and vice versa. $\G_\M$ must then be included in the set of diagrams represented by the PAG $\1P$ as the PAG is defined as the set of diagrams with $d$-separation statements match the conditional independencies in data, that is $\1M \in \3M(\1P)$. As $\1M$ was arbitrary (up to agreement with the observational distribution), we have that, under faithfulness, $\{ \M\in\3M: P_\M(\v)=P(\v)\} \subseteq \{ \M\in\3M(\1P): P_\M(\v)=P(\v)\}$. This implies then that $\{\M\in\3M: P_\M(\v)=P(\v)\} = \{ \M\in\3M(\1P): P_\M(\v)=P(\v)\}$ showing the claim.
\end{proof}

Next we introduce two utility lemmas that will be useful in the derivation of the following proofs.

\begin{lemma}
    \label{lemma:subset_scm}
    Given a PAG $\1P$ over $\V$, let $\C\subset\V$. Then,
    \begin{align}
        \Big\{Q[\C;\M]: \M \in \3M(\1P)\Big\} =  \Big\{Q[\C;\M]: \M \in \3M(\1P_{\widetilde{\V\backslash\C}})\Big\}.
    \end{align}
\end{lemma}

\begin{proof}
    For a given causal diagram $\G$, a causal effect of interest $P_{\x}(\y)$ can be written, 
    \begin{align}
    \label{eq:param_effect}
        P_{\x}(\y) = \sum_{\v\backslash \{\x\cup\y\}}\int_{\Omega_{\U}} \prod_{V \in \V\backslash \X} P(v \mid \pa_V,\boldsymbol u_V) dP(\boldsymbol u) = \sum_{\s \backslash \{\y\}}\int_{\Omega_{\U}} \prod_{V\in\S} P(v \mid \pa_V,\boldsymbol u_V)  dP(\boldsymbol u),\nonumber
    \end{align}
    where $\S = \An(\Y)_{\G_{\overline{\X}}} \backslash \X$ and $\s$ is some value in the domain of $\S$. This expression depends only on probabilities associated with variables $\S$, $\U_S, S\in\S$ and their functional dependencies through terms $P(v \mid \pa_V,\boldsymbol u_V)$ that in turn are determined by the underlying SCMs compatible with the graph and data, which in this case are parameterized by $\{f_V:V\in\S\}$ and $\{P(u_V):V\in An(\S)_{\G[\C]}\}$ where $\C$ is the $c$-component in $\G$ that contains $\S$. The distribution and values of any other variable is of no consequence to the desired causal effect. In particular, any descendants of $\Y$ in $\G$ can be marginalized out without loss of generality.
    
    The same reasoning applies to $Q[\C]:= P_{\v\backslash\c}(\c)$ which depends only on $\An(\C)_{\G_{\overline{\V\backslash\C}}} \backslash (\V\backslash\C)$ and the functional dependencies of associated variables. Recall that $\1P_{\widetilde{\V\backslash\C}}$ denotes the graph in which all edges that are visible in $\1P$ and are into variables in $\V\backslash\C$ are deleted, and in which all invisible edges that are into variables in $\V\backslash\C$ are replaced with bi-directed edges. Since $\1P_{\widetilde{\V\backslash\C}}$ only modifies the functional assignment of descendants of $\C$ and these do not influence the causal effect of interest we have that,
    \begin{align}
        \Big\{Q[\C;\M]: \M \in \3M(\1P)\Big\} =  \Big\{Q[\C;\M]: \M \in \3M(\1P_{\widetilde{\V\backslash\C}})\Big\}.
    \end{align}
\end{proof}

This proposition implies that it is sufficient to consider the set of causal diagrams compatible with $\1P_{\widetilde{\V\backslash\C}}$, i.e. the set of ME causal diagrams $\G$ in which edges $X\rightarrow Y, X\in pa(\C)_{\G}, Y \in \C$ have been removed, to characterize lower and upper bounds over queries of the form $Q[\C]$ from $\1P$. The next lemma shows how to compute quantities $Q[\cdot]$ from larger ones and is an extension of an analogous results defined for causal diagrams \cite{tian2002general}.

%For the quantities in the above equation, the following proposition is identifiable from $Q[\C]$ by the following proposition. 

\begin{lemma}
    \label{lem:ancestor_id}
    Let $\W\subseteq\C\subset\V$ such that $\W = \texttt{PossAn}(\W)_{\1P_{\C}}$. Then, $Q[\W] = \sum_{\c\backslash\w}Q[\C]$.
\end{lemma}
    
\begin{proof}
    By \cite[Prop. 1]{jaber2018causal}, if $X$ is an ancestor of $Y$ in a causal diagram $\G_{\C}$, then $X$ is a possible ancestor of $Y$ in the PAG $\1P_{\C}$. By the converse of this implication, if $X$ is not a possible ancestor of $Y$ $\1P_{\C}$, then $X$ is not an ancestor of $Y$ in $\G_\C$. Let $\W\subseteq\C\subset\V$ such that $\W = \texttt{PossAn}(\W)_{\1P_{\C}}$. By \cite[Prop. 1]{jaber2018causal} therefore, no variable in $\R = \C\backslash\W$ being a possible ancestor of $\W$ in $\1P_{\C}$ implies that no variable in $\R = \C\backslash\W$ is an ancestor of $\W$ in any ME causal diagram $\G_{\C}$. For any causal diagram $\G_{\C}$, by \cite[Lemma 3]{tian2003identification} $Q[\W] = \sum_{\r}Q[\C]$. Moreover, if $Q[\C]$ could be uniquely computed from $P(\V)$ and $\1P$, the $Q[\W]$ could be uniquely computed from $P(\V)$ and $\1P$.
\end{proof}

We are now ready to prove the validity of lower and upper bounds.

\textbf{\Cref{prop:lowerbound} restated}. 
    \textit{Given a PAG $\1P$, consider sets $\S \subset \C \subseteq \V$ and define $\W = \texttt{PossAn}(\S)_{\1P_\C}$, $\R = \W \backslash \S$, and $\T=\texttt{PossSp}(\S)_{\1P_\C}\backslash \S$. Let $\A,\B$ partition $\R$ such that $\B = \texttt{PossDe}(\T)_{\1P_\C}\inter\R, \A=\R\backslash\B$. $Q[\S]$ is lower bounded as follows:
    \begin{align}
        \label{eq:lowerbound_app}
        Q[\S] \geq  \max_\z \frac{Q[\W]}{\sum_{\s,\b} Q[\W]},
    \end{align}
    where $\Z = \texttt{PossPa}(\W)_{\1P} \backslash \texttt{PossPa}(\S)_{\1P}$.}

\begin{proof}
    If $Q[\S]$ is not identifiable given $\1P$, then $Q[\S]$ is not identifiable in one or more ME causal diagrams $\G$ by \cite[Thm. 4]{jaber2019causal}. In each of those diagrams $\G$, then there must exist an open backdoor path from a node in $\V\backslash\S$ to a node in $\S$ that could be blocked with access to a set of unobserved confounders $\U$. Let $\U$ be the union of exogenous variables that block such open backdoor paths. In turn, let $\S\subset\C$, where $\C$ is a $pc$-component in a sub-graph of $\1P$ with $Q[\C]$ identifiable. Following the statement of the proposition, let $\W = \texttt{PossAn}(\S)_{\1P_\C}$, $\R = \W \backslash \S$, and $\T=\texttt{PossSp}(\S)_{\1P_\C}\backslash \S$. Further, let $\A,\B$ partition $\R$ such that $\B = \texttt{PossDe}(\T)_{\1P_\C}, \A=\R\backslash\B$. Without loss of generality, by \Cref{lemma:subset_scm} inference on $Q[\S]$ given $\1P$ is equivalent to inference on $Q[\S]$ given $\1P_{\widetilde{\V\backslash\S}}$. 
    
    To show the claim, we show that $Q[\S]$ is lower bounded in every causal diagram compatible with $\1P_{\widetilde{\V\backslash\S}}$. Let $\G$ be any such causal diagram. In light of the definitions above, it holds that,
    \begin{enumerate}
        \item $\R$ is $d$-separated from $\S$ conditioned on $\U$ in $\G_{\overline{\V\backslash\W}, \underline{\R}}$,
        \item $\U$ is exogenous and thus $d$-separated from $\R$ conditioned on $\V\backslash\W$ in $\G_{\overline{\V\backslash \W, \R}}$,
        \item  $\A$ is $d$-separated from $\U$ conditioned on $\V\backslash\W$ in $\G_{\overline{\V\backslash\W}}$. 
    \end{enumerate}
    Condition (1) states that all backdoor paths from $\R$ to $\S$, \textit{i.e.} those starting with an edge $R \leftarrow \cdots$ or $R \dashleftarrow\dashrightarrow \cdots, R\in\R$, are blocked conditioned on $\U$. To see this, note that $\U$ is chosen to block all backdoor paths through an unobserved confounder and that any other backdoor paths are assumed away in $\1P_{\widetilde{\V\backslash\S}}$, \textit{i.e.} there is no edge of the form $X \rightarrow Y, X \in \S, Y\in \V\backslash\S$ in any causal diagram compatible with $\1P_{\widetilde{\V\backslash\S}}$. Condition (2) holds because $\U$ is exogenous; in $\G_{\overline{\V\backslash \W, \R}}$ no open path between $\R$ and $\U$ conditioned on $\V\backslash \W$ could exist. Condition (3) holds because $\A$ is defined precisely as the set of variables in $\R$ that are not descendants of $\T$ and thus that are not descendants of $\U$; any path between $\A$ and $\U$ is therefore blocked by a child or descendant of $\U$ that acts as a collider on the path.
    
    The following derivation applies to $Q[\S]$ in $\G$.
    \begin{align*}
        Q[\S] &:= P(\s \mid do(\v\backslash \s)) \\
        &= P(\s \mid do(\r, \v\backslash \w))\\
        &\stackrel{(1)}{=} \sum_{\u} P(\s \mid \u, \r, do(\v\backslash \w)) P(\u \mid do(\r, \v\backslash \w))\\
        &\stackrel{(2)}{=} \sum_{\u} P(\s \mid \u, \r, do(\v\backslash \w)) P(\u \mid do(\v\backslash \w))\\
        &\stackrel{(3)}{=} \sum_{\u} P(\s \mid \u, \a, \b, do(\v\backslash \w)) P(\u \mid \a, do(\v\backslash \w))\\
        &\geq \sum_{\u} P(\s \mid \u, \a, \b, do(\v\backslash \w)) P(\u, \b \mid \a, do(\v\backslash \w)) \\
        &= P(\s, \b \mid \a, do(\v\backslash \w))\\
        &= \frac{P(\w \mid do(\v\backslash \w))}{\sum_{\s,\b} P(\w \mid do(\v\backslash \w))}\\
        &= \frac{Q[\W]}{\sum_{\s,\b} Q[\W]}.
    \end{align*}
    (1) follows from condition 1; (2) follows from condition 2; (3) follows from condition 3. The inequality follows from the fact that the event $\{\u,\b\}$ is less likely than event $\{\u\}$ under any probability mass function. Finally, $Q[\S]$ is a function of $Pa(\S)_{\G}$ only and thus this bound holds for any value of $\Z := Pa(\W)_{\G} \backslash Pa(\S)_{\G}$ and therefore any $\Z = \texttt{PossPa}(\W)_\1P \backslash \texttt{PossPa}(\S)_\1P$. In particular,
    \begin{align*}
        Q[\S] \geq \max_{\z} \frac{Q[\W]}{\sum_{\s,\b} Q[\W]}.
    \end{align*}
\end{proof}

\textbf{\Cref{prop:upperbound} restated}. 
    \textit{Given a PAG $\1P$, consider sets $\S \subset \C \subseteq \V$ and let a partial topological ordering $\S$ be $\S_1 \prec \cdots \prec \S_k$. Define $\W = \texttt{PossAn}(\S)_{\1P_\C}$, $\R = \W \backslash \S$, $\T=\texttt{PossSp}(\S)_{\1P_\C}\backslash \S$, and $\T=\texttt{PossSp}(\S)_{\1P_\C}\backslash \S$. Let $\A,\B$ partition $\R$ such that $\B = \texttt{PossDe}(\T)_{\1P_\C}\inter\R, \A=\R\backslash\B$. $Q[\S]$ is upper bounded as follows:
    \begin{align}
        Q[\S] \leq \min_{\z} \left\{\frac{Q[\W]}{\sum_{\s,\b} Q[\W]} - \sum_{\s_k}\frac{Q[\W]}{\sum_{\s,\b} Q[\W]} \right\} + Q[\S\backslash \S_k],
    \end{align}
    where $\Z = \texttt{PossPa}(\W)_{\1P} \backslash \texttt{PossPa}(\S)_{\1P}$.
}

\begin{proof}
    Similarly to the proof of \Cref{prop:lowerbound}, to show the claim we show that $Q[\S]$ is upper bounded in every causal diagram compatible with $\tilde{\1P}$ and rely on facts (1,2,3) above. Let $\G$ be any such causal diagram. Let a partial topological ordering of $\S$ be $\S_1 \prec \cdots \prec \S_k$.
    
    It then holds that,
    \begin{align*}
        &Q[\S] := P(\s \mid do(\v\backslash \s)) \\
        &= P(\s \mid do(\r, \v\backslash \w))\\
        &= \sum_{\u} P(\s \mid \u, \r, do(\v\backslash \w)) P(\u \mid do(\r, \v\backslash \w)) \\
        &= \sum_{\u} P(\s \mid \u, \r, do(\v\backslash \w)) P(\u \mid do(\v\backslash \w)) \\
        &= \sum_{\u} P(\s \mid \u, \a, \b, do(\v\backslash \w)) P(\u \mid \a, do(\v\backslash \w)) \\
        &= \sum_{\u} P(\s \mid \u, \a, \b, do(\v\backslash \w)) \Big(P(\u, \b \mid \a, do(\v\backslash \w)) + P(\u \mid \a, do(\v\backslash \w))- P(\u, \b \mid \a, do(\v\backslash \w)\Big) \\
        &\stackrel{(1)}{=} \frac{Q[\W]}{\sum_{\s,\b} Q[\W]} + \sum_{\u} P(\s \mid \u, \a, \b, do(\v\backslash \w)) \Big(P(\u \mid \a, do(\v\backslash \w))- P(\u, \b \mid \a, do(\v\backslash \w)\Big) \\
        &\stackrel{(2)}{\leq} \frac{Q[\W]}{\sum_{\s,\b} Q[\W]} + \sum_{\u} P(\s \backslash \s_k \mid \u, \a, \b, do(\v\backslash \w)) \Big(P(\u \mid \a, do(\v\backslash \w))- P(\u, \b \mid \a, do(\v\backslash \w)\Big) \\
        &\stackrel{(3)}{=} \frac{Q[\W]}{\sum_{\s,\b} Q[\W]} + \sum_{\u} P(\s \backslash \s_k \mid \u, \a, \b, do(\v\backslash \w, \s_k)) P(\u \mid \a, do(\v\backslash \w, \s_k)) \\
        & - \sum_{\u, \s_k} P(\s \mid \u, \a, \b, do(\v\backslash \w))P(\u, \b \mid \a, do(\v\backslash \w)) \\
        &\stackrel{(4)}{=} \frac{Q[\W]}{\sum_{\s,\b} Q[\W]} + P(\s\backslash \s_k \mid \a, do(\v\backslash \w, \b, \s_k)) - \sum_{\s_k}\frac{Q[\W]}{\sum_{\s,\b} Q[\W]}\\
        &\stackrel{(5)}{=} \frac{Q[\W]}{\sum_{\s,\b} Q[\W]} + P(\s\backslash \s_k \mid do(\v\backslash \w, \a, \b, \s_k)) - \sum_{\s_k}\frac{Q[\W]}{\sum_{\s,\b} Q[\W]}\\
        &\stackrel{(6)}{=} \frac{Q[\W]}{\sum_{\s,\b} Q[\W]} + Q[\S\backslash \S_k] - \sum_{\s_k}\frac{Q[\W]}{\sum_{\s,\b} Q[\W]}.
    \end{align*}
    The first four equalities follows from the observations (1,2,3) as in the derivation of the lower bound (\Cref{prop:lowerbound}); (1) follows from the derivation in the lower bound; (2) follows from the fact that the event $\{\s\}$ is less likely than event $\{\s\backslash\s_k\}$ under any probability mass function and that the difference in brackets is greater or equal to zero; (3) follows by rule 3 of the do-calculus (\Cref{thm:do_calculus}) since $\S_k$ is $d$-separated from $\S\backslash \S_k$ given $\U,\A,\B,\V\backslash\W$ in $\G_{\overline{\S_k,\V\backslash\W}}$ and since $\S_k$ is $d$-separated from $\U$ given $\A$ in $\G_{\overline{\S_k,\V\backslash\W}}$; (4) follows by marginalizing out $\U$ and similarly to (1); (5) follows by the rule 2 of do-calculus (\Cref{thm:do_calculus}) since $\S\backslash\S_k$ is $d$-separated from $\A$ in $\G_{\overline{V\backslash W, \B, \S_k}\underline{A}}$; (6) follows by the definition of $Q[\cdot]$.
    
    
    Similarly $P(\s \mid do(\v\backslash \s)) $ is a function of $Pa(\S)_{\G}$ only and thus this bound holds for any value of $\Z := Pa(\W)_{\G} \backslash Pa(\S)_{\G}$. In particular,
    \begin{align*}
        Q[\S] \leq \min_{\z} \left\{\frac{Q[\W]}{\sum_{\s,\b} Q[\W]} - \sum_{\s_k}\frac{Q[\W]}{\sum_{\s,\b} Q[\W]} \right\} + Q[\S\backslash \S_k].
    \end{align*}
\end{proof}

\textbf{\Cref{prop:bounds_vs_natural} restated.} \textit{Consider a query $P_\x(\y)$ and let $\1P$ be the PAG over $\{\X,\Y\}$ compatible with $P$. Then, under an assumption of faithfulness, the bounds given in \Cref{prop:lowerbound,prop:upperbound} are at least as tight as the natural bounds.}

\begin{proof}
    Following the premise of the proposition, consider a query $P_\x(\y)$ and let $\1P$ be the PAG over $\V=\{\X,\Y\}$ compatible with $P$. $P_\x(\y) = Q[\Y]$ and therefore we will compare bounds on $Q[\Y]$ with the natural bounds.
    
    By \Cref{lem:ancestor_id}, for any $\W = \texttt{PossAn}(\S)_{\1P}$ it holds that $Q[\W] = \sum_{\v\backslash\w} Q[\V] = \sum_{\v\backslash\w} P(\v) = P(\w)$. Further, since $\W\subseteq \{\X,\Y\}$ it holds that $P(\w) \geq P(\x,\y)$. The proposed lower bound can then be shown to be larger or equal to the natural lower bound as
    \begin{align*}
        \max_{\z} \frac{Q[\W]}{\sum_{\s,\b} Q[\W]} \geq \frac{Q[\W]}{\sum_{\s,\b} Q[\W]}
        \geq Q[\W]
        \geq P(\x,\y).
    \end{align*}
    The last term being the expression of the natural lower bound.
    
    Let $\S=\Y, \R=\W\backslash\S$, and let $\S_k\subseteq \S$ be any subset of $\S$. Note that $\R\union(\V\backslash\W) = \X$ in this example. From the definition of $\W$ as the set of possible ancestors of $\Y$ in $\1P$, it holds that  $(\R \indep \V\backslash\W)_{\G_{\overline{\V\backslash\W}}}$ for any causal diagram $\G$ compatible with an arbitrary $\1P$ as there cannot be any directed paths from $\V\backslash\W$ to $\R$ (otherwise at least some element in $\V\backslash\W$ would be defined as a possible ancestor of $\S$ which we ruled out by the definition of $\W$). 
    
    Denote $\U$ the set of unobserved confounders. It holds then that conditioning on $\U$ blocks all backdoor paths from $\R$ to $\S$ in graphs $\G_{\underline{\V\backslash\S}}$ in which directed edges into $\S$ are removed, that is $(\S \indep \R \mid \U)_{\G_{\underline{\V\backslash\S}}}$. This holds for any causal diagram $\G$ compatible with an arbitrary $\1P$. With these facts, we consider the following derivation to show that the proposed upperbound in smaller or equal to the natural upper bound,
    \begin{align*}
        \min_{\z} \left\{\frac{Q[\W]}{\sum_{\s,\b} Q[\W]} - \sum_{\s_k}\frac{Q[\W]}{\sum_{\s,\b} Q[\W]} \right\}& + Q[\S\backslash \S_k]\\
        &\leq  \frac{Q[\W]}{\sum_{\s,\b} Q[\W]} - \sum_{\s_k}\frac{Q[\W]}{\sum_{\s,\b} Q[\W]} + Q[\S\backslash \S_k]\\
        &\stackrel{(1)}{\leq} Q[\W] - \sum_{\s_k}Q[\W] + Q[\S\backslash \S_k]\\
        &= P(\w) - P(\s\backslash\s_k, \r \mid do(\v\backslash\w)) + P(\s\backslash\s_k \mid do(\v\backslash\s, \s_k))\\
        &= P(\w) +\sum_{\u} P(\s\backslash\s_k \mid \u, \r, do(\v\backslash\w))\{P(\u) - P(\u,\r)\}\\
        &\stackrel{(2)}{\leq} P(\w) +\sum_{\u} \{P(\u) - P(\u,\r)\}\\
        &= \sum_{\v\backslash\w} \{P(\w, \v\backslash\w) -  P(\r,\v\backslash\w)\} + 1\\
        &\stackrel{(3)}{\leq} P(\w, \v\backslash\w) -  P(\r,\v\backslash\w) + 1\\
        &\stackrel{(4)}{=} P(\x, \y) -  P(\x) + 1.
    \end{align*}
    The last term being the expression of the natural lower bound. In the above, (1) holds since $Q[\W] - \sum_{\s_k}Q[\W] < 0$ and therefore multiplying by a number less than 1, namely $\sum_{\s,\b} Q[\W]$, results in a larger expression; (2) holds by a similar observation since $P(\u) - P(\u,\r)>0$ and $P(\s\backslash\s_k \mid \u, \r, do(\v\backslash\w))<1$; (3) holds since $P(\w, \v\backslash\w) -  P(\r,\v\backslash\w) < 0$; (4) holds by definition since $\W \union \V\backslash\V = \X\union\Y$ and $\R\union\V\backslash\W = \X$.
\end{proof}


\textbf{\Cref{pro:soundness_alg} restated}. 
    \textit{Partial IDP (\Cref{alg:partialid}) terminates and is sound.}

\begin{proof}
    The proof follows from the termination guarantee of IDP, and the soundness of IDP \citep{jaber2019causal} and \Cref{prop:lowerbound,prop:upperbound}.
    
    For the run time, let $n$ be the number of variables. Operations in the \texttt{PID} function of \Cref{alg:partialid}, such as computing $pc$-components or finding the set of possible ancestors or descendants (as done in Props. 3 and 4), could be done in $\mathcal O(n^2)$ time, e.g. with a Breadth-First Search algorithm. Line 10 in \texttt{PID} decomposes the input set $\boldsymbol D$ into at most $n$ subsets, each requiring a new call to \texttt{PID}. In turn, line 7 in \texttt{PID}, if triggered, will reduce the set $\boldsymbol V$ of size $n$ by at least one variable at the time resulting in at most $n$ additional separate calls to \texttt{PID}. Since line 7 might be triggered repeatedly for each decomposed $C$-factor in line 10, overall, \Cref{alg:partialid} requires $\mathcal O(n^2)$ calls to \texttt{PID} and consequently $\mathcal O(n^4)$ time to return the bounds.
\end{proof}


\textbf{\Cref{prop:expressiveness_leg} restated.} \textit{Given a PAG $\1P$, let $\1L$ be the set of ME LEGs. Then, $\3M(\1P) = \3M(\1L)$.}

\begin{proof}
Each causal graph is a coarse representation of an underlying SCM, in which the presence of an edge $X\rightarrow Y$ implies the potential presence of $X$ as an argument in the function $f_Y$. In turn the absence of an edge implies a constraint in the system, e.g. no edge $X\rightarrow Y$ implies that $X$ is not an argument of $f_Y$. In general therefore the addition of an edge results in a family of SCMs that is strictly more general. 

Each MAG can be converted to an LEG by replacing some of the bi-directed arrows by directed arrow. Bi-directed arrows exclude an ancestral relationship whereas directed arrows do not exclude unobserved confounding, a directed edge $X\rightarrow Y$ thus defines a strictly more general $f_Y$ in contrast with $X\dashleftarrow\dashrightarrow Y$. Since this is the only difference between a MAG and its LEG, it follows that the set of SCMs compatible with a given MAG is strictly contained in the set of SCMs contained with its LEG. This reasoning applied to every MAG in the equivalence class implies that $\3M(\1P) = \3M(\1L)$. 
\end{proof} 

\textbf{\Cref{prop:expressiveness_causal_graphs_in_leg} restated.} \textit{Given a PAG $\1P$, let $\1D$ be the set of ME MBD diagrams. Then, $\3M(\1P) = \3M(\1D)$.}

\begin{proof} For a given LEG $L\in\1L$, let $\mathbb G_L$ be a set of maximally bi-directed causal graphs compatible with $L$. Then, We proceed by showing that  $\M(\mathbb G_{L}) = \M(L)$.

Let $L$ be a given LEG, and write $\G_L$ for the causal graph with the same structure as $L$. Adding bi-directed edges, without breaking conditional independencies, leads to a graph $\G_L'$ with the same ancestral and conditional independencies as $L$. The difference between the constructed causal graph $\G_L'$ and $\G_L$ is that for at least one variable $X\in\V$, $f_{X,\G_L}(\pa_X, \boldsymbol u_X)$ while $f_{X,\G_L'}(\pa_X, \boldsymbol u_X, u_{X,Z})$, where $Z\in\Pa_X$. The class of functions $f_{X}$ defined by $\G_L$ is included in that defined by $\G_L'$, and therefore $\3M(\G_L) \subset \3M(\G_L')$. 

In general, multiple graphs $\G_L'$ in which we repeatedly add bi-directed edges until no invisible edges exist can be constructed from a single LEG $L$. For example, the LEG $L:= \{Y_1 \leftarrow X \rightarrow Y_2\}$ has two graphs can be constructed by adding bi-directed edges without removing statistical independencies: $\G_1:=\{Y_1 \leftarrow X \rightarrow Y_2, Y_1 \dashleftarrow\dashrightarrow X\}$ and $\G_2:=\{Y_1 \leftarrow X \rightarrow Y_2, X \dashleftarrow\dashrightarrow Y_2\}$. These graphs can be recovered exactly by \Cref{alg:causal_graphs_from_LEG}. Let $\mathbb G_L$ be the set of all maximally bi-directed graphs compatible with $L$. Then, since any other causal graph compatible with $L$ defines a family of SCMs which is subsumed in that of a maximally directed causal graph, $\3M(\mathbb G_{L}) = \3M(L)$.
\end{proof}

\textbf{\Cref{prop:nonredundancy_causal_graphs_in_leg} restated.}
\textit{Given a PAG $\1P$, let $\G$ and $\1H$ be two MBD diagrams constructed from a ME LEG. In general, $\3M(\G)\not\subseteq \3M(\1H)$ and $\3M(\1H)\not\subseteq \3M(\1G)$.}

\begin{proof}
We give explicit counterexamples to demonstrate this fact.

\begin{figure}[t]
\centering
\hfill
\begin{subfigure}[t]{0.30\linewidth}\centering%(d)
  \begin{tikzpicture}[SCM,scale=1]
        \node (X) at (0,0) {$X$};
        \node (Y1) at (1,-0.7) {$Y_1$};
        \node (Y2) at (1,0.7) {$Y_2$};

        \path [->] (X) edge (Y1);
        \path [->] (X) edge (Y2);
    \end{tikzpicture}
\caption{$L$}
\label{fig:app_examples4:a}
\end{subfigure}\hfill
\begin{subfigure}[t]{0.30\linewidth}\centering%(d)
  \begin{tikzpicture}[SCM,scale=1]
        \node (X) at (0,0) {$X$};
        \node (Y1) at (1,-.7) {$Y_1$};
        \node (Y2) at (1,.7) {$Y_2$};

        \path [->] (X) edge (Y1);
        \path [->] (X) edge (Y2);
        \path [conf-path] (X) edge[out = -90, in=180] (Y1);
    \end{tikzpicture}
\caption{$\G_1$}
\label{fig:app_examples4:b}
\end{subfigure}
\hfill
\begin{subfigure}[t]{0.30\linewidth}\centering%(d)
  \begin{tikzpicture}[SCM,scale=1]
        \node (X) at (0,0) {$X$};
        \node (Y1) at (1,-.7) {$Y_1$};
        \node (Y2) at (1,.7) {$Y_2$};

        \path [->] (X) edge (Y1);
        \path [->] (X) edge (Y2);
        \path [conf-path] (X) edge[out = 90, in=180] (Y2);
    \end{tikzpicture}
\caption{$\G_2$}
\label{fig:app_examples4:c}
\end{subfigure}
\hfill\null
  \caption{First example of MBD causal diagrams used in the proof of \Cref{prop:nonredundancy_causal_graphs_in_leg}.}
  \label{fig:app_examples4}
\end{figure}


In general, the set of maximally bi-directed causal diagrams (MBD) compatible with a given LEG to consider for causal effect computation cannot be reduced without loss of generality. For instance, this could be shown for the computation of $P(y_1, y_2 \mid do(x))$ given the LEG $L$ in \Cref{fig:app_examples4:a}. Here two MBD causal diagrams could be constructed: $\G_1$ and $\G_2$ in \Cref{fig:app_examples4:b} and \Cref{fig:app_examples4:c} respectively. Given that $P(y_1, y_2 \mid do(x)) = P(y_1\mid do(x))P(y_2\mid do(x))$ and that in $\G_1$: $P(y_1\mid do(x)) \in [P(y_1, x), P(y_1, x) + 1 - P(x)], P(y_2\mid do(x) = P(y_2\mid x)$. (In this particular diagram, bounds could be derived analytically and are known to be provably tight \cite[Section 8.2]{pearl2009causality}. To further demonstrate this we provide below two SCMs compatible with $\G_1$ that evaluate to the upper and lower bounds respectively.). In contrast, in $\G_2$: $P(y_1\mid do(x) = P(y_1\mid x), P(y_2\mid do(x)) \in [P(y_2, x), P(y_2, x) + 1 - P(x)]$. The causal effect differs across $\G_1$ and $\G_2$. Neither of the bounds computed from $\G_1$ or $\G_2$ include the other and therefore both have to be considered for correctly bounding causal effects from $L$.


Below we give two SCMs compatible with $\G_1$ whose causal effect $P(y_1 \mid do(x))$ evaluate to the lower and upper bounds obtained through posterior sampling illustrating the tightness of the returned bounds. Let $\M_1, \M_2 \in \3M(\G_1)$ be defined by,
\begin{align*}
    \M_1 := \begin{cases}
    x := f_X(u) \\
    y_1 := \begin{cases} 
    f_{Y_1}(x, u) &\text{if } x = f_X(u), \\
    0 &\text{otherwise}. \\
    \end{cases}
    \end{cases}
\end{align*}
and,
\begin{align*}
    \M_2 := \begin{cases}
    x := f_X(u) \\
    y_1 := \begin{cases} 
    f_{Y_1}(x, u) &\text{if } x = f_X(u), \\
    1 &\text{otherwise}. \\
    \end{cases}
    \end{cases}
\end{align*}
Assume further that $P_{\M_1}(u)=P_{\M_2}(u)$. Then, both SCMs agree on observational distributions $P_{\M_1}(x, y_1)=P_{\M_2}(x, y_1)=P(x, y_1)$. However the following derivations show that the interventional distribution $P(y_1=1\mid do(x=1))$ differs across models: for $\M_1$ equal to the analytical lower bound, and for $\M_2$ equal to the analytical upper bound demonstrating that (in this case) the bound is tight. In particular,
\begin{align*}
    P_{\M_1}&(y_1=1\mid do(x=1)) \\
    &= P_{\M_1}(y_1=1\mid x=1, u: x=f_X(u))P(u: x=f_X(u)) + P_{\M_1}(y_1=1\mid x=1, u: x\neq f_X(u))P(u: x\neq f_X(u))\\
    &= P(y_1=1\mid x=1)P(x=1)\\
    &= P(y_1=1, x=1),\\
    P_{\M_2}&(y_1=1\mid do(x=1)) \\
    &= P_{\M_2}(y_1=1\mid x=1, u: x=f_X(u))P(u: x=f_X(u)) + P_{\M_2}(y_1=1\mid x=1, u: x\neq f_X(u))P(u: x\neq f_X(u))\\
    &= P(y_1=1\mid x=1)P(x=1) + P_{\M_2}(y_1=1\mid x=1, u: x\neq f_X(u))P(u: x\neq f_X(u))\\
    &= P(y_1=1, x=1) + 1 - P(x=1).
\end{align*}

\begin{figure}[t]
\centering
\hfill
\begin{subfigure}[t]{0.30\linewidth}\centering%(d)
  \begin{tikzpicture}[SCM,scale=1]
        \node (X) at (0,0) {$X$};
        \node (V1) at (1,-0.7) {$V_1$};
        \node (V2) at (1,0.7) {$V_2$};
        \node (Y) at (2,0) {$Y$};

        \path [->] (X) edge (V1);
        \path [->] (X) edge (V2);
        \path [->] (V1) edge (Y);
        \path [->] (V2) edge (Y);
    \end{tikzpicture}
\caption{$L$}
\label{fig:app_examples5:a}
\end{subfigure}\hfill
\begin{subfigure}[t]{0.30\linewidth}\centering%(d)
  \begin{tikzpicture}[SCM,scale=1]
        \node (X) at (0,0) {$X$};
        \node (V1) at (1,-0.7) {$V_1$};
        \node (V2) at (1,0.7) {$V_2$};
        \node (Y) at (2,0) {$Y$};

        \path [->] (X) edge (V1);
        \path [->] (X) edge (V2);
        \path [->] (V1) edge (Y);
        \path [->] (V2) edge (Y);
        \path [conf-path] (X) edge[out = -90, in=180] (V1);
    \end{tikzpicture}
\caption{$\G_1$}
\label{fig:app_examples5:b}
\end{subfigure}
\hfill
\begin{subfigure}[t]{0.30\linewidth}\centering%(d)
  \begin{tikzpicture}[SCM,scale=1]
        \node (X) at (0,0) {$X$};
        \node (V1) at (1,-0.7) {$V_1$};
        \node (V2) at (1,0.7) {$V_2$};
        \node (Y) at (2,0) {$Y$};

        \path [->] (X) edge (V1);
        \path [->] (X) edge (V2);
        \path [->] (V1) edge (Y);
        \path [->] (V2) edge (Y);
        \path [conf-path] (X) edge[out = 90, in=180] (V2);
    \end{tikzpicture}
\caption{$\G_2$}
\label{fig:app_examples5:c}
\end{subfigure}
\hfill\null
  \caption{Second example of MBD causal diagrams used in the proof of \Cref{prop:nonredundancy_causal_graphs_in_leg}.}
  \label{fig:app_examples5}
\end{figure}

This holds also more generally for single output causal effect of the form $P(y \mid do(x))$. For instance, bounding $P(y \mid do(x))$ given the LEG $L$ in \Cref{fig:app_examples5:a} requires two MBD causal diagrams without loss of generality, given in \Cref{fig:app_examples5:b} and \Cref{fig:app_examples5:c} respectively. It could be shown by writing $P(y \mid do(x)) = \sum_{v_1,v_2} P(y \mid x, v_1,v_2)P(v_1,v_2 \mid do(x))$ that the upper bounds computed from $\G_1$ and $\G_2$ will disagree as upper bounds for $P(v_1,v_2 \mid do(x))$ will differ (as shown in the example above).
\end{proof}






