

\section{Graph Notations}\label{app:graphnotations}


\begin{definition} A graph $D=(\V , \Eb)$ consists of a set of nodes (variables) $\V$ and a set of edges $\Eb$. We use $(X, Y)$ to denote an edge between a variable $X$ and another variable $Y$ in $D$.  A directed graph has only directed edges ($\rightarrow$). A partially directed graph may have both undirected ($-$) and directed edges ($\rightarrow$). A graph $D'=(\V', \Eb')$ is a \textit{subgraph} of $D=(\V, \Eb)$ and $D$ is a \textit{supergraph} of $D'$ if $\V' \subseteq \V$ and $\Eb' \subseteq \Eb$.  $D'$ is an \textit{induced subgraph} of $D$ if $\Eb'$ are all edges in $\Eb$ between nodes in $\V'$. A mixed graph consists of directed edges ($\rightarrow$), undirected edges ($-$) and bidirected edges ($\leftrightarrow$).
\end{definition}

\begin{definition}[Path]
    Two vertices in a graph are said to be \textit{adjacent} if there is an edge between them. Given a partially directed graph $D$, a \textit{path} from $V_{0}$ to $V_{n}$ in $D$ is a sequence of distinct vertices $\langle V_{0}, V_{1}, \ldots, V_{n} \rangle$ such that for $0 \le i \le n-1$, $V_{i}$ and $V_{i+1}$ are adjacent. It is called a \textit{causal} (or \textit{directed}) path from $V_{0}$ to $V_{n}$ in $D$ if $V_{i}$ is a parent of $V_{i+1}$ for $0 \le i \le n-1$. 
\end{definition}

\begin{definition}[Skeleton] A \textit{skeleton} of a causal graph is the undirected graph obtained by making every adjacent pair connected via an undirected edge.
\end{definition}

\begin{definition}[Colliders]
     A consecutive triple of nodes $\langle X, Y, Z \rangle$ on a path is called a \textit{collider} if both the edge between $X$ and $Y$ and the edge between $Y$ and $Z$ have arrowheads pointing to $Y$. If additionally $X$ and $Z$ are not adjacent,  it is called \textit{unshielded collider}. Any other consecutive triple is called a \textit{non-collider}. If additionally, the two end vertices of the triple are not adjacent, it is called a \textit{unshielded non-collider}.
\end{definition}



\begin{definition}[Ancestrality]
    In a graph $D$, for any two nodes $X, Y$ in $D$, if there is a directed edge  $X\rightarrow Y$, then $X$ is a \textit{parent} of $Y$ and $Y$ is a \textit{child} of $X$ in $D$. If there is a causal path from $X$ to $Y$, then $X$ is called an \textit{ancestor} of $Y$ and $Y$ is called a \textit{descendant} of $X$.
    We denote a set of parents of $X$, a set of children of $X$, a set of ancestors of $X$, a set of descendants of $X$ and a set of non-descendants of $X$  in $D$ as $Pa_{D}(X)$, $Ch_{D}(X)$, $An_{D}(X)$, $De_{D}(X)$ and $NDe_{D}(X)$ respectively.
   By convention, $X$ is both an ancestor and a descendant of $X$ in $D$. A source (or root) node has no parents.  A sink node does not have any child.
\end{definition}



% In general, constraint-based algorithms can only learn up to an equivalence class of models, a set of DAGs that induce the same conditional independencies via d-separation, which gives the following definition. 
% \begin{definition}[Markov Equivalence] Two DAGs $D_{1}, D_{2}$ with the same set of vertices are \textit{Markov equivalent} if for any three disjoint set of vertices $\mathbf{X}, \mathbf{Y} ,\mathbf{Z}$, $\mathbf{X}$ and $\mathbf{Y}$ are d-separated by $\mathbf{Z}$ in $D_{1}$ if and only if $\mathbf{X}$ and $\mathbf{Y}$ are d-separated by $\mathbf{Z}$ in $D_{2}$. A set of DAGs that encode the same set of conditional independence induced only by the causal Markov assumption is
% called the \textit{Markov equivalence class}. Denote the Markov equivalence class of a DAG $D$ by $[D]$.
% \end{definition}
% \begin{definition}
%  (Essential Graph) The \textit{essential graph} of a DAG $D$ has the same skeleton as $D$, with directed edges $X_i \rightarrow X_j$ if such edge direction between $X_i$ and $X_j$ holds for all DAGs in $[D]$, and undirected edges otherwise. 
% \end{definition}

% The essential graph is also called the \textit{completed partially directed acyclic graph} (CPDAG) \citep{perkovic2017interpreting, castelletti2018learning}.  

For learning $D_{aug}$, we need to leverage distributional invariances across the normal and anomalous datasets via the following two assumptions. For a more detailed discussion on these assumptions, please see \cite{jaber2020causal}.

\begin{assumption}[$\Psi$-Markov conditions]\label{assumption:causal Markov assumption}
Let $\mathbf{P}$ denote an ordered tuple of distributions and let $\mathcal{\mathcal{I}}$ be an ordered tuple of the children of F-NODE. $\mathbf{P}$ is called \textit{$\Psi$-Markov} relative to a graph $D_{aug} = (\mathbf{V}, \mathbf{E})$ if the following holds for $\mathbf{Y}, \mathbf{Z}, \mathbf{W} \subseteq \mathbf{V}$:
\begin{enumerate}
    \item For $\mathbf{I}_{i} \in \mathcal{I}$: $P_{i}(\mathbf{y}| \mathbf{w}, \mathbf{z})= P_{i}(\mathbf{y}| \mathbf{w})$ if $\mathbf{Y} \indep \mathbf{Z} | \mathbf{W}$ in $D_{aug}$
    \item For $\mathbf{I}_{i}, \mathbf{I}_{j} \in \mathcal{I}$: $P_{i}(\mathbf{y}| \mathbf{w})= P_{j}(\mathbf{y}| \mathbf{w})$ if $\mathbf{Y} \indep \mathbf{K} | \mathbf{W}_{\mathbf{K}}$ in ${D_{aug}}_{\underline{\mathbf{W}_{\mathbf{K}}}, \overline{\mathbf{R}(\mathbf{W})}}$
\end{enumerate}, where $\mathbf{K}:= (\mathbf{I_{i}} \setminus \mathbf{I}_{j}) \cup (\mathbf{I_{j}} \setminus \mathbf{I}_{i})$, $\mathbf{W}_{\mathbf{K}}:= \mathbf{W} \cap \mathbf{K}, \mathbf{R} := \mathbf{K} \setminus \mathbf{W}_{\mathbf{K}}$, and $\mathbf{R}(\mathbf{W}) \subseteq \mathbf{R}$ are non-ancestors of $\mathbf{W}$ in $D_{aug}$.
\end{assumption}

\begin{assumption}[c-faithfulness]\label{assumption:faithfulness}
    A tuple of distributions $\mathbf{P}$ are said to be \textit{c-faithful} to $D_{aug}$ if the converse of each of the $\Psi$-Markov conditions holds.
\end{assumption}



\input{sections/related-work}

\section{Theorems and Proofs}\label{app:proofs}
For the sack clarity, we first provide the Theorem $1$ from~\cite{shangqi2023partial} and Theorem $2$ from~\cite{tao2019interactive}. \citet{shangqi2023partial} term the IGS problem as the POMS problem and they refer to a DAG as an input graph.  

\begin{theorem}\cite{shangqi2023partial} \label{app:shang_theorem}
    For the POMS problem, let $n$ represent the number of vertices in the input graph $D$ and $d$ denote the maximum vertex out-degree in $D$. Both of the following statements are true:
    \begin{itemize}
        \item There is an algorithm that can find the target in $O(\log_{1+k} n + (d/k)\log_{1+d}n)$ probs.
        \item Any POMS algorithm must perform $\Omega(\log_{1+k}n+(d/k)\log_{1+d}n)$ probs to find the target in the worse case. 
    \end{itemize}
\end{theorem}

\begin{theorem}\cite{tao2019interactive} \label{app:tao_theorem}
    Let $h$ be the length of the longest path in the DAG the and $n$ be the number of variables. Both of the following statements are true about the IGS problem:
    \begin{itemize}
        \item \texttt{DFS-interleave} asks at most $\lceil\log_{2}h\rceil \cdot (1 + \lfloor \log_{2}n\rfloor) + (d - 1) \cdot \lceil\log_{d}h \rceil$ questions.
        \item Any algorithm must ask at least $(d - 1) \cdot \lfloor\log_{d}h \rfloor$ questions in the worst case.
    \end{itemize}
\end{theorem}

We provide the pseudocode of \texttt{DFS-interleave}, which has been modifed for RCA, in Algorithm~\ref{alg:modified-dfs-tree}.

\notancestors*

\begin{proof}
    For the sake of contradiction, suppose $F\rightarrow A$ in $D_{aug}$ for some $A\in An_{D}(X)$. Since $A$ is an ancestor of $X$ in $D$, there must be a directed path $q$ from $A$ to $X$ in $D$. Thus, $q$ must also exist in $D_{aug}$. Consider the path obtained by concatenating $F\rightarrow A$ with $q$ in $D_{aug}$. This path must be d-connecting in $D_{aug}$. Thus, it must be that $(F\not\indep X)_{D_{aug}}$. From interventional faithfulness, we have that $(F\not \indep X)_P$, which is a contradiction. 
\end{proof}

\notdecendants*
\begin{proof}
For the sake of contradiction, suppose $F\rightarrow Q$ in $D_{aug}$ for some $Q\in NAn_{D}(X)$. Since $Q$ is a non-ancestor of $X$ in $D$, without loss of generality, there are several cases: (i) there exists a directed path $q$ from $X$ to $Q$ in $G$ (ii) there is no path between $Q$ and $X$ in $D$ and (iii) any path $p$ between $X$ and $Q$ must have a collider on $p$ in $D$.  

For case (i), $q$ must also exist and be directed in $D$. By concatenating the path from $X$ to $Q$ and $F\rightarrow Q$, we see the path from $F$ to $X$ is blocked. Thus, we have $(F \indep X)_{D}$, which implies $(F \indep X)_{P}$ by Assumption \ref{assumption:causal Markov assumption}, which is a contradiction. 

For case (ii), there is no path between $X$ and $Q$ in $D$, which implies $(F \indep X)_{D}$ so that we reach the same contradiction. 

For case (iii), every collider on any path $p$ between $Q$ and $X$ must also be in $D$ such that we have $(F \indep X)_{D}$ by concatenating $F\rightarrow Q$ with $p$, which implies $(F \indep X)_{P}$ by Assumption \ref{assumption:causal Markov assumption}, which is a contradiction.
\end{proof}

\reduction*
\begin{proof}
Consider some nodes $X \in \V,$ suppose $(F \indep X)_{P}$, then $X \in NDe_{D}(R)$ by Lemma \ref{lem:ancestors_not_F}. Note that $NDe_{D}(R) = NAn_{D'}(R)$ due to $De_{D}(R) = An_{D'}(R)$ by the given conditions for $D$ and $D'$. Therefore, $X \in NAn_{D'}(R)$. As $NAn_{D'}(R) \Leftrightarrow Q(X)$= no. We have that $(F \indep X)_{P} \Rightarrow Q(X) = $ no. Similarly, suppose $(F \not \indep X)_{P}$, then $X \in De_{D}(R)$ by Lemma \ref{lem:descendants_cannot_be_targets}. As $De_{D}(R) = An_{D'}(R),$ we have that $(F \not \indep X)_{P} \Rightarrow X \in An_{D'}(R)$, which is equivalent to $Q(X) = $ yes. 
\end{proof}

\reductionoptimal*
\begin{proof}
    This follows from Lemma \ref{lem:reduction} and Theorem 1 in \citep{shangqi2023partial}, which says that any algorithm must ask $\Omega(\log_{2}n + d\log_{1+d}n)$ queries to identify the target node selected by an adversary in a DAG $D'$ with a single root node for the problem of IGS, where $d$ is the maximum out-degree in $D'$ and there is an algorithm that can find the target node in $\mathcal{O}(\log_{2}n + d\log_{1+d}n)$ number of queries.
\end{proof}


% \begin{lemma}
%     Given a DAG $D$, construct a DAG $D_{aug}$ with all the edges reversed in $D$. A query to check if a node $Z$ is a descendant of a given node $X$ in $D$ is equivelant to checking if $Z$ is an ancestor of $X$ in $D_{aug}$.
% \end{lemma}
% \begin{proof}
%     Suppose $Z$ is a descendant of $X$ in $D$. This means there exists a directed path $P = (X \rightarrow \dots \rightarrow Z)$ in $D$ where every edge on $P$ points from left to right. Now consider $D_{aug}$ where the direction of every edge is flipped, therefore the path $P$ in $D$ becomes a reversed path $P' = (Z \rightarrow \dots \rightarrow X)$ in $D_{aug}$. This reversed path $P'$ in $D_{aug}$ shows that there is a directed path from $Z$ to $X$ in $D_{aug}$ and hence $Z$ is an ancestor of $X$ in $D_{aug}$.
% \end{proof}


% \igscorrectness*
% \begin{proof}
%     First note that a probing query in IGS is actually a reachability query. A probe on a node $X$ in IGS is a query to an oracle $\mathcal{O}$ whether $X$ has a directed path to the target node $Z$. We can map this reahability query to a marginal CI test between $F$ and a node in the graph. Note that a CI test $(F \indep Y)_{P}$, $Y \in \V$ indicates that there is no directed path from $F$ to $Y$. Similarly, $(F \dep Z)_{P}$, $Z \in \V$ shows that the there is a directed path from $F$ to $Z$. This is true due to the fact there is only a single target and thus $F$ has only one child. Moreover~\ref{lem:ancestors_not_F} claims that 
% \end{proof}

\subsection{Proof of Theorem \ref{thm:soundness}} We first leverage an existing result from \citet{wienobst2020recovering}  (see Lemma~\ref{lem:no_causal_path}). Then, we will prove Lemma ~\ref{lem:conditioning_on_posspa}. While Lemma~\ref{lem:no_causal_path} ensures the correctness of lines 5-6 in Algorithm~\ref{alg:sample-version}, Lemma ~\ref{lem:conditioning_on_posspa} proves the correctness of using possible parent sets to rank the top root causes under a more fine-grained representation of CPDAGs due to the orientations that take place in lines 6-7 in Algorithm~\ref{alg:sample-version}. 


\begin{restatable}{lemma}{nocausalpath}\cite{wienobst2020recovering}
\label{lem:no_causal_path}
Given a distribution $P$ defined over a DAG $D$, for any $X, Y \in \V$ and $\Z\in \mathbf{V}\setminus \{X,Y\}, |\mathbf{Z}| \le k$ for some $k\ge 0$, if $(\ci{X}{Y}{\Z})_{P}, (\nci{X}{W}{\Z})_{P}$, then no DAG faithful to $P$ contains the edge $W \rightarrow Y$.
\end{restatable}


% \begin{proof}
%     For the sake of contradiction, assume that there is DAG that contains the directed edge from $W$ to $Y$. Since  $(\nci{X}{W}{\Z})_{P}$, we have that $X$ is d-connecting with $W$ given $\mathbf{Z}$, concatenating this d-connecting path with $W\rightarrow Y$, we have that $X$ is also d-connecting with $W$ given $\mathbf{Z}$, which is a contradiction.
% \end{proof}



% \begin{theorem}[\citealt{kocaoglu2023characterization}]\label{thm:two-k-closure-graphs-mec-equal-two-DAGs-mec}
%     Two DAGs $D_{1}$, $D_{2}$ are $k$-Markov equivalent if and only if $\mathcal{C}_{k}(D_{1})$ and $\mathcal{C}_{k}(D_{2})$ are Markov equivalent. 
% \end{theorem}

\begin{restatable}{lemma}{optimal}\label{lem:conditioning_on_posspa} 
Let $M$ be the graph returned by lines 6-7 in Algorithm \ref{alg:sample-version},  $F$ is not adjacent to $X$ in $D_{aug}$ if and only if $F$ is d-separated with $X$ given $PossPa_{M}(X)$ in $D_{aug}$.  
\end{restatable}

\begin{proof} \label{proof-of-lem-possibleparents}
  % The algorithm works as follows: First, we run $n$ marginal independence tests between $F$ and $X$ for all $X \in \V$. Then, for each node $Y$ that has been tested to be marginally dependent with $F$. We run $m$ independent tests between $Y$ and $F$ conditional on the possible parents of $Y$ in $\varepsilon_{k}(D)$. 
     We first prove the if ($\Rightarrow $) direction. 
    
   We first give a critical insight. We note that if F-NODE points to any variable that is a collider $H$ on some paths $p$ in $D_{aug}$, then running marginal tests must have allowed us to orient $F \rightarrow H \leftarrow U$ and $F \rightarrow H \leftarrow Q$ for some variables $U, Q$ on $p$ in the given CPDAG $\mathcal{C}(D)$ due to Lemma \ref{lem:no_causal_path}. Thus, we call this resulting graph $M$ rather than $\mathcal{C}(D)$. If $F$ is marginally independent with all members in the adjacency set of $H$, then the result follows. 
   
   Suppose there is more than one node being marginally dependent on $F$. We call this set $\mathbf{Z}$. Then, we know $F$ must have a directed path to all such nodes $Z \in \Z$ in $D_{aug}$ as there is no incoming edges to $F$ and each of these nodes is marginally dependent with $F$. 
    We will prove the claim that if $F$ is not adjacent to $Z$ in $D_{aug}$, then $F$ is d-separated with $Z$ given $PossPa_{M}(Z)$ in $D_{aug}$ for all $Z \in \Z$. 

    For the sake of contradiction, assume that $F$ is d-connecting with $Z$ given $PossPa_{M}(Z)$ in $D_{aug}$. First, we note that $PossPa_{M}(Z)$ must contain all parents of $Z$ in $D_{aug}$. Since there exists a directed path from $F$ to $Z$, we call this path $r$ as shown below:
    \begin{equation}
        F \rightarrow T \rightarrow \ldots \rightarrow W \rightarrow ...\rightarrow Z.
    \end{equation} 
    Since $PossPa_{M}(Z)$ must contain all parents of $Z$, we consider two cases: (i) conditioning on $PossPa(Z)$ opens an active path from $F$ to $Z$ through the backdoor of some ancestors of $PossPa(Z)$ e.g., $W$ by concatenating a subpath of $r$ as follows :
    \begin{equation}\label{path:ex1}
        F \rightarrow T \rightarrow \ldots \rightarrow W \leftarrow  Q \rightarrow \ldots \rightarrow Z
    \end{equation}
    and case (ii): there exists a d-connecting path from $F$ to $Z$ given some variables $K$ as follows
     \begin{equation}\label{path:ex2}
        F \rightarrow T \rightarrow \ldots \rightarrow W \rightarrow \ldots \rightarrow K \leftarrow Z
    \end{equation}

Case (i) - conditioning on $PossPa(Z)$ opens an active path from $F$ to $Z$ through the backdoor of some ancestors of $PossPa(Z)$ by concatenating with a subpath of $r$: We will first show a contradiction in this case. Note that we cannot have $Q \in An_{D_{aug}}(Z)$. To see that, suppose $Q$ and $Z$ is adjacent in $\mathcal{C}(D)$, then $Q$ must be in $PossPa_{M}(Z)$ as $(F \not \indep Z)_{P}$ so that Algorithm \ref{alg:sample-version} will not change the orientation of this edge in $M$. This yields a contradiction as $Q \in PossPa_{M}(Z)$ would have blocked this path. Suppose they are not adjacent in $\mathcal{C}(D)$, as any directed path from $Q$ to $Z$ would have been blocked by conditioning on $PossPa_{M}(Z)$, there exists a collider $U_{1}$ on a path from $Q$ to $Z$ with a member in $De_{D_{aug}}(U_{1})$ must be in $PossPa_{M}(Z)$ in order for the path in (\ref{path:ex1}) to be a d-connecting path from $F$ to $Z$ as follows.

\begin{equation}\label{path:ex3}
        F \rightarrow T \rightarrow \ldots \rightarrow W \leftarrow  Q \rightarrow \ldots \rightarrow U_{1} \leftarrow  \ldots Z
    \end{equation}
Consider $U_{1}$ is a descendant of $Z$. This means there exists another backdoor path being active by conditioning on $PossPa_{M}(Z)$ where some members $\mathbf{J} \subseteq PossPa_{M}(Z)$ are descendants of $U_{1}$. We first note that $U_{1}$ cannot have a directed path to $Z$ due to acyclicty. That implies $\mathbf{J}$ must be children of $Z$ in $D_{aug}$. Note that there cannot be any descendant $R$ of $U_{1}$ that forms an unshielded collider with any member in $\mathbf{J}$ because it will orient $Z \rightarrow C$ for any such child $C$ in $\mathbf{J}$ such that they will not be in $PossPa_{M}(Z)$. Thus, for any J in $\mathbf{J}$ and some descendants $R$ of $U_{1}$, we must have shielded colliders $\langle R, J, Z \rangle$.  This implies that $U_{1}$ must be adjacent to $Z$, which implies that $U_{1}$ is a child of $Z$. It also implies that the backdoor path in (\ref{path:ex3}) is active due to conditioning on a child of $U_{1}$, which is also in $PossPa_{M}(Z)$. We call this child $H$. Consider the variable $U_{2}$ that is closest to $U_{1}$ on the path in (\ref{path:ex3}) e.g.
\begin{equation}\label{path:ex4}
        F \rightarrow T \rightarrow \ldots \rightarrow W \leftarrow  Q \rightarrow \ldots \rightarrow U_{2} \rightarrow U_{1} \leftarrow   Z
    \end{equation}


Suppose $U_{2}$ is adjacent to $Z$ in $D_{aug}$, if $U_{2} \leftarrow Z$ is in $D_{aug}$, then we can use the same argument by treating $U_{2}$ as $U_{1}$ repeatedly until we reach $Q$ to be the closest node such that we reach the conclusion that conditioning on $Q$ would have blocked the backdoor path to reach the same contradiction. If $U_{2} \rightarrow Z$  is in $D_{aug}$, then $U_{2}$ is in $PossPa_{M}(Z)$ such that the backdoor path would not be active either by conditioning on $PossPa_{M}(Z)$. Thus, $Z$ and $U_{2}$ cannot be adjacent in $D_{aug}$.     
Suppose $U_{2}$ is not adjacent to $H$ in $D_{aug}$, then $U_{2} \rightarrow U_{1} \rightarrow H$ must have been oriented in $\mathcal{C}(D)$ by first Meek rule such that $Z \rightarrow H$ is also oriented in $\mathcal{C}(D)$ due to acyclicity, which leads to a contradiction as $H \in PossPa_{M}(Z)$. Thus, $U_{2}$ is adjacent to $H$ in $D_{aug}$. This implies that $U_{2} \rightarrow H$ is in $D_{aug}$ due to acyclicity. Since $Z$ and $U_{2}$ cannot be adjacent in $D_{aug}$, then $U_{2}\rightarrow H \leftarrow Z$ must have been oriented in $\mathcal{C}(D)$ such that $U_{2}\rightarrow H \leftarrow Z$ is also in $M$. Then, $H$ will not be in $PossPa_{M}(Z)$, which is a contradiction. 


Case (ii): Consider the path in (\ref{path:ex2}). We will use a similar argument we made in case (i). That is, there cannot be any descendant $R$ of $W$ that forms an unshielded collider with any member $C$ in $PossPa_{M}(Z)$ because it will orient $Z \rightarrow C$ for any such child $C$ such that they will not be in $PossPa_{M}(Z)$. Thus, for the variable $J$ that is the descendant closest to $W$ and is a child of $Z$ must form a shielded collider e.g. $\langle W, J, Z\rangle$. However, since $W$ has a directed path to $Z$, that implies $W \rightarrow Z$ must also be in $D_{aug}$ such that $W \in PossPa_{M}(Z)$, which leads to a contradiction as the path $(\ref{path:ex2})$ is no longer d-connecting.

For the only if direction, for the sake of contradiction, assume $F$ and $X$ are adjacent in $D_{aug}$. Since $F$ and $X$ are d-separated given the possible parents set of $X$ in $M$, then there is no d-connecting path from $F$ to $X$ given the possible parents set of $X$, which is a contradiction as $F$ is adjacent to $X$.  
\end{proof}

\soundness*
\begin{proof}
    The proof is based on Lemmas \ref{lem:no_causal_path} and \ref{lem:conditioning_on_posspa}. Lemma \ref{lem:no_causal_path} proves the correctness of the orientation rules in Algorithm \ref{alg:sample-version} to refine the given CPDAG. Lemma \ref{lem:conditioning_on_posspa} proves the correctness of lines 10-13 in Algorithm \ref{alg:sample-version} where we use the highest CMI value e.g. $I(F;X|PossPa_{M}(X))$ to identify the true root causes. Specifically, Assumption \ref{assumption:faithfulness} ensures  that the non-root cause $\bar{R}$ will have low CMI value e.g. $I(F;\bar{R}|PossPa_{M}(\bar{R}))$ relative to the true root causes $R$ e.g. $I(F;R|PossPa_{M}(R)) > I(F;\bar{R}|PossPa_{M}(\bar{R}))$. 
\end{proof}



% \sizeofposspa*
% \begin{restatable}
%     {corollary}{sizeofposspa}\label{cor:size_of_poss_pa}
%     Given two graphs $M_{1}, M_{2}$ returned by Algorithm $\ref{alg:marginal-invariance-test}$ based on two different $\mathcal{C}$-essential graphs $\varepsilon_{\mathcal{C}_{1}}(D)$ and $\varepsilon_{\mathcal{C}_{2}}(D)$, if $\mathcal{C}_{1} \subset \mathcal{C}_{2}$, then $|PossPa_{M_{1}}(X)| \ge |PossPa_{M_{2}}(X)|$.
% \end{restatable}
% \begin{proof}
%     Since $\mathcal{C} \subset \mathcal{C}_{2}, \mathcal{C}$-PC will conduct more CI tests based on $\mathcal{C}_{2}$, which can result in a sparser $\mathcal{C}$-essential graph, it follows that $|PossPa_{M_{1}}(X)| \ge |PossPa_{M_{2}}|$ for all $X \in \mathbf{V}$
% \end{proof}

\section{Discussion on Incorporating Other Partial Causal Structures}\label{app:extension_discussion}

In this section, we will briefly introduce what other partial causal structures can be incorporated into our Algorithm \ref{alg:sample-version}. Then, we will discuss the main differences between these structures and a CPDAG. These structure are called $k$-CPDAG \cite{wienobst2020recovering} and $k$-essential graphs \cite{kocaoglu2023characterization}. These graphical representation are mainly for characterizing a set of causal graphs that share the same d-separation constraints with a bounding conditioning set size up to $k$ e.g. $(X \indep Y | \mathbf{Z})_{D}, |\mathbf{Z}| \le k$ for some causal graphs $D$. For the sake of clarity, we call these \textit{degree-$k$ d-separation statements}. As $k$-essential graphs carry strictly more information about the set of causal graphs that entail the set of degree-$k$ d-separation statements than $k$-CPDAG (see Section D.6 in \citet{kocaoglu2023characterization}), any result that we extend to $k$-essential graphs in this section will also be applicable to $k$-CPDAG. Thus, our discussion will focus on $k$-essential graphs for the rest of this section. These partial causal structures are developed for practical scenarios where there is only very limited sample, which can quickly compromise the reliability of CI tests due to the lack of statistical power. \citet{kocaoglu2023characterization} has provided an algorithm known as $k$-PC for learning $k$-essential graphs from observed data. By letting $k=|\V|-2$, a $k$-essential graph will resemble a CPDAG. Thus, one can view it as a more general form of the CPDAG.


\begin{figure}[t]
    \centering 
    \footnotesize
    % figure a 
     \subfigure[$D_{1}$]
    {     
        \begin{tikzpicture}[scale=0.3]
            \node (2) at (0,0) {$X_{1}$};
            \node (3) at (3,0) {$X_{2}$};
            \node (4) at (6,0) {$X_{3}$};
            \node (5) at (3,3) {$X_{4}$};
            \path[->] (4) edge (3);
            \path[->] (5) edge (3);
            \path[->] (5) edge (2);
        \end{tikzpicture}
        \label{fig:g-a1}
    }
    \subfigure[$\C{D_{1}}$]
    {     
        \begin{tikzpicture}[scale=0.3]
            \node (2) at (0,0) {$X_{1}$};
            \node (3) at (3,0) {$X_{2}$};
            \node (4) at (6,0) {$X_{3}$};
            \node (5) at (3,3) {$X_{4}$};
            \path[<->] (2) edge (3);
            \path[->] (4) edge (3);
            \path[->] (5) edge (3);
            \path[->] (5) edge (2);
        \end{tikzpicture}
        \label{fig:kpc-a}
    }
    % figure b 
    \subfigure[$D_{2}$] 
    {         
        \begin{tikzpicture}[scale=0.3]
                       \node (2) at (0,0) {$X_{1}$};
            \node (3) at (3,0) {$X_{2}$};
            \node (4) at (6,0) {$X_{3}$};
            \node (5) at (3,3) {$X_{4}$};
            \path[->] (2) edge (3);
            \path[->] (4) edge (3);
            \path[->] (2) edge (5);
        \end{tikzpicture}
        \label{fig:kpc-b}
    }
    % figure c 
    \subfigure[$\C{D_{2}}$] 
    {         
         \begin{tikzpicture}[scale=0.3]
                       \node (2) at (0,0) {$X_{1}$};
            \node (3) at (3,0) {$X_{2}$};
            \node (4) at (6,0) {$X_{3}$};
            \node (5) at (3,3) {$X_{4}$};
            \path[<->] (5) edge (3);
            \path[->] (2) edge (3);
            \path[->] (4) edge (3);
            \path[->] (2) edge (5);
        \end{tikzpicture}
        \label{fig:kpc-c}
    }
    % figure d 
    \subfigure[$\varepsilon_k$]
    {     
           \begin{tikzpicture}[scale=0.35]
                       \node (2) at (0,0) {$X_{1}$};
            \node (3) at (3,0) {$X_{2}$};
            \node (4) at (6,0) {$X_{3}$};
            \node (5) at (3,3) {$X_{4}$};
            \path[o->] (5) edge (3);
            \path[o->] (2) edge (3);
            \path[->] (4) edge (3);
            \path[-] (2) edge (5);
        \end{tikzpicture}
        \label{fig:kpc-d}
    }
    \caption{\cite{kocaoglu2023characterization} Two \kmarkov equivalent DAGs for $k=0$ with the same \kessential graph. As $\C{D_1}$ is Markov equivalent to  $\C{D_2}$, they have the same %
 \kessential graphs $\E{D_1}\!=\!\E{D_2}\!=\!\varepsilon_k$, obtained as the edge union of their \kclosures. (b): the $k$-closure graph of $D_{1}$, where $k=0$. (d): the $k$-closure graph of $D_{2}$, where $k=0$. (e): the $k$-essential graph of the two $k$-closures graphs $\C{D_{1}}, \C{D_{2}}$.
    }
    \label{fig:kPC_example}
\end{figure}

\subsection{Background on $k$-essential graphs}
For readability, we provide the definitions and theorems from \citet{kocaoglu2023characterization} that are necessary to facilitate the discussion. For details, please refer to \citet{kocaoglu2023characterization}. We begin by introducing several key concepts: $k$-covered, $k$-closure graphs, and their relationships with $k$-essential graphs. 

\begin{definition}\cite{kocaoglu2023characterization} \label{def:k-covered}
    Given a DAG $D=(\mathbf{V},\mathbf{E})$ and an integer $k$, a pair of nodes $X, Y$ are said to be \kcovered if $\exists \mathbf{Z}\subset \mathbf{V}:\lvert \mathbf{Z}\rvert \leq k$ and $(\ci{X}{Y}{\mathbf{Z}})_{D}$.
\end{definition}
Definition \ref{def:k-covered} says that two variables are $k$-covered if there is no separating set of size up to $k$ that can d-separate them. The idea of having two nodes being \kcovered is to facilitate the construction of an augmented graphical representation of a DAG known as \kclosure graphs.  
\begin{definition}\cite{kocaoglu2023characterization}
\label{def:kclosure}
    Given a DAG $D=(\mathbf{V},\mathbf{E})$ and an integer $k$, the \kclosure of $D$ is defined as the graph shown by $\C{D}$ that satisfies the following:
    \begin{enumerate}
        \item If: $X,Y$ are \kcovered in $D$\\
        \hspace{-2em} $i)$ if $X\in An_{D}(Y)$, then $X\rightarrow Y$ in $\C{D}$,
        \space$ii)$  if $Y\in An_{D}(X)$, then $X\leftarrow Y$ in $\C{D}$,
         \\  $iii)$ else $X\leftrightarrow Y$ in $\C{D}$
        \item Else: $X, Y$ are non-adjacent in $\C{D}$
    \end{enumerate}
\end{definition}
\citet{kocaoglu2023characterization} provides a graphical way to help determine whether two $k$-closure graphs are Markov equivalent. \citet{kocaoglu2023characterization} also proved that the $k$-closure graph of a DAG $D$ induces the same set of degree-$k$ d-separation statements as $D$.
\begin{corollary}\cite{kocaoglu2023characterization}
\label{cor:kclosureequivalence}
    Two \kclosure graphs $K_1, K_2$ are Markov equivalent if and only if 
    \begin{enumerate}
        \item They have the same skeleton and
        \item They have the same unshielded colliders.
    \end{enumerate}
\end{corollary}

\begin{lemma}\cite{kocaoglu2023characterization}
\label{lem:kclosure}
$k$-closure $\C{D}$ of a DAG $D$ entails the same degree-$k$ d-separation statements as the DAG, i.e., 
 $   (\ci{X}{Y}{\mathbf{Z}})_D\iff (\ci{X}{Y}{\mathbf{Z}})_{\C{D}}, \forall \mathbf{Z}\subset \mathbf{V}:\lvert \mathbf{Z}\rvert\leq k.$
\end{lemma}


 Moreover, $k$-Markov equivalence of two DAGs can be reduced to checking Markov equivalence of their $k$-
closure graphs. Based on this result,
\citet{kocaoglu2023characterization} shows that one can only hope to learn up
to the equivalence class of $k$-closure graphs just by CI tests.

\begin{theorem}\cite{kocaoglu2023characterization}
\label{thm:kmarkov}
Two DAGs $D_1,D_2$ are k-Markov equivalent if and only if $\C{D_1}$ and $\C{D_2}$ are Markov equivalent.
\end{theorem}
\citet{kocaoglu2023characterization} uses the following edge union to represent a set of Markov equivalent $k$-closure graphs. 

\begin{definition}[edge unions: $\mbox{---},o\mbox{---}o, \crightarrow$ \citep{kocaoglu2023characterization}] The edge union operations of a set of $k$-closure graphs are defined as:\label{def:c-closure-graphs}
(i) $X\mbox{ --- }Y := X\rightarrow Y \cup X\leftarrow Y$, (ii) $X\, o\mbox{---}o \, Y := X\rightarrow Y \cup X\,\leftarrow Y \cup X\leftrightarrow Y$, (iii) $X\,\crightarrow \, Y := X\rightarrow Y \cup X\leftrightarrow Y$. A wildcard mark $*$ denotes it can be a circle, a tail, or an arrowhead mark.
\end{definition}


\begin{definition}[\kessential graph] \cite{kocaoglu2023characterization}
\label{def:kessential}
For any DAG $D$, the edge union of all \kclosure graphs that are Markov equivalent to $\C{D}$ is called the \kessential graph of $D$, shown by $\E{D}$. %
\end{definition}

Figure \ref{fig:kPC_example} illustrates the difference between DAGs, $k$-closure graphs, and $k$-essential graphs.



\subsection{Theorems and Proofs for Extending Our Results to $k$-essential Graphs}
% We can use an existing result Lemma \ref{lem:no_causal_path} directly implies the following as follows.
We will first prove a result that is similar to \ref{lem:no_causal_path} for $k$-essential graphs. We will leverage an existing result from \citet{wienobst2020recovering}.

\begin{lemma}\cite{wienobst2020recovering}\label{lem:loci_result}
    Given a distribution $P$ defined over a DAG $D$, for any $X, Y \in \V$ and $\Z\in \mathbf{V}\setminus \{X,Y\}, |\mathbf{Z}| \le k$ for some $k\ge 0$, if $(\ci{X}{Y}{\Z})_{P}, (\nci{X}{W}{\Z})_{P}, (\nci{W}{Y}{\Z})_{P}$, then no DAG $k$-faithful to $P$ contains a causal path from $W$ to $Y$.
\end{lemma}

\begin{lemma}\label{lem:no_causal_path_extended}
    Given a distribution $P$ defined over a DAG $D$, for any $X, Y \in \V$ and $\Z\in \mathbf{V}\setminus \{X,Y\}, |\mathbf{Z}| \le k$ for some $k\ge 0$, if $(\ci{X}{Y}{\Z})_{P}, (\nci{X}{W}{\Z})_{P}$, then the $k$-essential graph $\varepsilon_{k}({D})$ of all k-closure graphs that are Markov equivalent to $\C{D}$  does not contain $W \rightarrow Y$.
\end{lemma}
\begin{proof}
    By design of $k$-essential graphs $\varepsilon_{k}(D)$, two nodes $X,Y$ are adjacent in $\varepsilon_{k}(D)$ only when $(X \dep Y | \mathbf{Z})_{D}$ for all $\mathbf{Z} \subseteq \mathbf{V}, |\mathbf{Z}| \le k$. It is because by design, there exists at least one $k$-closure graph that is Markov equivalent to $\C{D}$ where $X$ and $Y$ are adjacent, which implies $X$ and $Y$ are $k$-covered in $D$. Given that $(\ci{X}{Y}{\Z})_{P}, (\nci{X}{W}{\Z})_{P}$, we have that $\varepsilon_{k}(D)$ does not contain $W \rightarrow Y$ by Lemma \ref{lem:loci_result}.
\end{proof}

Next, we will prove a result for $k$-essential graphs that is similar to Lemma \ref{lem:conditioning_on_posspa} to show the correctness of conditioning on the possible parents in the more fine-grained $k$-essential graph in Algorithm \ref{alg:extended-version}.

\begin{restatable}{lemma}{optimal}\label{lem:conditioning_on_posspa_extended} 
Let $M$ be the graph returned by lines 6-9 in Algorithm \ref{alg:extended-version},  $F$ is not adjacent to $X$ in $D_{aug}$ if and only if $F$ is d-separated with $X$ given $PossPa_{M}(X)$ in $D_{aug}$.  
\end{restatable}


\begin{proof} \label{extended_version_proof-of-lem-possibleparents}
  % The algorithm works as follows: First, we run $n$ marginal independence tests between $F$ and $X$ for all $X \in \V$. Then, for each node $Y$ that has been tested to be marginally dependent with $F$. We run $m$ independent tests between $Y$ and $F$ conditional on the possible parents of $Y$ in $\varepsilon_{k}(D)$. 
     We first prove the if ($\Rightarrow $) direction. 
    
   We first give a critical insight. We note that if F-NODE points to any variable that is a collider $H$ on some paths $p$ in $D_{aug}$, then running marginal tests must have allowed us to orient $F \srightarrow H \sleftarrow U$ and $F \srightarrow H \sleftarrow Q$ for some variables $U, Q$ on $p$ in the given $k$-essential graphs $\varepsilon_{k}(D)$ due to Lemma \ref{lem:no_causal_path_extended}. Thus, we call this resulting graph $M$ rather than $\varepsilon_{k}({D})$. If $F$ is marginally independent with all members in the adjacency set of $H$, then the result follows. 
   
   Suppose there is more than one node being marginally dependent on $F$. We call this set $\mathbf{Z}$. Then, we know $F$ must have a directed path to all such nodes $Z \in \Z$ in $D_{aug}$ as there is no incoming edges to $F$ and each of these nodes is marginally dependent with $F$. 
    We will prove the claim that if $F$ is not adjacent to $Z$ in $D_{aug}$, then $F$ is d-separated with $Z$ given $PossPa_{M}(Z)$ in $D_{aug}$ for all $Z \in \Z$. 

    For the sake of contradiction, assume that $F$ is d-connecting with $Z$ given $PossPa_{M}(Z)$ in $D_{aug}$. First, we note that $PossPa_{M}(Z)$ must contain all parents of $Z$ in $D_{aug}$. Since there exists a directed path from $F$ to $Z$, we call this path $r$ as shown below:
    \begin{equation}
        F \rightarrow T \rightarrow \ldots \rightarrow W \rightarrow ...\rightarrow Z.
    \end{equation} 
    Since $PossPa_{M}(Z)$ must contain all parents of $Z$, we consider two cases: (i) there exists a backdoor active path from $F$ to $Z$ by concatenating with a subpath of $r$ as follows:
    \begin{equation}\label{path:ex1}
        F \rightarrow T \rightarrow \ldots \rightarrow W \leftarrow  Q \rightarrow \ldots \rightarrow Z
    \end{equation}
    and case (ii): there exists a d-connecting path from $F$ to $Z$ given some variables $K$ as follows
     \begin{equation}\label{path:ex2}
        F \rightarrow T \rightarrow \ldots \rightarrow W \rightarrow \ldots \rightarrow K \leftarrow Z
    \end{equation}

Case (i) - there exists a backdoor active path from $F$ to $Z$ by concatenating with a subpath of $r$: We will first show a contradiction in this case. Note that we cannot have $Q \in An_{D_{aug}}(Z)$. To see that, suppose $Q$ and $Z$ is adjacent in $\varepsilon_{k}(D)$, then $Q$ must be in $PossPa_{M}(Z)$ as $(F \not \indep Z)_{P}$ so that Algorithm \ref{alg:extended-version} will not change the orientation of this edge in $M$. This yields a contradiction as $Q \in PossPa_{M}(Z)$ would have blocked this path. Suppose they are not adjacent in $\varepsilon_{k}(D)$, as any directed path from $Q$ to $Z$ would have been blocked by conditioning on $PossPa_{M}(Z)$, there exists a collider $U_{1}$ on a path from $Q$ to $Z$ with a member in $De_{D_{aug}}(U_{1})$ must be in $PossPa_{M}(Z)$ in order for the path in (\ref{path:ex1}) to be a d-connecting path from $F$ to $Z$ as follows.

\begin{equation}\label{path:ex3}
        F \rightarrow T \rightarrow \ldots \rightarrow W \leftarrow  Q \rightarrow \ldots \rightarrow U_{1} \leftarrow  \ldots Z
    \end{equation}
Consider $U_{1}$ is a descendant of $Z$. This means there exists another backdoor path being active by conditioning on $PossPa_{M}(Z)$ where some members $\mathbf{J} \subseteq PossPa_{M}(Z)$ are descendants of $U_{1}$. We first note that $U_{1}$ cannot have a directed path to $Z$ in $D_{aug}$ due to acyclicty. That implies $\mathbf{J}$ must be children of $Z$ in $D_{aug}$. Note that there cannot be any descendant $R$ of $U_{1}$ that forms an unshielded collider with any member in $\mathbf{J}$ because it will orient $Z \rightarrow C$ for any such child $C$ in $\mathbf{J}$ such that they will not be in $PossPa_{M}(Z)$. Thus, for any $J$ in $\mathbf{J}$ and some descendants $R$ of $U_{1}$, we must have shielded colliders $\langle R, J, Z \rangle$.  This implies that $U_{1}$ must be adjacent to $Z$, which implies that $U_{1} \sleftarrow Z$ is in $M$. 

It also implies that the backdoor path in (\ref{path:ex3}) is active due to conditioning on a variable that is adjacent to $U_{1}$ and it is in $PossPa_{M}(Z)$. Let us call this variable $H$. Consider the variable $U_{2}$ that is closest to $U_{1}$ on the path in (\ref{path:ex3}) in $D_{aug}$ e.g.
\begin{equation}\label{path:ex4}
        F \rightarrow T \rightarrow \ldots \rightarrow W \leftarrow  Q \rightarrow \ldots \rightarrow U_{2} \rightarrow U_{1} \leftarrow   Z
    \end{equation}

Suppose $U_{2}$ is adjacent to $Z$ in $M$, if $U_{2} \sleftarrow Z$ is in $M$, then we can use the same argument by treating $U_{2}$ as $U_{1}$ repeatedly until we reach $Q$ to be the closest node such that we reach the conclusion that conditioning on $Q$ would have blocked the backdoor path to reach the same contradiction. If $U_{2} \srightarrow Z$ or $U_{2} \circlecircle Z$ is in $M$, then $U_{2}$ is in $PossPa_{M}(Z)$ such that the backdoor path would not be active either by conditioning on $PossPa_{M}(Z)$ in $D_{aug}$. Thus, $Z$ and $U_{2}$ cannot be adjacent in $M$. Suppose $U_{2}$ is not adjacent to $H$ in $D_{aug}$, if $U_{2}$ and $H$ are not $k$-covered,  then $U_{2} \srightarrow U_{1} \srightarrow H$ must have been oriented in $\varepsilon_{k}(D)$ by first Meek rule in $k$-PC algorithm \cite{kocaoglu2023characterization} such that $Z \srightarrow H$ is also oriented in $\varepsilon_{k}(D)$ due to acyclicity, which leads to a contradiction as $H \in PossPa_{M}(Z)$. Thus, $U_{2}$ is adjacent to $H$ in $M$. This trivially also holds if $U_{2}$ and $H$ are $k$-covered. This implies that $U_{2} \rightarrow H$ is in $D_{aug}$. Since $Z$ and $U_{2}$ cannot be adjacent in $M$, then $U_{2}\srightarrow H \sleftarrow Z$ must have been oriented in $\varepsilon_{k}(D)$ such that $U_{2}\srightarrow H \sleftarrow Z$ is also in $M$. Then, $H$ will not be in $PossPa_{M}(Z)$, which is a contradiction. 


Case (ii): Consider the path in (\ref{path:ex2}). We will use a similar argument we made in case (i). That is, there cannot be any descendant $R$ of $W$ that forms an unshielded collider with any member $C$ in $Ch_{M}(Z)$ because it will orient $Z \srightarrow C$ for any such child $C$ such that they will not be in $PossPa_{M}(Z)$. Thus, for the variable $J$ that is the descendant closest to $W$ and is a child of $Z$ must form a shielded collider e.g. $\langle W, J, Z\rangle$ in $M$. However, since $W$ has a directed path to $Z$, that implies $W \srightarrow Z$ must also be in $M$ such that $W \in PossPa_{M}(Z)$, which leads to a contradiction as the path $(\ref{path:ex2})$ is no longer d-connecting.

For the only if direction, for the sake of contradiction, assume $F$ and $X$ are adjacent in $D_{aug}$. Since $F$ and $X$ are d-separated given the possible parents set of $X$ in $M$, then there is no d-connecting path from $F$ to $X$ given the possible parents set of $X$, which is a contradiction as $F$ is adjacent to $X$.  
\end{proof}

\begin{theorem}
    Given the $k$-essential graph of the true DAG $D$ and under causal sufficiency and the extended faithfulness assumption, Algorithm \ref{alg:extended-version} returns the true root cause variables. 
\end{theorem}
\begin{proof}
    The proof is based on Lemmas \ref{lem:no_causal_path_extended} and \ref{lem:conditioning_on_posspa_extended}. Lemma \ref{lem:no_causal_path_extended} proves the correctness of the orientation rules in Algorithm \ref{alg:extended-version} to refine the given $k$-essential graph. Lemma \ref{lem:conditioning_on_posspa_extended} proves the correctness of lines 16-19 in Algorithm \ref{alg:extended-version} where we use the highest CMI value e.g. $I(F;X|PossPa_{M}(X))$ to identify the true root causes. Specifically, assumption \ref{assumption:faithfulness} ensures us that the non-root cause $\bar{R}$ will have low CMI value e.g. $I(F;\bar{R}|PossPa_{M}(\bar{R}))$ relative to the true root causes $R$ e.g. $I(F;R|PossPa_{M}(R)) > I(F;\bar{R}|PossPa_{M}(\bar{R}))$. 
\end{proof}
% \begin{proof} \label{proof-of-lem-possibleparents_extended}
%   % The algorithm works as follows: First, we run $n$ marginal independence tests between $F$ and $X$ for all $X \in \V$. Then, for each node $Y$ that has been tested to be marginally dependent with $F$. We run $m$ independent tests between $Y$ and $F$ conditional on the possible parents of $Y$ in $\varepsilon_{k}(D)$. 
%     We will use $*$ as a wildcard to denote the endpoint of an edge to be a tail or a circle or an arrowhead. We first prove the if ($\Rightarrow $) direction. 
    
%    We first give a critical insight. We note that if F-NODE points to any variable that is a collider $H$ on some paths $p$ in $D_{aug}$, then running marginal tests must have allowed us to orient $F \crightarrow H \cleftarrow U$ and $F \crightarrow H \cleftarrow Q$ for some variables $U, Q$ on $p$ in the given $\varepsilon_{\mathcal{C}}(D)$ due to Lemma \ref{lem:no_causal_path}. Thus, we call this resulting graph $M$ rather than $\varepsilon_{\mathcal{C}}(D)$. If $F$ is marginally independent with all members in the adjacency set of $H$, then the result follows. 
   
%    Suppose there is more than one node being marginally dependent on $F$. We call this set $\mathbf{Z}$. Then, we know $F$ must have a directed path to all such nodes $Z \in \Z$ in $D_{aug}$ as there is no incoming edges to $F$ and each of these nodes is marginally dependent with $F$. 
%     We will prove the claim that if $F$ is not adjacent to $Z$ in $D_{aug}$, then $F$ is d-separated with $Z$ given $PossPa_{M}(Z)$ in $D_{aug}$ for all $Z \in \Z$. 

%     For the sake of contradiction, assume that $F$ is d-connecting with $Z$ given $PossPa_{M}(Z)$ in $D_{aug}$. First, we note that $PossPa_{M}(Z)$ must contain all parents of $Z$ in $D_{aug}$. Since there exists a directed path from $F$ to $Z$, we call this path $r$ as shown below:
%     \begin{equation}
%         F \rightarrow T \rightarrow \ldots \rightarrow W \rightarrow ...\rightarrow Z.
%     \end{equation}
%     Then, since $PossPa_{M}(Z)$ must contain all parents of $Z$, we consider two cases: (i) there exists a backdoor active path from $F$ to $Z$ by concatenating with a subpath of $r$ as follows:
%     \begin{equation}\label{path:ex1}
%         F \rightarrow T \rightarrow \ldots \rightarrow W \leftarrow  Q \rightarrow \ldots \rightarrow Z
%     \end{equation}
%     and case (ii): there exists a d-connecting path from $F$ to $Z$ given some variables $K$ as follows
%      \begin{equation}\label{path:ex2}
%         F \rightarrow T \rightarrow \ldots \rightarrow W \rightarrow \ldots \rightarrow K \leftarrow Z
%     \end{equation}


% Case (i) - there exists a backdoor active path from $F$ to $Z$ by concatenating with a subpath of $r$: We will first show a contradiction in case (i). Note that we cannot have $Q \in An_{D_{aug}}(Z)$. To see that, suppose $Q$ and $Z$ is $\mathcal{C}$-covered, then $Q$ must be in $PossPa_{M}(Z)$ as $(F \not \indep Z)_{P}$ so that Algorithm \ref{alg:marginal-invariance-test} will not change the orientation of this edge. Suppose they are not $\mathcal{C}$-covered, there exists a member along this path from $Q$ to $Z$ conditioned on which d-separates $Q$ and $Z$, which contradicts with the fact there is an active backdoor path. Thus, there exists a collider $U_{1}$ on the path from $Q$ to $Z$ as follows.

% \begin{equation}\label{path:ex3}
%         F \rightarrow T \rightarrow \ldots \rightarrow W \leftarrow  Q \rightarrow \ldots \rightarrow U_{1} \leftarrow  \ldots Z
%     \end{equation}
% Then,  a member in $De_{D_{aug}}(U_{1})$ must be in $PossPa_{M}(Z)$ in order for the path in (\ref{path:ex3}) to be a d-connecting path from $F$ to $Z$. Consider $U_{1}$ is a child of $Z$ in $D_{aug}$ and the node $U_{2}$ that is closest to $U_{1}$ to form $U_{2} \rightarrow U_{1} \leftarrow Z$ in $D_{aug}$. If $U_{2}$ and $Z$ are not $\mathcal{C}$-covered, then $\langle U_{2},U_{1}, Z \rangle$ must be unshielded in $M$. Then, $U_{1}$ cannot be in $PossPa_{M}(Z)$ as $Z \srightarrow U_{1}$ must have been oriented as an unshielded collider in $M$, which is a contradiction. If $U_{2}$ and $Z$ are $\mathcal{C}$-covered, then $U_{2}$ is adjacent to $Z$ in $M$. We will consider two cases: (a)  $U_{2} \not \in PossPa_{M}(Z)$ and (b) $U_{2}  \in PossPa_{M}(Z)$.




% Case(a): $U_{2} \not \in PossPa_{M}(Z)$: Suppose $U_{2} \not \in PossPa_{M}(Z)$, then it must be that $U_{2} \sleftarrow Z$ in $M$. Then, we have a collider $\langle U_{3}, U_{2}, Z\rangle $ on the path from $W$ to $Z$, where $U_{3}$ is the next closest node to $U_{2}$ on the same path. If $\langle U_{3}, U_{2}, Z\rangle $  is unshielded in $M$, then the $\mathcal{C}$-essential graph provided would have oriented $U_{3}\srightarrow U_{2}  \srightarrow U_{1}$ in $M$ by using the first Meek rule. Then, using acyclicity (second Meek rule) infers that $Z \srightarrow U_{1}$ in $M$ such that $U_{1} \not \in PossPa_{M}(Z)$. Since we have $Z \srightarrow U_{2}$ in $M$, there exists a $\mathcal{C}$-closure graph $\mathcal{S}_{\mathcal{C}}(D')$ of some causal graph $D'$ that is $\mathcal{C}$-Markov equivalent to $D_{aug}$ by Theorem \ref{thm:mec_ccset} and $Z \in An_{D'}(U_{2})$. The path $F \rightarrow W \leftarrow Q \rightarrow U_{2}$ concatenating this directed path from $Z$ to $U_{2}$ cannot be a d-connecting path from $F$ to $Z$ given $PossPa_{M}(Z)$ because the child of $Z$ on this path would not be in $PossPa_{M}(Z)$ as $U_{1}$ is also a child of $Z$. Hence, we reach a contradiction. Suppose  $\langle U_{3}, U_{2}, Z\rangle $ is shielded, we see that the same argument repeats by picking the next closest node to $U_{3}$ until we have reached that $\langle Q, U_{j}, Z\rangle $ is shielded for some $j$, if $Q\in PossPa_{M}(Z)$, then we will also reach a contradiction because the path in (\ref{path:ex3}) will no longer be active from $F$ to $Z$ given $ PossPa_{M}(Z)$. We will see that it is impossible to have $Q \not \in PossPa_{M}(Z)$ either.  Suppose $Q \not \in PossPa_{M}(Z)$, then there must exist $Z \srightarrow Q$ in $M$. However, this is also a contradiction for the following reason: any DAGs that is $\mathcal{C}$-Markov equivalent to $D_{aug}$ must have $F \rightarrow \ldots \rightarrow W \leftarrow Q$ as $F$ has a directed path to $W$ and no incoming edges. Having $Z \srightarrow Q$ in $M$ implies, for some DAG $D''$, there exists a $\mathcal{C}$-closure graph $\mathcal{S}_{\mathcal{C}}(D'')$ that is Markov equivalent to $\mathcal{S}_{\mathcal{C}}(D_{aug})$ has $Z\rightarrow Q$. We see that there will be a directed cycle in $D''$ as $F$ must have a directed path to $Z$ and $F\rightarrow \ldots \rightarrow W \leftarrow Q$ and $Z$ has a directed path to $Q$.  

% Case(b): $U_{2}  \in PossPa_{M}(Z)$: Suppose  $U_{2}  \in PossPa_{M}(Z)$. Consider the node that is closest to $U_2$ in the path in $(\ref{path:ex3})$ from $Q$ to $Z$. We call this node $U_{3}$. Since $U_{2}  \in PossPa_{M}(Z)$, $\langle U_{3}, U_{2}, Z \rangle$ cannot be an unshielded collider on the path from $Q$ to $Z$ in $M$. That implies $\langle U_{3}, U_{2}, Z \rangle$ must be shielded. We can repeat this argument by picking the next closest node until the next closest node is $Q$ so that we have $\langle Q, U_{j}, Z \rangle$ being shielded for some $j$. Then, the same argument as in case (a) repeats, reaching a contradiction. 

% Case (ii): there exists a d-connecting path from $F$ to $Z$ given some variables $K$: Now, we consider the case (ii) with the path in (\ref{path:ex2}). Consider the node closest to $K$. We call this node $K_{1}$ such that $\langle K_{1}, K, Z \rangle $ form a collider on the path in (\ref{path:ex2}) in $D_{aug}$. If  $\langle K_{1}, K, Z \rangle $ is unshielded in $M$, then $K$ cannot be in $PossPa_{M}(Z)$ as $Z \star \rightarrow K$ would have been oriented by $\mathcal{C}$-PC. Suppose  $\langle K_{1}, K, Z \rangle $ is not unshielded in $M$. 
% Consider the node closest to $K_1$. We call this node $K_{2}$ If $\langle K_{2}, K_{2}, Z \rangle $ is unshielded in $M$, then $K$ cannot be in $PossPa_{M}(Z)$ as $Z \srightarrow K$ would have been oriented by $\mathcal{C}$-PC. We can see this repeated argument until the closest node to $K_{i}$ for some $i$ is $T$. Then, $T$ must be in $PossPa_{M}(Z)$. Therefore, $F$ is d-separated from $Z$ given $PossPa_{M}(Z)$, which is a contradiction, blocking the path from $W$ to $Z$ such that $F$ is d-separated from $Z$ given $PossPa_{M}(Z)$, which is a contradiction.

% For the only if direction, for the sake of contradiction, assume $F$ and $X$ is adjacent in $D_{aug}$. Since $F$ and $X$ are d-separated given the possible parents set of $X$ in $M$, then there is no d-connecting path from $F$ to $X$ given the possible parents set of $X$, which is a contradiction as $F$ is adjacent to $X$.  
% \end{proof}


\section{Algorithms}\label{app:algorithms}


\begin{algorithm}[H] 
            \small
            \caption{RCA with Causal Graphs (Extended \name)} \label{alg:extended-version}
            \begin{algorithmic}[1]
                \INPUT Observational data $\mathcal{D}$, interventional data $\mathcal{D^{\star}}$, a $k$-essential graph $\E{D}=(\mathbf{V}, \mathbf{E})$, Max. no of root causes $l$,
                \OUTPUT Top-$l$ root causes

                \STATE Concatenate $\mathcal{D}$ and $\mathcal{D^{\star}}$ with a binary indicator variable $\fnode$.
                \FOR{$X \in \V$}
                    \STATE $A_X \leftarrow I(F;X)$
                \ENDFOR
                % \STATE Sort $A$ in ascending order
                \STATE $A \gets $Sort $X\in \V$ by $A_{X}$ in ascending order
                \STATE Create an empty list $\textbf{V}^{\star}_s$
                \FOR{$\alpha \in A$}
                    \STATE $G \leftarrow \E{D}$
                    \FOR{$X, Y \in \V$}
                        \IF{$I(F;X) < \alpha$ and $I(F;Y) \ge \alpha$}
                        \STATE If $X \leftarrow Y$ is in $G$, remove $X \leftarrow Y$ 
                        \STATE If $X-Y$ is in $G$, orient $X\rightarrow Y$ 
                                            \STATE If $X \circlecircle Y$ is in $G$, orient $X \crightarrow Y$
                    \STATE If $X \cleftarrow Y$ is in $G$, orient $X \leftrightarrow Y$
                      
                        \ENDIF
                    \ENDFOR
                    \FOR{$X \in \V$}
                        \STATE $CMI_{X} \leftarrow I(F;X|PossPa_{G}(X))$
                    \ENDFOR
                    \STATE $\mathbf{V}_s \leftarrow $Sort $X\in \V$ by $CMI_{X}$ in descending order
                    \IF{ $\exists X$ that has $I(F;X) < \alpha$ and $CMI_{X}$ is ranked on top-$l$ in $\textbf{V}_s$}
                        \STATE \textbf{Return} the first $l$ root causes from $\textbf{V}^{\star}_s$.
                    \ENDIF
                    \STATE $\textbf{V}^{\star}_s \gets \textbf{V}_s$
                \ENDFOR
                \STATE \textbf{Return} the first $l$ root causes from $\textbf{V}^{\star}_s$.
            \end{algorithmic}
        \end{algorithm}
        
 %        \begin{algorithm}[H] 
 %            \small
 %            \caption{RCA with Causal Graphs (Extended \name)} \label{alg:extended-version}
 %            \begin{algorithmic}[1]
 %                \INPUT Observational data $\mathcal{D}$, interventional data $\mathcal{D^{\star}}$, a $k$-essential graph $\epsilon_{k}(D)=(\mathbf{V}, \mathbf{E})$, Max. no of root causes $l$, a boolean value $B$
 %                \OUTPUT top $l$ root causes
 %                \STATE $\alpha  \leftarrow 0.001$; $\tau = 0.001$; $D \leftarrow$ Concatenate $\mathcal{D}$ and $\mathcal{D^{\star}}$ with a binary indicator variable $\fnode$; Create an empty list $L$
 %                \WHILE{\textbf{True}}
 %                 \STATE $G \leftarrow \epsilon_{k}(D)$
 %                \FOR{$X, Y \in \V$}
 %                    \IF{$I(F;X) < \alpha$ and $I(F;Y) \ge \alpha$}
 %                    \STATE If $X \leftarrow Y$ is in $G$, remove $X \leftarrow Y$
 %                    \STATE If $X-Y$ is in $G$, orient $X\rightarrow Y$
 %                    \STATE If $X \circlecircle Y$ is in $G$, orient $X \crightarrow Y$
 %                    \STATE If $X \cleftarrow Y$ is in $G$, orient $X \leftrightarrow Y$
 %                    \ENDIF
 %                \ENDFOR
 %                % \STATE $G \leftarrow$ \textbf{MARGINAL-INVARIANCE}($D, G$)
 %                \FOR{$X \in \V$}
 %                % \STATE $\mathbf{Z} \leftarrow \min_{\mathbf{Z}} I(F;X|\mathbf{Z}),$ where  $\mathbf{Z} \subseteq PossPa_{G}(X)$ and $|\mathbf{Z}| \le k$ 
 %                \IF{$B$ is \textbf{True}}
 %                \STATE \textbf{POSS}($t, X, G$)
 %                \ELSE
 %                 \STATE $I_{X} \leftarrow I(F;X|PossPa_{G}(X))$
 %                \ENDIF
 %                \ENDFOR
 % \STATE $\mathbf{V}_s \leftarrow $Sort $X\in \V$ by $CMI_{X}$ in descending order
 %                    \IF{ $\exists X$ that has $I(F;X) < \alpha$ and $CMI_{X}$ is ranked on top-$l$ in $\textbf{V}_s$}
 %                        \STATE \textbf{Return} the first $l$ root causes from $\textbf{V}^{\star}_s$.
 %                    \ENDIF
 %                    \STATE $\textbf{V}^{\star}_s \gets \textbf{V}_s$
 %                \ENDWHILE
 %            \end{algorithmic}
 %        \end{algorithm}
        % \begin{algorithm}[H] 
        %     \small
        %     \caption{Possible Parents Subset Search (POSS)} \label{alg:subset-search}
        %     \begin{algorithmic}[1]
        %         \INPUT upper limit $t$, a target variable $X$, a refined $k$-essential graph $G$
        %         \OUTPUT the smallest mutual information $I(F;X|\mathbf{Z})$, where $\mathbf{Z} \subseteq PossPa_{G}(X), |\mathbf{Z}| \le t $ 
        %         \STATE $M = \infty$
        %        \FOR{$\mathbf{Z} \subseteq PossPa_{G}(X), |Z| \le t$}
        %        \STATE $I_{X} \leftarrow I(F; W|\mathbf{Z})$
        %        \IF{$I_{X} < M$}
        %        \STATE $M = I_{X}$
        %        \ENDIF
        %        \ENDFOR
        %        \STATE \textbf{return} $M$
        %     \end{algorithmic}
        % \end{algorithm}

    % \begin{minipage}{0.48\textwidth}
    %     \begin{algorithm}[H]
    %      \footnotesize    \caption{MARGINAL-INVARIANCE} 
    %         \footnotesize
    %         \label{alg:marginal-invariance-test}
    %         \begin{algorithmic}[1]
    %             \INPUT Observational and interventional data distribution $P$, a partial causal structure $G=(\V, E)$, CI tester.
    %             \OUTPUT $G$
    %             \FOR{$X, Y \in \V$}
    %                 \IF{$(F \indep X)_{P}$ and $(F \dep Y)_{P}$}
    %                 \STATE If $X \leftarrow Y$ is in $G$, remove $X \leftarrow Y$
    %                 \STATE If $X-Y$ is in $G$, orient $X\rightarrow Y$
    %                 % \STATE If $X \circlecircle Y$ is in $G$, orient $X \crightarrow Y$
    %                 % \STATE If $X \cleftarrow Y$ is in $G$, orient $X \leftrightarrow Y$
    %                 \ENDIF
    %             \ENDFOR
    %              \STATE \textbf{Return} $G$
    %         \end{algorithmic}
    %     \end{algorithm}
    % \end{minipage}


% \begin{algorithm}[H]
%          \footnotesize    \caption{MARGINAL-INVARIANCE} 
%             \footnotesize
%             \label{alg:marginal-invariance-test}
%             \begin{algorithmic}[1]
%                 \INPUT Observational and interventional data distribution $P$, a partial causal structure $G=(\V, E)$, CI tester.
%                 \OUTPUT $G$
%                 \FOR{$X, Y \in \V$}
%                     \IF{$(F \indep X)_{P}$ and $(F \dep Y)_{P}$}
%                     \STATE If $X \leftarrow Y$ is in $G$, remove $X \leftarrow Y$
%                     \STATE If $X-Y$ is in $G$, orient $X\rightarrow Y$
%                     \STATE If $X \circlecircle Y$ is in $G$, orient $X \crightarrow Y$
%                     \STATE If $X \cleftarrow Y$ is in $G$, orient $X \leftrightarrow Y$
%                     \ENDIF
%                 \ENDFOR
%                  \STATE \textbf{Return} $G$
%             \end{algorithmic}
%         \end{algorithm}
        




\begin{algorithm}[H]
    \caption{CONSTRUCT-HEAVY-PATH-DFS-TREE~\cite{tao2019interactive}} 
    \label{alg:dfs-tree}
    \begin{algorithmic}[1]
    \INPUT DAG $D=(\V, \Eb)$ \\
    \OUTPUT A heavy-path-DFS-tree $T$\\
    \STATE Create a stack $\mathcal{S}$ with the root node $R$ in $D$ and mark $R$ visited.
    \REPEAT
    \STATE $J \leftarrow $ get the top member in the stack. 
    \IF{$J$ has any child $A$ that has not been visited previously}
    \STATE $A' \leftarrow$ Find the child that can reach the highest number of nodes that have not been visited via a directed path.
    \STATE Push $A'$ into the stack $\mathcal{S}$ and mark it visited.
    \ELSE 
    \STATE Pop $J$ out of the stack $\mathcal{S}$.
    \ENDIF
    \UNTIL{$\mathcal{S}$ is empty}
    \end{algorithmic}
\end{algorithm}

\begin{algorithm}[H]
    \caption{Modified IGS (DFS-Interleave~\cite{tao2019interactive}) for RCA} 
    \label{alg:modified-dfs-tree}
    \begin{algorithmic}[1]
    \INPUT DAG $D=(\V, \Eb)$, interventional data $\mathcal{D}$, CI tester, \\
    \OUTPUT A root cause $R$\\
    \IF{$D$ has more than one sink node}
    \STATE  $D \leftarrow$ Add a dummy vertex $S$ to $D$ where all the sink nodes in $D$ point to $S$. 
    \ENDIF
    \STATE $D \leftarrow $ Reverse all the edges in $D$
    \STATE $T \leftarrow $
    \textbf{CONSTRUCT-HEAVY-PATH-DFS-TREE}($D$) \COMMENT{See Algorithm \ref{alg:dfs-tree}}
    \STATE $\hat{R} \leftarrow $ Select the root of $T$
   \REPEAT
   \STATE $\pi \leftarrow $ Select the leftmost $\hat{R}$-to-leaf path of $T$
   \STATE $U \leftarrow$ Perform binary search on $\pi$ to find the last node $U$ that gives $(F \dep U)_{P}$.
   \STATE $W \leftarrow $ Find the leftmost child of $U$ in $T$ where $(F \not \indep W)_{P}$.
   \IF{$W$ does not exists}
    \STATE \textbf{return} $U$
    \ELSE
    \STATE \textbf{update} $\hat{R} \leftarrow W$
   \ENDIF
   \UNTIL{$\hat{R}$ has not been updated.}
    \end{algorithmic}
\end{algorithm}



% \begin{algorithm}[H]
%     \caption{CONSTRUCT-SEPARATOR \cite{shangqi2023partial}} 
%     \label{alg:seprator}
%     \begin{algorithmic}[1]
%     \INPUT A Heavy-path Depth First Search Tree $T$, the number of vertices in a DAG $n$,
%     \OUTPUT A subtree $S$ with at most $n/2$ nodes\\
%     \STATE $S \leftarrow \emptyset; T' \leftarrow T$
%    \REPEAT
%     \STATE $U \leftarrow$ get the smallest node under the total order in $T'$ s.t. $|T_{U}'| \ge \floor{n/2} +1$ and $|T_{Q}'| \le \floor{n/2}$ for each child $Q$ of $U$ in $T'$, where $T_{U}'$ denotes a subgraph induced by the descendants of $U$ in $T'$
%     \STATE Add $U$ to $\mathbf{S}$
%     \STATE Remove $T_{U}'$ from $T'$
%    \UNTIL{$|T'| < \floor{n/ 2} +1$ }
%    \STATE
%   \textbf{return} $S$
%     \end{algorithmic}
% \end{algorithm}

% \begin{algorithm}[H]
%     \caption{POMS \cite{shangqi2023partial}} 
%     \label{alg:igs}
%     \begin{algorithmic}[1]
%     \INPUT DAG $D=(\V, \Eb)$, $\mathcal{O}$ an oracle on $Z$'s reachability from any node in $D$ for a single variable $Z \in \V$ \\
%     \OUTPUT A target node $Z$ that an adversary chooses arbitrarily in $\V$ \\
%     \STATE $G \leftarrow D$
%    \REPEAT
%     \STATE $T \leftarrow$ \textbf{CONSTRUCT-HEAVY-PATH-DFS-TREE}($G$)
%     \STATE $S \leftarrow $ \textbf{CONSTRUCT-SEPARATOR}$(T, |\mathbf{V}|)$
%    \STATE $W \leftarrow$ Use a single query to obtain all nodes in $S$ that can reach $Z$.
%    \STATE $S^{\star} \leftarrow$ Get the smallest node in terms of total order in $S$
%    \STATE $L \leftarrow LF(S^\star) \cup \{S^{\star}\}$, where $LF(S^{\star})$  is the set that includes all the left siblings of $S^\star$ in $T$. 
%    \STATE $S^{\star \star} \leftarrow$ Get the smallest node in terms of total order in $L$.
%     \IF{$S^{\star \star} \not \in S$}
%     \STATE $G \leftarrow $ Get the subgraph by removing all subtrees $T_{Q}$ from $T_{S^{\star \star}}$ for all $Q \in S$, where $T_{S^{\star\star}}$ is the subtree induced by $De_{T}(S^{\star \star})$ 
%     \ELSE
%     \STATE $S^{\#} \leftarrow $Query the oracle for each child of $S^{\star \star}$ in the acsending order according to the total order of $T$ until it encounters the smallest child of $S^{\star\star}$ according to the total order.
%     \IF{$S^{\#}$ does not exist}
%     \STATE \textbf{return $S^{\star\star}$} as the target
%     \ELSE
%     \STATE $G \leftarrow $ Get the subgraph by removing all subtrees $T_{Q}$ from $T_{S^{\#}}$ for all $Q \in S$.
%     \ENDIF
%     \ENDIF
%    \UNTIL{$G$ has only a single vertex.}
   
%     \end{algorithmic}
% \end{algorithm}


% \begin{algorithm}[H]
%     \caption{DFS-interleave \cite{tao2019interactive}} 
%     \label{alg:igs}
%     \begin{algorithmic}[1]
%     \INPUT DAG $D=(\V, \Eb)$, $\mathcal{O}$ an oracle on $Z$'s reachability from any node in $D$ for a single variable $Z \in \V$ \\
%     \OUTPUT A target node $Z$ that an adversary chooses arbitrarily in $\V$ \\
%     \STATE $T \leftarrow $ \textbf{CONSTRUCT-HEAVY-PATH-DFS-TREE}($D$)
%     \STATE $\hat{U} \leftarrow $ Select the root of $T$
%    \REPEAT
%    \STATE $\pi \leftarrow $ Select the leftmost $\hat{U}$-to-leaf path of $T$
%    \STATE $U \leftarrow$ Perform binary search on $\pi$ to find the last node $U$ that has a directed path to $Z$ by querying $\mathcal{O}$ whether $X$ has a directed path to $Z$ for each variable $X$ on this binary search.
%    \STATE $V \leftarrow $ Find the leftmost child $V$ of $U$ in $T$ where $V$ has a directed path to $Z$ by querying $\mathcal{O}$
%    \IF{$V$ does not exists}
%     \STATE \textbf{return} $U$
%     \ELSE
%     \STATE \textbf{update} $\hat{U} \leftarrow V$
%    \ENDIF
%    \UNTIL{$\hat{U}$ has not been updated.}
%     \end{algorithmic}
% \end{algorithm}


% \begin{algorithm}[H]
%     \caption{Modified POMS for RCA} 
%     \label{alg:modified-dfs-tree}
%     \begin{algorithmic}[1]
%     \INPUT DAG $D=(\V, \Eb)$, interventional data $\mathcal{D}$, CI tester, \\
%     \OUTPUT A root cause $R$\\
%     \IF{$D$ has more than one sink node}
%     \STATE  $D \leftarrow$ Add a dummy vertex $S$ to $D$ where all the sink nodes in $D$ point to $S$. 
%     \ENDIF
%     \STATE $G \leftarrow $ Reverse all the edges in $D$
%      \REPEAT
%     \STATE $T \leftarrow$ \textbf{CONSTRUCT-HEAVY-PATH-DFS-TREE}($G$)
%     \STATE $S \leftarrow $ \textbf{CONSTRUCT-SEPARATOR}$(T, |\mathbf{V}|)$
%    \STATE $W \leftarrow$ Obtain all $X \in S$ that gives $(F \not \indep X)_{P}$.
%    \STATE $S^{\star} \leftarrow$ Get the smallest node in terms of total order in $S$
%    \STATE $L \leftarrow LF(S^\star) \cup \{S^{\star}\}$, where $LF(S^{\star})$  is the set that includes all the left siblings of $S^\star$ in $T$. 
%    \STATE $S^{\star \star} \leftarrow$ Get the smallest node in terms of total order in $L$.
%     \IF{$S^{\star \star} \not \in S$}
%     \STATE $G \leftarrow $ Get the subgraph by removing all subtrees $T_{Q}$ from $T_{S^{\star \star}}$ for all $Q \in S$, where $T_{S^{\star\star}}$ is the subtree induced by $De_{T}(S^{\star \star})$ 
%     \ELSE
%     \STATE $S^{\#} \leftarrow $ Obtain the child $C$ of $S^{\star \star}$ that gives $(F\not \indep C)_{P}$ in the acsending order according to the total order of $T$.
%     \IF{$S^{\#}$ does not exist}
%     \STATE \textbf{return $S^{\star\star}$} as the root cause $R$
%     \ELSE
%     \STATE $G \leftarrow $ Get the subgraph by removing all subtrees $T_{Q}$ from $T_{S^{\#}}$ for all $Q \in S$.
%     \ENDIF
%     \ENDIF
%    \UNTIL{$G$ has only a single vertex.}\end{algorithmic}
% \end{algorithm}


% \begin{algorithm}[H]
%     \caption{DFS-interleave-for-RCA} 
%     \label{alg:igs-rca}
%     \begin{algorithmic}[1]
%     \INPUT DAG $D=(\V, \E)$, interventional data, CI tester \\
%     \OUTPUT A root cause $R$ \\
%     \STATE $T \leftarrow $ \textbf{CONSTRUCT-HEAVY-PATH-DFS-TREE}($D$)
%     \STATE $T \leftarrow$ Reverse every edge in $T$
%     \STATE $\hat{U} \leftarrow $ Select the sink node of $T$
%    \FOR{$V \in \V$}
%    \STATE $\pi \leftarrow $ Select the leftmost $\hat{U}$-to-root path of $T$
%    \STATE $U \leftarrow$ Perform binary search on $\pi$ to find the last node $U$ that is dependent with $F$ by testing whether $(F \not \indep X)_{P}$ for each variable $X$ on this binary search.
%    \STATE $V \leftarrow $ Find the leftmost parent $V$ of $U$ in $T$ where $(F \not \indep V)_{P}$
%    \IF{$V$ does not exists}
%     \STATE \textbf{return} $U$
%     \ELSE
%     \STATE \textbf{update} $\hat{U} \leftarrow V$
%    \ENDIF
%    \ENDFOR
%     \end{algorithmic}
% \end{algorithm}


% \begin{algorithm}[H]
%     \caption{DFS-interleave-for-RCA} 
%     \label{alg:igs-rca}
%     \begin{algorithmic}[1]
%     \INPUT DAG $D=(\V, \E)$, interventional data, CI tester \\
%     \OUTPUT A root cause $R$ \\
%     \STATE $D' \leftarrow$ Reverse every edge in $D$
%     \STATE $T \leftarrow $ \textbf{CONSTRUCT-HEAVY-PATH-DFS-TREE}($D'$)
%     \STATE $\hat{U} \leftarrow $ Select the source node of $T$
%    \FOR{$V \in \V$}
%    \STATE $\pi \leftarrow $ Select the leftmost $\hat{U}$-to-root path of $T$
%    \STATE $U \leftarrow$ Perform binary search on $\pi$ to find the last node $U$ that is dependent with $F$ by testing whether $(F \not \indep X)_{P}$ for each variable $X$ on this binary search.
%    \STATE $V \leftarrow $ Find the leftmost child $V$ of $U$ in $T$ where $(F \not \indep V)_{P}$
%    \IF{$V$ does not exists}
%     \STATE \textbf{return} $U$
%     \ELSE
%     \STATE \textbf{update} $\hat{U} \leftarrow V$
%    \ENDIF
%    \ENDFOR
%     \end{algorithmic}
% \end{algorithm}




\section{Sample Run of RCD \cite{ikram2022root}}\label{app:samplerun-rcd}

\begin{figure*}[h!]
    \centering 
    \footnotesize
   \begin{tikzpicture} [scale=0.4]
     \node (1) at (-3,0) {$X_{1}$};
     \node (2) at (0,0) {$X_{2}$};
     \node (3) at (3,0) {$X_{3}$};
     \node (4) at (3,3) {$X_{4}$};
     \node (5) at (-3,3) {$F$};
     \path (1) edge (2);
     \path (2) edge (3);
     \path (4) edge (2);
     \path (4) edge (3);
     \path[red, line width=1.0] (5) edge (2);
    \end{tikzpicture}
    \caption{An example to show how RCD works. RCD would need increase the size of the separating set to 2 to find the root cause ($X_2$). However, we can leverage the causal graph to know precisely the separating set for every node.}\label{fig:rcd-sample-run}
\end{figure*}

RCD is based on the observation that a failure in a microservice can be treated as an intervention in the underlying causal graph. By treating the root cause as the interventional target, RCD leverages recent advances in causal discovery to identify the root cause. Consistent with the broader causal discovery literature, RCD determines the interventional target (the root cause) through a series of CI tests. RCD operates by introducing a special node, referred to as \fnode, into the dataset and connecting it to every other node in a complete undirected graph. The algorithm's primary goal is to trim down the children of \fnode, as the true root cause will ultimately be the sole remaining child. However, due to the lack of information about the underlying graphical structure, RCD must condition on every possible set of variables until it identifies a separating set that can exclude a potential node as the root cause.

For instance, consider the ground truth causal graph shown in Figure~\ref{fig:rcd-sample-run}, where the root cause is $X_2$. Initially, RCD constructs an undirected graph with \fnode having outgoing edges to every node. It begins with a separating set of size 0 and executes all possible CI tests. After conducting the tests $(F \indep X_{1})_{P}$ and $(F \indep X{4})_{P}$, RCD removes the edges between \fnode and both $X_1$ and $X_4$. At this point, only two candidates for the root cause remain: $X_2$ and $X_3$. To narrow it down to the true root cause, RCD increases the size of the separating set. If it tests $X_2$, it runs $(F \dep X{2} | X_{3})_{P}$. Since $X_2$ is the root cause, it cannot be independent of \fnode. When testing $X_3$ by running $(F \dep X{3} | X_{2})_{P}$, conditioning on $X_2$ opens a backdoor path from \fnode to $X_3$, preventing its elimination. RCD then increases the size of the separating set once more and runs $(F \indep X{3} | X_{2}, X_{4})_{P}$, which removes the edge between \fnode and $X_3$. Finally, RCD stops, identifying $X_2$ as the root cause.

Since RCD lacks access to the causal graph, it must perform CI tests on all possible conditioning sets (up to size 2) to identify the root cause, resulting in an exponential growth in tests and higher computational costs. To address this, RCD limits the conditioning set size using a hyperparameter, though this can lead to incomplete results. We propose that knowing the causal graph can significantly reduce the number of required CI tests. A causal graph provides precise separating sets, allowing the root cause to be identified with at most $n$ CI tests, where $n$ corresponds to the number needed for validation of the structure.

% \section{Sample Run of $\mathcal{C}$-PC Algorithm and Interpretations of $\mathcal{C}$-essential graph}\label{app:samplerun-cpc}

% As $\mathcal{C}$-PC is highly relevant to our algorithm. We give a sample run of $\mathcal{C}$-PC algorithm as in \citet{lee2024constraint} to demonstrate how it works in Figure \ref{fig:sample-run}. The ground truth is provided in Figure \ref{figa-hello}. Suppose we let $\mathcal{C} = \{\emptyset, \{Y\}\}$. It means that one will only conduct all marginal independence tests and CI tests with conditioning set $\{Y\}$. The resulting graphical representation after finishing step 4 of Algorithm \ref{alg:IPC-algo} is in Figure \ref{fig:sr-a}. The definition of various marks on the graph is provided in Definition \ref{definition:edge_union}. Then, by applying some orientation rules in step 5 of Algorithm \ref{alg:IPC-algo}, we can obtain the final output shown by Figure \ref{fig:sr-c}. 

% We will use the output of $\mathcal{C}$-PC in figure \ref{fig:sr-c} to illustrate the meaning of a $\mathcal{C}$-essential graph.  The interpretation of this graphical object known as $\mathcal{C}$-essential graph is that it represents a set of conditional independence relations induced by the ground truth in Figure \ref{figa-hello} with respect to the set $\mathcal{C}=\{\emptyset, \{Y\}\}$. These CI relations are $(Z \indep J)_{P}, (W \indep J)_{P}, (Z \indep Q|Y)_{P}, (Q \indep J|Y)_{P}, (W \indep Q|Y)_{P}, (Z \indep Q|Y)_{P}$. Both the arrowheads and directed edges e.g. $J \rightarrow Y$ in Figure \ref{fig:sr-c} are invariant across all the DAGs that are $\mathcal{C}$-Markov (see Definition \ref{def:c-markov}) to the ground truth by Lemma \ref{lem:c-closure-and-DAGs} and Theorem \ref{thm:mec_ccset}. An undirected edge $Z-W$ denotes that there exists a $\mathcal{C}$-closure graph that has $Z \rightarrow W$ and another $\mathcal{C}$-closure graph that has $W \rightarrow Z$ within the same Markov equivalence class. Please see Definition \ref{def:c-closure} for the relationships between DAGs and $\mathcal{C}$-closure graphs.  As $\mathcal{C}$-essential graph represents a set of $\mathcal{C}$-closure graphs, the edge union operation (see Definition \ref{def:c-closure-graphs}) is then used to represent different orientations in these $\mathcal{C}$-closure graphs that are Markov equivalent.  

% \begin{figure*}[t]
%     \centering 
%     \footnotesize
%     \subfigure[Ground truth $D$]
%      {     \begin{tikzpicture} [scale=0.3] \label{figa-hello}
%       \node (1) at (-3,0) {$Z$};
%       \node (2) at (0,0) {$Y$};
%     \node (3) at (3,0) {$Q$};
%     \node (4) at (0,3) {$W$};
%     \node (5) at (3,3) {$J$};
%     \path(4) edge (2);  
%     \path(4) edge (1);
%     \path(5) edge (2);
%     \path(2) edge (3);
%     \end{tikzpicture}}
%     \quad
%     \subfigure[$M$ after Step 4]
%      {\begin{tikzpicture} [scale=0.3] \label{fig:sr-a}
%       \node (1) at (-3,0) {$Z$};
%       \node (2) at (0,0) {$Y$};
%     \node (3) at (3,0) {$Q$};
%     \node (4) at (0,3) {$W$};
%     \node (5) at (3,3) {$J$};
%     \path[o->](4) edge (2);  
%     \path[o-o](4) edge (1);
%     \path[o->](5) edge (2);
%     \path[o-o](2) edge (3);
%     \path[o->](1) edge (2);
%     \end{tikzpicture}} \quad
%         \subfigure[$M$ after Step 5 with $\mathcal{R}11$ applied to $J$]
%      {     \begin{tikzpicture} [scale=0.3] \label{fig:sr-b}
%       \node (1) at (-3,0) {$Z$};
%       \node (2) at (0,0) {$Y$};
%     \node (3) at (3,0) {$Q$};
%     \node (4) at (0,3) {$W$};
%     \node (5) at (3,3) {$J$};
%     \path[o->](4) edge (2);  
%     \path[o-o](4) edge (1);
%     \path (5) edge (2);
%     \path(2) edge (3);
%     \path[o->](1) edge (2);
%     \end{tikzpicture}}    \quad    \subfigure[$M$ after Step $5$ with  $\mathcal{R}12$ with applied to $W$]
%      {     \begin{tikzpicture} [scale=0.3] \label{fig:sr-c}
%       \node (1) at (-3,0) {$Z$};
%       \node (2) at (0,0) {$Y$};
%     \node (3) at (3,0) {$Q$};
%     \node (4) at (0,3) {$W$};
%     \node (5) at (3,3) {$J$};
%     \path[o->](4) edge (2);  
%     \path[-] (4) edge (1);
%     \path (5) edge (2);
%     \path(2) edge (3);
%     \path[o->](1) edge (2);
%     \end{tikzpicture}}
   
%     \caption{(a)-(d): Given $\mathcal{C}=\{\emptyset, \{Y\}\},$ this is an example of the execution of Algorithm \ref{alg:IPC-algo}. Particularly, \ref{fig:sr-c} shows the output of $\mathcal{C}$-PC for learning the ground truth in \ref{figa-hello}.} \label{fig:sample-run}
% \end{figure*}

% \section{An Example that shows the benefits of Lemmas \ref{lem:ancestors_not_F} and \ref{lem:descendants_cannot_be_targets}}\label{app:example-show-working-of-lemma4}

% We will use Figure \ref{fig:lemma_illustration} to illustrate how Lemmas \ref{lem:ancestors_not_F} and \ref{lem:descendants_cannot_be_targets} may help identify the root cause, which is $X_{1}$ in this case, with less than $n$ invariance tests. We can start by arbitrarily picking a variable for testing conditional independence with $F$. Suppose we select $X_{2}$ to test whether $(F\indep X_{2})_{P}$. By Assumption \ref{assumption:faithfulness}, we will observe $(F\not \indep X_{2})_{P}$. Then, Lemma \ref{lem:descendants_cannot_be_targets} says that $X_{3}$ cannot be the root cause. Suppose we pick $X_{1}$ to test for conditional independence, then we will observe $(F\indep X_{1})_{P}$. Then, by Lemma \ref{lem:ancestors_not_F}, we know that $X_{5}$ cannot be the root cause either. Then, we are only left with $X_{4}$ to test for conditional independence. This results in a total of $3$ marginal independence tests, which is less than $n=5$. 


% \begin{figure*}[h!]
%     \centering 
%     \footnotesize
%    \begin{tikzpicture} [scale=0.4] \label{fig:working-example-for-lemmas}
%      \node (2) at (0,0) {$X_{1}$};
%       \node (3) at (3,0) {$X_{2}$};
%     \node (4) at (6,0) {$X_{3}$};
%     \node (5) at (2,3) {$X_{4}$};
%     \node (6) at (-1,3) {$F$};
%       \node (7) at (-3,0) {$X_{5}$};
%     \path (2) edge (3);
%     \path (3) edge (4);
%     \path (5) edge (3);
%     \path (7) edge (2);
%     \path[red, line width=1.0] (6) edge (3);
%     \end{tikzpicture}
%     \caption{An example to show how Lemma \ref{lem:ancestors_not_F} and \ref{lem:descendants_cannot_be_targets} helps identify the root cause with a few invariance tests given a causal graph, where $X_{1}$ is the root cause.} \label{fig:lemma_illustration}
% \end{figure*}


% \section{Discussion on the trade-off between sample complexity of learning $\mathcal{C}$-essential graph and computational efficiency of computing CMI} \label{app:trade-off-sample-computational-efficiency}


% In this section, we discuss the trade-off between computational efficiency and sample complexity in Algorithm~\ref{alg:sample-version}. As noted by Corollary~\ref{cor:size_of_poss_pa}, a larger set $\mathcal{C}$ allows the $\mathcal{C}$-PC algorithm to conduct more CI tests, potentially including high-order tests. While this tends to result in a sparser graph, it also increases the time needed to learn the causal graph during normal operations and requires more samples for reliable CI tests. The goal is to reduce the set of possible parents during normal operation by conducting more informative CI tests based on data reliability. Although our method can leverage advancements in consistent CMI estimators for high-dimensional datasets~\citep{mukherjee2020ccmi, NEURIPS2023_48db6744}, a smaller set of possible parents will reduce the time needed to compute CMI during critical failure situations. 



% We will use Table \ref{tab:sample_complexity_computational_proposed_algo} to illustrate the trade-off between the sample complexity and computational efficiency of the proposed algorithm \name for RCA. We use $D$ to denote the ground truth DAG. We use $\mathcal{C}0$ to denote $\mathcal{C} = \{\{\emptyset\}\}.$ We use $\mathcal{C}1$ to denote $\mathcal{C}= \{\{\emptyset\}, \{W\}, \{J\}, \{T\}, \{Z\}, \{Y\}, \{Q\}\}$.  We use $\mathcal{C}2$ to denote $\mathcal{C}= \{\{\emptyset\}, \{W\}, \{J\}, \{T\}, \{Z\}, \{Y\}, \{Q\},\{W,Z\}, \{W,Q\}, \{W,Y\}, $ $\{W,J\}, \{W,T\}, \{Z,Y\}, \{Z,J\},  \{Z,T\},  \{Z,Q\}, \{Y,J\},  \{Y,T\},  \{Y,Q\}, \{J,T\}, \{J,Q\}, \{T,Q\}\}$. These sets are defined for Algorithm \ref{alg:IPC-algo} to obtain the respective $\mathcal{C}$ essential graphs during the normal operation time and \name can then take these graph objects as input for RCA post-failure. 

% We see that as we increase the number of conditioning sets in $\mathcal{C}$, the resulting $\mathcal{C}$-essential graph will become sparser. During the failure time, \name will conduct an additional $n$ marginal invariance tests to further refine the graph objects shown by Table \ref{tab:sample_complexity_computational_proposed_algo} depending on the defined $\mathcal{C}$. Thus, the possible parents of each observed variable will potentially get smaller. This will increase the computational efficiency and reduce the sample complexity of computing conditional mutual information in \name during the failure time. However, as $\mathcal{C}$ gets larger, the sample complexity and time complexity also increase for using $\mathcal{C}$-PC during the normal operation time. Hence, there is a trade-off between learning $\mathcal{C}$-essential graphs during normal operation time and computing conditional mutual information post-failure in terms of sample and time complexity.  
% \begin{table*}[h]
% \small
% \centering
% \begin{tabular}{|c|c|c|c|c|}
% \hline
%   \cellcolor{gray!20} Ground truth $D$     &         \cellcolor{green!20} Choice of $\mathcal{C}$                         &   \cellcolor{green!20} $\mathcal{C}0$ & \cellcolor{green!20} $\mathcal{C}1$ & \cellcolor{green!20} $\mathcal{C}2$ \\ \hline  \begin{tikzpicture} [scale=0.3] \label{figa}
%       \node (1) at (-3,0) {$Z$};
%       \node (2) at (0,0) {$Y$};
%     \node (3) at (3,0) {$Q$};
%     \node (4) at (-3,3) {$W$};
%     \node (5) at (0,3) {$J$};
%     \node (6) at (3,3) {$T$};
%     \path(4) edge (2);  
%     \path(4) edge (1);
%     \path(5) edge (2);
%     \path(2) edge (3);
%     \path(5) edge (6);
%     \path(6) edge (3);
%     \end{tikzpicture}  
%  &  $\mathcal{C}$-essential graph   & \begin{tikzpicture} [scale=0.3] 
%       \node (1) at (-3,0) {$Z$};
%       \node (2) at (0,0) {$Y$};
%     \node (3) at (3,0) {$Q$};
%     \node (4) at (-3,3) {$W$};
%     \node (5) at (-1,3) {$J$};
%     \node (6) at (2,3) {$T$};
%     \path[o->](1) edge (2); 
%     \path[o->](4) edge (2);  
%     \path[o-o](4) edge (1);
%     \path[o-o](6) edge (2);
%     \path[o->](5) edge (3);
%     \path[o->](5) edge (2);
%     \path[o-o](2) edge (3);
%     \path[o-o](5) edge (6);
%     \path[o->](6) edge (3);
%     \path[o->] (1) edge[bend right=30] (3);
%      \path[o->] (4) edge[bend left=100] (3);
%     \end{tikzpicture}                  & \begin{tikzpicture} [scale=0.3] 
%       \node (1) at (-3,0) {$Z$};
%       \node (2) at (0,0) {$Y$};
%     \node (3) at (3,0) {$Q$};
%     \node (4) at (-3,3) {$W$};
%     \node (5) at (-1,3) {$J$};
%     \node (6) at (2,3) {$T$};
%     \path(4) edge (2);  
%     \path[-](4) edge (1);
%     \path[o->](5) edge (3);
%     \path(5) edge (2);
%     \path(2) edge (3);
%     \path[-](5) edge (6);
%     \path[o->](6) edge (3);
%     \path[o->] (1) edge[bend right=30] (3);
%      \path[o->] (4) edge[bend left=100] (3);
%     \end{tikzpicture}       & \begin{tikzpicture} [scale=0.3] 
%       \node (1) at (-3,0) {$Z$};
%       \node (2) at (0,0) {$Y$};
%     \node (3) at (3,0) {$Q$};
%     \node (4) at (-3,3) {$W$};
%     \node (5) at (0,3) {$J$};
%     \node (6) at (3,3) {$T$};
%     \path(4) edge (2);  
%     \path[-](4) edge (1);
%     \path(5) edge (2);
%     \path(2) edge (3);
%     \path[-](5) edge (6);
%     \path (6) edge (3);
%     \end{tikzpicture}                                  \\
% \hline
% \end{tabular}
% \caption{A table that shows the trade-off between sample complexity and computational efficiency before and after failure for RCA of the proposed algorithm \name using different $\mathcal{C}$ to learn $\mathcal{C}$-essential graphs.}
% \label{tab:sample_complexity_computational_proposed_algo}
% \end{table*}


% % \section{Discussion on  challenges of incorporating $\mathcal{C}$-essential graphs for RCA with CI tests only}\label{app:motivating-examples-and-challenges}

% % \begin{figure*}[h]
% %     \centering 
% %     \footnotesize
% %     % figure a 
% %     \subfigure[ $D_{1_{aug}}$]
% %      {     \begin{tikzpicture} [scale=0.3]
% %      \node (2) at (0,0) {$X_{1}$};
% %       \node (3) at (3,0) {$X_{2}$};
% %     \node (4) at (6,0) {$X_{3}$};
% %     \node (5) at (2,3) {$X_{4}$};
% %     \node (6) at (-1,3) {$F$};
% %     \path (2) edge (3);
% %     \path (3) edge (4);
% %     \path (5) edge (3);
% %     \path[red, line width=1.0] (6) edge (2);
% %     \end{tikzpicture}\label{fig:g-a}}
% %     \hspace{1.5ex}
% %     % figure b 
% %      \subfigure[$\varepsilon_{\{\{\emptyset\}\}}(D_{1})$] 
% %      {         \begin{tikzpicture} [scale=0.3]\label{fig:g-b}
% %      \node (2) at (-1.5,0) {$X_{1}$};
% %       \node (3) at (2,0) {$X_{2}$};
% %     \node (4) at (6,0) {$X_{3}$};
% %     \node (5) at (2,2.5) {$X_{4}$};
% %     \path (2) edge (3);
% %     \path (2) edge[bend right=30] (4);
% %     \path  (3) edge (4);
% %     \path(5) edge (3);
% %     \path (5) edge (4);
% %     \end{tikzpicture} } 
% %     \hspace{1.5ex}
% %     % figure c 
% %   \subfigure[ $D_{2_{aug}}$] 
% %      {         \begin{tikzpicture}[scale=0.3]\label{fig:g-c}
% %      \node (1) at (0,0) {$X_{1}$};
% %     \node (2) [right=of 1] {$X_{2}$};
% %      \node (3) at (2,3) {$X_{3}$};
% %      \node (4) at (-2,3) {$F$};
% %      \path[red, line width=1.0] (4) edge (3);
% %     \path (1) edge (2);
% %      \path (3) edge (2);
% %       \path (3) edge (1);
% %     \end{tikzpicture}} 
% %      \hspace{1.5ex}
% %     \subfigure[$\varepsilon_{\{\{\emptyset\}\}}(D_{2})$] 
% %      {         \begin{tikzpicture}[scale=0.3]\label{fig:g-d}
% %      \node (1) at (0,0) {$X_{1}$};
% %     \node (2) [right=of 1] {$X_{2}$};
% %      \node (3) at (2.5,3) {$X_{3}$};
% %     \path[o-o] (1) edge (2);
% %      \path[o-o] (3) edge (2);
% %       \path[o-o] (3) edge (1);
% %     \end{tikzpicture}} \\
% %     \subfigure[$D_{3_{aug}}$]
% %      {     \begin{tikzpicture} [scale=0.3]\label{fig:g-e}
% %          \node (2) at (0,0) {$X_{1}$};
% %       \node (3) at (3,0) {$X_{2}$};
% %     \node (4) at (6,0) {$X_{3}$};
% %     \node (1) at (3,3) {$F$};
% %     \path (2) edge (3);
% %     \path (3) edge (4);
% %     \path[red, line width=1.0] (1) edge (3);
% %     \end{tikzpicture}}\hspace{1.5ex}
% %     \subfigure[$\varepsilon_{\{\{\emptyset\}, \{X_{2}\}\}}(D_{3})$]
% %      {     \begin{tikzpicture} [scale=0.3]\label{fig:g-f}
% %      \node (2) at (0,0) {$X_{1}$};
% %       \node (3) at (3,0) {$X_{2}$};
% %     \node (4) at (6,0) {$X_{3}$};
% %     \path[-] (2) edge (3);
% %     \path[-] (3) edge (4);
% %     \end{tikzpicture}}
% %     \caption{\ref{fig:g-a} - \ref{fig:g-b}: an example shows how a $\mathcal{C}$-essential graph learned from observed data can be used to find root cause more efficiently where $\mathcal{C}=\{\{\emptyset\}\}$. \ref{fig:g-c} - \ref{fig:g-d}: an example shows how a $\mathcal{C}$-essential graph may not help identify root causes with more CI tests since it does not have any orientations. \ref{fig:g-c} - \ref{fig:g-f}: an example shows that not all $\mathcal{C}$-essential graphs that have no orientations are equally informative for RCA, where $\mathcal{C}=\{\{\emptyset\}, \{X_{2}\}\}$}.
% %     \label{fig:challenge_example}
% % \end{figure*}


% % In this section, we first show how a partial causal graph represented by $\mathcal{C}$-essential graph learned from observed data before the failure period can facilitate an efficient RCA method with CI tests under the faithfulness assumption. Then, we discuss three difficulties of incorporating a $\mathcal{C}$-essential graph for RCA. 

% % Given a $\mathcal{C}$-essential graph of a DAG $D_{1}$ shown in Figures \ref{fig:g-a} and \ref{fig:g-b} and by assumption \ref{assumption:faithfulness}, we will show that it is possible to run a single CI test to identify the root cause during the fault period. To illustrate this concept, suppose an algorithm can pick on $X_{1}$ and test the CI relation $(F \indep X_{1})_{P}$. Since $X_{2}, X_{3}, X_{4}$ are non-ancestors of $X_{1}$ in $\varepsilon_{\{\{\emptyset\}\}}(D_{1})$ and $(F \dep X_{1})_{P}$, one can infer that $X_{1}$ must be a child of $F$ in the ground truth. Hence, $X_{1}$ is the root cause.  

% % In contrast, we will show how RCD \citep{ikram2022root} is inefficient in terms of the number of CI tests used to identify root causes in this example and how the worst case for an algorithm that leverages partial causal structure still outperforms RCD in its best case. Suppose the ground truth DAG augmented by F-NODE is shown in Figure \ref{fig:g-a}. Note that the best case for RCD must have tested $6$ CI statements since the following CI statements must be observed based on the design of RCD in order to conclude $X_{1}$ to be root cause: $(F \indep X_{4})_{P}, (F \dep X_{2})_{P}, (F \indep X_{2}|X_{1})_{P}$, $(F \dep X_{3})_{P}$ and $(F \indep X_{3} | X_{2})_{P}$ (or $(F \indep X_{3} | X_{1})_{P}$). Otherwise one will need to test CI relation between $F$ and $X_{1}$ by conditioning on all subsets of the power set of $\{X_{2}, X_{3}, X_{4}\}$. However, if we compare with the best case of an algorithm that leverages partial causal structure, it only requires to observe a single CI statement: $(F\dep X_{1})_{P}$. Note that even in the worst case, it only takes at most $4$ CI statement, i.e., $(F\indep X_{4})_{P}, (F\dep X_{3})_{P}, (F \dep X_{2})_{P}, (F \dep X_{1})_{P}$ in order to conclude $X_{1}$ as it first searches through all marginal tests and can leverage the graph structure of $\varepsilon_{\{\{\emptyset\}\}}(D_{1})$ learned from observed data. 




 

% % % RCD does not consider the prior causal structure and relies on a hyperparameter, $m$, to control the size of the separating set. It divides all the variables into groups such that every group has at most $m$ variables. Then, it uses a series of CI tests in each group to identify the top-$l$ root causes, where $l$ is arbitrary. Following our example in Figure \ref{fig:a}, we observe that the least number of CI tests that RCD can invoke is $6$. For example, by observing $(F \not \indep X_1)_{P}, (F \not \indep X_{2})_{P}, (F \not \indep X_{3})_{P}, (F \indep X_4)_{P}, (\ci{F}{X_{2}}{X_{1}})_{P}$ and $(F \indep X_3 |X_{1})_{P}$, eventually identifying $X_{1}$ to be the root cause. However, if one can leverage the graphical structure as shown in \ref{fig:motiviating_examples_2}, the least number of CI tests can be $1$. It is possible by observing $(F\not \indep X_{1})_{P}$ and using the fact that there is a single root cause such that no non-ancestors of $X_{1}$ can be the root cause. Since $X_1$ has no ancestors, we can immediately conclude that $X_1$ is the root cause. 

% % However, there are a few challenges in incorporating a $\mathcal{C}$-essential graph for RCA. First, it is not clear how one should select a variable initially in a graph for testing conditional independence. Consider the same example in Figure \ref{fig:g-b}, if $X_{3}$ is selected first instead of $X_{1}$ for testing the CI relation $(F\indep X_{3})_{P}$, then one should observe $(F\dep X_{3})_{P}$, implied by assumption \ref{assumption:faithfulness}. Unfortunately, this test result does not eliminate the possibility that $X_{3}$ can be the root cause. It also does not give information to exclude $X_{1}, X_{2}, X_{4}$ from being the root cause. This shows that, given a $\mathcal{C}$-essential graph, the number of CI tests needed for RCA depends on both the graphical structure and the actual root cause location. 

% % % accounting for both the uncertainty of the root cause location and the partial causal structure given as a priori. The example in Figure \ref{fig:b} requires that one first select $X_1$ to test for conditional independence to be able to utilize the ancestral relationships provided by the $k$-essential graph.

% % Second, some $\mathcal{C}$-essential graphs may not show any orientations. This posits a challenge that one may not hope to use fewer CI tests for RCA even when a partial causal structure is learned from observational data. For example, in Figure \ref{fig:g-d}, $F$ is d-connecting with all observed variables. Unlike the example in Figure \ref{fig:g-b}, even when we have exhausted all marginal CI tests among the observed variables and $F$ during the failure period, we cannot utilize any ancestral relationships in the graph structure to determine which variable cannot be the root cause. 

% % Third, all $\mathcal{C}$-essential graphs that do not have any orientations may not be equally informative for RCA. For instance, if the $\mathcal{C}$-essential graph is the graph shown in Figure \ref{fig:g-f}, according to Figure \ref{fig:g-e}, we see that $(F\indep X_{1})_{P}$ and $(F\dep X_{2})_{P}$ hold based on assumption \ref{assumption:faithfulness}. One can infer that i.) $F$ cannot point to $X_{1}$ due to $(F\indep X_{1})_{P}$, ii.) $F$ does not have a directed path to $X_{1}$ and iii.) $F$ has a directed path to $X_{2}$. Therefore, $X_{1}-X_{2}$ can further be oriented as $X_{1}\rightarrow X_{2}$ in Figure \ref{fig:g-f} with interventional data. Since all the unshielded colliders in Figure \ref{fig:g-f} should have been oriented by $\mathcal{C}$-PC (see line 4 in Algorithm \ref{alg:IPC-algo}), $X_{2}-X_{3}$ can then be further oriented as $X_{2} \rightarrow X_{3}$, resulting in $X_{1}\rightarrow X_{2} \rightarrow X_{3}$. Hence, we can conclude $X_{2}$ to be the root cause as $X_{2}$ is the parent of $X_{3}$. As such, the $\mathcal{C}$-essential graph in Figure \ref{fig:g-f} is more informative than the one in Figure \ref{fig:g-d} for RCA.

% % % % As shown by the example in figure \ref{fig:d}, we see that there are no edges oriented as a prior. 

% % % % Furthermore, among various essential graphs that do not have any orientations, they may not be equally informative. For example, figure \ref{fig:1e} shows that $X_{1}$ and $X_{3}$ are d-separated by $X_{2}$, whereas in figure \ref{fig:1c}, every pair of nodes is d-connected given any subsets of the variables. This difference can have an impact on the efficiency of finding the root cause. Consider the example in \ref{fig:1d} and \ref{fig:1e} where $X_{2}$ is the root cause. If one has tested $(F \indep X_{1})_{P}$ and $(F \not \indep X_{2})_{P}$, then one can immediately see that the only possible DAG consistent with these CIs and the given essential graph is $X_{1}\rightarrow X_{2} \rightarrow X_{3}$ which leads to $X_{2}$ being the root cause. To see this, note that the Markov equivalent class of DAGs depicted by the essential graph only include  $X_{1}\rightarrow X_{2} \rightarrow X_{3}, X_{1}\leftarrow X_{2} \leftarrow X_{3},$ and $X_{1}\leftarrow X_{2} \rightarrow X_{3}$. Since $X_{1}$ is not the root cause due to $(F \indep X_{1})_{P}$, among the three possible DAGs, only the DAG $X_{1}\rightarrow X_{2} \rightarrow X_{3}$ can have a blocked path between $F$ and $X_{1}$ while $X_{2}$ and $F$ are d-connected. 

% % % % Now, suppose one observes the same set of CI statements, but the given essential graph is shown in figure \ref{fig:1c}. Note that one can also infer some new orientations e.g. $X_{1} \rightarrow X_{2}$ and $X_{1} \rightarrow X_{3}$ from $(F \indep X_{1})_{P}$, but it is still uncertain whether $X_{2}$ is the actual root cause. This leads to the hird challenge; there exists a trade-off between learning new orientations in an essential graph using the data after failure to increase the efficiency in terms of the number of CI tests and finding the root cause. Consider an example where the ground truth DAG is $X_{1}\leftarrow X_{2} \leftarrow \ldots \leftarrow X_{q} \rightarrow \ldots \rightarrow X_{n}$ and the root cause is $X_{q}$. The essential graph of this DAG without $F$ node is an undirected graph: $X_{1}- X_{2} - \ldots -X_{q} - \ldots - X_{n}$. With CI tests, one will have to first learn the adjacencies between $F$ and the rest of the variables, which is the same as finding conditional independence relations for every variable except for the root cause, leading to worse performance compared to RCD which potentially condition on a smaller subset of the variables due to partitioning.  

% \section{On complexity of \name and Mutual Information for RCA}\label{app:complexity_of_rcg_and_mi}

% Using mutual information (MI) is both more time- and sample-efficient than \name, as \name relies on conditional mutual information (CMI). However, there is a trade-off between efficiency and the granularity of the input structure provided to \name. For example, when using the PC algorithm to construct a CPDAG as input, the resulting graph is typically sparser than a $k$-essential graph, leading to more efficient CMI computations due to smaller conditioning sets. In contrast, a $k$-essential graph often results in larger conditioning sets, making CMI computations more costly. This efficiency gap, however, is not obvious when CPDAGs are obtained from other discovery algorithms besides PC.

\section{Additional Experiments}\label{app:additional-experiments}

\subsection{RCD with Higher-Order CI tests}\label{app:rcd-higher-order-ci}

\begin{figure}
    \centering
    \includegraphics[width=0.35\textwidth]{figures/rcd-ci-tests.pdf}
    \caption{The number of CI tests executed by RCD and the size of the separating set used in those tests. As the number of nodes increases, RCD relies on higher-order CI tests to identify the root cause. However, these higher-order tests are less reliable with limited samples, which diminishes RCD's effectiveness.}
    \label{fig:rcd-ci-tests}
\end{figure}

Figure~\ref{fig:rcd-ci-tests} illustrates the number of CI tests executed by RCD alongside the size of the separating sets used. RCD identifies the root cause by gradually increasing the size of these sets. However, the statistical power of CI tests diminishes with larger separating sets, particularly when sample sizes are limited, as is often the case in RCA, where quick failure resolution is crucial \citep{shah2020hardness,kocaoglu2023characterization}. This reliance on higher-order CI tests leads to poorer performance with an increasing number of nodes, as discussed in Section~\ref{sec:experiments} of the main paper. In contrast, we demonstrate the utility of the extended version of \name (see Algorithm \ref{alg:extended-version}), which mitigates this issue by using $k$-PC, which is more effective than full graph learning. It relies solely on $n$ marginal invariance tests after failure.

\subsection{Experiments with Sampled Version}\label{app:sample-version-result}

% \begin{figure}[!t]
%     \centering
%     \begin{minipage}{0.68\textwidth}
%         \centering
%         \includegraphics[width=\textwidth]{ICLR 2025 Template/figures/top-l-accuracy-sample.pdf}
%     \end{minipage}
%     \hfill
%     \begin{minipage}{0.31\textwidth}
%         \centering
%         \vspace{8pt}
%         \includegraphics[width=\textwidth]{ICLR 2025 Template/figures/runtime-sample.pdf}
%     \end{minipage}
%     \caption{(\textbf{Left}) Top-$l$ accuracy of \name compared to RCD. The results demonstrate that \name with \name-2 consistently provides better accuracy compared to RCD. While MI struggles due to its inability to condition on the parents of each node, whereas RCD is capable of conditioning on other nodes but lacks information about the causal structure. In contrast, \name overcomes these challenges by learning a causal graph and using CMI to rank the nodes effectively. (\textbf{Right}) The execution time of the baselines.}
%     \label{fig:sample-version}
% \end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.65\textwidth]{figures/top-l-accuracy-sample.pdf}
    \caption{Top-$l$ accuracy and the runtime of extended version of \name (see Algorithm \ref{alg:extended-version}) compared to the baselines. The input graph in this experiment was learned from the data using $k$-PC.}
    \label{fig:sample-version}
\end{figure}

Figure~\ref{fig:sample-version} illustrates the performance of \name in comparison to MI and RCD. Similar to the experiment using the ground truth causal graph, we utilized 10,000 samples for the observational dataset and only 1000 samples for the interventional dataset. Additionally, we included RCG-0 and RCG-1 based on Algorithm \ref{alg:extended-version} to demonstrate the performance across different values of $k$ for $k$-PC, where uses all the separating set of size up to $k$. We did not include RUN in this experiment, as it requires continuous data, while our dataset in this experiment is discrete. Furthermore, RCG(IGS) and RCG(CPDAG) were omitted since we cannot derive a complete DAG from the samples, and learning the full CPDAG from the samples is exceedingly time-consuming~\cite{ikram2022root}.

The results align with our earlier findings presented in the main paper. RCD exhibits poor performance because it lacks access to causal relationships, leading it to condition on all nodes until a separator is found. This results in lower accuracy for RCD. In contrast, \name yields better results as the value of $k$ increases. Notably, RCG-1 consistently outperforms RCD, while RCG-0 occasionally produces results similar to RCD, but sometimes fails to identify the root cause. This inconsistency arises because RCG-0 struggles to learn a sufficiently sparse graph, resulting in conditioning on a larger set of nodes, which diminishes the reliability of the conditional independence test.

\subsection{Accuracy with Larger Graphs}\label{app:more-int-samples-accuracy}

\begin{figure}
    \centering
    \subfigure[50-node graphs]{%
        \includegraphics[width=0.35\textwidth]{figures/50-int_samples_recall.pdf}
    }
    \subfigure[100-node graphs]{%
        \includegraphics[width=0.35\textwidth]{figures/100-int_samples_recall.pdf}
    }
    \caption{Top-1 accuracy with 50- and 100-nodes graphs with varying number of interventional samples. \name(CPDAG) (Algorithm \ref{alg:sample-version}) uses the ground truth CPDAG.}
    \label{fig:larger-graphs-int-samples}
\end{figure}

Figure~\ref{fig:larger-graphs-int-samples} presents the top-1 accuracy of the three approaches across 50- and 100-node graphs with varying sample sizes. As we can see the performance of \name, when graph complexity increases with more nodes, a larger number of samples is required to accurately estimate relationships and identify the root cause.

\subsection{Execution Time with Varying number of Samples}\label{app:int-samples-runtime}

\begin{figure}
\centering
\includegraphics[width=0.35\textwidth]{figures/int_samples_time.pdf}
   \caption{The execution time of ranking top-1 root cause with three competing approaches with varying number of interventional samples.}
   \label{fig:execution-time}
\end{figure}

Figure~\ref{fig:execution-time} presents the execution time of \name alongside MI and RCD given ground truth CPDAG. MI maintains a consistent runtime as it computes a fixed number of mutual information scores. In contrast, RCD's runtime grows sharply beyond 1000 samples due to an increasing number of dependencies detected by CI tests, leading to a larger number of subsets for analysis. \name shows the highest but stable runtime. This is because, as \name has access to more samples, the CMI scores given a separating set become more reliable, enabling \name to explore different values of $\alpha$ and achieve a more consistent ranking.

% Azam: I added BARO to main experiments
% \subsection{Linear Gaussian Additive Models with BARO}\label{app:linear_gaussian_model_comparison}

% We want to demonstrate the merit of correct causal knowledge and do so by comparing our proposed method with access to a correct directed acyclic graph of the underlying system with one of the state-of-the-art methods called BARO \cite{pham2024baro}. 

% As BARO is restricted to data where median and interquartile range can be computed, we provide a synthetic experiment that generates DAGs of size $n \in \{5, 25, 50, 75, 100\}$. The sample size for observational data and interventional is proportional to the number of variables e.g. $100n$. A root cause is randomly assigned and there are at least $k$ descendants randomly assigned to the root cause where $k>0$. Then, there is a probability of $0.7$ that there exists a confounder between the root cause and one of its descendants. Then, directed edges are randomly assigned between a pair of nodes that are not the root cause and its descendants with a probability of $0.6$ while acyclicity is maintained. Each variable that has no parents follows a standard Gaussian distribution. Any variable that has parents will take a weighted sum of its parents with an additive standard Gaussian noise. The weight from each parent is sampled from a uniform distribution between $0.5$ and $1.5$ over the size of the graph. If the root cause variable does not have any parents, then it follows a Gaussian distribution with mean sampled from a uniform distribution between $-10$ and $10$ and standard deviation sampled from a uniform distribution between $1.5$ and $10$. Otherwise, it is a weighted sum of its parents plus a noise term that follows a Gaussian distribution with mean sampled from a uniform distribution between $-10$ and $10$ and standard deviation sampled from a uniform distribution between $1.5$ and $10$. We repeat the experiment for $100$ times per graph size.  We provide the exact index of the data point that follows the interventional distribution for BARO.  We discretize the dataset with $k$-bins discretizer in \texttt{scikit-learn}~\citep{scikit-learn} with the setting: $k=3$, \texttt{encode = ordinal, strategy=kmeans}. We compute $I(X;F|PossPa_{D}(X))$ by counting the frequencies for each node $X$ given a correct DAG. We rank each node by sorting $I(X;F|PossPa_{D}(X))$ for each $X$ in descending order. This approached is denoted as CMI.  We limit both the observational and interventional sample sizes to $100$ for each size of the graph.


% \begin{figure}
% \centering\includegraphics[width=0.7\textwidth]{figures/100_Samples_accuracy_comparison_plot_adjusted_legend_further_down.pdf}
%    \caption{Average top-$l$ accuracy of CMI compared to BARO with $l\in {1, 3, 5}$ over $100$ repeated experiment per graph size. The results demonstrate that CMI with correct causal knowledge consistently provides better accuracy compared to the state-of-the-art algorithm BARO even when the distribution is only continuous. Both observational and interventional sample sizes are $100n$ for $n \in \{5, 25, 50, 75, 100\}$. }
%    \label{fig:comparison_with_baro}
% \end{figure}

% From Figure \ref{fig:comparison_with_baro}, we see that the use of conditional mutual information with the correct causal knowledge consistently outperforms BARO under a limited sample across all graph sizes. There is almost $0.4$ average top-1 accuracy difference for the small graph of size $5$. The difference becomes small as the graph size increases. This is expected as the data gets noisier with larger graphs due to the experimental setup. We see that our approach achieves $100\%$ average top-3 accuracy for graphs with $5$ variables. This experiment shows the benefits of having correct causal knowledge in the presence of spurious correlation.   

\section{Memory Leak Failure in Sock-shop}\label{app:sockshop-result}

\begin{table}[H]
\centering

\label{tab:sockshop-mem}
\begin{tabular}{|cl|rrrrrrrr|}
\hline
\multicolumn{2}{|c|}{}                                                                   & \multicolumn{1}{l}{MI}                                             & \multicolumn{1}{l}{cRCA}                  & \multicolumn{1}{l}{RUN}                  & \multicolumn{1}{l}{BARO}                                            & \multicolumn{1}{l}{RCD}                  & \multicolumn{1}{c}{\name-0}                         & \multicolumn{1}{c}{\name-1}                         & \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}\name-\\ Expert\end{tabular}} \\ \hline
\multicolumn{1}{|c|}{\multirow{5}{*}{top-1}} & Carts                                     & 0.87                                                               & 0.20                                      & 0.02                                     & 1.00                                                                & 0.58                                     & 1.00                                                               & 0.87                                                               & 0.36                                                                                        \\
\multicolumn{1}{|c|}{}                       & Catalogue                                 & 0.10                                                               & 0.20                                      & 0.00                                     & 1.00                                                                & 0.20                                     & 0.97                                                               & 0.12                                                               & 0.49                                                                                        \\
\multicolumn{1}{|c|}{}                       & Orders                                    & 1.00                                                               & 0.00                                      & 0.00                                     & 1.00                                                                & 1.00                                     & 0.95                                                               & 0.98                                                               & 0.95                                                                                        \\
\multicolumn{1}{|c|}{}                       & Payment                                   & 0.99                                                               & 0.40                                      & 0.00                                     & 1.00                                                                & 0.93                                     & 0.88                                                               & 0.99                                                               & 1.00                                                                                        \\
\multicolumn{1}{|c|}{}                       & User                                      & 0.98                                                               & 0.40                                      & 0.00                                     & 1.00                                                                & 1.00                                     & 0.44                                                               & 0.98                                                               & 0.97                                                                                        \\
\multicolumn{1}{|c|}{}                       & \cellcolor{gray!20} Avg. & \cellcolor{gray!20}0.79                           & \cellcolor{gray!20} 0.24 & \cellcolor{gray!20}0.00 & \cellcolor{gray!20} 1.00 & \cellcolor{gray!20}0.74 & \cellcolor{gray!20} 0.85 & \cellcolor{gray!20}0.79                           & \cellcolor{gray!20} 0.75                                                   \\ \hline
\multicolumn{1}{|c|}{\multirow{5}{*}{top-3}} & Carts                                     & 1.00                                                               & 0.60                                      & 0.40                                     & 1.00                                                                & 0.76                                     & 1.00                                                               & 1.00                                                               & 1.00                                                                                        \\
\multicolumn{1}{|c|}{}                       & Catalogue                                 & 0.98                                                               & 0.25                                      & 0.30                                     & 1.00                                                                & 0.46                                     & 0.99                                                               & 0.58                                                               & 1.00                                                                                        \\
\multicolumn{1}{|c|}{}                       & Orders                                    & 1.00                                                               & 0.00                                      & 0.09                                     & 1.00                                                                & 0.96                                     & 0.97                                                               & 0.99                                                               & 1.00                                                                                        \\
\multicolumn{1}{|c|}{}                       & Payment                                   & 1.00                                                               & 0.40                                      & 0.10                                     & 1.00                                                                & 0.98                                     & 0.89                                                               & 1.00                                                               & 1.00                                                                                        \\
\multicolumn{1}{|c|}{}                       & User                                      & 1.00                                                               & 0.62                                      & 0.11                                     & 1.00                                                                & 1.00                            & 0.51                                                               & 1.00                                                               & 1.00                                                                                        \\
\multicolumn{1}{|c|}{}                       & \cellcolor{gray!20} Avg. & \cellcolor{gray!20} 1.00 & \cellcolor{gray!20} 0.37 & \cellcolor{gray!20}0.20 & \cellcolor{gray!20} 1.00 & \cellcolor{gray!20}0.83 & \cellcolor{gray!20}0.87 & \cellcolor{gray!20}0.93                           & \cellcolor{gray!20} 1.00                         \\ \hline
\multicolumn{1}{|c|}{\multirow{5}{*}{top-5}} & Carts                                     & 1.00                                                               & 0.80                                      & 0.66                                     & 1.00                                                                & 0.77                                     & 1.00                                                               & 1.00                                                               & 1.00                                                                                        \\
\multicolumn{1}{|c|}{}                       & Catalogue                                 & 0.99                                                               & 0.52                                      & 0.60                                     & 1.00                                                                & 0.49                                     & 1.00                                                               & 0.69                                                               & 1.00                                                                                        \\
\multicolumn{1}{|c|}{}                       & Orders                                    & 1.00                                                               & 0.00                                      & 0.16                                     & 1.00                                                                & 1.00                                     & 0.97                                                               & 1.00                                                               & 1.00                                                                                        \\
\multicolumn{1}{|c|}{}                       & Payment                                   & 1.00                                                               & 0.40                                      & 0.19                                     & 1.00                                                                & 0.96                                     & 0.89                                                               & 1.00                                                               & 1.00                                                                                        \\
\multicolumn{1}{|c|}{}                       & User                                      & 1.00                                                               & 0.67                                      & 0.26                                     & 1.00                                                                & 1.00                                     & 0.68                                                               & 1.00                                                               & 1.00                                                                                        \\
\multicolumn{1}{|c|}{}                       & \cellcolor{gray!20} Avg. & \cellcolor{gray!20} 1.00 & \cellcolor{gray!20} 0.48 & \cellcolor{gray!20}0.37 & \cellcolor{gray!20} 1.00 & \cellcolor{gray!20}0.84 & \cellcolor{gray!20}0.91 & \cellcolor{gray!20} 0.94 & \cellcolor{gray!20} 1.00    \\ \hline
\end{tabular}
\caption{The table presents the top-$l$ accuracy of various baselines on data collected from the Sock-shop application after injecting a memory leak failure into a specific microservice.}
\label{tab:memory-leak}
\end{table}


% \begin{table}[H]
% \scriptsize
% \centering

% \label{tab:sockshop-mem}
% \begin{tabular}{|cl|rrrrrrrrr|}
% \hline
% \multicolumn{2}{|c|}{}                                                                   & \multicolumn{1}{l}{MI}                                             & \multicolumn{1}{l}{cRCA}                  & \multicolumn{1}{l}{RUN}                  & \multicolumn{1}{l}{BARO}                                            & \multicolumn{1}{l}{RCD}                  & \multicolumn{1}{c}{\name-0}                         & \multicolumn{1}{c}{\name-C} & \multicolumn{1}{c}{\name-1}                         & \multicolumn{1}{c|}{\begin{tabular}[c]{@{}c@{}}\name-\\ Expert\end{tabular}} \\ \hline
% \multicolumn{1}{|c|}{\multirow{5}{*}{top-1}} & Carts                                     & 0.87                                                               & 0.20                                      & 0.02                                     & 1.00                                                                & 0.58                                     & 1.00                                                               & 0.86                                       & 0.87                                                               & 0.36                                                                                        \\
% \multicolumn{1}{|c|}{}                       & Catalogue                                 & 0.10                                                               & 0.20                                      & 0.00                                     & 1.00                                                                & 0.20                                     & 0.97                                                               & 0.24                                       & 0.12                                                               & 0.49                                                                                        \\
% \multicolumn{1}{|c|}{}                       & Orders                                    & 1.00                                                               & 0.00                                      & 0.00                                     & 1.00                                                                & 1.00                                     & 0.95                                                               & 0.99                                       & 0.98                                                               & 0.95                                                                                        \\
% \multicolumn{1}{|c|}{}                       & Payment                                   & 0.99                                                               & 0.40                                      & 0.00                                     & 1.00                                                                & 0.93                                     & 0.88                                                               & 0.99                                       & 0.99                                                               & 1.00                                                                                        \\
% \multicolumn{1}{|c|}{}                       & User                                      & 0.98                                                               & 0.40                                      & 0.00                                     & 1.00                                                                & 1.00                                     & 0.44                                                               & 0.98                                       & 0.98                                                               & 0.97                                                                                        \\
% \multicolumn{1}{|c|}{}                       & \cellcolor{gray!20} Avg. & \cellcolor{gray!20}0.79                           & \cellcolor{gray!20} 0.24 & \cellcolor{gray!20}0.00 & \cellcolor{gray!20} 1.00 & \cellcolor{gray!20}0.74 & \cellcolor{gray!20} 0.85 & \cellcolor{gray!20} 0.81  & \cellcolor{gray!20}0.79                           & \cellcolor{gray!20} 0.75                                                   \\ \hline
% \multicolumn{1}{|c|}{\multirow{5}{*}{top-3}} & Carts                                     & 1.00                                                               & 0.60                                      & 0.40                                     & 1.00                                                                & 0.76                                     & 1.00                                                               & 0.99                                       & 1.00                                                               & 1.00                                                                                        \\
% \multicolumn{1}{|c|}{}                       & Catalogue                                 & 0.98                                                               & 0.25                                      & 0.30                                     & 1.00                                                                & 0.46                                     & 0.99                                                               & 0.76                                       & 0.58                                                               & 1.00                                                                                        \\
% \multicolumn{1}{|c|}{}                       & Orders                                    & 1.00                                                               & 0.00                                      & 0.09                                     & 1.00                                                                & 0.96                                     & 0.97                                                               & 0.99                                       & 0.99                                                               & 1.00                                                                                        \\
% \multicolumn{1}{|c|}{}                       & Payment                                   & 1.00                                                               & 0.40                                      & 0.10                                     & 1.00                                                                & 0.98                                     & 0.89                                                               & 1.00                                       & 1.00                                                               & 1.00                                                                                        \\
% \multicolumn{1}{|c|}{}                       & User                                      & 1.00                                                               & 0.62                                      & 0.11                                     & 1.00                                                                & 1.00                            & 0.51                                                               & 1.00                                       & 1.00                                                               & 1.00                                                                                        \\
% \multicolumn{1}{|c|}{}                       & \cellcolor{gray!20} Avg. & \cellcolor{gray!20} 1.00 & \cellcolor{gray!20} 0.37 & \cellcolor{gray!20}0.20 & \cellcolor{gray!20} 1.00 & \cellcolor{gray!20}0.83 & \cellcolor{gray!20}0.87                           & \cellcolor{gray!20} 0.95  & \cellcolor{gray!20}0.93                           & \cellcolor{gray!20} 1.00                         \\ \hline
% \multicolumn{1}{|c|}{\multirow{5}{*}{top-5}} & Carts                                     & 1.00                                                               & 0.80                                      & 0.66                                     & 1.00                                                                & 0.77                                     & 1.00                                                               & 1.00                                       & 1.00                                                               & 1.00                                                                                        \\
% \multicolumn{1}{|c|}{}                       & Catalogue                                 & 0.99                                                               & 0.52                                      & 0.60                                     & 1.00                                                                & 0.49                                     & 1.00                                                               & 0.76                                       & 0.69                                                               & 1.00                                                                                        \\
% \multicolumn{1}{|c|}{}                       & Orders                                    & 1.00                                                               & 0.00                                      & 0.16                                     & 1.00                                                                & 1.00                                     & 0.97                                                               & 1.00                                       & 1.00                                                               & 1.00                                                                                        \\
% \multicolumn{1}{|c|}{}                       & Payment                                   & 1.00                                                               & 0.40                                      & 0.19                                     & 1.00                                                                & 0.96                                     & 0.89                                                               & 1.00                                       & 1.00                                                               & 1.00                                                                                        \\
% \multicolumn{1}{|c|}{}                       & User                                      & 1.00                                                               & 0.67                                      & 0.26                                     & 1.00                                                                & 1.00                                     & 0.68                                                               & 1.00                                       & 1.00                                                               & 1.00                                                                                        \\
% \multicolumn{1}{|c|}{}                       & \cellcolor{gray!20} Avg. & \cellcolor{gray!20} 1.00 & \cellcolor{gray!20} 0.48 & \cellcolor{gray!20}0.37 & \cellcolor{gray!20} 1.00 & \cellcolor{gray!20}0.84 & \cellcolor{gray!20}0.91                           & \cellcolor{gray!20} 0.95  & \cellcolor{gray!20} 0.94 & \cellcolor{gray!20} 1.00    \\
% \hline
% \end{tabular}
% \caption{The table presents the top-$l$ accuracy of various baselines on data collected from the Sock-shop application after injecting a memory leak failure into a specific microservice.}\label{tab:memory-leak}
% \end{table}



% \section{Characteristics of Outages in Real World Application}\label{app:outage-summary}

% \begin{table}[h!]
%     \footnotesize
%     \centering
%         \begin{tabular}{c c c c c}
%         \toprule
%         Outage & Nodes & \begin{tabular}[c]{@{}c@{}}Normal\\ Samples\end{tabular} & \begin{tabular}[c]{@{}c@{}}Failure\\ Samples\end{tabular} & \begin{tabular}[c]{@{}c@{}}Duration\\ (hours)\end{tabular} \\
%         \midrule
%         A      & 152   & 4783                       & 918                         & 15        \\
%         B      & 141   & 4626                       & 1217                        & 20        \\
%         C      & 149   & 3464                       & 110                         & 2        \\
%         D      & 146   & 7165                       & 567                         & 5        \\
%         \bottomrule
%         \end{tabular}
%     \caption{Summary of outages from a real-world production application.}
%     \label{tab:outage-summary}
% \end{table}
