\documentclass[accepted]{uai2022}
\usepackage[american]{babel}
\usepackage{natbib}
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{thmtools}
\usepackage{mathtools}
\usepackage{booktabs}
\usepackage{tikz}
\usetikzlibrary{positioning}
\usepackage{pgfplotstable}
\usepackage{pgfplots}
\pgfplotsset{compat=1.8}
\usepgfplotslibrary{groupplots}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{xcolor}
\usepackage{float}
\usepackage{subfig}
\usepackage[linesnumbered,ruled,vlined]{algorithm2e}
\SetKwRepeat{Do}{do}{while}
\maxdeadcycles=200
\extrafloats{100}
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother
\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
\myexternaldocument{lam_294}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Macros %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\let\mc\mathcal
\let\mb\mathbf
\let\mt\mathtt
\let\mf\mathfrak
\let\bs\boldsymbol
\let\wt\widetilde
\newcommand{\Prob}[0]{\mc{P}}
\newcommand{\G}[0]{\mc{G}}
\newcommand{\A}[0]{\mathsf{A}}
\newcommand{\I}[0]{\mt{I}}
\newcommand{\E}[0]{\mt{E}}  
\newcommand{\D}[0]{\mc{D}}
\newcommand{\Pa}[0]{\mt{Pa}}
\newcommand{\An}[0]{\mt{An}}
\newcommand{\Ch}[0]{\mt{Ch}}
\newcommand{\De}[0]{\mt{De}}
\newcommand{\Nd}[0]{\mt{Nd}}
\newcommand{\Pre}[0]{\mt{Pre}}
\newcommand{\MB}[0]{\mt{MB}}
\newcommand{\MEC}[0]{\mt{MEC}}
\newcommand{\CMC}[0]{\mt{CMC}}
\newcommand{\CFC}[0]{\mt{CFC}}
\newcommand{\SGS}[0]{\mt{SGS}}
\newcommand{\Pm}[0]{\mt{Pm}}
\newcommand{\uPm}[0]{\mt{uPm}}
\newcommand{\Fr}[0]{\mt{Fr}}
\newcommand{\uFr}[0]{\mt{uFr}}
\newcommand{\BIC}[0]{\mt{BIC}}
\newcommand{\DAG}[0]{\mt{DAG}}
\newcommand{\la}[0]{\langle}
\newcommand{\ra}[0]{\rangle}
\newcommand{\CI}[0]{\perp\!\!\!\perp}
\newcommand{\nCI}[0]{\perp\!\!\!\!/\!\!\!\!\!\perp}
\newcommand{\ot}[0]{\leftarrow}
\newcommand{\TP}[0]{\mathit{TP}}
\newcommand{\FP}[0]{\mathit{FP}}
\newcommand{\FN}[0]{\mathit{FN}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Environments %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newtheorem{definition}{Definition}[section]
\newtheorem{corollary}[definition]{Corollary}
\newtheorem{lemma}[definition]{Lemma}
\newtheorem{example}[definition]{Example}
\newtheorem{theorem}[definition] {Theorem}
\newtheorem{conjecture}[definition]{Conjecture}
\newenvironment{proof}{\textit{Proof.\hspace{0.1cm}}}{\hfill$\square$}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Front matter %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{Greedy Relaxations of the Sparsest Permutation Algorithm\\ (Supplementary material)}
\author[1]{\href{mailto:waiyinl@andrew.cmu.edu?Subject=}{Wai-Yin~Lam}}
\author[1]{\href{mailto:bjandrew@andrew.cmu.edu}{Bryan~Andrews}}
\author[1]{\href{mailto:jdramsey@andrew.cmu.edu}{Joseph~Ramsey}}
\affil[1]{
    Department of Philosophy\\
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
\begin{document}
\onecolumn
\maketitle
\appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Background materials}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% A: Background materials %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\label{app:bkgd}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% A1: Graphical definitions %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Graphical Definitions}
\label{app:graph_def}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

A \textit{directed graph} $\G$ over a set of measured variables $\mb{V} = \{X_1,...,X_m\}$ consists of $m$ vertices $\mb{v} = \{1,...,m\}$ where each vertex $i \in \mb{v}$ associates to a variable $X_i \in \mb{V}$, and each edge in $\G$ is directed with the form $j \to k$ and no vertex has a directed edge to itself. A \textit{path} $\mf{p}$ is a sequence of vertices $\la \mb{i}_1, \mb{i}_2,..., \mb{i}_k\ra$ for some $k \geq 2$ where $\mb{i}_j \in \mb{v}$ for each $1 \leq j \leq k$, and $\mb{i}_j$ and $\mb{i}_{j+1}$ are connected by a directed edge (i.e., $\mb{i}_j \to \mb{i}_{j+1}$ or $\mb{i}_{j+1} \to \mb{i}_{j}$). A path $\mf{p}$ is \textit{directed} if $\mb{i}_{j} \to \mb{i}_{j+1}$ for each $1 \leq j < k$. A \textit{directed acyclic graph} (DAG) is a directed graph where no vertex can have a directed path to itself.

Denote $\E(\G)$ as the set of directed edges in $\G$. A pair of DAGs $\G_1, \G_2$ over the same set of variables $\mb{V}$ are equivalent if and only if $\E(\G_1) = \E(\G_2)$. Let $\Pa(j, \G) = \{k \in \mb{v}: (k \to j) \in \E(\G)\}$ be the set of \textit{parents} of $j$ in $\G$, and $\Ch(j, \G) = \{k \in \mb{v}: (j \to k) \in \E(\G)\}$ be the set of \textit{children} of $j$ in $\G$. $\An(j, \G)$, the \textit{ancestors} of $j$ in $\G$, is defined by the transitive closure of $\Pa(j, \G)$. Similarly, $\De(j, \G)$, the \textit{descendants} of $j$ in $\G$, is defined by the transitive closure of $\Ch(j, \G)$ and union with $\{j\}$ itself (i.e., $j$ is its own descendant). Further let $\Nd(j, \G) = \mb{v} \setminus \De(j, \G)$ be the set of $j$'s \textit{non-descendants}.

A pair of vertices $j, k \in \mb{v}$ are said to be \textit{adjacent} in $\G$ if $(j \to k) \in \E(\G)$ or $(k \to j) \in \E(\G)$. For any triple of pairwise distinct vertices $i, j, k \in \mb{v}$, we say that $(i, j, k)$ is \textit{unshielded} if $(i, j)$ and $(j, k)$ are adjacent pairs in $\G$, but not $(i, k)$. $(i, j, k)$ forms a \textit{triangle} if they are pairwise adjacent. If $(i, j, k)$ is an unshielded triple or is a triangle, $j$ is a \textit{collider} (on the path $\la i, j, k\ra$) if $(i \to j), (k \to j) \in \E(\G)$, and a \textit{non-collider} otherwise. A path $\mf{p}$ is a \textit{trek} if it contains no collider. 

For any $j, k \in \mb{v}$ and any $\mb{i} \subseteq \mb{v} \setminus \{j, k\}$, $j$ and $k$ are \textit{d-connected} given $\mb{i}$ in $\G$ if there exists a path $\mf{p}$ between $j$ and $k$ in $\G$ such that no non-collider on $\mf{p}$ is in $\mb{i}$, and each collider $l$ on $\mf{p}$ or a $l$'s descendant is in $\mb{i}$. $j$ and $k$ are \textit{d-separated} given $\mb{i}$ in $\G$ if $j$ and $k$ are not d-connected given $\mb{i}$. For any disjoint subsets of vertices $\mb{j}, \mb{k}, \mb{i} \subseteq \mb{v}$, $\mb{j}$ and $\mb{k}$ are d-separated given $\mb{i}$ in $\G$ if $j$ and $k$ are d-separated by $\mb{i}$ in $\G$ for every $j \in \mb{j}$ and every $k \in \mb{k}$. 

Given a model $(\G, \Prob)$ over $\mb{V}$, $\G$ is said to be \textit{local Markov} to $\Prob$ if $X_j \CI_\Prob \mb{X}_{\Nd(j, \G)} \setminus \mb{X}_{\Pa(j, \G)}\,|\,\mb{X}_{\Pa(j, \G)}$ for every $j \in \mb{v}$. It is a well-known fact that $\G$ is local Markov to $\Prob$ if and only if $\I(\G) \subseteq \I(\Prob)$ (i.e., global Markov as defined by d-separation).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% A2: Graphoid axioms %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Graphoid Axioms} 
\label{app:graphoid}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

For any pairwise disjoint sets of variables $\mb{W}, \mb{X}, \mb{Y},$ and $\mb{Z}$,
\begin{align*}
\mb{X} \CI \mb{Y} \,|\,\mb{Z} &\,\,\Rightarrow\,\, \mb{Y} \CI \mb{X} \,|\,\mb{Z} & (\textit{symmetry})\\
\mb{X} \CI \mb{Y} \cup \mb{W} \,|\,\mb{Z} &\,\,\Rightarrow\,\, (\mb{X} \CI \mb{Y} \,|\,\mb{Z})  \wedge (\mb{X} \CI \mb{W}\,|\,\mb{Z}) &
(\textit{decomposition})\\
\mb{X} \CI \mb{Y} \cup \mb{W} \,|\,\mb{Z} &\,\,\Rightarrow\,\, \mb{X} \CI \mb{Y} \,|\,\mb{Z} \cup \mb{W} &
(\textit{weak union})\\
(\mb{X} \CI \mb{Y}\,|\,\mb{Z}) \wedge (\mb{X} \CI \mb{W}\,|\,\mb{Z} \cup \mb{Y}) &\,\,\Rightarrow\,\, \mb{X} \CI \mb{Y}\cup \mb{W}\,|\,\mb{Z} &(\textit{contraction})\\
(\mb{X} \CI \mb{Y}\,|\,\mb{Z} \cup \mb{W}) \wedge (\mb{X} \CI \mb{W}\,|\,\mb{Z} \cup \mb{Y}) &\,\,\Rightarrow\,\, \mb{X} \CI \mb{Y}\cup \mb{W}\,|\,\mb{Z} &(\textit{intersection})\\
(\mb{X} \CI \mb{Y}\,|\,\mb{Z}) \wedge (\mb{X} \CI \mb{W}\,|\,\mb{Z}) &\,\,\Rightarrow\,\, \mb{X} \CI \mb{Y}\cup \mb{W}\,|\,\mb{Z} & (\textit{composition})
\end{align*}

A distribution $\Prob$ is a \textit{semigraphoid} if $\I(\Prob)$ is closed under \textit{symmetry}, \textit{decomposition}, \textit{weak union}, and \textit{contraction}. A semigraphoid $\Prob$ is a \textit{graphoid} if $\I(\Prob)$ is closed under \textit{intersection}. A graphoid $\Prob$ is \textit{compositional} if $\I(\Prob)$ is closed under \textit{composition}. See Chapter 2 of \citep{Studeny10.5555/1044858} for a more comprehensive study of graphoid axioms. In addition, applications of \textit{symmetry} in our upcoming proofs will be done implicitly for the sake of simplicity. 

Additionally, \cite{Spohn1994}
notes that the following property necessarily holds in the independence models induced by positive discrete probability distributions. For any pairwise disjoint sets of variables $\mb{W}, \mb{X}, \mb{Y},$ and $\mb{Z}$,
\[
    (\mb{X} \CI \mb{Y}\,|\,\mb{W} \cup \mb{Z}) \wedge (\mb{W} \CI \mb{Z}\,|\,\mb{X} \cup \mb{Y}) \wedge (\mb{W} \CI \mb{Z}\,|\,\mb{X}) \,\,\Rightarrow\,\, [(\mb{W} \CI \mb{Z}\,|\,\mb{Y}) \Leftrightarrow (\mb{W} \CI \mb{Z}\,|\,\varnothing)] \qquad (\textit{Spohn condition})
\]

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% A3: DAG induced from a permutation %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{DAG induced from a permutation}
\label{app:DAG_induce}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{definition}
\label{MB}
Given a semigraphoid $\Prob$ over $\mb{V}$, for every $X \in \mb{V}$, we say that $\mb{M} \subseteq \mb{V}$ is a \textit{Markov blanket} of $X$ relative to $\mb{Z} \subseteq \mb{V} \setminus \{X\}$ if
\begin{enumerate}
    \item[(i)] $\mb{M} \subseteq \mb{Z}$;
    \item[(ii)] $X \CI_\Prob (\mb{Z}\setminus \mb{M})\,|\,\mb{M}$.
\end{enumerate}
Such a Markov blanket $\mb{M}$ is said to be a \textit{Markov boundary} if it further satisfies the following condition:
\begin{enumerate}
    \item[(iii)] there does not exist $\mathbf{M}' \subset \mathbf{M}$ s.t. $X \CI_\Prob (\mb{Z} \setminus \mb{M}')\,|\,\mb{M}'$.
\end{enumerate}
\end{definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{MB_unique}
\citep{verma1988causal_supp} Given a graphoid $\Prob$ over $\mb{V}$, for every $X \in \mb{V}$ and every $\mb{Z} \subseteq \mb{V} \setminus \{X\}$, there is a \textit{unique} Markov boundary of $X$ relative to $\mb{Z}$.
\end{lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In the following, we use $\MB_\Prob(X, \mb{Z})$ to refer to the unique Markov boundary of $X$ relative to $\mb{Z}$. The subscript $\Prob$ will be suppressed if the underlying graphoid is clear from context.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{MB_subset}
Given a graphoid $\Prob$  over $\mb{V}$, for every $X \in \mb{V}$ and every $\mb{Z} \subseteq \mb{V} \setminus \{X\}$, if $\mb{M}$ is a Markov blanket of $X$ relative to $\mb{Z}$, then $\MB(X, \mb{Z}) \subseteq \mb{M}$.
\end{lemma}
\begin{proof}
Immediate from \textbf{Definition \ref{MB}} and \textbf{Lemma \ref{MB_unique}}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Next, we revisit the two methods of inducing a DAG from a permutation. Given a semigraphoid $\Prob$ over $\mb{V}$, each $\pi \in \Pi(\mb{v})$ induces a DAG satisfying the following condition:
\begin{align}\tag{VP}
X_j \in \mb{M} \Leftrightarrow (j \to k) \in \E(\G_\pi)
\end{align}
where $\mb{M}$ is a Markov boundary of $X_k$ relative to $\mb{X}_{\Pre(k, \pi)}$. (VP) is the construction of a \textit{boundary DAG} in \citep{verma1988causal_supp}. On the other hand, given a graphoid $\Prob$ over $\mb{V}$, each $\pi \in \Pi(\mb{v})$ induces a DAG satisfying the following condition:
\begin{align}\tag{RU}
    j \in \Pre(k, \pi) \text{ and } X_j \nCI_\Prob X_k\,|\,\mb{X}_{\Pre(k, \pi)\setminus \{j\}} \Leftrightarrow (j \to k) \in \E(\G_\pi). 
\end{align}

We want to show that the two DAG-inducing methods are equivalent when the underlying distribution is a graphoid.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{lemma}
\label{VP=RU}
Given a graphoid $\Prob$ over $\mb{V}$, consider any $\pi \in \Pi(\mb{v})$. Let $\G_\pi$ be the DAG induced from $\pi$ by (VP), and $\mc{H}_\pi$ be the DAG induced from $\pi$ by (RU). Then $\G_\pi = \mc{H}_\pi$. 
\end{lemma}
\begin{proof}
We divide the proof into two directions: (VP) $\Rightarrow$ (RU), and (VP) $\Leftarrow$ (RU). Consider any $j, k \in \mb{v}$ where $\pi[j] < \pi[k]$ such that $j \in \Pre(k, \pi)$. Let $\mathbf{M}$ be the unique Markov boundary $\MB(X_k, \mb{X}_{\Pre(k, \pi)})$.

[$\Rightarrow$] Suppose that $(j \to k) \notin \E(\G_\pi)$. We have $X_j \notin \mb{M}$. By \textbf{Definition \ref{MB}} (ii), we then have,
\begin{align}
    X_k \CI_\Prob& ((\mb{X}_{\Pre(k, \pi)} \setminus \mb{M}) \setminus \{X_j\}) \cup \{X_j\}\,|\,\mb{M} &\because X_k \CI_\Prob \mb{X}_{\Pre(k, \pi)} \setminus \mb{M}\,|\,\mb{M}\\
    X_k \CI_\Prob& X_j\,|\,\mb{M} \cup ((\mb{X}_{\Pre(k, \pi)} \setminus \mb{M}) \setminus \{X_j\}) &\because (1), \text{weak union}\\
    X_k \CI_\Prob& X_j\,|\,\mb{X}_{\Pre(k, \pi) \setminus \{j\}} &\because (2)
\end{align}
where the last formula amounts to $(j \to k) \notin \E(\mc{H}_\pi)$ by (RU).

[$\Leftarrow$] Suppose that $(j \to k) \notin \E(\mc{H}_\pi)$. We have $X_k \CI_\Prob X_j\,|\,\mb{X}_{\Pre(k, \pi)\setminus \{j\}}$. Let $\mb{M}'$ be $\mb{X}_{\Pre(k, \pi) \setminus \{j\}}$. We have $X_k \CI_\Prob (\mb{X}_{\Pre(k, \pi)} \setminus \mb{M}')\,|\,\mb{M}'$ such that $\mb{M}'$ is a Markov blanket of $X_k$ relative to $\mb{X}_{\Pre(k, \pi)}$. By \textbf{Lemma \ref{MB_subset}}, $X_j \notin \mb{M} \subseteq \mb{M}'$ and therefore $(j \to k) \notin \E(\G_\pi)$ by (VP). 
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{theorem}
\label{VP-theorem}
\citep{Pearl_10.5555/534975} Given a semigraphoid $\Prob$ over $\mb{V}$, $\G_\pi$ induced by $\pi$ using (VP) is Markovian and SGS-minimal for any $\pi \in \Pi(\mb{v})$. 
\end{theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\textbf{Theorem \ref{RU-theorem}}
\hspace{0.2cm} \textit{Given a graphoid $\Prob$ over $\mb{V}$, $\G_\pi$ induced by $\pi$ using (RU) is Markovian and SGS-minimal for any $\pi \in \Pi(\mb{v})$.}\\

\begin{proof}
Immediate from \textbf{Lemma \ref{VP=RU}} and \textbf{Theorem \ref{VP-theorem}}.\footnote{The two DAG-inducing methods were not differentiated in \citep{raskutti2018learning_supp}. Thus, we provide a proof of \textbf{Theorem \ref{RU-theorem}}.}
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Correctness results}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% B: Correctness results %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\label{app:correct}
First, we introduce some permutation-based notations to facilitate our coming proofs. In this section, we use $\G_\pi$ to denote the DAG induced by $\pi$ from a graphoid $\Prob$ using (RU) unless specified otherwise.

Given a set of variables $\mb{V}$, consider any $\pi \in \Pi(\mb{v})$ and any pair $j, k \in \mb{v}$ where $\pi[j] < \pi[k]$. $\pi$ can be written as $\la \bs{\delta}_{<j}, j, \bs{\delta}_{j\sim k}, k, \bs{\delta}_{>k}\ra$ such that $\bs{\delta}_{<j} = \la \pi_i: 1 \leq i < \pi[j] \ra$, $\bs{\delta}_{j \sim k} = \la \pi_i: \pi[j] < i < \pi[k]\ra$, and $\bs{\delta}_{>k} = \la \pi_i: \pi[k] < i \leq |\pi|\ra$. When $\bs{\delta}_{j \sim k} = \varnothing$, we say that $j$ and $k$ are $\pi$-\textit{adjacent}. In that case, $\pi$ can be written as $\la \bs{\delta}_{<j}, j, k, \bs{\delta}_{>k}\ra$ instead.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{definition}
\label{AT}
Given a set of variables $\mb{V}$, for any $\pi, \tau \in \Pi(\mathbf{v})$, 
\begin{enumerate}
    \item[(a)] $\tau$ is said to be $(j, k)$-different from $\pi$ for some $j, k \in \mathbf{v}$ if $j$ and $k$ are $\pi$-adjacent (i.e., $\pi = \la \bs{\delta}_{<j}, j, k, \bs{\delta}_{>k}\ra$) and $\tau = \la \bs{\delta}_{<j}, k, j, \bs{\delta}_{>k} \ra$;
    \item[(b)] $\pi$ and $\tau$ are said to be in adjacent transposition (AT) if they are $(j, k)$-different for some $j, k \in \mathbf{v}$.
\end{enumerate}
\end{definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{SGS_same_DAG}
Given a graphoid $\Prob$ over $\mb{V}$, consider any $\mc{H} \in \CMC(\Prob)$. If $\pi \in \Pi(\mb{v})$ is a causal order of $\G$, then $\E(\G_\pi) \subseteq \E(\mc{H})$. Also, $\G_\pi = \mc{H}$ if $\mc{H} \in \SGS(\Prob)$. 
\end{lemma}
\begin{proof}
Consider any $k \in \mathbf{v}$ and $\mt{Nd}(k, \mc{H})$ (i.e., the set of $k$'s non-descendants in $\mc{H}$). Since $\mc{H} \in \mathtt{CMC}(\mathcal{P})$, it follows that $X_k \CI_\Prob \mb{X}_{\mt{Nd}(k, \mc{H})} \setminus \mb{X}_{\Pa(k, \mc{H})} \,|\,\mb{X}_{\Pa(k, \mc{H})}$. Also, we have $\Pa(k, \mc{H}) \subseteq \Pre(k, \pi) \subseteq \mt{Nd}(k, \mc{H})$ from $\pi$'s being a causal order of $\mc{H}$. By \textit{decomposition}, we have
$X_k \CI_\Prob \mb{X}_{\Pre(k, \pi)} \setminus \mb{X}_{\Pa(k, \mc{H})} \,|\,\mb{X}_{\Pa(k, \mc{H})}$ such that $\mb{X}_{\Pa(k, \mc{H})}$ is a Markov blanket of $X_k$ relative to $\mb{X}_{\Pre(k, \pi)}$. By \textbf{Lemma \ref{MB_subset}}, we have $\mathtt{MB}(X_k, \mb{X}_{\mathtt{Pre}(k, \pi)}) \subseteq \mathtt{Pa}(k, \mc{H})$. Consider $\G_\pi$ induced by (VP). The above entails that $\E(\G_\pi) \subseteq \E(\mc{H})$ since $\Pa(k, \G_\pi) = \MB(X_k, \mb{X}_{\Pre(k, \pi)}) \subseteq \Pa(k, \mc{H})$ for each $k \in \mb{v}$. Due to \textbf{Lemma \ref{VP=RU}}, $\E(\G_\pi) \subseteq \E(\mc{H})$ still holds even if $\G_\pi$ is induced by (RU). Lastly, $\G_\pi = \mc{H}$ follows from \textbf{Definition \ref{SGS-minimal}} if $\mc{H} \in \SGS(\Prob)$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{lemma}
\label{AT_iff}
\citep{solus2021consistency_supp} Given a graphoid $\Prob$ over $\mb{V}$, consider any $\pi, \tau \in \Pi(\mb{v})$ where $\tau$ is $(j, k)$-different from $\pi$ for some $j, k \in \mb{v}$. Then $\G_\pi = \G_\tau$ if and only if $X_j \CI_\Prob X_k\,|\,\mb{X}_{\Pre(j, \pi)}$.
\end{lemma}

\begin{proof}
Suppose that $X_j \nCI_\Prob X_k\,|\,\mb{X}_{\Pre(j, \pi)}$. By (RU), we have $(j \to k) \in \E(\G_\pi)$. Note that $(j \to k) \notin \E(\G_\tau)$ since $\tau[k] < \tau[j]$ and $\tau$ is a causal order of $\G_\tau$ by construction. Hence, $\G_\pi \neq \G_\tau$. 

On the other hand, suppose that $X_j \CI_\Prob X_k\,|\,\mb{X}_{\Pre(j, \pi)}$. Since $\tau$ is $(j, k)$-different from $\pi$, we have $\pi = \la \bs{\delta}_{<j}, j, k, \bs{\delta}_{>k}\ra$ and $\tau = \la \bs{\delta}_{<j}, k, j, \bs{\delta}_{>k}\ra$ according to \textbf{Definition \ref{AT}} (a). By (RU), we know that $(k \to j) \notin \E(\G_\tau)$. Hence, $\pi$ is a causal order of $\G_\tau$. By \textbf{Theorem \ref{RU-theorem}}, $\G_\tau \in \SGS(\Prob)$. Therefore, it follows from \textbf{Lemma \ref{SGS_same_DAG}} that $\G_\tau = \G_\pi$.
\end{proof} 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{covered_AT}
Given a graphoid $\Prob$ over $\mathbf{V}$, consider any $\pi \in \Pi(\mathbf{v})$. Suppose that $\G_\pi$ contains a covered edge $j \to k$ where $\pi = \la \bs{\delta}_{<j}, j, \bs{\delta}_{j \sim k}, k, \bs{\delta}_{>k}\ra$. If $\tau = \la \bs{\delta}_{<j}, j, k, \bs{\delta}_{j \sim k}, \bs{\delta}_{>k}\ra$, then $\G_\pi = \G_\tau$.
\end{lemma}
\begin{proof}
Since $j \to k$ is a covered edge in $\G_\pi$, it follows that $(i \to k) \notin \E(\G_\pi)$ for each $i \in \bs{\delta}_{j\sim k}$, and thus $X_i \CI_\Prob X_k\,|\,\mb{X}_{\Pre(i, \pi)}$ by (RU). Hence, $\G_\pi = \G_\tau$ can be obtained after $|\bs{\delta}_{j \sim k}|$ applications of \textbf{Lemma \ref{AT_iff}}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{theorem}
\label{Jiji_thm}
\citep{zhang2013comparison_supp} Given a set of variables $\mb{V}$, for any $\G, \mc{H} \in \mt{DAG}(\mb{V})$, if $\E(\G) \subseteq \E(\mc{H})$, then $\I(\mc{H}) \subseteq \I(\G)$. 
\end{theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{covered_MEC}
\citep{10.5555/2074158.2074169} Consider any DAG $\G$. Let $\mc{H}$ be the result of reversing $(i \to j) \in \E(\G)$. Then $\mc{H} \in \MEC(\G)$ if and only if $i \to j$ is a covered edge. 
\end{lemma}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{theorem}
\label{covered_MEC_chain}
\citep{10.5555/2074158.2074169} Consider any pair of DAGs $\G$ and $\mc{H}$ over the same set of variables s.t. $\mc{H} \in \MEC(\G)$, and for which there are $k$ edges in $\G$ that have opposite orientation in $\mc{H}$. Then there exists a sequence of $k$ distinct covered edge reversals in $\G$ s.t. $\G$ becomes $\mc{H}$ after all reversals. 
\end{theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{tuck_lemma}
Given a graphoid $\Prob$ over $\mb{V}$, consider any $\pi \in \Pi(\mb{v})$. Suppose that $(j \to k) \in \E(\G_\pi)$ is a covered edge, and let $\mc{H}$ be the DAG resulted from reversing $(j \to k)$ in $\G_\pi$. If $\tau = \textit{tuck}(\pi, j, k)$, then
\begin{enumerate}
    \item[(a)] $\tau$ is a causal order of $\mc{H}$;
    \item[(b)] $\E(\G_\tau) \subseteq \E(\mc{H})$;
    \item[(c)] $|\E(\G_\tau)| \leq |\E(\G_\pi)|$;
    \item[(d)] $\I(\G_\pi) \subseteq \I(\G_\tau)$. 
\end{enumerate}
\end{lemma}
\begin{proof}
A similar lemma has been shown in \citep{solus2021consistency_supp}. First, we write $\pi = \la \bs{\delta}_{<j}, j, \bs{\delta}_{j \sim k}, k, \bs{\delta}_{>k} \ra$ as usual. Consider $\pi' = \la \bs{\delta}_{<j}, j, k, \bs{\delta}_{j \sim k}, \bs{\delta}_{>k} \ra$. By \textbf{Lemma \ref{covered_AT}}, we have $\G_\pi = \G_{\pi'}$. Note that $\tau = \textit{tuck}(\pi, j, k) = \la \bs{\delta}_{<j}, k, j, \bs{\delta}_{j \sim k}, \bs{\delta}_{>k}\ra$ because $\textit{tuck}(\pi, j, k)$ is a covered tuck. Thus, $\tau$ is $(j, k)$-different from $\pi'$. Also, since $\pi'$ is a causal order of $\G_\pi$, it follows that $\tau$ is a causal order of $\mc{H}$ and thus (a) is proven.

Next, observe that $\I(\G_\pi) = \I(\mc{H})$ from \textbf{Lemma \ref{covered_MEC}}. From $\G_\pi \in \CMC(\Prob)$ by \textbf{Theorem \ref{RU-theorem}}, we know that $\mc{H} \in \CMC(\Prob)$. Thus, (b) immediately follows from (a) and \textbf{Lemma \ref{SGS_same_DAG}}. Also, (c) is entailed by $|\E(\G_\tau)| \leq |\E(\mc{H})| = |\E(\G_\tau)|$. Finally, by \textbf{Theorem \ref{Jiji_thm}}, we have $\I(\G_\pi) = \I(\mc{H}) \subseteq \I(\G_\tau)$ as desired in (d).
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Before we compare TSP and unbounded GRaSP$_0$, we want to make an assumption related to how the set of covered edges in any particular DAG is ordered. To see the importance of such an assumption, observe that different orderings of $\E^0(\G_\pi)$ (i.e., the set of covered edges in an induced DAG $\G_\pi$) can alter the output of TSP and also GRaSP$_0$. For example, suppose that $(j \to k), (j' \to k') \in \E^0(\G_\pi)$. Say the DFS of GRaSP$_0$ starts with performing $\textit{tuck}(\pi, j, k)$ and leads to some permutation $\tau$. However, choosing to perform $\textit{tuck}(\pi, j', k')$ instead at the beginning of the DFS procedure can lead to some $\tau'$ where $\G_{\tau} \neq \G_{\tau'}$. Hence, we enforce the assumption that the ordering of $\E^0(\G)$ for any DAG $\G$ is fixed arbitrarily. For instance, $(j \to k)$ precedes $(j' \to k')$ in $\E^0(\G)$ if $j < j'$, or $j = j'$ and $k < k'$. Consequently, the issue of order-dependence can be avoided even when comparing a Chickering sequence found by TSP and a ct-sequence found by unbounded GRaSP$_0$. In the following, this assumption will be made implicitly.  

Now we revisit how TSP works. Given a graphoid $\Prob$ over $\mb{V}$ and an initial permutation $\pi \in \Pi(\mb{v})$, TSP begins with setting $\G$ as the induced $\G_\pi$. Starting with the root $\G$, TSP performs DFS to identify a SGS-minimal DAG $\mc{H}$ connected by a Chickering sequence from $\G$ such that $|\E(\G)| > |\E(\mc{H})|$. TSP returns $\G$ if no such $\mc{H}$ is found. Otherwise, it updates $\G$ as $\mc{H}$ and repeat the procedure.

The DFS procedure of TSP aims to traverse from one SGS-minimal DAG to another SGS-minimal DAG by the construction of a Chickering sequence. Though we know that a Chickering sequence is obtained by the reversals of covered edges and deletions of directed edges, \cite{solus2021consistency_supp} did not specify any ordering of these operations. Below we provide a more precise definition of the Chickering sequences considered by TSP. 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{definition}
\label{TSP_Chickering}
Given a graphoid $\Prob$ over $\mb{V}$, a TSP-Chickering sequence $\mf{C} = \la \G^1,..., \G^m\ra$ is a Chickering sequence satisfying the following condition:
\begin{enumerate}
    \item[(a)] $\G^1, \G^m \in \SGS(\Prob)$;
    \item[(b)] $\G^i$ and $\G^{i'}$ are pairwise distinct for $1 \leq i < i' \leq m$; 
    \item[(c)] if $|\E(\G^1)| = |\E(\G^m)|$, then $\G^1,..., \G^m \in \SGS(\Prob)$ where they differ by the reversals of some covered edges;
    \item[(d)] otherwise, there exists a turning index $1 < l < m$ such that (i) $\G^1,..., \G^{l-1} \in \SGS(\Prob)$, (ii) $\G^1,..., \G^l$ differ by the reversals of some covered edges, and (iii) $\G^{i+1}$ is obtained from deleting a directed edge in $\G^{i} \notin \SGS(\Prob)$ for each $l \leq i < m$. 
\end{enumerate}
\end{definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Readers are suggested to find the original pseudocode of TSP in \cite{solus2021consistency_supp} to verify that our \textbf{Definition \ref{TSP_Chickering}} is a fair description of the Chickering sequences considered by TSP. Conditions (a) and (b) are straightforward. (c) refers to the case where TSP cannot find a sparser SGS-minimal DAG. So if any $\G^i$ in $\mf{C}$ were non-SGS-minimal, then TSP would have obtained a proper subgraph of $\G^i$ which is SGS-minimal by a series of edge-deletion. (d) refers to the case where TSP manages to find a sparser SGS-minimal DAG. Notice that $\G^2$ must be obtained by a covered edge reversal from $\G^1$ since $\G^1 \in \SGS(\Prob)$. If $\G^2 \notin \SGS(\Prob)$, then TSP can obtain the desired SGS-minimal DAG by a series of edge-deletion from $\G^2$. But if $\G^2 \in \SGS(\Prob)$, the procedure above repeats until finding the turning index $l$ such that $\G^l \notin \SGS(\Prob)$ and then the sparser $\G^m \in \SGS(\Prob)$ can be obtained by a series of edge-deletion from $\G^l$. 

Now we compare TSP and unbounded GRaSP$_0$ by considering their respective sequences traversed in the DFS procedure.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{ct_2_Chickering}
Given a graphoid $\Prob$ over $\mb{V}$, consider any $\pi \in \Pi(\mb{v})$ and $\tau = \textit{tuck}(\pi, j, k)$ where $(j \to k) \in \E^0(\G_\pi)$. Given that $\mf{T} = \la \pi, \tau\ra$ is a ct-sequence,
\begin{itemize}
    \item[(a)] if $|\E(\G_\tau)| = |\E(\G_\pi)|$, then $\mf{C} = \la \G_\pi, \G_\tau\ra$ is a TSP-Chickering sequence where $\G_\tau$ is obtained from reversing $(j \to k) \in \E^0(\G_\pi)$;
    \item[(b)] otherwise, there exists a TSP-Chickering sequence $\mf{C} = \la \G_\pi = \G^1,..., \G^m = \G_\tau\ra$ s.t. $\G^2$ is obtained from reversing $(j \to k) \in \E^0(\G_\pi)$, and $\G^{i+1}$ is obtained from deleting a directed edge in $\G^i$ for each $2 \leq i < m$.
\end{itemize}
\end{lemma}
\begin{proof}
First, consider the DAG $\mc{H}$ obtained from reversing $(j \to k) \in \E^0(\G_\pi)$. We start with the case in (a) where $|\E(\G_\tau)| = |\E(\G_\pi)| = |\E(\mc{H})|$. We want to show that $\G_\tau = \mc{H}$. By \textbf{Lemma \ref{tuck_lemma}} (b), we have $\E(\G_\tau) \subseteq \E(\mc{H})$. If $\E(\G_\tau) \subset \E(\mc{H})$ holds, then $|\E(\G_\pi)| = |\E(\mc{H})|$ will be violated. Hence, we have $\G_\tau = \mc{H}$ and thus $\mf{C} = \la \G_\pi, \mc{H} = \G_\tau \ra$ is our desired TSP-Chickering sequence.

For (b), it follows from \textbf{Lemma \ref{tuck_lemma}} (c) that $|\E(\G_\tau)| < |\E(\mc{H})| = |\E(\G_\pi)|$. Let $\G_\pi$ and $\mc{H}$ be $\G^1$ and $\G^2$ respectively. By \textbf{Lemma \ref{tuck_lemma}} (b) again, we have $\E(\G_\tau) \subset \E(\G^2)$ such that we can remove a directed edge from $\G^2$ once at a time until obtaining $\G_\tau$. Therefore, we have the desired TSP-Chickering sequence in (b).
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{Chickering_2_ct}
Given a graphoid $\Prob$ over $\mb{V}$, consider any TSP-Chickering sequence $\mf{C} = \la \G^1,...,\G^m\ra$. Let $\pi^1$ be a causal order of $\G^1$. Then
\begin{itemize}
    \item[(a)] if $|\E(\G^1)| = |\E(\G^m)|$, then $\G^{i+1} = \G_{\pi^{i+1}} = \G_{\textit{tuck}(\pi, j, k)}$ where $j \to k$ is the covered edge reversed to obtain $\G^{i+1}$ from $\G^i$ for each $1 \leq i < m$ s.t. $\mf{T} = \la \pi^1,...,\pi^m\ra$ is a ct-sequence;
    \item[(b)] otherwise, then $\G^{i+1} = \G_{\pi^{i+1}} = \G_{\textit{tuck}(\pi, j, k)}$ where $j \to k$ is the covered edge reversed to obtain $\G^{i+1}$ from $\G^i$ for each $1 \leq i < l$ where $l$ is the turning index of $\mf{C}$ and $\G_{\pi^l} = \G^m$ s.t. $\mf{T} = \la \pi^1,...,\pi^l\ra$ is a ct-sequence.
\end{itemize}
\end{lemma}
\begin{proof}
(a) can be easily shown by \textbf{Lemma \ref{tuck_lemma}}(a) and \textbf{Lemma \ref{SGS_same_DAG}}. For (b), the proof of $\G^i = \G_{\pi^i}$ for each $1 \leq i < l$ is similar to that in (a). So we consider $l$ where $\G^l \notin \SGS(\Prob)$ according to \textbf{Definition \ref{TSP_Chickering}}(d). However, it follows from \textbf{Lemma \ref{tuck_lemma}} (a) that $\pi^l$ is a causal order of $\G^l$. Since $\E(\G^m) \subset \E(\G^l)$, we know that $\pi^l$ is also a causal order of $\G^m$. Lastly, given that $\G^m \in \SGS(\Prob)$, it follows from \textbf{Lemma \ref{SGS_same_DAG}} that $\G_{\pi^l} = \G^m$. 
\end{proof} 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\,\\
\textbf{Lemma \ref{ct-better}} \hspace{0.1cm} \textit{Given a graphoid $\Prob$, for any $\pi \in \Pi(\mb{v})$ and any Chickering sequence from $\G_\pi$ to some $\mc{H} \in \SGS(\Prob)$ considered by TSP, there exists a ct-sequence $\la \pi,...,\tau\ra$ s.t. $\G_\tau = \mc{H}$.}\\

\begin{proof}
Given that a Chickering sequence considered by TSP is simply a TSP-Chickering sequence defined in \textbf{Definition \ref{TSP_Chickering}}, the lemma follows immediately from \textbf{Lemma \ref{Chickering_2_ct}}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\,\\
\textbf{Theorem \ref{TSP=GRaSP0}} \hspace{0.1cm} \textit{Given a graphoid $\Prob$ over $\mb{V}$ and any initial permutation $\pi \in \Pi(\mb{v})$, the DAG induced by the output of unbounded GRaSP$_0$ is equivalent to the DAG returned by TSP.}\\

\begin{proof}
Immediate from \textbf{Lemma \ref{ct_2_Chickering}} and \textbf{Lemma \ref{Chickering_2_ct}}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Now we turn to the discussion on the correctness of GRaSP$_0$ under faithfulness.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{ct_lemma}
Given a graphoid $\Prob$ over $\mb{V}$ and any $\pi \in \Pi(\mb{v})$, if $\G_\pi \notin \Pm(\Prob)$, then there exists a ct-sequence $\mf{T} = \la \pi,..., \tau\ra$ s.t. $\I(\G_\pi) \subset \I(\G_\tau)$.
\end{lemma}
\begin{proof}
Suppose that $\G_\pi \notin \Pm(\Prob)$. By \textbf{Definition \ref{P-minimal}}, it follows that there exists $\mc{H} \in \CMC(\Prob)$ s.t. $\I(\G_\pi) \subset \I(\mc{H}) \subseteq \I(\Prob)$. By \textbf{Theorem \ref{Chickering_seq}}, we know that there exists a Chickering sequence $\mf{C}_0 = \la \G_\pi = \G^1, ..., \G^l = \mc{H}\ra$. Without loss of generality, suppose that $\mf{C}_0$ is the shortest Chickering sequence where each $\G^{i+1}$ differs from $\G^i \in \SGS(\Prob)$ by the reversal of a covered edge in $\E^0(\G^i)$ for each $1 \leq i < l-1$, and $\G^l$ is obtained from deleting a directed edge in $\G^{l-1}$. Notice that $|\E(\G^l)| < |\E(\G^1)|$ due to the edge deletion. If $\G^l \in \SGS(\Prob)$, then $\mf{C}_0$ is a TSP-Chickering sequence. Otherwise, we can easily construct a TSP-Chickering sequence $\mf{C} = \la \G^1,..., \G^m\ra$ with $l-1$ as the turning index and $\G^m \in \SGS(\Prob)$ obtained by repeated edge-deletion from $\G^l$ such that $\I(\G^1) \subset \I(\G^l) \subset \I(\G^m)$. By \textbf{Lemma \ref{Chickering_2_ct}} (b), we have the desired ct-sequence.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\,\\
\textbf{Theorem \ref{ct-theorem}} \hspace{0.1cm} \textit{Given a graphoid $\Prob$ over $\mb{V}$ and any $\pi \in \Pi(\mb{v})$, if $\G_\pi \notin \Pm(\Prob)$, then there exists a ct-sequence $\mf{T} = \la \pi,..., \tau\ra$ s.t. $\G_\tau \in \Pm(\Prob)$.}\\

\begin{proof}
Immediate from \textbf{Lemma \ref{ct_lemma}}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{theorem}
\label{GRaSP0_correct_consistent}
Unbounded GRaSP$_0$ is correct and pointwise consistent under faithfulness.
\end{theorem}
\begin{proof}
We review the argument for the correctness of unbounded GRaSP$_0$ under faithfulness given in the main paper. Given a graphoid $\Prob$ over $\mb{V}$, consider any initial permutation $\pi \in \Pi(\mb{v})$. Given that unbounded GRaSP$_0$ greedily search for a ct-sequence from $\pi$, it is guaranteed by \textbf{Theorem \ref{ct-theorem}} that $\tau$ returned by unbounded GRaSP$_0$ in \textbf{Algorithm \ref{alg:grasp}} induces a P-minimal DAG. Under faithfulness, we have $\G_\tau \in \MEC(\G^*)$ due to \textbf{Theorem \ref{razors}} and hence unbounded GRaSP$_0$ is correct.

Alternatively, the correctness and pointwise consistency of unbounded GRaSP$_0$ can also be proven directly from \textbf{Theorem \ref{TSP=GRaSP0}} and the corresponding results of TSP in \citep{solus2021consistency_supp}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\,\\
\textbf{Corollary \ref{correct_consistent}} \hspace{0.1cm} \textit{Unbounded GRaSP$_0$, GRaSP$_1$, and GRaSP$_2$ are correct and pointwise consistent under faithfulness.}\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In the following, we want to prove that faithfulness is not only sufficient, but also \textit{necessary} for the correctness of TSP and unbounded GRaSP$_0$. We first want to prove an interesting and novel equivalence between two causal razors: faithfulness and u-P-minimality.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{CMC_single_CI}
Given a joint probability distribution $\Prob$ over $\mb{V}$, for any $\la X_i, X_j\,|\,\mb{X}_{\mb{k}} \ra \in \I(\Prob)$, there exists $\G \in \mt{DAG}(\mb{V})$ s.t. $\I(\G) = \{\la X_i, X_j\,|\,\mb{X}_{\mb{k}} \ra\}$. 
\end{lemma}
\begin{proof}
Consider $\mb{V} = \{X_1,...,X_m\}$. An empty DAG suffices when $m = 2$. So assume that $m \geq 3$. Without loss of generality, consider $\la X_1, X_{k+2} \,|\,\mb{X}_{\mb{k}}\ra \in \I(\Prob)$ where $\mb{k} = \la 2,...,k+1\ra$, and the remaining vertices are $\la k+3,..., m \ra$. We propose a procedure which guarantees the existence of the desired DAG $\G$.

\begin{algorithm}
\DontPrintSemicolon
$\G \ot \text{a complete undirected graph over } \mb{v}$\;
remove the adjacency $1$ \textemdash $\,k+2$ in $\G$ \;
\ForEach{$(j, k)$ that are adjacent in $\G$}{
    \If{$j < k$}{
        orient $j \to k$ in $\G$
    }
}
return $\G$    
\end{algorithm}

Line 3 to 5 guarantee that $\mathcal{G}$ is a DAG since all edges are directed and pointing from lower indices to higher indices such that no directed cycle can occur. Finally, $1 \perp_\G k+2\,|\,\mb{k}$ holds because all paths from $1$ to $k+2$ either contain a non-collider $i \in \mb{k}$ or contain a collider $i \notin \mb{k}$. Therefore, $\I(\G) = \{\la X_1, X_{k+2}\,|\,\mb{X}_\mb{k})\}$ because no other d-separation relations hold in $\mathcal{G}$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{theorem}
\label{CFC-uPm}
For any joint probability distribution $\Prob$, $\CFC(\Prob) = \uPm(\Prob)$.
\end{theorem}

\begin{proof}
[$\subseteq$] Suppose that $\G \in \CFC(\Prob)$. It follows that $\G \in \Pm(\Prob)$ by \textbf{Definition \ref{P-minimal}}. For any $\G' \in \CMC(\Prob)$, if $\I(\G') \subset \I(\G)$, then $\G' \notin \Pm(\Prob)$. Hence, if $\G' \in \Pm(\Prob)$, then $\I(\G') = \I(\G)$. Hence, $\G \in \uPm(\Prob)$.

$[\supseteq$] Suppose that $\G \notin \CFC(\Prob)$. Since $\uPm(\Prob) \subseteq \Pm(\Prob)$ by \textbf{Definition \ref{P-minimal}}, if $\G \notin \Pm(\Prob)$, we have $\G \notin \uPm(\Prob)$ immediately. So consider the case where $\G \in \Pm(\Prob)$. It follows from $\G \notin \CFC(\Prob)$ that there exists a CI relation $\psi \in \I(\Prob) \setminus \I(\G)$. By \textbf{Lemma \ref{CMC_single_CI}}, we can construct a DAG $\G^0$ such that $\I(\G^0) = \{\psi\}$. Consequently, there exists $\G^1 \in \Pm(\Prob)$ such that $\I(\G^0) \subseteq \I(\G^1) \subseteq \I(\Prob)$. Since $\psi \in \I(\G^1)$, we know that $\G^1 \notin \MEC(\G)$. Given that both $\G, \G^1 \in \Pm(\Prob)$, we have $\G \notin \uPm(\Prob)$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\,\\
\textbf{Theorem \ref{TSP_necessary}} \hspace{0.1cm} \textit{Given a graphoid $\Prob$, faithfulness is necessary for the correctness of TSP.}\\

\begin{proof}
Suppose that $(\G^*, \Prob)$ is unfaithful. We consider the two kinds of unfaithfulness in \citep{zhang2008detection}: \textit{detectable} (i.e., $\CFC(\Prob) = \varnothing$) versus \textit{undetectable} (i.e., $\G' \in \CFC(\Prob)$ where $\G' \notin \MEC(\G^*)$). For the latter, TSP can identify $\G_\tau \in \Pm(\Prob) = \CFC(\Prob) = \MEC(\G')$. However, TSP is incorrect because $\G_\tau \notin \MEC(\G^*)$. 

On the other hand, consider the case that $\CFC(\Prob) = \varnothing$. By \textbf{Theorem \ref{CFC-uPm}}, there exists $\G \in \Pm(\Prob)$ such that $\G \notin \MEC(\G^*)$ even if $\G^* \in \Pm(\Prob)$. Recall that Chickering algorithm can only allow us to traverse to a DAG $\mc{H}$ from $\G$ satisfying $\I(\G) \subseteq \I(\mc{H})$. It entails that Chickering algorithm can only obtain DAGs that are in $\MEC(\G)$ since $\G \in \Pm(\Prob)$ and hence never be able to reach $\G^*$ where $\I(\G_\pi) \nsubseteq \I(\G^*)$. Therefore, by setting $\pi$ as the initial permutation to TSP where $\G_\pi = \G$, TSP will return $\G_\pi$ incorrectly.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Notice that \textbf{Theorem \ref{TSP_necessary}} is contrary to what \cite{solus2021consistency_supp} suggested. They proposed an example arguing that TSP can be correct even under (detectable) unfaithfulness.\footnote{See Figure 2 in the \href{https://academic.oup.com/biomet/article-abstract/108/4/795/6062392?redirectedFrom=fulltext\#supplementary-data}{\textcolor{blue}{supplementary materials}} of \citep{solus2021consistency_supp}.} However, the distribution used in the example is not a semigraphoid. This renders their example illegitimate because every joint probability distribution is a semigraphoid. 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{ESP and GRaSP-1}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% C: ESP and GRaSP-1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\label{app:ESP_GRaSP1}
As shown in \textbf{Theorem \ref{TSP_necessary}} in the last section, TSP cannot be correct under unfaithfulness by choosing an arbitrary initial permutation. Consequently, one important question is how to relax the search space of TSP to identify a sparser permutation under unfaithfulness. \cite{solus2021consistency_supp} proposed the \textit{Edge SP} (ESP) algorithm based on an assumption strictly weaker than that assumed by TSP. However, unlike TSP, they did not provide an operational version of ESP in their work. In this section, we are going to show a theorem similar to \textbf{Theorem \ref{TSP=GRaSP0}} but with respect to ESP and unbounded GRaSP$_1$. In other words, unbounded GRaSP$_1$ is an operational version of ESP. In the following, we first examine some technical notations used in \citep{mohammadi2018generalized} and \citep{solus2021consistency_supp}. Readers are strongly suggested to visit \citep{solus2021consistency_supp} for the full discussion of ESP and relevant notations. 

Given a set of measured variables $\mb{V}$, a \textit{permutohedron} on $\mb{v}$, denoted $\A_\mb{v}$, is the convex hull in $\mathbb{R}^{|\mb{v}|}$ of all permutations in $\Pi(\mb{v})$. In simpler terms, $\A_{\mb{v}}$ is the \textit{state space} with each \textit{state} being a permutation $\pi \in \Pi(\mb{v})$. The neighborhood of states in $\A_{\mb{v}}$ is defined by adjacent transpositions (ATs) as in \textbf{Definition \ref{AT}} (b). 

Notice that different states in $\A_\mb{v}$ can induce the same DAG given a graphoid $\Prob$. Thus, a natural way to narrow down the search space is to identify permutations inducing the same DAG. \textbf{Lemma \ref{AT_iff}} provides such a characterization. Construct $\A_{\mb{v}}(\Prob)$ by \textit{contracting} neighborhood in $\A_{\mb{v}}$ to ATs that correspond to the CI relations in $\I(\Prob)$ specified in \textbf{Lemma \ref{AT_iff}}. To be more specified, the contracted permutohedron $\A_{\mb{v}}(\Prob)$, also known as the \textit{DAG associahedron}, is the state space with each state being an induced DAG.\footnote{One can equivalently express each state in the DAG associahedron as the set of permutations which induce the same DAG. This is the original representation in \citep{mohammadi2018generalized}. However, we prefer the representation given in \citep{solus2021consistency_supp} in the sense that one can easily compare DAGs that are in neighborhood.} Two states $\G_1, \G_2$ in $\A_{\mb{v}}(\Prob)$ are neighbors if and only if there exist $\pi^1, \pi^2 \in \Pi(\mb{v})$ s.t. $\G_{\pi^1} = \G^1$, $\G_{\pi^2} = \G^2$, and $\pi^1$ and $\pi^2$ are neighbors in the permutohedron $\A_{\mb{v}}$. As shown by \cite{mohammadi2018generalized}, the DAG associahedron is a convex polytope where each vertex of $\A_{\mb{v}}(\Prob)$ corresponds to a different DAG. 

To draw a clearer picture, consider any $\pi, \tau \in \Pi(\mb{v})$ where $\tau$ is $(j, k)$-different from $\pi$ for some $j, k \in \mb{v}$. They are neighbors in $\A_\mb{v}$ but they do not necessarily induce the same DAG. If $X_j \CI_\Prob X_k\,|\,\mb{X}_{\Pre(j, \pi)}$ holds, they induce the same DAG and thus correspond to the same state $\G_\pi$ in the DAG associahedron $\A_{\mb{v}}(\Prob)$. But if the CI relation does not hold, then $\G_\pi$ and $\G_\tau$ are neighbors in $\A_{\mb{v}}(\Prob)$. See Figure \ref{fig:permutohedron} for an example from \citep{solus2021consistency_supp}.    

\begin{figure}[ht!]
\begin{center}
\subfloat{
\begin{tikzpicture}
\draw[fill=blue] (0,0) circle (3pt);
\draw[fill=blue] (2.5,0) circle (3pt);
\draw[fill=blue] (4,-2) circle (3pt);
\draw[fill=blue] (2.5,-4) circle (3pt);
\draw[fill=blue] (0,-4) circle (3pt);
\draw[fill=blue] (-1.5,-2) circle (3pt);
\node at (-1.3,0) {$\pi^1 = \la 1, 2, 3\ra$};
\node at (3.8,0) {$\pi^2 =\la 2, 1, 3\ra$};
\node at (2.8,-2) {$\pi^3 =\la 2, 3, 1\ra$};
\node at (3.8,-4) {$\pi^4 =\la 3, 2, 1\ra$};
\node at (-1.3,-4) {$\pi^5 =\la 3, 1, 2\ra$};
\node at (-0.2,-2) {$\pi^6 =\la 1, 3, 2\ra$};
\draw[thick] (0,0) -- (2.5,0) -- (4,-2) -- (2.5,-4) -- (0,-4) -- (-1.5,-2) -- (0,0);
\node at (0,-4.6) {};
\end{tikzpicture}}
\hspace{1.5cm}
\subfloat{
\begin{tikzpicture}[scale=0.8]
\draw[fill=red] (0,0) circle (3pt);
\draw[fill=red] (2.5,0) circle (3pt);
\draw[fill=red] (4,-2) circle (3pt);
\draw[fill=red] (2.5,-4) circle (3pt);
\draw[fill=red] (0,-4) circle (3pt);
\node at (-1,-0.5) {$\G_{\pi^1}$};
\node at (-1,0.3) {
% pi^1 = <1, 2, 3>
\scalebox{0.6}{\begin{tikzpicture}
\node(X1) at (0,0) {$1$};
\node(X2) at (1,-1) {$2$};
\node(X3) at (2,0) {$3$};
\path [->] (X1) edge (X3);
\path [->] (X1) edge (X2);
\path [->] (X2) edge (X3);
\end{tikzpicture}}};
\node at (4.3,-0.3) {$\G_{\pi^2}$};
\node at (3.4,0.3) {
% pi^2 = <2, 1, 3>
\scalebox{0.6}{\begin{tikzpicture}
\node(X1) at (0,0) {$1$};
\node(X2) at (1,-1) {$2$};
\node(X3) at (2,0) {$3$};
\path [->] (X1) edge (X3);
\path [->] (X2) edge (X1);
\path [->] (X2) edge (X3);
\end{tikzpicture}}};
\node at (5.9,-2.6) {$\G_{\pi^3}$};
\node at (5,-2){
% pi^3 = <2, 3, 1>
\scalebox{0.6}{\begin{tikzpicture}
\node(X1) at (0,0) {$1$};
\node(X2) at (1,-1) {$2$};
\node(X3) at (2,0) {$3$};
\path [->] (X3) edge (X1);
\path [->] (X2) edge (X1);
\path [->] (X2) edge (X3);
\end{tikzpicture}}};
\node at (4.7,-5.1) {$\G_{\pi^4}$};
\node at (3.8,-4.5) {
% pi^4 = <3, 2, 1>
\scalebox{0.6}{\begin{tikzpicture}
\node(X1) at (0,0) {$1$};
\node(X2) at (1,-1) {$2$};
\node(X3) at (2,0) {$3$};
\path [->] (X3) edge (X2);
\path [->] (X3) edge (X1);
\path [->] (X2) edge (X1);
\end{tikzpicture}}};
\node at (0.2,-5.1) {$\G_{\pi^5} = \G_{\pi^6}$};
\node at (-1.3,-4.5) {
% pi^5 = <3, 1, 2>, pi^6 = <1, 3, 2>
\scalebox{0.6}{\begin{tikzpicture}
\node(X1) at (0,0) {$1$};
\node(X2) at (1,-1) {$2$};
\node(X3) at (2,0) {$3$};
\path [->] (X3) edge (X2);
\path [->] (X1) edge (X2);
\end{tikzpicture}}};
\draw[thick] (0,0) -- (2.5,0) -- (4,-2) -- (2.5,-4) -- (0,-4) -- (0,0);
\end{tikzpicture}}
\end{center}
\caption{Given $\mb{V} = \{X_1, X_2, X_3\}$, consider $\I(\Prob) = \{\la X_1, X_3\,|\,\varnothing\ra\}$. The diagram on the left is the permutohedron $\A_{\mb{v}}$ where each state is a permutation in $\Pi(\mb{v})$. The one on the right is the DAG associahedron $\A_{\mb{v}}(\Prob)$ where each state is a different DAG in $\SGS(\Prob)$. In particular, the two states $\pi^5$ and $\pi^6$ in $\A_{\mb{v}}$ are collapsed into a single state in $\A_{\mb{v}}(\Prob)$ because they induce the same DAG.}  
\label{fig:permutohedron}
\end{figure}

Observe that each state in the DAG associahedron $\A_{\mb{v}}(\Prob)$ corresponds to a SGS-minimal DAG according to \textbf{Theorem \ref{RU-theorem}}. ESP performs a greedy DFS in $\A_{\mb{v}}(\Prob)$. Given an initial permutation $\pi \in \Pi(\mb{v})$, set $\G$ as the induced $\G_\pi$ and traverse through $\A_{\mb{v}}(\Prob)$ by a \textit{weakly decreasing walk} to obtain $\mc{H}$ where $|\E(\mc{H})| < |\E(\G_\pi)|$.\footnote{In \citep{solus2021consistency_supp}, their pseudocode does not indicate that such a walk needs to be weakly decreasing but such a requirement is imposed in the description of the algorithm.} If no such $\mc{H}$ exists, ESP returns $\mc{G} = \mc{G}_\pi$; else $\mc{G}$ is reset as $\mc{H}$ and repeat. 

As noted by \cite{solus2021consistency_supp}, the construction of $\A_\mb{v}(\Prob)$ is inefficient since one is only required to know the neighboring states instead of the entire $\A_\mb{v}(\Prob)$ to perform the traversal. Below we show that unbounded GRaSP$_1$ can efficiently learn the neighbors of each state in $\A_\mb{v}(\Prob)$ by our permutation-based operation \textit{tuck} performed on singular edges. Before examining this claim, we introduce some useful definitions.

Given the permutohedron $\A_\mb{v}$, a \textit{walk} $\mf{W} = \la \pi^1,...,\pi^m\ra$ is a sequence of neighboring states in $\A_\mb{v}$ such that $\pi^i, \pi^{i+1} \in \Pi(\mb{v})$ are in AT for each $1 \leq i < m$. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{definition}
\label{walk_prop}
Given a graphoid $\Prob$ over $\mb{V}$, for any walk $\mf{W} = \la \pi^1,...,\pi^m \ra$ in $\A_\mb{v}$,
\begin{enumerate}
    \item[(a)] $\mf{W}$ is said to be \textit{DAG-preserving} if $\G_{\pi^1} = ... = \G_{\pi^m}$;
    \item[(b)] $\mf{W}$ is said to be \textit{DAG-changing} if $\la \pi^1,...,\pi^{m-1}\ra$ is DAG-preserving and $\G_{\pi^{m-1}} \neq \G_{\pi^m}$.
\end{enumerate}
\end{definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In addition, for each DAG-changing walk $\mf{W} = \la \pi^1,..., \pi^m\ra$, we say that $\mf{W}$ is relative to $(j,k)$ if $\pi^{m}$ is $(j, k)$-different from $\pi^{m-1}$ for some $j, k \in \mb{v}$. Thus, each DAG-changing walk is relative to a pair of vertices corresponding to the last AT performed in the walk.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{definition}
\label{rev_DAGs}
Given a set of variables $\mb{V}$, consider any $j, k \in \mb{v}$. Two DAGs $\G, \mc{H} \in \DAG(\mb{V})$ are said to be $(j,k)$-reverse if $(j \to k) \in \E(\G)$ and $(k \to j) \in \E(\mc{H})$, and there does not exist any other $j', k' \in \mb{v}$ s.t. $(j' \to k') \in \E(\G)$ and $(k' \to j') \in \E(\mc{H})$. 
\end{definition}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{walk_iff}
Given a graphoid $\Prob$ over $\mb{V}$, consider any $j, k \in \mb{v}$. Then there exists a DAG-changing walk $\mf{W} = \la \pi^1,...,\pi^m\ra$ relative to $(j,k)$ in $\A_\mb{v}$ if and only if $\G_{\pi^1}$ and $\G_{\pi^m}$ are neighbors in $\A_{\mb{v}}(\Prob)$ that are $(j,k)$-reverse. 
\end{lemma}
\begin{proof}
For the forward direction, given that $\pi^{m-1}$ and $\pi^m$ are $(j, k)$-different but induce different DAGs, it follows from the definition of $\A_\mb{v}(\Prob)$ that $\G_{\pi^1} = \G_{\pi^{m-1}}$ and $\G_{\pi^m}$ are neighbors in $\A_\mb{v}(\Prob)$. Also, we know that $(j \to k) \in \E(\G_{\pi^{m-1}})$ and $(k \to j) \in \E(\G_{\pi^m})$ by \textbf{Lemma \ref{AT_iff}} and (RU). The fact that $\G_{\pi^1} = \G_{\pi^{m-1}}$ and $\G_{\pi^m}$ are $(j,k)$-reverse follows immediately from (RU) and the assumption that $\pi^{m-1}$ is $(j,k)$-different from $\pi^m$. 

For the backward direction, suppose that $\G_\pi$ and $\G_\tau$ are neighbors in $\A_{\mb{v}}(\Prob)$ that are $(j, k)$-reverse. It entails from (RU) that there exist $\pi', \tau' \in \Pi(\mb{v})$ such that $\pi'$ and $\tau'$ are $(j,k)$-different where $\G_\tau = \G_{\tau'}$ and $\G_{\pi} = \G_{\pi'}$. Hence, $\la \pi', \tau'\ra$ is our desired DAG-changing walk relative to $(j, k)$ in $\A_\mb{v}$.   
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{4_perm_lemma}
Given a graphoid $\Prob$ over $\mb{V}$, consider any pair $\pi^1, \tau^1 \in \Pi(\mb{v})$ such that $\pi^1 = \la \bs{\delta}_1, j, k, \bs{\delta}_2\ra$ for some sub-sequences $\bs{\delta}_1$, $\bs{\delta}_2$ of $\pi^1$, and $\tau^1 = \la \bs{\zeta}_1, j, k, \bs{\zeta}_2\ra$ for some sub-sequences $\bs{\zeta}_1$, $\bs{\zeta}_2$ of $\tau^1$. Further consider $\pi^2 = \la \bs{\delta}_1, k, j, \bs{\delta}_2\ra$ and $\tau^2 = \la \bs{\zeta}_1, k, j, \bs{\zeta}_2\ra$. If $\G_{\pi^1} = \G_{\tau^1}$, then $\G_{\pi^2} = \G_{\tau^2}$. 
\end{lemma}
\begin{proof}
Notice that $\G_{\pi^2} \in \SGS(\Prob)$ by \textbf{Theorem \ref{RU-theorem}}. If we can show that $\tau^2$ is a causal order of $\G_{\pi^2}$, it follows from \textbf{Lemma \ref{SGS_same_DAG}} that $\G_{\pi^2} = \G_{\tau^2}$. To do so, it suffices to show the following. For any $i \in \mb{v} \setminus \{j, k\}$,
\begin{enumerate}
    \item[(i)] if $(i \to j) \in \E(\G_{\pi^2})$, then $i \in \bs{\zeta}_1$;
    \item[(ii)] if $(i \to k) \in \E(\G_{\pi^2})$, then $i \in \bs{\zeta}_1$;
    \item[(iii)] if $(j \to i) \in \E(\G_{\pi^2})$, then $i \in \bs{\zeta}_2$;
    \item[(iv)] if $(k \to i) \in \E(\G_{\pi^2})$, then $i \in \bs{\zeta}_2$.
\end{enumerate}
For (i), suppose that $(i \to j) \in \E(\G_{\pi^2})$. If $(i \to j) \in \E(\G_{\pi^1})$ as well, then $(i \to j) \in \E(\G_{\tau^1})$ since $\G_{\pi^1} = \G_{\tau^1}$. This entails that $i \in \bs{\zeta}_1$. On the other hand, consider the case that $(i \to j) \notin \E(\G_{\pi^1})$. Then
\begin{align}
    X_i \nCI_\Prob X_j \,&|\,\mb{X}_{\bs{\delta}_1 \setminus \{i\}} \cup \{X_k\} &\because (i \to j) \in \E(\G_{\pi^2})\\
    X_i \nCI_\Prob \{X_j, X_k\} \,&|\,\mb{X}_{\bs{\delta}_1 \setminus \{i\}} &\because (4), \textit{weak union}\\
    X_i \CI_\Prob X_j \,&|\,\mb{X}_{\bs{\delta}_1 \setminus \{i\}} &\because (i \to j) \notin \E(\G_{\pi^1})\\
    X_i \nCI_\Prob X_k \,&|\,\mb{X}_{\bs{\delta}_1 \setminus \{i\}} \cup \{X_j\} &\because (5), (6), \textit{contraction}
\end{align}
By (RU), (8) entails that $(i \to k) \in \E(\G_{\pi^1}) = \E(\G_{\tau^1})$. Since $\tau^1$ is a causal order of $\G_{\tau^1}$, we have $i \in \bs{\zeta}_1$.

For (ii), suppose that $(i \to k) \in \E(\G_{\pi^2})$. Similar to (i), the case for $(i \to k) \in \E(\G_{\pi^1})$ is simple. So consider the case where $(i \to k) \notin \E(\G_{\pi^1})$.
\begin{align}
    X_i \nCI_\Prob X_k \,&|\,\mb{X}_{\bs{\delta}_1 \setminus \{i\}} &\because (i \to k) \in \E(\G_{\pi^2})\\
    X_i \nCI_\Prob \{X_j, X_k\} \,&|\,\mb{X}_{\bs{\delta}_1 \setminus \{i\}} &\because (8), \textit{decomposition}\\
    X_i \CI_\Prob X_k \,&|\,\mb{X}_{\bs{\delta}_1 \setminus \{i\}} \cup \{X_j\} &\because (i \to k) \notin \E(\G_{\pi^1})\\
    X_i \nCI_\Prob X_j \,&|\,\mb{X}_{\bs{\delta}_1 \setminus \{i\}} &\because (9), (10), \textit{contraction}
\end{align}
By (RU), (13) entails that $(i \to j) \in \E(\G_{\pi^1}) = \E(\G_{\tau^1})$ and hence $i \in \bs{\zeta}_1$.

For (iii), suppose that $(j \to i) \in \E(\G_{\pi^2})$. Then we have $(j \to i) \in \E(\G_{\pi^1})$ by (RU) because $\Pre(i, \pi^1) = \Pre(i, \pi^2)$. Hence $(j \to i) \in \E(\G_{\tau^1})$ since $\G_{\pi^1} = \G_{\tau^1}$. Given that $\tau^1$ is a causal order of $\G_{\tau^1}$, we have $i \in \bs{\zeta}_2$. (iv) is analogous to (iii).
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{two_walks}
Given a graphoid $\Prob$ over $\mb{V}$, consider any two DAG-changing walks $\mf{W} = \la \pi^1,..., \pi^m\ra$ and $\mf{W}' = \la \tau^1,..., \tau^n\ra$ in $\A_\mb{v}$ where $\pi^1 = \tau^1$. If $\mf{W}$ and $\mf{W}'$ are both relative to the same $(j, k)$ for some $j, k \in \mb{v}$, then $\G_{\pi^m} = \G_{\tau^n}$.
\end{lemma}
\begin{proof}
Immediate from \textbf{Definition \ref{walk_prop}} and \textbf{Lemma \ref{4_perm_lemma}}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{singular_lemma}
Given a graphoid $\Prob$ over $\mb{V}$, consider any DAG-changing walk $\mf{W} = \la \pi^1,..., \pi^m\ra$ in $\A_\mb{v}$ which is relative to $(j, k)$ for some $j, k \in \mb{v}$. Then $j \to k$ is a singular edge in $\G_{\pi^1}$. 
\end{lemma}
\begin{proof}
Let $\mf{W}_0$ denotes the DAG-preserving walk $\la \pi^1,..., \pi^{m-1}\ra$. Given that $\pi^{m}$ is $(j, k)$-different from $\pi^{m-1}$, it follows from \textbf{Lemma \ref{AT_iff}} and (RU) that $(j \to k) \in \E(\G_{\pi^{m-1}})$. Since $\mf{W}_0$ is a DAG-preserving walk in $\A_\mb{v}$, we have $(j \to k) \in \E(\G_{\pi^{1}}) = \E(\G_{\pi^{m-1}})$. 

Next, suppose by reductio that $j \to k$ is not a singular edge in $\G_{\pi^1}$. Then there is a directed path from $j$ to $k$ other than $j \to k$ in $\G_{\pi^1}$. So there exists $l \in \mb{v}$ such that $l \in \De(j, \G_{\pi^1}) \cap \An(k, \G_{\pi^1})$. In order to ensure that $j$ and $k$ are $\pi^{m-1}$-adjacent, either $\pi^{m-1}[l] < \pi^{m-1}[j]$ or $\pi^{m-1}[l] > \pi^{m-1}[k]$ holds. However, either case will violate that $\pi^{m-1}$ is a causal order of $\G_{\pi^{m-1}} = \G_{\pi^1}$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{singular_tuck_lemma}
Given a graphoid $\Prob$ over $\mb{V}$, consider $\pi \in \Pi(\mb{v})$ where $(j \to k) \in \E(\G_\pi)$ is a singular edge for some $j, k \in \mb{v}$. Then there exists a DAG-changing walk $\mf{W} = \la \pi,...,\tau\ra$ in $\A_\mb{v}$ relative to $(j, k)$ where $\tau = \textit{tuck}(\pi, j, k)$.
\end{lemma}
\begin{proof}
First, we rewrite $\pi = \la \bs{\delta}_{<j}, j, \bs{\delta}_{j \sim k}, k, \bs{\delta}_{>k}\ra$ as usual. Then we partition $\bs{\delta}_{j \sim k}$ as follows: $\bs{\zeta}_a = \la i \in \bs{\delta}_{j \sim k}: i \in \An(k, \G_{\pi})\ra$, and $\bs{\zeta}_b = \la i \in \bs{\delta}_{j \sim k}: i \notin \An(k, \G_{\pi})\ra$. Given that $(j \to k)$ is a singular edge, we know that $\De(j, \G_{\pi^1}) \cap \An(k, \G_{\pi^1}) = \varnothing$. In other words, we know that (i) each vertex in $\bs{\zeta}_a$ has no ancestor in $\bs{\delta}_{j \sim k} \setminus \bs{\zeta}_a$ in $\G_{\pi}$ and (ii) each vertex in $\bs{\zeta}_b$ has no descendant in $\bs{\delta}_{j \sim k} \setminus \bs{\zeta}_b$ in $\G_{\pi}$. 

Now consider the permutation $\tau' = \la \bs{\delta}_{<j}, \bs{\zeta}_a, j, k, \bs{\zeta}_{b}, \bs{\delta}_{>k}\ra$ in particular. We want to show that there exists a DAG-preserving walk from $\pi$ to $\tau'$. Such a walk is easy to construct. First, perform repeated ATs by moving each $i \in \bs{\zeta}_a$ prior to $j$ from left to right, and then repeated ATs by moving each $i \in \bs{\zeta}_{b}$ behind $k$ from right to left. The two sets of ATs are licensed by (i) and (ii) respectively. Hence, we have $\G_{\tau'} = \G_{\pi}$. Finally, consider $\tau = \textit{tuck}(\pi, j, k) = \la \bs{\delta}_{<j}, \bs{\zeta}_a, k, j, \bs{\zeta}_b, \bs{\delta}_{>k}\ra$ which is $(j, k)$-different from $\tau'$. By (RU) and \textbf{Lemma \ref{AT_iff}}, we know that $\G_{\tau} \neq \G_{\tau'}$ and thus $\la \pi ,..., \tau', \tau\ra$ is a DAG-changing walk in $\A_\mb{v}$ relative to $(j, k)$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{theorem}
\label{singular_tuck_thm}
Given a graphoid $\Prob$ over $\mb{V}$, consider any DAG-changing walk $\mf{W} = \la \pi^1,...,\pi^m\ra$ in $\A_\mb{v}$ which is relative to $(j, k)$ for some $j, k \in \mb{v}$. Then $\G_{\pi^{m}} = \G_\tau$ where $\tau = \textit{tuck}(\pi^1, j, k)$.
\end{theorem}
\begin{proof}
We obtain a DAG-changing walk $\mf{W}' = \la \pi^1, ...,\tau\ra$ in $\A_\mb{v}$ relative to $(j, k)$ by \textbf{Lemma \ref{singular_tuck_lemma}}. Since both $\mf{W}$ and $\mf{W}'$ are relative to the same $(j, k)$, it follows from \textbf{Lemma \ref{two_walks}} that $\G_{\pi^{m}} = \G_\tau$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Similar to the discussion in Appendix \textcolor{blue}{\textbf{\ref{app:correct}}}, we want to fix the ordering of the set of singular edges in any DAG. This ensures that ESP and unbounded GRasP$_1$ will not yield different DAGs simply due to the issue of order-dependence. Below we prove that ESP and unbounded GRaSP$_1$ are equivalent algorithms.    
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{theorem}
\label{ESP=GRaSP1}
Given a graphoid $\Prob$ and any initial permutation $\pi \in \Pi(\mb{v})$, the DAG induced by the output of unbounded GRaSP$_1$ is equivalent to the DAG returned by ESP.
\end{theorem}
\begin{proof}
Consider any $j, k \in \mb{v}$. By \textbf{Lemma \ref{singular_lemma}} and \textbf{Lemma \ref{singular_tuck_lemma}}, every DAG-changing walk $\mf{W} = \la \pi^1,..., \pi^m\ra$ in $\A_\mb{v}$ relative to $(j, k)$ corresponds to a \textit{tuck} operation of the singular edge $j \to k$ in $\E(\G_{\pi^1})$. Hence, by \textbf{Lemma \ref{walk_iff}}, we know that $\textit{tuck}(\pi^1, j, k)$ corresponds to the neighboring relation between $\G_{\pi^1}$ and $\G_{\pi^m}$ in $\A_\mb{v}(\Prob)$ that are $(j, k)$-reverse. Therefore, every step taken by ESP to move to a neighboring state in $\A_\mb{v}(\Prob)$ (relative to a unique pair of vertices) is equivalent to the tuck operation taken by GRaSP$_1$ over the same pair of vertices.  
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Causal Razors and GRaSP}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% D. Causal Razors and GRaSP %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\label{app:tiers}
In this section, we first provide a logical analysis of the causal razors discussed in the main text.\footnote{There are other causal razors discussed in the literature, including, but not limited to, \textit{adjacency-faithfulness} and \textit{orientation-faithfulness} in \citep{ramsey2006}, and \textit{triangle-faithfulness} in \cite{zhang2013comparison_supp}. But they do not have a strong connection with our discussion of GRaSP and so will not analyzed in this work.} Then we construct new causal razors with respect to each tier of GRaSP, and show how a higher tier of GRaSP requires a strictly weaker causal razor. 

\begin{theorem}
\label{razors_hierarchy}
The following statements are true:
\begin{enumerate}
    \item[(a)] For any joint probability distribution $\Prob$, $\uPm(\Prob) = \CFC(\Prob) \subseteq \uFr(\Prob) \subseteq \Fr(\Prob) \subseteq \Pm(\Prob) \subseteq \SGS(\Prob)$.
    \item[(b)] For any joint probability distribution $\Prob$, if faithfulness is satisfied, $\CFC(\Prob) = \uFr(\Prob) = \Fr(\Prob) = \Pm(\Prob).$
    \item[(c)] There exists a joint probability distribution s.t. $\CFC(\Prob) \subset \uFr(\Prob)$.
    \item[(d)] There exists a joint probability distribution s.t. $\uFr(\Prob) \subset \Fr(\Prob)$.
    \item[(e)] There exists a joint probability distribution s.t. $\Fr(\Prob) \subset \Pm(\Prob)$.
    \item[(f)] There exists a joint probability distribution s.t. $\Pm(\Prob) \subset \SGS(\Prob)$.
\end{enumerate}
\end{theorem}
\begin{proof}
For (a), $\uPm(\Prob) = \CFC(\Prob)$ is our result in \textbf{Theorem \ref{CFC-uPm}}. $\CFC(\Prob) \subseteq \uFr(\Prob)$ is proven in \citep{raskutti2018learning_supp}, $\uFr(\Prob) \subseteq \Fr(\Prob)$ is true by \textbf{Definition \ref{frugal}}, $\Fr(\Prob) \subseteq \Pm(\Prob)$ in \citep{forster2020frugal_supp}, and $\Pm(\Prob) \subseteq \SGS(\Prob)$ in \citep{zhang2013comparison_supp}. (b) is a direct consequence of (a) and \textbf{Theorem \ref{razors}}.

For (c), see \citep[Theorem 2.4]{raskutti2018learning_supp}. For (d), see \citep[Figure 6]{forster2020frugal_supp}. For (e), see \citep[Theorem 2.5]{raskutti2018learning_supp}. For (f), see \citep[Figure 2]{zhang2013comparison_supp}. Additionally, the example in \textbf{Theorem \ref{2_not_1}} and its corresponding Figure \ref{fig:GRaSP2_better} verifies (c) and (e): $\G^* \in \uFr(\Prob) \setminus \CFC(\Prob)$ and $\G_\pi \in \Pm(\Prob) \setminus \Fr(\Prob)$. On the other hand, each of $\G_{\pi^1}, \G_{\pi^2}, \G_{\pi^3}$, and $\G_{\pi^4}$ in the DAG-associahedron in Figure \ref{fig:permutohedron} is in $\SGS(\Prob)\setminus \Pm(\Prob)$ verifying (f).
\end{proof}

\begin{definition}
(TSP-razor and ESP-razor) Given a graphoid $\Prob$ over $\mb{V}$, let $\textit{tsp}(\Prob, \pi)$ be the DAG returned by TSP on $\Prob$ by setting $\pi$ as the initial permutation. Define $\mt{TSP}(\Prob) = \{\G \in \DAG(\mb{V}): \pi \in \Pi(\mb{v})$ and $\G = tsp(\Prob, \pi)\}$ as the set of DAGs returned by TSP on $\Prob$ over each initial permutation in $\Pi(\mb{v})$. Further define
\begin{align*}
    \mt{TSPr}(\Prob) = \{\G \in \mt{TSP}(\Prob): \neg \exists \G' \in \mt{TSP}(\Prob) \text{ s.t. } \G' \notin \MEC(\G)\}.
\end{align*}
$(\G^*, \Prob)$ satisfies the TSP-razor if $\G^* \in \mt{TSPr}(\Prob)$. Similarly for ESP, $\textit{esp}$, $\mt{ESP}$, $\mt{ESPr}$, and ESP-razor. 
\end{definition}

One can observe that $\mt{TSPr}(\Prob) = \mt{TSP}(\Prob)$ if every DAG in $\mt{TSP}(\Prob)$ belongs to the same MEC, and $\mt{TSPr}(\Prob) = \varnothing$ otherwise. The same is also true for $\mt{ESPr}(\Prob)$ and $\mt{ESP}(\Prob)$. These definitions will be proven useful when we compare them with the classes of DAGs discussed in \textbf{Theorem \ref{razors_hierarchy}}. Below we provide a similar definition for each tier of GRaSP. 

\begin{definition}
(GRaSP$_t$-razor) Given a graphoid $\Prob$ over $\mb{V}$, for $t \in \{0, 1, 2\}$, define $\mt{GRaSP}_t(\Prob) = \{\G_\tau \in \DAG(\mb{V}): \pi \in \Pi(\mb{v}) \text{ and } \tau = \textit{grasp}(\Prob, \pi, |\mb{v}|, t)\}$ as the set of DAGs returned by unbounded GRaSP$_t$ on $\Prob$ over each initial permutation in $\Pi(\mb{v})$. Further define
\begin{align*}
    \mt{GRaSP}_t\mt{r}(\Prob) = \{\G \in \mt{GRaSP}_t(\Prob): \neg \exists \G' \in \mt{GRaSP}_t(\Prob) \text{ s.t. } \G' \notin \MEC(\G)\}.
\end{align*}
$(\G^*, \Prob)$ satisfies the GRaSP$_t$-razor if $\G^* \in \mt{GRaSP}_t\mt{r}(\Prob)$.
\end{definition}

\begin{theorem}
Given a graphoid $\Prob$, the following statement is true:
\begin{align*}
    \CFC(\Prob) = \mt{TSPr}(\Prob) = \mt{GRaSP}_0\mt{r}(\Prob) \subseteq \mt{ESPr}(\Prob) = \mt{GRaSP}_1\mt{r}(\Prob) \subseteq \mt{GRaSP}_2\mt{r}(\Prob) \subseteq \uFr(\Prob).
\end{align*}
\end{theorem}
\begin{proof}
$\CFC(\Prob) = \mt{TSPr}(\Prob) = \mt{GRaSP}_0\mt{r}(\Prob)$ is directly entailed by \textbf{Theorem \ref{TSP=GRaSP0}} and \textbf{Theorem \ref{TSP_necessary}}. \cite{solus2021consistency_supp} showed that $\mt{TSPr}(\Prob) \subseteq \mt{ESPr}(\Prob)$. $\mt{ESPr}(\Prob) = \mt{GRaSP}_1\mt{r}(\Prob)$ is entailed by \textbf{Theorem \ref{ESP=GRaSP1}}. 

Next, to show that $\mt{GRaSP}_1\mt{r}(\Prob) \subseteq \mt{GRaSP}_2\mt{r}(\Prob)$, notice that $\mt{GRaSP}_1\mt{r}(\Prob) = \mt{GRaSP}_1(\Prob)$ when all DAGs in $\mt{GRaSP}_1(\Prob)$ belong to the same MEC, and $\mt{GRaSP}_1\mt{r}(\Prob) = \varnothing$ otherwise. The latter case validates $\mt{GRaSP}_1\mt{r}(\Prob) \subseteq \mt{GRaSP}_2\mt{r}(\Prob)$ trivially. Now consider the former case where all DAGs in the non-empty $\mt{GRaSP}_1(\Prob)$ belong to the same MEC and so they have the same number of edges. Now consider any $\pi \in \Pi(\mb{v})$ satisfying $\G_\pi \in \Fr(\Prob)$ (where $\Fr(\Prob)$ is necessarily non-empty). We know that $\G_\pi \in \mt{GRaSP}_1(\Prob)$. This is because every initial permutation in $\Pi(\mb{v})$ is considered and unbounded GRaSP$_1$ will never return a denser permutation than its initial permutation. Hence, every DAG in $\mt{GRaSP}_1(\Prob)$ is the sparsest Markovian DAG. (The same also holds when $\mt{GRaSP}_2\mt{r}(\Prob) \neq \varnothing$.) Then the construction of \textbf{Algorithm \ref{alg:grasp}} entails that GRaSP$_2$ will return the same permutation as GRaSP$_1$. Hence, $\mt{GRaSP}_1\mt{r}(\Prob) = \mt{GRaSP}_2\mt{r}(\Prob)$ when all DAGs in $\mt{GRaSP}_1(\Prob)$ belongs to the same MEC.

Lastly, to show that $\mt{GRaSP}_2\mt{r}(\Prob) \subseteq \uFr(\Prob)$, we use a proof similar to the above. First, the case where $\mt{GRaSP}_2\mt{r}(\Prob) = \varnothing$ is trivial. Consider the case where $\mt{GRaSP}_2\mt{r}(\Prob) = \mt{GRaSP}_2(\Prob)$ s.t. all DAGs in $\mt{GRaSP}_2(\Prob)$ are in the same MEC. Using a similar inference used in the last paragraph, we know that every DAG in $\mt{GRaSP}_2(\Prob)$ is the sparsest Markovian DAG. Therefore, $\mt{GRaSP}_2\mt{r}(\Prob) = \uFr(\Prob)$ when all DAGs in $\mt{GRaSP}_2(\Prob)$ belongs to the same MEC.
\end{proof}

\begin{theorem}
\label{ESP_not_TSP}
There exists a graphoid $\Prob$ s.t. $\mt{GRaSP}_0\mt{r}(\Prob) \subset \mt{GRaSP}_1\mt{r}(\Prob)$.
\end{theorem}
\begin{proof}
Given the equivalence between TSP and unbounded GRaSP$_0$ shown in \textbf{Theorem \ref{TSP=GRaSP0}}, and that between ESP and unbounded GRaSP$_1$ in \textbf{Theorem \ref{ESP=GRaSP1}}, we can borrow the example from \citep{solus2021consistency_supp} on how ESP requires a strictly weaker causal razor than TSP. We refer the readers to Figure 3 in the \href{https://academic.oup.com/biomet/article-abstract/108/4/795/6062392?redirectedFrom=fulltext#supplementary-data}{\textcolor{blue}{supplementary materials}} of \citep{solus2021consistency_supp}.
\end{proof}

In the remainder of this section, we discuss two examples: how unbounded GRaSP$_2$ requires a strictly weaker causal razor than unbounded GRaSP$_1$, and how unbounded GRaSP$_2$ requires a strictly stronger causal razor than u-frugality. The joint distribution of each example below is a compositional graphoid. For the sake of simplicity, we only include CI relations that hold between two \textit{singleton} sets of variables such that all other CI relations entailed by each of the graphoid axioms discussed in Appendix \textcolor{blue}{\textbf{\ref{app:graphoid}}} are understood.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{theorem}
\label{2_not_1}
There exists a graphoid $\Prob$ s.t. $\mt{GRaSP}_1\mt{r}(\Prob) \subset \mt{GRaSP}_2\mt{r}(\Prob)$.
\end{theorem}
\begin{proof}
Given $\mb{V} = \{X_1,...,X_4\}$, consider the unfaithful model $(\G^*, \Prob)$ where the true DAG $\G^*$ is shown on the left in Figure \ref{fig:GRaSP2_better}, and $\I(\Prob) = \Phi \cup \Psi$ where $\Phi$ is the set of faithful CI relations and $\Psi$ is the set of unfaithful CI relations as listed below:
\begin{align*}
    \Phi = \{&\phi_1 = \la X_1, X_3\,|\,\{X_2\}\ra, \phi_2 = \la X_2, X_4\,|\,\{X_1, X_3\}\ra\};\\
    \Psi = \{&\psi_1 = \la X_2, X_4\,|\,\varnothing\ra\}.
\end{align*}
For every $\G \in \CMC(\Prob)$ where $\psi_1 \in \I(\G)$, we have $5 = |\E(\G)| > |\E(\G^*)| = 4$. Also, all 4-edge Markovian DAGs are in the same MEC. Hence, u-frugality is satisfied. Consider feeding the initial permutation $\pi = \la 2, 4, 1, 3\ra$ to unbounded GRaSP$_1$. It will return the same $\pi$ after the DFS procedure and the induced $\G_\pi$, as shown on the right in Figure \ref{fig:GRaSP2_better}, contains 5 edges. Therefore, unbounded GRaSP$_1$ fails to return the sparsest permutation under some initial permutation and $\mt{GRaSP}_1\mt{r}(\Prob) = \varnothing$. 

On the contrary, $|\mb{v}|! = 24$ initial permutations have been tested on unbounded GRaSP$_2$ and it returns $\hat{\tau}$ where $\G_{\hat{\tau}} \in \MEC(\G^*)$ for each initial permutation. Hence, $\mt{GRaSP}_2\mt{r}(\Prob) \neq \varnothing$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{center}
\begin{figure}[H]
    \centering
    \subfloat{
\begin{tikzpicture}[scale=1.5, roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node (X1) at (0,1) {$1$};
\node [fill=lightgray, rounded corners](X2) at (0,0) {$2$};
\node (X3) at (-1,0) {$3$};
\node [fill=lightgray, rounded corners](X4) at (-1,1) {$4$};
\node (label) at (-0.5, -0.5) {$\G^*$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X4);
\end{tikzpicture}}
\hspace{3cm}
\subfloat{
\begin{tikzpicture}[scale=1.5, roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node (X1) at (0,1) {$1$};
\node (X2) at (0,0) {$2$};
\node (X3) at (-1,0) {$3$};
\node (X4) at (-1,1) {$4$};
\node (label) at (-0.5, -0.5) {$\G_\pi$};
\path [->,line width=0.5mm] (X4) edge (X1);
\path [->,line width=0.5mm] (X4) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X1);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X3);
\end{tikzpicture}}
    \caption{An unfaithful model satisfying u-frugality. The true DAG $\G^*$ is shown on the left where the two shaded vertices indicate the unfaithful marginal independence $X_2 \CI_\Prob X_4\,|\,\varnothing$. Unbounded GRaSP$_1$ returns its initial permutation $\pi = \la 2, 4, 1, 3\ra$. The induced DAG $\G_\pi$ is shown on the right with 5 edges. However, unbounded GRaSP$_2$ manages to return one of the sparsest permutations under every initial permutation.}
    \label{fig:GRaSP2_better}
\end{figure}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{theorem}
\label{uFr_not_2}
There exists a graphoid $\Prob$ s.t. $\mt{GRaSP}_2\mt{r}(\Prob) \subset \uFr(\Prob)$.
\end{theorem}
\begin{proof}
The example below is one of the uDAGs studied in Table \ref{tab:unit_test} in Section \textcolor{blue}{\textbf{\ref{u_fru_unfaithful}}} where GRaSP$_2$ fails to return one of the sparsest permutations under u-frugality. Given $\mb{V} = \{X_1,...,X_5\}$, consider the unfaithful model $(\G^*, \Prob)$ where the true DAG $\G^*$ is shown on the left in Figure \ref{fig:uFruNotGRaSP2}, and $\I(\Prob) = \Phi \cup \Psi$ where $\Phi$ is the set of faithful CI relations and $\Psi$ is the set of unfaithful CI relations as listed below:
\begin{align*}
    \Phi = \{&\phi_1 = \la X_1, X_2\,|\,\varnothing\ra, \phi_2 = \la X_1, X_2\,|\,\{X_3\}\ra,\\
    &\phi_3 = \la X_2, X_3\,|\,\varnothing\ra, \phi_4 = \la X_2, X_3\,|\,\{X_1\}\ra,\\
    &\phi_5= \la X_2, X_5\,|\,\{X_1, X_3, X_4\}\ra\};\\
    \Psi = \{&\psi_1 = \la X_1, X_5\,|\,\varnothing\ra\}.
\end{align*}
For every $\G \in \CMC(\Prob)$ where $\psi_1 \in \I(\G)$, we have $|\E(\G)| > |\E(\G^*)| = 7$. Also, all 7-edge Markovian DAGs are in the same MEC and there exists no sparser Markovian DAG. Hence, u-frugality is satisfied s.t. $\uFr(\Prob) \neq \varnothing$. 

Next, consider feeding the initial permutation $\pi = \la 5, 1, 3, 4, 2\ra$ to unbounded GRaSP$_2$. It will return the same $\pi$ after the DFS procedure and the induced $\G_\pi$, as shown on the right in Figure \ref{fig:uFruNotGRaSP2}, contains 8 edges. Therefore, unbounded GRaSP$_2$ fails to return one of the sparsest permutations under some initial permutation and $\mt{GRaSP}_2\mt{r}(\Prob) = \varnothing$. 
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{center}
\begin{figure}[H]
    \centering
    \subfloat{
\begin{tikzpicture}[scale=1.2, roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (label) at (0, -1.2) {$\G^*$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}}
\hspace{3cm}
\subfloat{
\begin{tikzpicture}[scale=1.2, roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node (X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node (X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (label) at (0, -1.2) {$\G_\pi$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X2);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X4) edge (X2);
\path [->,line width=0.5mm] (X5) edge (X3);
\path [->,line width=0.5mm] (X5) edge (X4);
\end{tikzpicture}}
    \caption{An unfaithful model satisfying u-frugality. The true DAG $\G^*$ is shown on the left where the two shaded vertices indicate the unfaithful marginal independence $X_1 \CI_\Prob X_5\,|\,\varnothing$. Unbounded GRaSP$_2$ returns its initial permutation $\pi = \la 5, 1, 3, 4, 2\ra$. The induced DAG $\G_\pi$ is shown on the right with 8 edges. Hence, GRaSP$_2$ is not correct under u-frugality alone.}
    \label{fig:uFruNotGRaSP2}
\end{figure}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\textbf{Corollary \ref{coro_GRaSP_hierarchy}} \hspace{0.1cm} \textit{Given a graphoid $\Prob$, unbounded GRaSP$_2$ is correct under a strictly weaker causal razor than unbounded GRaSP$_1$, which is correct under a strictly weaker causal razor than unbounded GRaSP$_0$.}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Grow-Shrink Algorithm and its properties}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% E. Grow-Shrink Algorithm and its properties %%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\label{app:gs}
\begin{definition}
\label{BIC}
Given an observational dataset $\D$ with $n$ i.i.d. observations from a joint probability distribution $\Prob$ over $\mb{V}$ that belongs to a curved exponential family\footnote{See \citep{Kass:1437490} for an in-depth analysis of curved exponential families.}, for every $X \in \mb V$ and every $\mb M \subseteq \mb V \setminus X$,
\[
    \BIC_\D(X, \mb{M}) = \ell_{X \mid \mb{M}}(\hat{\theta}_\mt{mle} \mid \D) + c \, \frac{|\hat{\theta}_\mt{mle}|}{2} \log(n)
\]
where $\ell_{X \mid \mb M}$ is the conditional log likelihood function, $|\hat{\theta}_\mt{mle}|$ is the absolute value of the maximum likelihood estimate, and $c$ is a multiplier for the parameter penalty.
\end{definition}

BIC score is a \textit{decomposable} scoring function in the sense that the BIC score of any DAG $\G$ (over the same set of variables $\mb{V}$ as the observational dataset $\D$), denoted as $\BIC_\D(\G)$, satisfies the following:
\[
    \BIC_\D(\G) = \sum_{i \in \mb{v}} \BIC_\D(X_i, \mb{X}_{\Pa(i, \G)}).
\]

In addition, since we will be using BIC throughout this appendix, we assume that every joint probability distribution $\Prob$ belongs to a curved exponential family in this section.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{algorithm}[H]
\DontPrintSemicolon
\caption{\textsc{Grow: }$\textit{grow}(\mc{D}, X, \mb{Z})$}
\label{alg:grow}
\KwIn{(a) $\D$: an observational dataset over $\mb{V}$; (b) $X \in \mb{V}$; (c) $\mb{Z} \subseteq \mb{V} \setminus \{X\}$}
\KwOut{$\mb{M}_{gr}\subseteq \mb{Z}$}
$s \ot \BIC_\D(X, \varnothing)$\;
$s' \ot s$\;
$\mb{M}_{gr} \ot \varnothing$\;
\Do{$s' > s$}{
    $s \ot s'$\;
    $s' \ot \max_{Y \in \mathbf{Z}\setminus \mathbf{M}_{gr}} \mathtt{BIC}_\mathcal{D}(X, \mathbf{M}_{gr} \cup \{Y\})$\;
    $Y' \ot \text{argmax}_{Y \in \mathbf{Z}\setminus \mathbf{M}_{gr}} \mathtt{BIC}_\mathcal{D}(X, \mathbf{M}_{gr} \cup \{Y\})$\;
    \If{$s' > s$}{
    $\mb{M}_{gr} \ot \mb{M}_{gr} \cup \{Y'\}$}
}
return $\mb{M}_{gr}$
\end{algorithm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{algorithm}[H]
\DontPrintSemicolon
\caption{\textsc{Shrink: }$\textit{shrink}(\mc{D}, X, \mb{Z})$}
\label{alg:shrink}
\KwIn{(a) $\D$: an observational dataset over $\mb{V}$; (b) $X \in \mb{V}$; (c) $\mb{Z} \subseteq \mb{V} \setminus \{X\}$}
\KwOut{(i) $\mb{M}_{sh} \subseteq \mb{Z}$; (ii) $s = \BIC_\D(X, \mb{M}_{sh})$}
$s \ot \BIC_\D(X, \mb{Z})$\;
$s' \ot s$\;
$\mb{M}_{sh} \ot \mb{Z}$\;
\Do{$s' > s$}{
    $s \ot s'$\;
    $s' \ot \max_{Y \in \mathbf{M}_{sh}} \mathtt{BIC}_\mathcal{D}(X, \mathbf{M}_{sh} \setminus \{Y\})$\;
    $Y' \ot \text{argmax}_{Y \in \mathbf{M}_{sh}} \mathtt{BIC}_\mathcal{D}(X, \mathbf{M}_{gs} \setminus \{Y\})$\;
    \If{$s' > s$}{
    $\mb{M}_{sh} \ot \mb{M}_{sh} \setminus \{Y'\}$}
}
return $\mb{M}_{sh}, s$
\end{algorithm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{theorem}
\label{BIC_local_DAG}
\citep{chickering2002optimal_supp} Given an observational dataset $\D$ with $n$ i.i.d. observations from a joint probability distribution $\Prob$ over $\mb{V}$, consider $\G, \G' \in \DAG(\mb{V})$ where $\G'$ is resulted from adding the edge $j \to k$ in $\G$. In the large sample limit of $n$,
\begin{enumerate}
    \item[(a)] if $X_j \nCI_\Prob X_k\,|\,\mb{X}_{\Pa(k, \G)}$, then $\BIC_\D(\G') > \BIC_\D(\G)$;
    \item[(b)] if $X_j \CI_\Prob X_k\,|\,\mb{X}_{\Pa(k, \G)}$, then $\BIC_\D(\G') < \BIC_\D(\G)$.
\end{enumerate}
\end{theorem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

The theorem above is known as the \textit{local consistency} of BIC score over DAGs. We can easily derive a lemma which concerns the BIC score of a variable (relative to a set of variables). 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{lemma}
\label{BIC_local_var}
Given an observational dataset $\D$ with $n$ i.i.d. observations from a joint probability distribution $\Prob$ over $\mb{V}$, consider any distinct $j, k \in \mb{v}$ and $\mb{i} \subseteq \mb{v} \setminus \{j, k\}$. In the large sample limit of $n$,
\begin{enumerate}
    \item[(a)] if $X_j \nCI_\Prob X_k\,|\,\mb{X}_\mb{i}$, then $\BIC_\D(X_k, \mb{X}_\mb{i} \cup \{X_j\}) > \BIC_\D(X_k, \mb{X}_\mb{i})$;
    \item[(b)] if $X_j \CI_\Prob X_k\,|\,\mb{X}_\mb{i}$, then $\BIC_\D(X_k, \mb{X}_\mb{i} \cup \{X_j\}) < \BIC_\D(X_k, \mb{X}_\mb{i})$.
\end{enumerate}
\end{lemma}
\begin{proof}
Construct a DAG $\G \in \DAG(\mb{V})$ by drawing all and only directed edges from each vertex in $\mb{i}$ to $k$, and another DAG $\G' \in \DAG(\mb{V})$ by adding $j \to k$ in $\G$. Then the lemma immediately follows from \textbf{Theorem \ref{BIC_local_DAG}} and the decomposable feature of BIC scores.  
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{grow_lemma}
Consider an observational dataset $\D$ with $n$ i.i.d. observations from a compositional graphoid $\Prob$ over $\mb{V}$. In the large sample limit of $n$, for any $X \in \mb{V}$ and any $\mb{Z} \subseteq \mb{V} \setminus \{X\}$, $\MB(X, \mb{Z}) \subseteq \mb{M}_{gr}$ where $\mb{M}_{gr} = \textit{grow}(\D, X, \mb{Z}) \subseteq \mb{Z}$.
\end{lemma}
\begin{proof}
First, \textbf{Algorithm \ref{alg:grow}} requires that $\mb{M}_{gr} \subseteq \mb{Z}$, and $\BIC_\D(X, \mb{M}_{gr} \cup \{Y\}) < \BIC_D(X, \mb{M}_{gr})$ for every $Y \in \mb{Z} \setminus \mb{M}_{gr}$. By \textbf{Lemma \ref{BIC_local_var}}, we have $X \CI_\Prob Y\,|\,\mb{M}_{gr}$ for each $Y \in \mb{Z} \setminus \mb{M}_{gr}$. By composition, we have $X \CI_\Prob (\mb{Z} \setminus \mb{M}_{gr})\,|\,\mb{M}_{gr}$. Therefore, by \textbf{Definition \ref{MB}} and \textbf{Lemma \ref{MB_unique}}, we have $\MB(X, \mb{Z}) \subseteq \mb{M}_{gr}$.   
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{lemma}
\label{shrink_lemma}
Consider an observational dataset $\D$ with $n$ i.i.d. observations from a graphoid $\Prob$ over $\mb{V}$. In the large sample limit of $n$, for any $X \in \mb{V}$ and any $\mb{Z} \subseteq \mb{V} \setminus \{X\}$, $\MB(X, \mb{Z}) = \mb{M}_{sh}$ where $\mb{M}_{sh} = \textit{shrink}(\D, X, \mb{Z}) \subseteq \mb{Z}$.
\end{lemma}
\begin{proof}
We show the lemma by $\mb{M}_{sh} \subseteq \MB(X, \mb{Z})$ and $\mb{M}_{sh} \supseteq \MB(X, \mb{Z})$.

[$\subseteq$] By reductio, suppose that there exists $Y \in \mb{M}_{sh} \subseteq \mb{Z}$ but $Y \notin \MB(X, \mb{Z})$. Let $\mb{S}$ be $\mb{M}_{sh}\setminus \{Y\}$. \textbf{Algorithm \ref{alg:shrink}} requires that $\BIC_\D(X, \mb{M}_{sh} \setminus \{Y\}) < \BIC_D(X, \mb{M}_{sh})$. In other words, we have $\BIC_D(X, \mb{S}) < \BIC_\D(X, \mb{S} \cup \{Y\})$. By \textbf{Lemma \ref{BIC_local_var}}, we have $X \nCI_\Prob Y\,|\,\mb{S}$. 

Let $\mb{W} = \mb{S} \setminus \MB(X, \mb{Z})$. From $Y \notin \MB(X, \mb{Z})$ and $Y \notin \mb{S}$, we have $\{Y\} \cup \mb{W} \subseteq \mb{Z} \setminus \MB(X, \mb{Z})$. Recall \textbf{Definition \ref{MB}} that $X \CI_\Prob \mb{Z} \setminus \MB(X, \mb{Z}) \,|\,\MB(X, \mb{Z})$. Thus, 
\begin{align}
    X \CI_\Prob&\,\{Y\} \cup \mb{W}\,|\,\MB(X, \mb{Z}) &\because X \CI_\Prob \mb{Z} \setminus \MB(X, \mb{Z}) \,|\,\MB(X, \mb{Z}),  \textit{ decomposition}\\
    X \CI_\Prob&\,Y \,|\,\MB(X, \mb{Z}) \cup \mb{W} &\because (12),  \textit{ weak union}\\
    X \CI_\Prob&\,Y\,|\,\mb{S} &\because (13), \mb{W} = \mb{S} \setminus \MB(X, \mb{Z})
\end{align}
Contradiction arises with $X \nCI_\Prob Y\,|\,\mb{S}$.

[$\supseteq$] Observe that \textbf{Algorithm \ref{alg:shrink}} removes one variable in $\mb{Z}$ one at a time repeatedly to form $\mb{M}_{sh}$. Thus, the shrink-procedure corresponds to a sequence of sets of variables $\la \mb{M}^0, ..., \mb{M}^k\ra$ and a sequence of variables $\mb{W} = \la W_1,..., W_k\ra = \mb{Z} \setminus \mb{M}_{sh}$ such that $\mb{M}^0 = \mb{Z}$, $\mb{M}^k = \mb{M}_{sh}$, and $\mb{M}^i = \mb{M}^{i-1} \setminus \{W_i\}$ (where $W_i \in \mb{M}^{i-1}$) for each $1 < i \leq k$.

Notice that $\mb{M}^{i-1} = \mb{M}^i \cup \{W_i\}$. \textbf{Algorithm \ref{alg:shrink}} requires that $\BIC_\D(X, \mb{M}^i) > \BIC_\D(X, \mb{M}^{i-1}) = \BIC_D(X, \mb{M}^i \cup \{W_i\})$. We then have 
\begin{align}
    X \CI_\Prob&\,W_1\,|\,\mb{M}^1 &\because \BIC_\D(X, \mb{M}^1) > \BIC_\D(X, \mb{M}^{0}), \textbf{Lemma \ref{BIC_local_var}}\\
    X \CI_\Prob&\,W_{2}\,|\,\mb{M}^{2} &\because \BIC_\D(X, \mb{M}^{2}) > \BIC_\D(X, \mb{M}^{1}), \textbf{Lemma \ref{BIC_local_var}}\\
    X \CI_\Prob&\,W_1\,|\,\mb{M}^{2} \cup \{W_{2}\} & \because (15), \mb{M}^1 = \mb{M}^{2} \cup \{W_{2}\}\\
    X \CI_\Prob&\,\{W_1, W_{2}\}\,|\,\mb{M}^{2} & \because (16), (17), \textit{contraction}\\
    &\vdots \nonumber\\
    X \CI_\Prob&\,\{W_1,...,W_k\}\,|\,\mb{M}^k & \because ..., \textit{contraction}\\
    X \CI_\Prob&\,\mb{W}\,|\,\mb{M}_{sh} &\because (19), \mb{W} = \la W_1,...,W_k\ra \text{ and } \mb{M}^k = \mb{M}_{sh}\\
    X \CI_\Prob&\,\mb{Z} \setminus \mb{M}_{sh}\,|\,\mb{M}_{sh} &\because (20), \mb{W} = \mb{Z} \setminus \mb{M}_{sh}
\end{align}
Hence, it follows from \textbf{Definition \ref{MB}} that $\mb{M}_{sh} \supseteq \MB(X, \mb{Z})$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{theorem}
\label{gs_thm_app}
Consider an observational dataset $\D$ with $n$ i.i.d. observations from a compositional graphoid $\Prob$ over $\mb{V}$. In the large sample limit of $n$, for any $X \in \mb{V}$ and any $\mb{Z} \subseteq \mb{V} \setminus \{X\}$, $\MB(X, \mb{Z}) = \mb{M}_{gs}$ where $\mb{M}_{gs} = \textit{shrink}(\D, X, \textit{grow}(\D, X, \mb{Z}))$.
\end{theorem}
\begin{proof}
Immediate from \textbf{Lemma \ref{grow_lemma}} and \textbf{Lemma \ref{shrink_lemma}}.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{theorem}
Consider an observational dataset $\D$ with $n$ i.i.d. observations from a (compositional) graphoid $\Prob$ over $\mb{V} = \{X_1,...,X_m\}$, and any $\pi \in \Pi(\mb{v})$. Let $s_{i}$ and $\mb{M}_i$ be the score and the set of variables returned by \textit{shrink}$(\D, X_i, \mb{X}_{\Pre(i, \pi)})$ (or \textit{shrink}$(\D, X_i,  \textit{grow}(\D, X_i, \mb{X}_{\Pre(i, \pi)}))$ if $\Prob$ is a compositional graphoid) respectively. Denote $s_\pi$ as $\sum_{i \in \mb{v}} s_i$. In the large sample limit of $n$, $\BIC_\D(\G_\pi) = s_\pi$ where $\G_\pi$ is induced from $\pi$ by (VP).
\end{theorem}
\begin{proof}
Immediate from the decomposable feature of BIC scores, \textbf{\textbf{Lemma} \ref{shrink_lemma}} and \textbf{Theorem \ref{gs_thm_app}}. 
\end{proof}

Lastly, though a compositional graphoid is a sufficient condition for the correct identification of the unique Markov boundary using the grow-shrink algorithm, we are aware of an assumption weaker than compositional graphoid to validate such an identification. Nevertheless, this discussion will be beyond the scope of this paper and we will leave the formal proof to future work.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Additional Examples}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% F. Additional Examples %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\label{EmpiricalAppendix}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\subsection{Lu et al. Comparison}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% F1. Lu et al. Comparison %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\label{luetal-comparison}
Reported below are average statistics obtained by running GRaSP$_2$ on the published datasets used to generate Figure 6 in \citep{lu2021improving_supp}\footnote{\url{https://github.com/ninalu/urlearning-cpp/tree/master/triplet_data}}. We cannot compare these results to Lu et al. precisely, since their statistics are given in figures and not exactly in tables, though judging from their figures it appears that GRaSP$_2$ is dominating for adjacency precision and recall, arrowhead recall, and most results for arrowhead precision. Timing results are not reported by Lu et al.; we include these to show that GRASP$_2$ returns quickly for all of these examples, where we know (personal communication) that some of the results for Triple A$^*$ take much longer. Adjacencies in these graphs are sampled with uniform probability, ``Edge-prob''.

\begin{table}[H]
    \centering
    \begin{tabular}{r|c|c|c|c|c|c}
    	Edge-prob & 0.03 & 0.04 & 0.05 & 0.06 & 0.07 & 0.08 \\
    	\hline
        Precision & 0.964 & 0.976 & 0.979 & 0.980 & 0.982 & 0.976 \\
        Recall & 0.985 & 0.982 & 0.986 & 0.986 & 0.985 & 0.985 \\
        F1 & 0.974 & 0.979 & 0.983 & 0.983 & 0.983 & 0.980
    \end{tabular}
    \caption{GRaSP$_2$ Adjacency Statistics}
    \label{tab:grasp_on_lu_adj}
\end{table}

\begin{table}[H]
    \centering
    \begin{tabular}{r|c|c|c|c|c|c}
    	Edge-prob & 0.03 & 0.04 & 0.05 & 0.06 & 0.07 & 0.08 \\
    	\hline
        Precision & 0.907 & 0.914 & 0.933 & 0.949 & 0.946 & 0.945 \\
        Recall & 0.897 & 0.916 & 0.933 & 0.952 & 0.952 & 0.955 \\
        F1 & 0.898 & 0.913 & 0.932 & 0.950 & 0.948 & 0.950
    \end{tabular}
    \caption{GRaSP$_2$ Arrowhead Statistics}
    \label{tab:grasp_on_lu_arr}
\end{table}

\begin{table}[H]
    \centering
    \begin{tabular}{r|c|c|c|c|c|c}
    	Edge-prob & 0.03 & 0.04 & 0.05 & 0.06 & 0.07 & 0.08 \\
    	\hline
        Seconds & 0.405 & 0.755 & 1.403 & 2.703 & 4.795 & 7.161
    \end{tabular}
    \caption{GRaSP$_2$ Timing Statistics}
    \label{tab:grasp_on_lu_time}
\end{table}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\subsection{Airfoil Example}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% F2. Airfoil Example %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\label{airfoil-example}
Figure \ref{fig:airfoil} gives the results of running GRaSP$_2$, PC, and fGES on the Airfoil empirical example described in Section \textcolor{blue}{\textbf{\ref{sec:empirical}}}. GRaSP$_2$ gets the same uniquely frugal result as SP. To improve readability, we use the names of the variables (instead of numerals) to label the vertices. 

\begin{figure}[ht!]
\begin{center}
\subfloat{
\begin{tikzpicture}[scale=1.0, roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node (l) at (3,-1) {(a) GRaSP$_2$ result};
\node (a) at (3,3) {\textit{Attack}};
\node (v) at (1.75,2) {\textit{Velocity}};
\node (c) at (4.25,2) {\textit{Chord}};
\node (p) at (1.75,1) {\textit{Pressure}};
\node (d) at (4.25,1) {\textit{Displacement}};
\node (f) at (3,0) {\textit{Frequency}};
\path [->,line width=0.5mm] (a) edge (p);
\path [->,line width=0.5mm] (a) edge (d);
\path [->,line width=0.5mm] (v) edge (a);
\path [->,line width=0.5mm] (v) edge (p);
\path [-,line width=0.5mm] (v) edge (f);
\path [->,line width=0.5mm] (c) edge (a);
\path [->,line width=0.5mm] (c) edge (d);
\path [->,line width=0.5mm] (c) edge (p);
\path [->,line width=0.5mm] (d) edge (p);
\path [->,line width=0.5mm] (f) edge (a);
\path [->,line width=0.5mm] (f) edge (p);
\end{tikzpicture}}
\hspace{1.5cm}
\subfloat{
\begin{tikzpicture}[scale=1.0, roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node (l) at (3,-1) {(b) fGES result};
\node (a) at (3,3) {\textit{Attack}};
\node (v) at (1.75,2) {\textit{Velocity}};
\node (c) at (4.25,2) {\textit{Chord}};
\node (p) at (1.75,1) {\textit{Pressure}};
\node (d) at (4.25,1) {\textit{Displacement}};
\node (f) at (3,0) {\textit{Frequency}};
\path [->,line width=0.5mm] (a) edge (p);
\path [->,line width=0.5mm] (a) edge (f);
\path [->,line width=0.5mm] (v) edge (a);
\path [->,line width=0.5mm] (v) edge (p);
\path [->,line width=0.5mm] (v) edge (f);
\path [->,line width=0.5mm] (c) edge (a);
\path [-,line width=0.5mm] (c) edge (d);
\path [->,line width=0.5mm] (c) edge (p);
\path [->,line width=0.5mm] (c) edge (f);
\path [->,line width=0.5mm] (d) edge (p);
\path [->,line width=0.5mm] (d) edge (a);
\path [->,line width=0.5mm] (f) edge (p);
\end{tikzpicture}}
\hspace{1.5cm}
\subfloat{
\begin{tikzpicture}[scale=1.0, roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node (l) at (3,-1) {(c) PC result};
\node (a) at (3,3) {\textit{Attack}};
\node (v) at (1.75,2) {\textit{Velocity}};
\node (c) at (4.25,2) {\textit{Chord}};
\node (p) at (1.75,1) {\textit{Pressure}};
\node (d) at (4.25,1) {\textit{Displacement}};
\node (f) at (3,0) {\textit{Frequency}};
\path [->,line width=0.5mm] (a) edge (c);
\path [->,line width=0.5mm] (a) edge (f);
\path [-,line width=0.5mm] (a) edge (d);
\path [->,line width=0.5mm] (v) edge (p);
\path [->,line width=0.5mm] (v) edge (f);
\path [->,line width=0.5mm] (p) edge (c);
\path [->,line width=0.5mm] (d) edge (p);
\path [->,line width=0.5mm] (d) edge (c);
\path [->,line width=0.5mm] (f) edge (p);
\end{tikzpicture}}
\end{center}
\caption{Results of algorithms on NASA airfoil experiment.}
\label{fig:airfoil}
\end{figure}

Note that both the GRaSP$_2$ and FGES results use the linear, Gaussian BIC score with a penalty multiplier of 2. For the GRaSP$_2$ result in (a), \textit{Attack} is not exogenous, which is counter-intuitive, since it is experimentally controlled. Allowing for latent variables could resolve this issues. However, we leave the development of such an algorithm to future work. On the other hand, the FGES result in (b) is notably not the same as the SP result and so is not frugal. Also, the orientation between \textit{Attack} and \textit{Displacement} is reversed.

The PC result in (c), which uses the zero partial correlation test with a significance level of 0.01, in fact has fewer edges than the frugal result and makes \textit{Chord}, another experimental variable, endogenous. Causally, PC is giving incorrect and incomplete information.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Unit tests}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% G. Unit tests %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\label{app:unit_tests}
We consider path cancellations in DAGs between pairs of vertices, one of which is exogenous, connected by two or more unique treks. Furthermore, the path cancellations we consider elicit a marginal independence between the two vertices in question. Below, we enumerate all possible path cancellations of this type (up to vertex relabeling). Each graph illustrates a case where an unfaithful marginal independence is elicited between the two gray vertices due to path cancellation. A complete list of all unfaithful CI relations (symmetry assumed) where the independent sets are singletons is also provided for each graph.

\begin{figure}[H]
\centering
\hspace{0.4cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_2,X_3,X_5 &\\X_1 \CI X_2,X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2,X_3&\\X_1 \CI X_2 \; | \; X_3,X_5&\\X_1 \CI X_3,X_5 \; | \; X_2&\\X_1 \CI X_2,X_5 \; | \; X_3&\\X_1 \CI X_3 \; | \; X_2,X_5&\\X_1 \CI X_3,X_5 &\\X_1 \CI X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_3&\\X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 \; | \; X_2,X_3&\\X_1 \CI X_2 \; | \; X_3,X_5&\\X_1 \CI X_3 \; | \; X_2,X_5&\\X_1 \CI X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_3&\\X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.4cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_2,X_3,X_5 &\\X_1 \CI X_2,X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2,X_3&\\X_1 \CI X_2 \; | \; X_3,X_5&\\X_1 \CI X_3,X_5 \; | \; X_2&\\X_1 \CI X_2,X_5 \; | \; X_3&\\X_1 \CI X_3 \; | \; X_2,X_5&\\X_1 \CI X_3,X_5 &\\X_1 \CI X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_3&\\X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 \; | \; X_2,X_3&\\X_1 \CI X_2 \; | \; X_3,X_5&\\X_1 \CI X_3 \; | \; X_2,X_5&\\X_1 \CI X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_3&\\X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.4cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_2,X_3,X_5 &\\X_1 \CI X_2,X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2,X_3&\\X_1 \CI X_2 \; | \; X_3,X_5&\\X_1 \CI X_3,X_5 \; | \; X_2&\\X_1 \CI X_2,X_5 \; | \; X_3&\\X_1 \CI X_3 \; | \; X_2,X_5&\\X_1 \CI X_3,X_5 &\\X_1 \CI X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_3&\\X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 \; | \; X_2,X_3&\\X_1 \CI X_2 \; | \; X_3,X_5&\\X_1 \CI X_3 \; | \; X_2,X_5&\\X_1 \CI X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_3&\\X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_2,X_3,X_5 &\\X_1 \CI X_2,X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2,X_3&\\X_1 \CI X_2 \; | \; X_3,X_5&\\X_1 \CI X_3,X_5 \; | \; X_2&\\X_1 \CI X_2,X_5 \; | \; X_3&\\X_1 \CI X_3 \; | \; X_2,X_5&\\X_1 \CI X_3,X_5 &\\X_1 \CI X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_3&\\X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 \; | \; X_2,X_3&\\X_1 \CI X_2 \; | \; X_3,X_5&\\X_1 \CI X_3 \; | \; X_2,X_5&\\X_1 \CI X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_3&\\X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_2,X_3,X_5 &\\X_1 \CI X_2,X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2,X_3&\\X_1 \CI X_2 \; | \; X_3,X_5&\\X_1 \CI X_3,X_5 \; | \; X_2&\\X_1 \CI X_2,X_5 \; | \; X_3&\\X_1 \CI X_3 \; | \; X_2,X_5&\\X_1 \CI X_3,X_5 &\\X_1 \CI X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_3&\\X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 \; | \; X_2,X_3&\\X_1 \CI X_2 \; | \; X_3,X_5&\\X_1 \CI X_3 \; | \; X_2,X_5&\\X_1 \CI X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_3&\\X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_2,X_3,X_5 &\\X_1 \CI X_2,X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2,X_3&\\X_1 \CI X_2 \; | \; X_3,X_5&\\X_1 \CI X_3,X_5 \; | \; X_2&\\X_1 \CI X_2,X_5 \; | \; X_3&\\X_1 \CI X_3 \; | \; X_2,X_5&\\X_1 \CI X_3,X_5 &\\X_1 \CI X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_3&\\X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 \; | \; X_2,X_3&\\X_1 \CI X_2 \; | \; X_3,X_5&\\X_1 \CI X_3 \; | \; X_2,X_5&\\X_1 \CI X_3 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_3&\\X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
% \node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2,X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 &\\X_1 \CI X_2 \; | \; X_5&\\X_1 \CI X_5 \; | \; X_2& \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X2) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X2) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node [fill=lightgray, rounded corners](X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node (X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_4 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node [fill=lightgray, rounded corners](X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node (X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_4 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node [fill=lightgray, rounded corners](X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node (X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_4 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\hspace{0.5cm}
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X1) edge (X5);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}

\begin{figure}[H]
\centering
\begin{tikzpicture}[roundnode/.style={circle, draw=black!60, very thick, minimum size=5mm}]
\node [fill=lightgray, rounded corners](X1) at (0.0,1.0) {$1$};
\node (X2) at (0.9510565162951535,0.30901699437494745) {$2$};
\node (X3) at (0.5877852522924732,-0.8090169943749473) {$3$};
\node (X4) at (-0.587785252292473,-0.8090169943749476) {$4$};
\node [fill=lightgray, rounded corners](X5) at (-0.9510565162951536,0.30901699437494723) {$5$};
\node (CIs) at (2.5,0) {$\begin{aligned} X_1 \CI X_5 & \end{aligned}$};
\path [->,line width=0.5mm] (X1) edge (X2);
\path [->,line width=0.5mm] (X1) edge (X3);
\path [->,line width=0.5mm] (X1) edge (X4);
\path [->,line width=0.5mm] (X2) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X4);
\path [->,line width=0.5mm] (X3) edge (X5);
\path [->,line width=0.5mm] (X4) edge (X5);
\end{tikzpicture}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Bibliography %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\bibliography{lam_294-supp.bib}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{document}