\documentclass[accepted]{uai2023} 
 
\usepackage[british]{babel}

\usepackage{natbib} 
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{booktabs}
\usepackage{pgf,tikz}
\usepackage{mathrsfs}
\usetikzlibrary{arrows, fit, shapes, arrows.meta, positioning, matrix}
\usepackage{tkz-graph}
\usepackage[linesnumbered, ruled]{algorithm2e}

\newcommand{\G}{\mathcal{G}}
\newcommand{\D}{\mathcal{D}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\I}{\mathcal{I}}
\newcommand{\K}{\mathcal{K}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\E}{\mathbf{E}}
\newcommand{\R}{\mathcal{R}}
\newcommand{\F}{\mathcal{F}}
\newcommand{\adj}[2]{\mathrm{adj}_{{#1}}({#2})}
\newcommand{\nb}[2]{\mathrm{ne}_{{#1}}({#2})}
\newcommand{\pa}[2]{\mathrm{pa}_{{#1}}({#2})}
\newcommand{\indep}{\perp\mkern-10mu\perp}
\newcommand{\any}{*\mkern-7mu-\mkern-7mu*}

\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{example}{Example}
\newtheorem{theorem}{Theorem}
\newtheorem{assumption}{Assumption}

\definecolor{silver}{RGB}{225, 225, 225}

\title{Do we become wiser with time? \\
On causal equivalence with tiered background knowledge}

\author[1,2]{\href{mailto:<bang@uni-bremen.de>?Subject=Your UAI 2023 paper}{Christine~W.~Bang}{}}
\author[1,2]{Vanessa~Didelez}

  \affil[1]{
    Faculty of Mathematics and Computer Science\\
    University of Bremen\\
    Bremen, Germany
}
\affil[2]{
    Leibniz Institute for Prevention Research and Epidemiology – BIPS\\
    Bremen, Germany
}
  
\begin{document}
  
\maketitle


\begin{abstract}
Equivalence classes of DAGs (represented by CPDAGs) may be too large to provide useful causal information.
Here, we address incorporating {\em tiered} background knowledge yielding restricted equivalence classes represented by `tiered MPDAGs'. Tiered knowledge leads to considerable gains in informativeness and computational efficiency: We show that construction of tiered MPDAGs only requires application of Meek's 1st rule, and that tiered MPDAGs (unlike general MPDAGs) are chain graphs with chordal components. This entails simplifications e.g. of determining valid adjustment sets for causal effect estimation. Further, we characterise when one tiered ordering is more informative than another, providing insights into useful aspects of background knowledge.
\end{abstract}

\section{Introduction}
We consider equivalence classes of DAGs represented by completed partially directed acyclic graphs (CPDAGs), occurring as outputs of causal discovery algorithms. A first characterisation of equivalent DAGs was given by \citet{verma1990} and a full characterisation of CPDAGs by \citet{andersson1997}. Often, domain expertise provides additional information about shared features of the graphs in a class. Restricting the equivalence class by background knowledge yields a type of partially directed acyclic graph (PDAG) that is potentially much more informative due to  more induced edge orientations. \citet{meek1995} provided a set of orientation rules to obtain a graph encoding the maximal implied information, and the resulting graph is then a maximally oriented partially directed acyclic graph (MPDAG) \citep{perkovic2017}. While a CPDAG represents an independence model common to all DAGs in the equivalence class, an MPDAG represents an independence model as well as additional causal or directional information that is common to all DAGs in a restricted equivalence class. DAGs and CPDAGs are special cases of MPDAGs; DAGs are MPDAGs with full (or sufficient) background knowledge, while CPDAGs are MPDAGs with no (or redundant) background knowledge. A general characterisation of MPDAGs was given by \citet{fang2022representation}. The interpretation of MPDAGs was described in  detail by \citet{perkovic2017} and is considerably more involved than that of  CPDAGs. 

Background knowledge can  be induced by, e.g., well-established causal or logical relations. Some kinds of knowledge, e.g. temporal or sequential structures, imply that the nodes can be partitioned into ordered tiers. This is the case in many settings where longitudinal data is collected, e.g. cohort or panel studies, common in sociology, epidemiology etc. In particular, this kind of data structure is used in the field of life course epidemiology \citep{kuh2004life}. Tiered background knowledge is typically unambiguous and it is intuitively obvious that it must be useful. Indeed, implementations of algorithms for constraint-based causal discovery with  a given tiered structure exist (e.g. \citet{scheines1998tetrad}), and have been applied to cohort data for life course analyses \citep{petersen2021data,foraita2022}, but the general properties of these restricted model classes have not yet been investigated. Here, we provide the first formal in-depths analysis of equivalence classes restricted by tiered knowledge. We show that there are several desirable properties, distinguishing tiered from other kinds of background knowledge. Thus, we focus on MPDAGs arising from imposing a tiered ordering, which we term `tiered MPDAGs'. We show that under the key properties of completeness and transitivity, tiered background knowledge cannot induce partially directed cycles,  and, moreover, that tiered MPDAGs are chain graphs with chordal chain components. This allows us to, e.g., apply common methods for identifying causal effects using CPDAGs  to tiered MPDAGs without any additional processing. 

While temporal structures will often be the main source for tiered background knowledge, tiers are slightly more general. Information about logical causal directions may be available, for instance between environment and individual or between cells and molecules.
When eliciting such background knowledge, there may be more effort involved to achieve more detail; or it may be possible but more costly to achieve more detail by the design of a, say, cohort study with finer waves. Existing data from cohort studies are organised in a readily available tiered structure. But within the same wave it may be possible to further subdivide the nodes using logical, temporal or similar expertise; e.g. the first wave of a children's cohort may be composed of variables  before and after birth, pertaining to mother or child etc. With view to eliciting such details or designing  a cohort study, it is therefore interesting to characterise when different tiered restrictions are redundant versus when they are most informative.

 In Section \ref{sec:background} we formalise the concepts of (tiered) background knowledge and restricted equivalence classes. We then provide a formal characterisation of tiered MPDAGs:  Section \ref{sec:properties} describes some of their properties, and Section \ref{sec:character}  compares different tiered orderings in terms of informativeness. In Section \ref{sec:sim}, we illustrate which types of tiered knowledge are particularly informative via simulation, and in Section \ref{sec:example} we provide a practical example. Section \ref{sec:comparisons} addresses how tiered information structurally differs from other types of background knowledge. Throughout, we rely on standard notation for (causal) graphical models,  an overview of  relevant definitions can be found in Section A of the Supplement; all proofs can be found in Sections D and  E of the Supplement.

\section{Background knowledge} \label{sec:background}


While the DAGs in an equivalence class have exactly the same conditional independencies, they can still have vastly different causal implications. In this section we  introduce a smaller, and possibly more informative, subclass of causal graphs using background knowledge. 

We define that \emph{background knowledge} $\K=(\R, \F)$ consists of a set of \emph{required edges} $\R$ and a set of \emph{forbidden edges} $\F$. 

\begin{definition}[Encoding background knowledge]
A graph $\G$ encodes background knowledge $\K=(\R,\F)$ if all of the edges in $\R$ and none of the edges in $\F$ are present in $\G$. 
\end{definition}

\subsection{Restricted equivalence classes}

In our setup we only consider correct background knowledge in the sense that it agrees with an underlying (unknown) true DAG:

\begin{assumption}\label{assumptionknowledge}
The given background knowledge is correct.
\end{assumption}

Let $\C$ be a CPDAG, then by $[\C]$ we denote the equivalence class of DAGs represented by $\C$.

\begin{definition}[\cite{meek1995}]
A CPDAG $\C$ and background knowledge $\K=(\R,\F)$ are \emph{consistent} if and only if there exists a DAG $\D\in [\C]$ such that all of the edges in $\R$ and none of the edges in $\F$ are in $\D$.
\end{definition} 

Since our focus is on equivalence classes, we will assume throughout:

\begin{assumption}\label{assumptionCPDAG}
The given CPDAG is correct.
\end{assumption}

Combining Assumption \ref{assumptionknowledge} and \ref{assumptionCPDAG}, background knowledge will be  consistent with the CPDAG. In actual practice, inconsistencies  might occur, e.g. due to statistical errors when first learning the CPDAG. These are separate issues which we address elsewhere.


\SetKwInOut{Input}{input} 
\SetKwInOut{Output}{output}

\begin{algorithm}[!htbp]

\caption{Constructing $\C^\K$} \label{alg:ck}

\Input{CPDAG $\C=(\V,\E)$ and consistent background knowledge $\K=(\R,\F)$.} 

\Output{PDAG $\C^\K=(\V,\E')$}

$\E'=\E$

\ForAll{$\{ V_i- V_j\}\in\E$}{

\uIf{$\{ V_i\rightarrow V_j\}\in\F$}{

replace $\{ V_i- V_j\}$ with $\{ V_i\leftarrow V_j\}$ in $\E'$

\uElseIf{$\{ V_i\rightarrow V_j\}\in\R$}{

replace $\{ V_i- V_j\}$ with $\{ V_i\rightarrow V_j\}$ in $\E'$

}
}% if

} % for

\end{algorithm}

Consistent background knowledge is imposed on a CPDAG by orienting the corresponding undirected edges. In turn, this may allow us to  orient further undirected edges, e.g. to avoid directed cycles. \cite{meek1995} showed that maximal edge orientations implied by given background knowledge are obtained under a set of four orientation rules, also known as Meek's rules (see Figure B.1 in the Supplement).
The resulting graph then no longer represents an equivalence class, but rather a \emph{restricted equivalence class}.  

More formally, the construction is as follows: Let $\C=(\V,\E)$ be a CPDAG and let $\K=(\R,\F)$ be background knowledge consistent with $\C$. First, orient edges in $\C$ according to $\K$ as in Algorithm \ref{alg:ck}, and let $\C^\K$ denote the PDAG obtained from this procedure. Second, orient additional edges by repeated application of Meek's rules 1-4 until no further change; let $\G$ denote the resulting PDAG. Then $\G$ is the \emph{maximally oriented partially directed acyclic graph} (MPDAG) obtained from $\C$ relative to $\K$ \citep{meek1995}. 

For a given CPDAG $\C$ and background knowledge $\K$, the MPDAG obtained from $\C$ relative to $\K$ is unique. However, the origin of an MPDAG is not unique: Let $\C$ be a CPDAG, and  $\K_1$ and $\K_2$ two distinct sets of background knowledge. If repeated applications of Meek's rules to $\C^{\K_1}$ and $\C^{\K_2}$ lead to the same MPDAG, then $\K_1$ and $\K_2$ are \emph{equivalent} given $\C$. We say that $\K_2$ is \emph{redundant} relative to $\K_1$ if $\K_1\subseteq\K_2$ and $\K_1$ and $\K_2$ are equivalent. We say that a PDAG $\G_1$ is \emph{contained in} another PDAG $\G_2$ if they have the same skeleton and every directed edge in $\G_2$ is also in $\G_1$. We say that $\K_1$ is more \emph{informative} than $\K_2$ if the MPDAG relative to $\K_1$ is contained in the MPDAG relative to $\K_2$ 

\subsection{Tiered background knowledge}

In this work we focus on background knowledge about tiered structures.

\begin{definition}[Partial tiered ordering]
Let $\G$ be a PDAG with node set $\mathbf{V}$ of size $p$, and let $T\in\mathbb{N}$, $T \leq p$. A  (partial) tiered ordering of the nodes in $\mathbf{V}$ is a map $\tau: \mathbf{V}\mapsto \{ 1,\ldots ,T\}^p$ that assigns each node $V\in\mathbf{V}$ to a unique tier $t\in\{ 1,\ldots ,T\}$.
\end{definition}

A tiered ordering is partial if multiple nodes are assigned to the same tier. The following properties are implied by the definition and reflect what kind of background knowledge is encoded in a tiered ordering:
\begin{itemize}[align=left]
    \item[(Uniqueness)] A node is assigned to no more than one tier.
    \item[(Completeness)] Every node belongs to a tier.
    \item[(Transitivity)] If $\tau (A)\leq\tau (B)$ and $\tau (B)\leq\tau (C)$ then $\tau (A)\leq\tau (C)$.
\end{itemize}

A tiered ordering imposes background knowledge on a graph by demanding that no directed edges point from later tiers into  earlier tiers, i.e. specifying the forbidden edges accordingly  $\F=\{\{A\leftarrow B\}: \tau(A)<\tau(B), A,B\in\V\}$. Thus,  a tiered ordering provides information on the absence of ancestral relations, but not on their presence. Combining tiered knowledge with a PDAG, it might be possible to construct some ancestral relations which allow us to orient undirected edges as illustrated in Example \ref{smallexample}. 

\begin{example}\label{smallexample}
 Assume that we are given $\V=\{ A, B\}$ and tiered ordering $\tau$ with $\tau (A)<\tau (B)$. This corresponds to the background knowledge $\K$ with forbidden set $\F =\{A\leftarrow B\}$ and no required edges: In this case it could be possible that $A$ is a parent of $B$ or that there is no edge between them. However, if we additionally knew that $A$ and $B$ are adjacent, then our background knowledge would result in the edge orientation $\{A\rightarrow B\}$. 
\end{example}

As illustrated, the cross-tier edges play an important role and are defined as follows.

\begin{definition}[Cross-tier edge]
Let $\G=(\V,\E)$ be a PDAG and $\tau$ a tiered ordering of $\V$. An edge $\{A\rightarrow B\}\in\E$ is a cross-tier edge (relative to $\tau$) if $\tau(A)<\tau(B)$.
\end{definition}

With tiered knowledge $\tau$, all cross-tier edges of a PDAG will be directed. Conversely, $A-B$ only occurs if $\tau(A)=\tau(B)$. Since a tiered ordering $\tau$ of a node set $\V$ unambiguously implies a forbidden edge set we can refer to the MPDAG obtained from a CPDAG relative to a tiered ordering $\tau$, rather than referring to the forbidden edges implied by $\tau$. 
In view of Assumption \ref{assumptionknowledge} and \ref{assumptionCPDAG}, any tiered ordering that does not contradict the directed edges of the CPDAG will be consistent; to establish that there is no contradiction we therefore  only need to verify the cross-tier edges. 


We will refer to MPDAGs relative to exclusively tiered  background knowledge as `tiered MPDAGs'. This is in contrast to general MPDAGs, which can arise from any kind of background knowledge. 

\begin{example}[Equivalence class restricted by tiered ordering] \label{example:constructTMPDAG}

\begin{figure}[!htbp]
\centering
\begin{tikzpicture}[state/.style={thick}]

\node[state] (x1) at (0,1.25) {$A$};
\node[state] (x2) at (0,0) {$B$};
\node[state] (x3) at (2.25,1.875) {$C$};
\node[state] (x4) at (2.25,0.625) {$D$};
\node[state] (x5) at (2.25,-0.625) {$E$};
\node[state] (x6) at (4.5,1.25) {$F$};
\node[state] (x7) at (4.5,0) {$G$};

\node (t0) at (0,-1.35) {$\tau = 1$};
\node (t1) at (2.25,-1.35) {$\tau = 2$};
\node (t2) at (4.5,-1.35) {$\tau = 3$};

\tikzset{dir/.style = {->, -{To[length=6, width=7]}, thick}}
\draw[dir]
(x1) edge (x2)
(x1) edge [bend left] (x3)
(x2) edge [bend right] (x5)
(x3) edge (x4)
(x4) edge (x5)
(x3) edge [bend left] (x6)
(x6) edge (x7)
;

\draw[dashed] 
(1.125,2.625) -- (1.125,-1.375)
(3.375,2.625) -- (3.375,-1.375)
;

\end{tikzpicture}
\caption{DAG $\D=(\V,\E)$ with tiered ordering $\tau$ and three tiers: $A$ and $B$ are assigned to tier 1, while $C$, $D$ and $E$ are assigned to tier 2, and $F$ and $G$ are assigned to tier 3.}
\label{fig.dag1}
\end{figure}

Figure \ref{fig.dag1} shows a DAG $\D$ and a tiered ordering $\tau$ of the nodes in $\D$. The differences between the equivalence class of $\D$ and the restricted equivalence class of $\D$ relative to  $\tau$ are illustrated in Figure \ref{fig.est1}. The CPDAG $\C$ represents the equivalence class of $\D$; as only two out of seven edges are directed the conditional independencies alone do not contain much causal information. Meanwhile, the restricted equivalence class represented by tiered MPDAG $\G$ relative to $\tau$  is much smaller, here six out of seven edges are oriented. $\G$ will naturally contain the same adjacencies and v-structures as $\C$,  the dashed cross-tier edges $A\rightarrow C$ and $C\rightarrow F$ are implied by the forbidden directions, and the dotted edges $C\rightarrow D$ and $F\rightarrow G$ are consequences of Meek's 1st rule which prohibits new v-structures. 
Due to these last additionally implied orientations of previously undirected edges, restricted equivalence classes given tiered background knowledge might be even smaller, and thus more informative, than one might initially expect.

\begin{figure}[!htbp]
\centering
\begin{tikzpicture}[state/.style={thick}]

\node[state] (x1') at (0,1.25) {$A$};
\node[state] (x2') at (0,0) {$B$};
\node[state] (x3') at (2.25,1.875) {$C$};
\node[state] (x4') at (2.25,0.625) {$D$};
\node[state] (x5') at (2.25,-0.625) {$E$};
\node[state] (x6') at (4.5,1.25) {$F$};
\node[state] (x7') at (4.5,0) {$G$};

\node[state] (x1) at (0,4.75) {$A$};
\node[state] (x2) at (0,3.5) {$B$};
\node[state] (x3) at (2.25,5.375) {$C$};
\node[state] (x4) at (2.25,4.125) {$D$};
\node[state] (x5) at (2.25,2.875) {$E$};
\node[state] (x6) at (4.5,4.75) {$F$};
\node[state] (x7) at (4.5,3.5) {$G$};

\node (t1) at (6, 4.5) {CPDAG $\C$};
\node (t2) at (6, 1) {MPDAG $\G$};

\tikzset{undir/.style = {-, thick}}
\tikzset{dir/.style = {->, -{To[length=6, width=7]}, thick}}
\draw[dir]
(x2) edge [bend right] (x5)
(x4) edge (x5)
(x1') edge [bend left] [dashed] (x3')
(x2') edge [bend right] (x5')
(x3') edge [dashed] [dotted] (x4')
(x4') edge (x5')
(x3') edge [bend left] [dashed] (x6')
(x6') edge [dotted] (x7')
;
\draw[undir]
(x1) edge (x2)
(x1) edge [bend left] (x3)
(x3) edge (x4)
(x3) edge [bend left] (x6)
(x6) edge (x7)
(x1') edge (x2')
;

\end{tikzpicture}
\caption{The CPDAG $\C$ representing the equivalence class of $\D$, and the tiered MPDAG $\G$ representing the restricted equivalence class of $\D$ relative to $\tau$.}
\label{fig.est1}
\end{figure}


\end{example}

\section{Properties of tiered MPDAGs}\label{sec:properties}

When incorporating general background knowledge into a CPDAG, it might be necessary to apply all of Meek's rules 1-4 in order to obtain a maximally informative graph. Meek's 1st rule ensures that no new v-structures are created, while rules 2-4 all concern preventing  directed cycles. By construction,  tiered knowledge imposes an ordering of the nodes; using that this ordering is transitive and complete, the following lemma shows that Meek's 1st rule is sufficient to construct a maximally informative graph. This is a strong result, which will help us prove further results in this and the following sections.
 
\begin{lemma}\label{mainlemma}
Let $\C=(\mathbf{V},\mathbf{E})$ be a CPDAG and let $\tau$ be a tiered ordering of the nodes $\V$. Let $\C^\tau$ be the PDAG obtained according to Algorithm \ref{alg:ck} and let $\G$ be the PDAG obtained by repeatedly orienting edges in $\C^\tau$ according to Meek's rule 1 until no further change occurs. Then $\G$ is the MPDAG obtained from $\C$ relative to $\tau$. 
\end{lemma}

Note that so far we have taken a given CPDAG as the starting point to which tiered background knowledge is added. Alternatively, we can start at an earlier stage with a PDAG $\G$ that contains directed edges only if they belong to v-structures with other edges being undirected. In this case, the tiered ordering can be incorporated into $\G$ by orienting the cross-tier edges and then applying all four of Meeks rules to achieve maximality. Lemma \ref{mainlemma} therefore highlights the extra orientations implied by tiered knowledge on top of the usual orientations. 


While general MPDAGs might contain partially directed cycles, it turns out that when background knowledge arises from tiered structures the completeness and transitivity of tiered orderings ensure that no partially directed cycles can occur:

\begin{theorem}\label{theorem:cycles}
Let $\G=(\V,\E)$ be a tiered MPDAG, then $\G$ does not have any partially directed cycles.
\end{theorem}

The implications of Theorem \ref{theorem:cycles}  allow us to work with tiered MPDAGs in a similar way as  with CPDAGs. The same does not hold for MPDAGs in general  \citep{perkovic2017}; we will elaborate on this in the section below.

It was shown by \cite{andersson1997} that CPDAGs are chain graphs with chordal chain components, which is useful for many purposes. Given Theorem \ref{theorem:cycles} it becomes straightforward to show that the the same holds for tiered MPDAGs:

\begin{corollary} \label{corollary:chain}
Let $\G=(\V,\E)$ be a tiered MPDAG, then $\G$ is a chain graph with chordal chain components.
\end{corollary}

Similarly, \citet{wang2022sound}  obtain graphs with chordal  undirected components under a different type of background knowledge:  
Local background knowledge for a node $A\in\V$ is defined as the knowledge of whether $A$ is a cause of $V$, for each $V\in\adj{}{A}$. Although not explicit, this induces a transitivity among the nodes in $\adj{}{A}$.


\subsection{Interpretation of undirected paths}\label{sec:interpret}

In a CPDAG $\C=(\V,\E)$, an undirected path $\pi$ between two nodes $A,B\in\V$ indicates that there exist a DAG $\D_1\in[\C]$ and another DAG $\D_2\in[\C]$  such that $A$ is an ancestor of $B$ in $\D_1$ and $B$ is an ancestor of $A$ in $\D_2$. This is not necessarily the case in a general MPDAG $\G=(\V,\E')$: If there is an edge $A\rightarrow B$ in $\G$ not on $\pi$, i.e. $\G$ has a partially directed cycle, then there exists a DAG $\D_1$ represented in the class by $\G$ in which the path corresponding to $\pi$ is directed from $A$ to $B$, but there cannot be a DAG $\D_2$ in the class represented by $\G$ in which $B$ is an ancestor of $A$ since this would create a cycle. Hence, an undirected path between two nodes in an MPDAG does not necessarily mean that the path can be directed either way. Hence it is  necessary to check multiple paths in the graph in order to determine whether one path can be directed. Since partially directed cycles do not occur in tiered MPDAGs, the above issue does not occur and, hence, the interpretation of undirected paths in tiered MPDAGs is the same as in CPDAGs. 

\subsection{Adjustment in tiered MPDAGs}\label{sec:adjust}

The \emph{generalised adjustment criterion} \citep{perkovic2018complete} determines whether a set of nodes in a CPDAG constitutes a valid adjustment set in every DAG in the equivalence class. This criterion checks for possibly causal paths, i.e. paths for which there are DAGs in the equivalence in which these paths are causal. Hence, any undirected path is possibly causal. However, as described in Section \ref{sec:interpret}, this interpretation does not hold for general MPDAGs, and the notion of possibly causal does not transfer directly from CPDAGs to MPDAGs. In order to tackle this issue, \citet{perkovic2017} introduced the notion of \emph{b-possibly causal} paths, which is a stronger requirement but ensures that there are in fact paths in the equivalence class that are causal. With this notion an adjustment criterion for general MPDAGs, the \emph{b-adjustment criterion}, can be given \citep{perkovic2017}. To determine whether a path is b-possibly causal, one needs to check multiple paths in the graph, which can be computationally heavy for large and dense graphs. For tiered MPDAGs, the definition of b-possibly causal paths simplifies to the definition of possibly causal paths, and the generalised adjustment criterion for CPDAGs is valid for tiered MPDAGs as well:

\begin{corollary}\label{corollary:possibly}
    Let $\G=(\V,\E)$ be a tiered MPDAG, and let $\mathbf{X}, \mathbf{Y},\mathbf{Z}\subseteq\V$ be pairwise disjoint node sets. Then $\mathbf{Z}$ satisfies the generalised adjustment criterion relative to $(\mathbf{X}, \mathbf{Y})$ in $\G$ if and only if it satisfies the b-adjustment criterion relative to $(\mathbf{X}, \mathbf{Y})$ in $\G$.
\end{corollary}

A related result is shown in \cite{van2016separators}: They introduce a class of graphs called \emph{restricted chain graphs}, which are chain graphs with (1) chordal chain components, and (2) no unshielded triples of the form $A\rightarrow B-C$. They give a sound and complete adjustment criterion for graphs of this type, and they provide an algorithm to find adjustment sets. Clearly, a tiered MPDAG is a type of restricted chain graph, and the results of \cite{van2016separators} hold for tiered MPDAGs.

\subsection{IDA for tiered MPDAGs}

Covariate adjustment in a CPDAG (MPDAG) requires an adjustment set to be a valid in every DAG in the (restricted) equivalence class. The IDA-algorithm \citep{maathuis2009estimating}, instead, finds an adjustment set for each DAG in the class. Enumerating all DAGs in an equivalence class is a computationally heavy task, but it can be done in polynomial time for chain graphs with chordal chain components \citep{wienobst2021polynomial}, hence also for tiered MDPAGs. 


The local IDA-algorithm utilises the fact that if a valid adjustment set exists, then the parent set is always valid \citep{pearl2009causality} and considers the possible parents, i.e. all sets that are parent sets in some DAG in the equivalence class. However, as general MPDAGs can contain partially directed cycles, it cannot be verified locally whether a set of nodes is a possible parent set, and a semi-local version  was introduced to tackle this \citep{perkovic2017}. The joint IDA-algorithm  determines the joint parent sets semi-locally by orienting subgraphs \citep{nandy2017estimating}. Similar to the local IDA-algorithm, this approach fails for general MPDAGs due to potential partially directed cycles. To tackle this issue \citet{perkovic2017} introduced an additional step to check whether the oriented subgraphs are valid. In contrast, for tiered MPDAGs no additional steps are needed, and the original local and joint IDA both remain valid:

\begin{corollary}\label{corollary:ida}
    Let $\G=(\V,\E)$ be a tiered MPDAG. Let $\mathbf{PA}_{\G}(X)$ denote the multiset of parent sets of $X$ in all DAGs represented by $\G$, and let $\mathbf{PA}^{\mathrm{local}}_{\G}(X)$ denote the multiset of parent sets of $X$ obtained from the local IDA algorithm. Then $\mathbf{PA}_{\G}(X)$ and $\mathbf{PA}^{\mathrm{local}}_{\G}(X)$ contain the same distinct elements. Moreover, let $\mathbf{PA}^{\mathrm{joint}}_{\G}(X)$ denote the multiset of parent sets of $X$ obtained from the joint IDA algorithm. Then $\mathbf{PA}_{\G}(X)$ and $\mathbf{PA}^{\mathrm{joint}}_{\G}(X)$ contain the same distinct elements and the ratios of multiplicities of any two elements are the same.
\end{corollary}

In order to adapt the local IDA-algorithm to general MPDAGs, \citet{fang2020ida} introduced a set of local orientation rules to verify whether a set of nodes is a possible parent set of a given node, yielding a fully local version of the IDA that can handle general MPDAGs. While this reduces computation time for general MPDAGs,  it is not necessary for tiered MPDAGs due to Corollary \ref{corollary:ida}.

Finally, the optimal IDA-algorithm \citep{witte2020efficient}  is only semi-local and it includes an additional step to check other parts of the graph: This algorithm is essentially not local as the optimal adjustment set is not likely to be the parent set \citep{witte2020efficient, henckel2022graphical}. In this case, tiered MPDAGs do not have an advantage over general MPDAGs. However, the definition of the optimal adjustment set does simplify in a similar fashion as in Section \ref{sec:adjust}. A minimal version of the IDA-algorithm is proposed by \citet{guo2021minimal}; like the optimal IDA, this method is  non-local by construction, and tiered MPDAGs do not provide an advantage in this case.


\section{Comparing tiered background knowledge}\label{sec:character}

In this section, we investigate how different tiered background knowledge can be compared. This is relevant in situations where different experts are consulted or where eliciting more detailed knowledge may require more effort. It also provides insights into what kind of background knowledge is especially valuable. In case of a cohort study spanning a whole life-time, we have a clear tiered structure due to the time-ordering, but the tiers might be large, and we need to consult different experts in order to refine the tiers depending on their expertise in e.g. children’s health, life style factors, or diseases common among the elderly. This can be very costly and time consuming to obtain, and it is then beneficial to know how to prioritise. Moreover, time-ordering has the advantage of being correct, whereas other ways of motivating tiers might be less certain. The results of this section could therefore also be used at the design stage of a, say, cohort study: As our results will show, the finer we can reliably partition early variables into tiers by the design alone, the more informative it will be for causal structure learning.

Throughout, we will only compare tiered orderings that are compatible in the sense that they do not contradict each other on the ordering of the nodes; this is in line with Assumption \ref{assumptionknowledge}. Hence, the orderings can only disagree on the status of an edge being a cross-tier edge, not the direction of it.

Consider two different tiered orderings $\tau_i$ and $\tau_j$. If for all $A,B\in\V:$ $\tau_i (A)<\tau_i (B)\Rightarrow\tau_j (A)<\tau_j (B)$, then $\tau_j$ is \emph{finer} than $\tau_i$, and $\tau_i$ is \emph{coarser} than $\tau_j$. In this case we have for the respective sets of forbidden edges that $\F_i\subseteq\F_j$.

Note that a finer tiered ordering can be redundant compared to the coarser one; otherwise  it must be more informative. However, two tiered orderings can be different without one being finer or coarser than the other. They can then either be equivalent, or one can be more informative than the other, or they are incomparable.

To compare tiered orderings on a given CPDAG, we must compare the resulting tiered MPDAGs. We give a graphical criterion for their  equivalence in Section \ref{sec:equivalent}. Evidently, this provides a criterion for redundancy. More interestingly, this also provides insight into when one ordering is more informative than another as addressed in Section \ref{sec:compareorderings}.


\subsection{Equivalence of tiered MPDAGs}\label{sec:equivalent}

First, we need some further terminology: 

\begin{definition}[Earlier path]
Let $\G=(\V,\E)$ be a PDAG, let $\tau$ be a tiered ordering of the nodes $\V$, and let $\pi_1$ and $\pi_2$ be two arbitrary paths in $\G$. If $\pi_1$ contains a node $V$ with $\tau(V)<\tau(W)$ for all nodes $W$ on $\pi_2$, we say that $\pi_1$ is earlier than $\pi_2$; correspondingly,  $\pi_2$ is later than $\pi_1$. A path is earliest if it does not contain subpaths of any earlier paths.
\end{definition}

We say that an edge is \emph{fully shielded} if it does not occur on any unshielded path. Hence, $A\any B$ is a fully shielded edge in $\G$ iff $\adj{\G}{A}\backslash\{ B\}=\adj{\G}{B}\backslash\{ A\}$.


In the following, $\C_u^\tau$  refers to the graph obtained by first omitting every directed edge in a graph $\C$, and then orienting edges in $\C_u$ according to the tiered ordering $\tau$. 

\begin{theorem}\label{mainthm}
Let $\C$ be a CPDAG and let $\tau_1$ and $\tau_2$ be distinct tiered orderings. Let $\G_1$ be the tiered MPDAG obtained from $\C$ relative to $\tau_1$ and $\G_2$ the tiered MPDAG obtained from $\C$ relative to $\tau_2$. Then $\G_1=\G_2$ if and only if

\begin{itemize}
    \item[(i)] $\C^{\tau_1}_u$ and $\C^{\tau_2}_u$ agree on the first cross-tier edge on any earliest unshielded path and
    \item[(ii)] $\C^{\tau_1}_u$ and $\C^{\tau_2}_u$ agree on any fully shielded cross-tier edge.
\end{itemize}
\end{theorem}

    
A path can have at most two first cross-tier edges. If a path contains two first cross-tier edges, the above condition requires $\C^{\tau_1}_u$ and $\C^{\tau_2}_u$ to agree on both cross-tier edges. 


\begin{example}[Equivalent tiered orderings]\label{example:mainthm}

We again consider the DAG $\D$ and the tiered ordering $\tau$ in Figure \ref{fig.dag1} from Example \ref{example:constructTMPDAG}. Now we compare $\tau$ to the tiered ordering $\tau'$ with
\begin{align*}
    \tau'(A)&=\tau'(B)=1\\
    \tau'(C)&=\tau'(D)=\tau'(E)=\tau'(F)=\tau'(G)=2
\end{align*}

\begin{figure}[!htbp]
\centering
\begin{tikzpicture}[state/.style={thick}]

\node[state] (x1) at (0,1.25) {$A$};
\node[state] (x2) at (0,0) {$B$};
\node[state] (x3) at (2.25,1.875) {$C$};
\node[state] (x4) at (2.25,0.625) {$D$};
\node[state] (x5) at (2.25,-0.625) {$E$};
\node[state] (x6) at (4.5,1.25) {$F$};
\node[state] (x7) at (4.5,0) {$G$};

\tikzset{undir/.style = {-, thick}}
\draw[undir]
(x2) edge (x1)
(x1) edge [bend left] (x3)
(x3) edge (x4)
(x3) edge [bend left] (x6)
(x6) edge (x7)
;

\end{tikzpicture}
\caption{The undirected subgraph $\C_u$ of the CPDAG $\C$ from Figure \ref{fig.est1}.}
\label{fig:cu}
\end{figure}

In Figure \ref{fig:cu} we have the undirected subgraph $\C_u$ of the CPDAG $\C$ of $\D$.   There are two earliest unshielded paths in $\C_u$: $B-A-C-F-G$ and $B-A-C-D$. Clearly, $\C_u^{\tau}$ and $\C_u^{\tau'}$ agree on the first cross-tier edges on these paths: On $\langle B, A, C, F, G\rangle $ the first cross-tier edge is $A\rightarrow C$, and on $\langle B, A, C, D\rangle$ the first cross-tier edge is $A\rightarrow C$. Note that in these graphs, no edge is fully shielded. Orienting edges in $\C_u^{\tau}$ and $\C_u^{\tau'}$ repeatedly according to Meek's 1st rule results in the same MPDAG. Since $\tau$ is finer than $\tau'$, it follows that $\tau$ is redundant relative to $\tau'$ given $\C$. 
\end{example}


\subsection{Comparing tiered orderings}\label{sec:compareorderings}

We have now characterised when two tiered orderings produce the same MPDAG; as a consequence of this, Theorem \ref{mainthm} also gives rise to useful insights concerning when tiered knowledge is more informative. 

\begin{corollary}\label{corollary:informative}
Let $\C=(\V, \E)$ be a CPDAG  and let $\tau_1$ and $\tau_2$ be two distinct tiered orderings of $\V$. Assume that 
\begin{itemize}
    \item[(i)] every first cross-tier edge on an earliest unshielded path in $\C^{\tau_2}_u$ is also a cross-tier edge in $\C^{\tau_1}_u$, and 
    \item[(ii)] every fully shielded cross-tier edge in $\C^{\tau_2}_u$ is also a cross-tier edge in $\C^{\tau_1}_u$,
\end{itemize}
then $\tau_1$ is more informative than $\tau_2$ given $\C$ if 
\begin{itemize}
    \item[(iii)] $\C^{\tau_1}_u$ has a first cross-tier edge on an earliest unshielded path, that is not a cross-tier edge in $\C^{\tau_2}_u$, or
    \item[(iv)] $\C^{\tau_1}_u$ has more fully shielded cross-tier edges than $\C^{\tau_2}_u$.
\end{itemize}
\end{corollary}

In Corollary \ref{corollary:informative}, (i) and (ii) ensure that the MPDAG $\G_1$ relative to $\tau_1$ 
is contained in the MPDAG $\G_2$ relative to $\tau_2$. Additional information can be obtained in two ways: Either directly from the tiered ordering, reflected in condition (iv), or indirectly through Meek's 1st rule, reflected in condition (iii).

Consider first condition (iii). This condition does not mention the number of cross-tier edges, it is only important to know the earliest.

\begin{example}\label{example:unshielded}
Consider again the DAG $\D=(\V,\E)$ from Example \ref{example:constructTMPDAG} and Example \ref{example:mainthm}. Consider now the tiered ordering $\tau_1$ with 
\begin{align*}
\tau_1(A)&=\tau_1(B)=1\qquad\qquad\qquad\qquad\qquad \\ 
\tau_1(C)&=2\\
\tau_1(D)&=\tau_1(E)=3\\
\tau_1(F)&=4 \textit{, and}\\ 
\tau_1(G)&=5
\end{align*}

Clearly, $\tau_1$ results in the same MPDAG as $\tau$ and $\tau'$; since $\tau_1$ is finer than both $\tau$ and $\tau'$ it is redundant given $\C$. Consider now the tiered ordering $\tau_2$ of $\V$ with 
\begin{align*}
\tau_2(A)&=\tau_2(B)=\tau_2(C)=1\qquad\qquad\qquad \\ 
\tau_2(D)&=\tau_2(E)=2\\
\tau_2(F)&=3 \textit{, and}\\ 
\tau_2(G)&=4
\end{align*}

Let $\G'$ be the MPDAG relative to $\tau_2$ given $\C$. Then $\G'$ has the same edge orientations as the MPDAG relative to $\tau$, $\tau'$ and $\tau_1$, except for $\{A\rightarrow C\}$ which remains undirected in $\G'$. Here, $\tau_1$ is finer than $\tau_2$, but $\tau_1$ is not redundant. In fact, $\tau_1$ is more informative than $\tau_2$ due to condition (iii) of Corollary \ref{corollary:informative}. In addition, $\tau$ and $\tau'$ are both more informative than $\tau_2$, even though $\tau_2$ assigns the nodes to more tiers. 
\end{example}

Consider condition (iv) in Corollary \ref{corollary:informative}. This suggests that every fully shielded cross-tier edge provides unique information. Hence, there is an immediate gain in information from each additional fully shielded cross-tier edge in $\C_u$, since edges of this type can not be oriented by Meek's 1st rule. Hence, in the complete subgraphs of $\C_u$, no tiered background knowledge is redundant. 

\begin{example}[Fully shielded edges]
Consider the simple case of a CPDAG $\C=(\V,\E)$ with three nodes $\V=\{ A, B, C\}$, where $\C$ is complete: $\E=\{\{A-B\},\{B-C\},\{A-C\}\}$. Assume that the true ordering $\tau_\alpha$ assigns the nodes to individual tiers with $\tau_\alpha(A)<\tau_\alpha(B)<\tau_\alpha(C)$. There are then three types of partial orderings that are compatible with $\tau_\alpha$: Orderings that assigns the nodes to the same tier, e.g. an ordering $\tau_\beta$ with  $\tau_\beta(A)=\tau_\beta(B)=\tau_\beta(C)$, or orderings that assigns two nodes to the same tier and the third to an individual tier, e.g. $\tau_\gamma$ and $\tau_\delta$ with $\tau_\gamma(A)<\tau_\gamma(B)=\tau_\gamma(C)$ and $\tau_\delta(A)=\tau_\delta(B)<\tau_\delta(C)$.


\begin{figure}[!htbp]
\centering
\begin{tikzpicture}[state/.style={thick}]

\node[state] (A1) at (0,0) {$A$};
\node[state] (B1) at (1.5,0) {$B$};
\node[state] (C1) at (3,0) {$C$};

\node[state] (A2) at (4,0) {$A$};
\node[state] (B2) at (5.5,0) {$B$};
\node[state] (C2) at (7,0) {$C$};

\node[state] (A3) at (0,-2.75) {$A$};
\node[state] (B3) at (1.5,-2.75) {$B$};
\node[state] (C3) at (3,-2.75) {$C$};

\node[state] (A4) at (4,-2.75) {$A$};
\node[state] (B4) at (5.5,-2.75) {$B$};
\node[state] (C4) at (7,-2.75) {$C$};

\node (t1) at (1.5, -0.75) {$\G_\alpha$};
\node (t2) at (5.5, -0.75) {$\G_\beta$};
\node (t3) at (1.5, -3.5) {$\G_\gamma$};
\node (t4) at (5.5, -3.5) {$\G_\delta$};

\tikzset{undir/.style = {-, thick}}
\tikzset{dir/.style = {->, -{To[length=6, width=7]}, thick}}
\draw[dir]
(A1) edge (B1)
(A1) edge [bend left=60] (C1)
(B1) edge (C1)

(A3) edge (B3)
(A3) edge [bend left=60] (C3)

(A4) edge [bend left=60] (C4)
(B4) edge (C4)
;
\draw[undir]
(A2) edge (B2)
(A2) edge [bend left=60] (C2)
(B2) edge (C2)

(B3) edge (C3)

(A4) edge (B4)
;

\end{tikzpicture}
\caption{Top left: MPDAG $\G_\alpha$ relative to $\tau_\alpha$; note that this is equal to the underlying DAG. Top right: MPDAG $\G_\beta$ relative to $\tau_\beta$; note that this is equal to the CPDAG $\C$. Bottom left: MPDAG $\G_\gamma$ relative to $\tau_\gamma$. Bottom right: MPDAG $\G_\delta$ relative to $\tau_\delta$.}
\label{fig:ex3}
\end{figure}

Figure \ref{fig:ex3} shows that the MPDAGs $\G_\alpha$ (relative to $\tau_\alpha$), $\G_\beta$ (relative to $\tau_\beta$), $\G_\gamma$ (relative to $\tau_\gamma$), and $\G_\delta$ (relative to $\tau_\delta$) are distinct. All oriented edges are implied by the tiered background knowledge and no edge has been oriented as a consequence of Meek's 1st rule. Here, $\tau_\delta$ and $\tau_\gamma$ are incomparable, and they are both more informative than $\tau_\beta$. Moreover, $\tau_\alpha$ is more informative than the other orderings.

\end{example}

\subsection{Simulation study}\label{sec:sim}

Corollary \ref{corollary:informative} shows that in graphs with many unshielded paths we can potentially obtain a large amount of additional information, and the  earlier we are able to identify the direction of a causal path, the more information we can gain. In summary: (1) early knowledge is in general more beneficial than late knowledge, even if the late knowledge is more detailed (c.f. Example \ref{example:unshielded}), and (2) since we expect unshielded paths to occur more frequently in sparse graphs, the effect of Meek's 1st rule is expected to be more pronounced in sparse than in dense graphs. In order to investigate to which degree these two features occur in practice, we conducted a simulation study. 

\begin{figure}[!htbp]
    \centering
    \includegraphics{sim.pdf}
    \caption{Results of one setting of the simulation. 6000 random DAGs with 25 nodes were generated; half of them sparse, the other half dense. For each DAG and tiered ordering, the tiered MPDAG was constructed and the difference in number of directed edges to its corresponding CPDAG was computed, divided by the total number of edges.}
    \label{fig:sim_main}
\end{figure}

To adhere to Assumption \ref{assumptionknowledge}, we generated random DAGs and for each random DAG, we considered five different, consistent tiered orderings: full knowledge, early detailed knowledge, late detailed knowledge, early simple knowledge and late simple knowledge. For each DAG, we constructed its CPDAG, and for each combination of DAG and tiered ordering, we constructed the tiered MPDAG. To adhere to Assumption \ref{assumptionCPDAG}, the CPDAGs and MPDAGs were constructed based on the independence models encoded by the DAGs; hence, finite sample issues did not occur. For each MPDAG, we counted the difference in number of directed edges between the MPDAG and its corresponding CPDAG, and divided this by the total number of edges; this  measures the fraction of edges that cannot be oriented in the CPDAG, but can be oriented in the tiered MPDAG. Since we compare oracle CPDAGs to oracle MPDAGs, this  measures exactly the (relative) gain in informativeness. A detailed description of the study can be found in Section C in the Supplement.

 Figure \ref{fig:sim_main} shows that using the full knowledge of the orderings unsurprisingly provides most new orientations. Moreover, we see that early knowledge is more beneficial than late, even though they are equally detailed, which is in line with (1) above. Interestingly, in dense graphs early knowledge can also induce more oriented edges than late knowledge, even if the later knowledge is more detailed, which is also in line with (1). Additionally, the advantage of adding tiered background knowledge is relatively larger in sparse graphs than dense graphs, which is in line with (2). 
 
 We performed the same procedure for random DAGs with node sets of size 10, 50 and 100, for which we found analogous results. These results are  depicted in Figure C.2 in the Supplement.

 \subsection{Practical example}\label{sec:example}


\begin{figure}[!htbp]
\centering
\begin{tikzpicture}[state/.style={thick}, scale=0.75]

\filldraw[silver] (-.875,4.5) rectangle +(1.75,-7) ;

\filldraw[silver] (1.125,4.5) rectangle +(1.75,-7) ;

\filldraw[silver] (3.125,4.5) rectangle +(1.75,-7) ;

\node[align=center] (A) at (0,2.5) {\footnotesize mother's\\ age};
\node[align=center] (B) at (2,0.25) {\footnotesize smoking\\ \footnotesize during\\ \footnotesize pregnancy};
\node[align=center] (C) at (4,2.5) {\footnotesize weeks\\ \footnotesize pregnant};
\node[align=center] (D) at (4,-1.5) {\footnotesize birth\\ \footnotesize weight};

\node[align=center] (E) at (7,2.5) {\footnotesize sleep\\ \footnotesize quality};
\node[align=center] (F) at (7,-.5) {\footnotesize insulin\\ \footnotesize resistance};

\node[align=center] (G) at (9,4) {\footnotesize media\\ \footnotesize consumption};
\node (H) at (9,1) {\footnotesize well being};
\node (I) at (9,-2) {\footnotesize BMI};

\tikzset{dir/.style = {->, -{To[length=5.5, width=6.5]}, line width = 0.5pt}}
\draw[dir]
(A) edge [bend left=35] [dashed] (E)

(D) edge (F)

(E) edge (F)
(E) edge [bend left] [dotted] (G)
(E) edge (H)

(F) edge  (H)
(F) edge [bend right] (I)

(G) edge (H)

(H) edge (I)

;

\draw[dashed] 
(5.375,4.5) -- (5.375,-3)
;

\tikzset{undir/.style = {-, line width = 0.5pt}}
\draw[undir]
(A) edge (B)
(A) edge (C)

(B) edge (C)

(C) edge (D)
;

\node (t1) at (2,-3) {\footnotesize wave 1};
\node (t2) at (8,-3) {\footnotesize wave 2};

\end{tikzpicture}
\caption{Simplified example of a cohort study. Early life factors are measured at wave 1, childhood health factors  at wave 2.  Edges are oriented by the v-structures, time-ordering and Meek's rules. Expert knowledge  allows the first tier to be subdivided into three new tiers.}
\label{fig:practicalexample}
\end{figure}

Figure \ref{fig:practicalexample} is a simplified example based on cohort data analysed in \citet{foraita2022}, it shows the MPDAG obtained from the time-ordering of wave 1 and 2. The corresponding CPDAG only has two fewer directed edges (the dashed and dotted), so the time-ordering does not provide much new information. The induced subgraph over the first wave remains undirected. In order to obtain a more informative graph, we should consult early life experts rather than children's life style experts; alternatively, the cohort study should have been designed such that early life factors were measured at different time points. While mother's age naturally is determined before smoking during pregnancy, which again occurs before birth, experts could disagree on the causal order of pregnancy duration and birth weight, which are defined at the exact same time. However, it is not necessary to order these particular two nodes, here: Any ordering $\tau$ with $\tau$(mothers age)$<\tau$(smoking during pregnancy)$<\tau$(remaining nodes) allows for the {\em entire} graph to be oriented in this case.


\section{Relation to other work} \label{sec:comparisons}

Subject matter background knowledge can come from different sources and take different forms, and previous work provides results for other knowledge than tiered one. A distinct type of causal background knowledge is, for instance, obtainable when experimentation is possible, see \cite{hauser2012characterization} for a characterisation of interventional equivalence classes of DAGs. While a tiered ordering is given before learning a graph, experiments can be performed iteratively, and the choice of most informative interventions might depend on the given intermediate graphical structure. It has been shown in \cite{eberhardt2008almost} and \cite{hauser2014two} that the most informative strategy is to intervene on nodes in the largest undirected complete subgraphs: this yields most new edge orientations, including those following from  Meek's rules. Since the knowledge obtained from an intervention is local, this type of background knowledge lacks the completeness of tiered knowledge. This means that all four of Meek's rules might apply after orienting edges according to interventional knowledge, resulting in additional orientations within a complete subgraphs, in contrast to tiered background knowledge. 

\cite{mooij2020joint} considered context variables. These can be seen as a special case of tiered knowledge: Some of the variables, the context variables, form an earlier tier, and others, the system variables, form later tiers. However, there can be additional knowledge about presence/absence of relations between context variables, or their causal relations may not be of interest: In these cases we are no longer in the tiered framework. 

Background knowledge about non-ancestral (pairwise) relations,  considered by \cite{fang2020ida},  can be seen as a non-complete version of tiered knowledge. They show that knowledge of non-ancestral relations can be translated to a set of direct causal relations, i.e. directed edges. In contrast, the completeness and transitivity of tiered knowledge subsumes such relations through the orientations of cross-tier edges. A more general representation of background knowledge is provided in \cite{fang2022representation}, where   ancestral background knowledge on node pairs is considered; this is surprisingly different from tiers and cannot necessarily be encoded  graphically. The authors provide a criterion for checking equivalence of background knowledge, which is more general than the one provided here since tiered background knowledge can be considered a complete version of pairwise causal constraints. However, unlike \citet {fang2022representation}, our criterion can be checked on the graph, and due to the properties of tiered orderings, it is rather simple. 

Multivariate time series, as repeated measurements of the same variables over time \citep{malinsky2018causal, runge2019inferring}, have a very obvious and unambiguous tiered ordering, and in this sense our results extend to time series. But because time series are observations on a single unit over a long time instead of multiple i.i.d. observations, the models typically impose additional  structure. For instance, that each variable depends on its own past, thus forcing edges, which is a restriction that we have not considered; and that there is a limited memory (e.g. $k$-th order), thus disallowing edges, which we have also not considered; and importantly, for time series a stationarity or slow/smooth change of the structure is enforced which is also not covered by tiered background knowledge. 
Our results on the ensuing equivalence classes, such as absence of partially directed cycles, still apply under these additional structural assumptions as they restrict the  skeleton  of the true CPDAG, and the tiered (in this case temporal) background knowledge complements them.  
Considering informativeness, sometimes it may be possible to impose additional tiers within time-slices if there is a known order for the contemporaneous variables, e.g. due to biological processes underlying medical time series; this could be useful in obtaining more edge orientations. 

A different line of work on using background knowledge relaxes the assumption of causal sufficiency. Latent variables can be accommodated in maximal ancestral graphs (MAGs) (\cite{richardson2002ancestral}), and the corresponding equivalence classes are represented by partial ancestral graphs (PAGs), see characterization by \cite{ali2009markov}. Different orientation rules are needed, and a set of ten rules were introduced by \cite{zhang2008} ensuring the maximally informative PAG. For added background knowledge it has not yet been shown, in general, that these ten rules yield a maximally informative graph. However, this has been shown for tiered background knowledge under the extra assumptions of no cross-tier confounding and no selection bias \citep{andrews2020completeness}. Moreover, in this case it turns out that not all ten orientation rules are needed,  similarly to our Lemma \ref{mainlemma}. We therefore conjecture that analogous results to those of section \ref{sec:properties} extend to PAGs with tiered background knowledge under the assumptions of no selection bias and no cross-tier confounding. Further work is still needed to relax the often implausible assumption of no cross-tier confounding. 


\section{Discussion}

By formalising equivalence classes restricted by tiered orderings, we provided some new insights: Tiered MPDAGs do not have partially directed cycles and are chain graphs with chordal chain components; this makes them easier to handle and interpret, e.g. for causal effect estimation. We have given a characterisation of tiered MPDAGs which clarified what can be gained by adding tiered knowledge and what will still remain unknown. Sparse graphs, in particular, will benefit much from edge orientations implied by the tiered ordering; further, eliciting background knowledge to separate out early tiers is especially informative. Hence, this is when we do become `wiser with time'.

In summary, we believe that oftentimes background knowledge comes in the form of a tiered ordering. Moreover, tiered knowledge can be expected to  be reliable especially when based on temporal information. A benefit of the tiered orderings is that in case of doubts or disagreements about  the ordering, these may be resolved by  coarsening the tiers and thus arrive at, say, a consensus among differing expert opinions. Tiered MPDAGs are, therefore, at least as plausible as their corresponding CPDAGs without background knowledge. In addition, tiered MPDAGs will also be at least as informative as their corresponding CPDAGs -- in practice they will often be much more informative. While it is self-evident that any background knowledge should be exploited for causal structure learning, we have illustrated how specific aspects and how much the background knowledge results in an information gain that goes well beyond the orientation of cross-tier edges. 

\begin{acknowledgements} 
This project was funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) – Project 281474342/GRK2224/2. Support from DFG (SFB 1320 EASE) is also acknowledged. We would like to thank the reviewers for their helpful and constructive comments. 
\end{acknowledgements}

\bibliography{bang_753}

\end{document}