\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{amsthm}
 \usepackage{algorithm2e}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\red}[1]{\textcolor[rgb]{1,0.1,0.1}{#1}} 
\newcommand{\blue}[1]{\textcolor[rgb]{0.,0.,0.75}{#1}} 

\newtheorem{definition}{Definition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{theorem}{Theorem}
\newtheorem{assumption}{Assumption}
 \newcommand{\indep}{\perp\!\!\!\!\perp} 
\newcommand{\oo}{{\circ\!{\--}\!\circ}}

\setcounter{assumption}{-1}

\title{Causal Discovery for time series from multiple datasets with latent contexts}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<wiebke.guenther@dlr.de>?Subject=UAI 2023 CD from multiple datasets with latent contexts}{Wiebke~G\"{u}nther}{}}
\author[2,1]{Urmi~Ninad}
\author[1,2]{Jakob~Runge}
% Add affiliations after the authors
\affil[1]{%
    German Aerospace Center\\
    Institute of Data Science\\
    07745 Jena, Germany
}
\affil[2]{%
    Technische Universit\"{a}t Berlin\\
    Dept. of Electrical Engineering and Computer Science\\
    10623 Berlin, Germany
}
  

\begin{document}
\maketitle

\begin{abstract}
Causal discovery from time series data is a typical problem setting across the sciences. Often, multiple datasets of the same system variables are available, for instance, time series of river runoff from different catchments. The local catchment systems then share certain causal parents, such as time-dependent large-scale weather over all catchments, but differ in other catchment-specific drivers, such as the altitude of the catchment. These drivers can be called temporal and spatial contexts, respectively, and are often partially unobserved. Pooling the datasets and considering the joint causal graph among system, context, and certain auxiliary variables enables us to overcome such latent confounding of system variables. In this work, we present a non-parametric time series causal discovery method, \emph{J(oint)-PCMCI$^+$}, that efficiently learns such joint causal time series graphs when both observed and latent contexts are present, including time lags. We present asymptotic consistency results and numerical experiments demonstrating the utility and limitations of the method.
\end{abstract}

\section{Introduction}\label{sec:intro}
Causal discovery from observational data has gained widespread interest in recent years. Next to score-based methods~\cite{chickering2002learning}, Granger causality \cite{granger1969investigating}, and the more recent restricted structural causal models (SCM) framework \cite{peters2017elements,spirtes2016causal}, the constraint-based approach\citep{spirtes2000causation} to this discovery task exploits conditional independencies in the data to constrain causal graphs and can flexibly handle nonlinear dependencies.

Most real world data comes in the form of \emph{time series}, which provide opportunities and challenges for causal discovery~\citep{runge2019inferring}. While the inherent time order implies certain causal directions, time-series data typically violates the \emph{i.i.d.} assumption usually made in  conditional independence testing. Therefore, specific algorithms that target the challenges of time-series data have become an increasingly popular sub-category within causal discovery, for instance, versions of the PC algorithm and FCI \citep{entner2010causal,malinsky2018causal}, or the PCMCI framework~\citep{runge2019detecting,runge2020discovering,gerhardus2020high}. 

The methods mentioned above consider single multivariate time series datasets and aggregate samples across time. Another relevant development has been the incorporation of multiple datasets and modeling their different contexts~\citep{mooij2020joint,huang2020causal}, which can also be framed as a data-fusion problem~\citep{pearl2011transportability,bareinboim2016causal}. 

In the following, we illustrate the main ideas of this paper on the example of time series datasets of river runoff from different catchment systems~\citep{wagener2007catchment}. If we assume all of these to come from the same (stationary) distribution, we can just concatenate (pool) the data to obtain a larger sample and, hence, more reliable causal discoveries among the system variables. But multiple datasets can also be used to de-confound relationships: The local catchment systems often share certain causal parents, such as time-dependent large-scale weather dynamics over all catchments, that can be called temporal contexts and cause latent confounding between two or more system variables, if they are unobserved. If we now assume that such a time-dependent latent confounder is the same across all datasets, we can condition on the time index (or add a so-called \emph{time-dummy variable}) by aggregating samples across datasets, instead of aggregating across time. This then yields a joint graph across all time points.

But datasets not only share common causal drivers, they also differ in other dataset-specific drivers, such as the altitude or vegetation-type of the catchment, which can be called spatial contexts. As spatial contexts are constant across time, they do not constitute a confounding \emph{within} each dataset. However, in the pooled data they vary across datasets and lead to confounding in the joint graph across datasets. If spatial contexts are observed, they can be included as variables in the analysis and if they are unobserved, they can be de-confounded by the same idea as time-dependent confounders by assuming that they are constant across all time points and conditioning on the dataset index (or adding a so-called \emph{space-dummy variable}, not to be confused with physical space). The idea of conditioning on time and space is heavily employed in \emph{fixed-effect panel regression models} in econometrics~\citep{angrist2009mostly} and here we consider these for causal discovery. 

Next to deconfounding system variables, observed context variables can help to orient causal links: Consider two system variables $X\oo Y$ whose causal direction cannot be identified by Markov equivalence. In the joint graph we could add a context variable $C$ and assume (or learn) that $C\rightarrow X \oo Y$. Then applying the collider or orientation rules~\citep{meek1995causal} allows to infer the causal direction between $X$ and $Y$.


Our approach partially follows \citet{mooij2020joint}. Their approach is to pool data from different contexts, for instance, observational and interventional data, and do causal discovery on the pooled dataset, called \emph{joint causal inference} (JCI). In particular, they established a general framework to (i)~interpret contexts as auxiliary variables that describe the context of each dataset, (ii)~pool all the data from different contexts while keeping the contextual information of the data by including the auxiliary context variables into a single dataset, and (iii)~apply standard causal discovery to all data jointly, incorporating appropriate background knowledge on the causal relationships involving the context variables.

Our aim is to extend the PCMCI$^+$ time series causal discovery algorithm~\citep{runge2020discovering} to the case of datasets from multiple dataset- or time dependent contexts with potentially unobserved context confounders of the system variables. We term this technique \emph{J(oint)-PCMCI$^+$} because it combines the two JCI-ideas mentioned above, i.e., pooling datasets from multiple contexts, and adding observed context variables to the graph. We go beyond JCI by providing a specific algorithmic implementation of it in the time-series setting and using time- and space-dummy variables to account for latent context variables that confound system variables, that are conceptually similar to surrogate variables in \citep{huang2020causal}. Therefore, we are faced with the additional challenge of dealing with observed context as well as dummy variables simultaneously, which requires caution due to the fact that they are deterministically related to one another \cite{lemeire2012conservative}. This approach combines the advantages of PCMCI$^+$ regarding detection power and false positive control in the presence of strong autocorrelation~\citep{runge2020discovering} with the advantage of the JCI framework.
To summarize, we present a consistent causal discovery algorithm J-PCMCI$^+$ that can:
\begin{enumerate}
    \item[(i)] de-confound those system nodes that are confounded by latent contexts without having any knowledge of the latent contexts themselves;
    \item[(ii)] retain as much information about the causal links between the observed context and system variables as possible by checking conditional independencies appropriately;
    \item[(iii)] discover the correct induced causal graph between the system nodes.
\end{enumerate}


\section{Related Work}\label{sec:related}

In causal inference, the idea of context variables has been explored under different notions :  “policy variables” \cite{spirtes2000causation}, “force variables” \cite{pearl1993comment}, “decision variables” in influence diagrams \cite{dawid2002influence}, “selection variables” in selection diagrams \cite{bareinboim2013general}, and “environment variables” \cite{peters2016causal}. \cite{mooij2020joint} established a general framework to combine data from multiple contexts with traditional causal discovery techniques. 

In \cite{huang2020causal}, the problem of \emph{heterogeneous} data, which might correspond to varying dataset collection conditions (analogous to contexts in JCI), as well as the problem of \emph{non-stationary} data was addressed in a framework called CD-NOD.
Here, changing causal mechanisms across time or datasets were interpreted as confounding of the system by an unobserved \emph{pseudo-confounder}, so named because it can be written as a function of the dataset or the time index. This confounding was then addressed by introducing a \emph{surrogate variable} that captures changes of causal mechanisms, thereby deconfounding the pseudo-confounded system variables. Further, they employed the information of changing causal mechanisms to infer additional causal directions than standard causal discovery allows for by formalizing independence of cause and mechanism \cite{peters2017elements}. Note that explicitly known context variables can not be included in this setup. It also focuses on non-stationarity that can be modeled as a smooth function of time. This is a restriction we do not place.
 

The concept of changing regimes over time is similar to the presence of a temporal context. In particular, \citet{saggioro2020reconstructing} consider regimes that vary over time and assume that these regimes are not known a priori. This is in contrast to our assumptions that the context variables are constant over time or across the data sets which implies that we know from domain knowledge when a context change might happen. They present the  Regime-PCMCI algorithm that learns the regimes together with the causal graphs within each regime.

Certain Bayesian methods can also deal with heterogeneous data, i.e.\ data from different contexts, e.g. \cite{zhou2022causal}. Naturally, identifying the causal structure by a Bayesian method requires strong model assumptions. Instead of using dummy or surrogate variables, the authors suggest to impute possible latent covariates using an embedding method, and they also provide a way to infer the latent covariates jointly with the causal graph.

Constraint-based causal discovery methods for time-series data historically began with Granger causality \cite{granger1969investigating}, and since has been addressed in \citet{enter2010causal, malinsky2018causal, runge2019detecting, runge2020discovering, gerhardus2020high} to cover non-linear relationships, contemporaneous as well as lagged links, latent confounders and highly auto-correlated data. For an overview, see \cite{runge2019inferring}.


\section{Theoretical Foundations} \label{sec:found}
\begin{figure}[t]
    \centering
    \includegraphics[width=0.75\linewidth]{./intro_after_rebuttal.pdf}
    \caption{\textbf{Causal discovery with temporal- and spatial-contexts.} \textbf{(A)} Two datasets of system variables $X^1,X^2$ may be confounded by the same temporal context $\tilde{C}^1_{\rm time}$, but differ in dataset-specific characteristics that are constant over time, here autocorrelation, which can be represented by a spatial context $\tilde{C}^1_{\rm space}$. J(oint)-PCMCI$^+$ learns the causal relations which can be represented in \textbf{(B)} a summary causal graph (link labels denote time lags) or \textbf{(C)} a time series graph. Samples of the nodes are pooled from both datasets over the (user-defined) stationary part of the time series (grey dotted lines) leading to a repeating structure of the time series graph (grey links). Context nodes can also help orienting links since they can create colliders. If contexts are unobserved, J-PCMCI$^+$ can utilize temporal- or spatial dummy variables to de-confound system variable relationships.}
    \label{fig:intro}
\end{figure}

Within the JCI framework~\citep{mooij2020joint}, the causal relations of a system and its context are represented by a joint (or meta) structural causal model (SCM).
A \emph{system variable} is a time-dependent random variable whose distribution can change across datasets. In the following, a \emph{temporal context variable} is a time-dependent random variable~\footnote{The term \emph{random variable} is used for context variables in the sense that it is done in \cite{mooij2020joint}.} that remains the same across datasets. A \emph{spatial context variable} is a random variable that is constant over time and within a dataset but can change across datasets. See figure \ref{fig:intro} for an example. The information on which variables belong to the system or to the context is given as a domain assumption. We now formulate an assumption on the data-generating mechanism that always holds, unless stated otherwise.


\begin{assumption}[Joint time-dependent SCM]\label{ass:SCM}
    The underlying data-generating mechanism across datasets $d\in \mathcal{D}$ with $|\mathcal{D}|=M$ is an acyclic time-dependent structural causal model (SCM) involving the time-dependent system variables $\mathbf{X}_t=\{X_t^i\}_{i\in \mathcal{I}}$ at time $t$ as well as context variables $\tilde{\mathbf{C}}=\tilde{\mathbf{C}}_{time}\dot{\cup} \ 
    \tilde{\mathbf{C}}_{space}$ with temporal contexts $\tilde{\mathbf{C}}_{time,t}=\{\tilde{C}_t^k\}_{k\in \mathcal{K}_{\rm time}}$ and (time-independent) spatial-contexts $\tilde{\mathbf{C}}_{space}=\{\tilde{C}^l\}_{l\in \mathcal{K}_{\rm space}}$, for $i\in \mathcal{I}$:
    
        \begin{equation} \label{eq:scm}
            \begin{split}
            &\mathbf{X}^{d}_t:= \mathbf{f}({Pa}_X(\mathbf{X}^{d}_t),\operatorname{Pa}_{\tilde{C}_{\rm time}}(\mathbf{X}^{d}_t), \operatorname{Pa}_{\tilde{C}_{\rm space}}(\mathbf{X}^{d}_t), \boldsymbol{\eta}^{d}_t) \\ 
            &\mathbf{\tilde{C}}_{{\rm time}, t} := \mathbf{g}({Pa}_{\tilde{C}_{\rm time}}(\mathbf{\tilde{C}}_{{\rm time}, t}), \boldsymbol{\eta}_{{\rm time}, t}) \\
            &\mathbf{\tilde{C}}^{d}_{\rm space} := \mathbf{h}({Pa}{\tilde{C}_{\rm space}}(\mathbf{\tilde{C}}_{\rm space}), \boldsymbol{\eta}_{\rm space}^{d})
            \end{split}
        \end{equation}
    where the exogenous noise variables $(\boldsymbol{\eta}^{d}_t, \boldsymbol{\eta}_{{\rm time}, t}, \boldsymbol{\eta}_{\rm space}^{d})$ are jointly independent and $\eta^{i,d}_{t}$ are identically distributed across time and space, $\eta^k_{{\rm time},t}$ are identically distributed across time, and $\eta^{l,d}_{\rm space}$ are identically distributed across space. $\operatorname{Pa}_X$ denotes the causal parents within $\mathbf{X}$, and analogously for $\operatorname{Pa}_{\tilde{C}_{\rm time}}$ and $\operatorname{Pa}_{\tilde{C}_{\rm space}}$.
\end{assumption}

%Here an SCM is called \emph{simple} if it is uniquely solvable with respect to any subset of variables in the SCM~\citep{bongers2021foundations}. In our setup, this implies that the time-dependent system is stable.
Note that we restrict our exploration to the case of acyclic SCMs. However, since the PC algorithm has been proven to be consistent in the presence of cycles \citep{mooij2020constraint}, we expect that the consistency of our method extends straightforwardly to the case of SCMs with contemporaneous cycles.
As discussed above, we allow for latent context variables. Non-stationary data could be modeled by intervening on a temporal context node simultaneously across all time points.

\begin{assumption}[Markov, Faithfulness and partial causal sufficiency]\label{ass:sufficiency}
    Let $M$ be a SCM of the form (\ref{eq:scm}) with graph $\mathcal{G}$, the joint distribution $P_M(X, \tilde{C})$ induced by the SCM satisfies the Markov Property with respect to the graph $\mathcal{G}$.
    Additionally, $P_M(X, \tilde{C})$ is faithful to the graph $\mathcal{G}$ of $M$.
    The collection of datasets from the joint SCM~\eqref{eq:scm} is assumed to contain data from all system nodes $\mathbf{X}_t$, but may contain unobserved context nodes $\mathbf{L}$ and observed context nodes $\mathbf{C}$,
    \begin{equation}
        \tilde{\mathbf{C}}=\mathbf{L} ~\dot{\cup}~ \mathbf{C}.
    \end{equation}
    Subsequently, $\mathbf{L}$ and $\mathbf{C}$ can also be written as a disjoint union over their spatial and temporal components.
\end{assumption}

Note, that in the setup of \eqref{eq:scm} we have included the assumption that context variables are exogenous to the system, see \emph{JCI assumption 1} in \cite{mooij2020joint}.
Furthermore, we assume that no latent context confounds an observed context and a system variable. Since a latent context is the only possible latent confounder in our setup, this assumption is equivalent to \emph{JCI assumption 2} \cite{mooij2020joint}. We make an additional assumption in view of efficiency:

\begin{assumption}[Context-system links]\label{ass:JCI}
    JCI assumption 1 holds.
    Further, a causal link between an observed context and a system variable is not mediated by latent context variables.
\end{assumption}

\begin{assumption}[No context-system confounders]\label{ass:no_conf}
    JCI assumption 2 holds, i.e.\ no latent context confounds an observed context and a system variable.
\end{assumption}

Finally, we adapt the \emph{pseudo-causal sufficiency assumption} of \cite{huang2020causal} for our case as follows:
\begin{assumption}[Context-determinism]\label{ass:asymmetry}
    The spatial context variables are deterministic functions of the dataset index.
    The temporal context variables are deterministic functions of the time-index.
    We assume these functions to be non-invertible.
\end{assumption}

The SCM \eqref{eq:scm} yields a joint time series causal graph $\mathcal{G}$ (see Fig.~\ref{fig:intro}) over all datasets $d$. Note that the spatial context variables appear as a single node in the time series graph. This representation is chosen to denote that they are constant in time and therefore do not have a time-dimension associated with them.
The joint graph $\mathcal{G}$ is related to the target of our discovery task:
\begin{definition}[Target graph]\label{def:target_graph}
   The target graph of J-PCMCI+ is the induced subgraph of $\mathcal{G}$ over the system nodes together with the observed context nodes and their edges to the system nodes.
\end{definition}


\section{Method}\label{sec:method}
The general idea of our method is to include context nodes in the time series graph motivated by the JCI approach. In order to deal with latent context variables, we introduce dummy variables (Sect.~\ref{subsec:def}) before presenting J-PCMCI$^+$ (Sect.~\ref{subsec:alg}) and state consistency results in Sect.~\ref{subsec:theorems}.


\subsection{Dummy variables as proxies for latent confounders}\label{subsec:def}
Let $\mathcal{G}=(V, E)$ be the time series graph corresponding to SCM \eqref{eq:scm}, where $V$ denote the vertices and $E$ edges between vertices. 
Here, $V = \mathbf{X} \cup \mathbf{C} \cup \mathbf{L}$, where $\mathbf{X}$ (resp.~$\mathbf{C}$ and $\mathbf{L}$) refers to vertices at all time points.
The set of edges $E$ can be written as a disjoint union between edges $E_L$, where at least one of the corresponding nodes is in $\mathbf{L}$, and its complement $E_O$, which consists of edges that only connect observed (system and/or context) nodes. That is, $E = E_L \dot{\cup} E_O.$ Further, $E_O$ is itself a disjoint union of $E_{S}$, which are the edges where at least one of the two nodes is in $\mathbf{X}$ and its complement $E_C$, i.e., $E_O = E_S \dot{\cup} E_C$. 
%In the following definitions, we focus on the space dummy first to ease notation.
Using this notation, we can express the definition of the target graph (definition \ref{def:target_graph}) more formally: Based on a given ground truth graph $\mathcal{G} = (\mathbf{X} \dot{\cup} \mathbf{C} \dot{\cup} \mathbf{L}, E_L \dot{\cup} E_S \dot{\cup} E_C)$, we define the target graph $\Tilde{\mathcal{G}}$ as $\Tilde{\mathcal{G}} = (\mathbf{X} \dot{\cup} \mathbf{C}, E_S)$. See figure 1 in the SM for an example.

\begin{definition}[Space dummy variable]\label{def:space_dummy}
    The space dummy variable $D_{\rm space} $, henceforth referred to as space dummy, is a variable that labels datasets.
\end{definition}

Without prior expert knowledge, this labelling is arbitrary. 
For instance,  the simplest embedding is $D_{\rm space} \in \{1, \ldots, M\}$.
Alternatively, in a one-hot-encoded embedding, $i \in \{1, \ldots, M\}$ denotes the position of the $1$ in an $M$-dimensional vector where all other entries are $0$.
In the following, we work with a one-hot-encoded space dummy. However, we note that the question of which embedding to choose for the space dummy is far from settled, and requires further expert knowledge about the particular setup and the type of conditional independence to be used in the causal discovery algorithm. Refer to the SM for further details.

Also note, there is no one-to-one relation between datasets and spatial contexts, i.e., two datasets can have same value for a spatial context.

\begin{definition}[Time dummy variable]\label{def:time_dummy}
    The time dummy variable $D_{\rm time}$, henceforth referred to as time dummy, is a variable that labels each time-step in the time-series data. 
\end{definition}

Here too, we arbitrarily choose the embedding for $D_{\rm time} $ to be a one-hot-encoding into a $T$-dimensional vector, where $T$ is the length of the time-series.


\begin{definition}[Dummy projection]\label{def:dummy_proj}
    We define the dummy-projection of the graph $\mathcal{G}=(\mathbf{X} \cup \mathbf{C} \cup \mathbf{L}, E_L \cup E_O)$ to be the graph $\mathcal{G}_D = (\mathbf{X} \cup \mathbf{C} \cup \{D_{\rm space}  \cup D_{\rm time} \}, \tilde{E})$, where edges $\tilde{E}$ are defined as:
    \begin{equation*}
        \begin{split}
            \tilde{E} &= \{ (D_{\rm space} ,v) | (u,v) \in E, \ \forall \  u \in \mathbf{L}_{space} \text{ and } v \in \mathbf{X} \} \\
            \cup & \{ (D_{\rm time} ,v) | (u,v) \in E, \ \forall \ u \in \mathbf{L}_{time} \text{ and } v \in \mathbf{X}_t \}\cup E_S.
        \end{split}
    \end{equation*}
\end{definition}

%We further specify that $D_{space}$ and $D_{time}$ nodes only have contemporaneous links to system variables, see figure 2 in SM for details. 
Further, note that in the dummy projection, we have omitted the edges $E_C$, i.e., the edges between the observed context variables, since these relationships are not of interest for the target graph (definition \ref{def:target_graph}). 


Finally, we introduce the dummy-deleted graph. For a visualization of the dummy projection and deletion operations, see figure 3 in SM. 

\begin{definition}[Dummy deletion]\label{def:dummy_del}
    Let $\mathcal{G}_D$ be the dummy projection of the graph $\mathcal{G}$. The dummy-deleted graph $\mathcal{G}_{D_{del}}$ is the graph where the dummy variables and any outgoing edge therefrom is removed.     
\end{definition}

Under assumption \ref{ass:asymmetry}, for $C_{s} \in \mathbf{C}_{space}$ and $L_s \in \mathbf{L}_{space}$, we can always find (not necessarily unique) functions $g_C$ and $g_L$ with $C_s = g_C(D_{\rm space} )$ and $L_s = g_L(D_{\rm space} )$, and analogously for the temporal counterparts.

Such mappings can be assumed to exist, since the dummy $D$ takes a unique value within each dataset (def. \ref{def:space_dummy}), and each spatial context variable $C$ is assumed constant within each dataset (and analogously for the temporal version). Therefore, there exists a mapping $g$ with $C=g(D)$.
There would exist no mapping from the dummy variable to the context variable if the context variable would take two different values within one dataset, but this case is excluded by assumption.

The introduction of the dummy variables into the causal discovery task is for the purpose of removing the confounding effect of latent context variables on a pair of system variables. Latent system variables and their confounding cannot be handled with this method. As we will see in Section \ref{subsec:alg}, our method yields a graph between $\mathbf{X}$, $\mathbf{C}$ and $D$, which is not exactly the dummy projection of the true graph $\mathcal{G}$, but whose dummy deletion is the target graph (definition \ref{def:target_graph}). 


\paragraph{Interpretation of the dummy-projection}
Some caution has to be applied in the interpretation of links between the dummy and system nodes in the dummy-projected graph. These links are only placeholders for the links between unobserved context nodes and system nodes in the ground truth graph. The dummy is not a causal variable itself! 
Further, note that in definition \ref{def:dummy_proj}, we did not include links between the dummy and observed context nodes in the dummy projection since these links are deterministic by assumption~\ref{ass:asymmetry} and always present and thus not informative.


\paragraph{Including both observed context and dummy variables}
A natural question at this point might be, why include the observed context variables at all in the causal discovery task, even though the dummy can remove all influence of context variables because of the general way in which is it defined (definitions \ref{def:space_dummy}, \ref{def:time_dummy}). Theoretically, there would indeed be nothing wrong with excluding contexts altogether. However, the dummy variable is not interpretable, and not useful when the goal is to learn the causal relationship between specific context and system variables.
Further, as we will see below, in the first step our causaldiscovery algorithm learns the context-system adjacencies, the relationship between which may be mathematically simpler than those between the highly general dummy and system variables. In the second step, it learns the dummy-system adjacencies given the context parents learnt in the first step. This helps infer influence on the system variable of the dataset label that cannot be explained by the context variables, i.e., in essence we learn hidden contexts. Finally, as also pointed out by \citet{mooij2020joint}, the separate observed context nodes help in orienting adjacencies between system variables. Refer also figure \ref{fig:intro} for a visualization.
 
A naive implementation to learn the causal relationships between context-system and dummy-system, where the context and dummy variables are treated on the same footing when testing adjacencies to the system, would not be correct as the \emph{causal faithfulness assumption} would be violated. This is because the relationship between context and dummy variables is deterministic.
The two-step procedure outlined above circumvents erroneous inferences of adjacencies due to faithfulness violation. For details, see sect.\ref{subsec:alg} and SM.


\subsection{Algorithm}\label{subsec:alg}
In the pooled dataset, we include one variable for the space and time dummy each, as well as for spatial context variables, at time $t$. These nodes can only have contemporaneous links to system nodes since they either do not change over time or contain no information about the temporal structure.
That is, only temporal context variable can have a lagged influence on the system variables, see also figure \ref{fig:intro} and SM (figure 2). 
To be able to deal with observed contexts and dummy variables, that are essentially placeholders for the unobserved context variables, our method first discovers links between system and observed context nodes while ignoring the dummy nodes, and in the next step discovers links between dummy and system nodes. Finally, using the information on the contextual parents of each system node, we do causal discovery on the system node pairs. In the following, we detail this procedure for the non-time series and time-series case. 

\paragraph{Non-time series case:}
To ease the explanations for the time-series case, where both spatial and temporal context variables can occur, we first focus on the non-time-series case, where only spatial context variables can occur. Consequently, we only have to consider the space dummy. Note that assumption \ref{ass:SCM} can be simplified to the non-time series straightforwardly. We will combine the well-known PC algorithm \cite{spirtes2000causation} with both observed and dummy context variables. Pseudocode for this method is provided in Algorithm \ref{alg:nonts}. In the standard setting, the PC algorithm is a constraint-based causal discovery algorithm for the causal sufficient case that relies on the Markov and Faithfulness assumption. In its first stage (skeleton discovery), adjacencies are learned based on iteratively testing conditionally independence of pairs of variables at some significance level $\alpha$. Afterwards, the links are oriented based on a set of rules. We will focus on how its skeleton phase needs to be adapted.
 \begin{enumerate}
     \item In the first step, we discover context-system links. We iteratively test independence between the following node pairs $(X^i, C^j)$, and $(C^j, X^i)$ for all $i,j$ while conditioning on subsets of the union over system and observed context nodes $\mathbf{X} \cup \mathbf{C}$. In other words, we initialize a fully connected graph between the system and context variables, eliminate the edges between context variables, and run the skeleton phase. By ignoring the dummy node in this step, we are able to circumvent the faithfulness violation that stems from the fact that every observed context node is a deterministic function of the dummy. 
     By the exogeneity of the context to the system (assumption \ref{ass:JCI}), we already know that any link between context variable $C$ and system variable $X$ is oriented as $C \rightarrow X$. Therefore, we construct the set of observed contextual parents $\text{Pa}_C(X^i)$ of each system variable $X^i$ from all observed context variables that are found to be adjacent to $X^i$.
    \item In the second step, we focus on the discovery of dummy-system links. In particular, we test independence between $D$ and each $X^i \in \mathbf{X}$ conditional on subsets of $\mathbf{X}$ and the found contextual parents $\text{Pa}_C(X^i)$. Combined with the expert knowledge that the dummy cannot be a descendent of a system variable, this gives us the dummy parents of $X^i$. We denote the set of dummy and contextual parents of $X^i$ by $\text{Pa}_{CD}(X^i)$.
    \item Finally, we run the skeleton phase of the full PC algorithm on $\mathbf{X} \cup \mathbf{C} \cup \{D\}$ while incorporating the background knowledge of the links from $\text{Pa}_{CD}(X^i)$ to $X^i$, and no context-context and context-dummy links.

 \end{enumerate}

Since context-system and dummy-system links are oriented by assumption, the orientation phase, see \cite{meek1995causal} for rules, needs to be applied to orient the system variables only. Note, however that we are taking between context-system or dummy-system edges into account whenever they form an unshielded triple with two system variables, i.e.\, $C \rightarrow X^i \oo X^j$ or $D \rightarrow X^i \oo X^j$. 
This allows to orient more edges than only considering triples of system variables.

\RestyleAlgo{ruled}
\begin{algorithm}
\caption{J-PC (for non-time series), pseudocode for poolData, and partialSkeletonPC is provided in SM}\label{alg:nonts}
\KwData{Background knowledge on context-system link orientation $\mathcal{E}$, observational data $(\mathbf{X}^{(m)})_{m=1, \ldots, M}$ in $M$ dataset, observed context variables $(\mathbf{C}^{(m)})_{m=1, \ldots, M}$ for each dataset, dummy variable $D$ with distinct values for each dataset, significance level $\alpha$}
\KwResult{graph $\mathcal{G}$}
$(\mathbf{X}, \mathbf{C}, D) \leftarrow \operatorname{poolData}((\mathbf{X}^{(m)}, \mathbf{C}^{(m)})_{m=1, \ldots, M}, D)$ \\
Set $\mathcal{P}_C := \{ (X, C), (C, X) | X \in \mathbf{X}, C \in \mathbf{C} \}$,\\ $\mathcal{P}_D := \{ (X, D) | X \in \mathbf{X}\}, \mathcal{P}_S := \{ (X, Y) | X, Y \in \mathbf{X} \}$ \\
Set $data_C := (\mathbf{X}, \mathbf{C})$ and $data_D := (\mathbf{X}, \mathbf{C}, D) $ \\
Set $\mathcal{C} = \emptyset$ \\
\For{index in [C, D]}{
    $\mathcal{G} \leftarrow \operatorname{partialSkeletonPC}(data_\text{index}, \alpha, \mathcal{P}_\text{index}, \mathcal{C})$ \\
    \For{$X$ in $\mathbf{X}$}{
        Orient context-system edge as in $\mathcal{E}$\\
        Add contextual parents $\text{Pa}_\text{index}(X)$ as in $\mathcal{G}$ to $\mathcal{C}$
    }
}
%Set \\
$\mathcal{G} \leftarrow \operatorname{partialSkeletonPC}(data_D, \alpha, \mathcal{P}_S, \mathcal{C})$ \\
Orient system edges using PC-orientation rules\\
\textbf{return} $\mathcal{G}$
\end{algorithm}


\paragraph{Time series case:}
\RestyleAlgo{ruled}
\begin{algorithm*}
\caption{J-PCMCI$^+$ (for time series), 
laggedSkeletonPCMCI$^+$ refers to Algorithm 1 in \cite{runge2020discovering}, partialContempSkeletonPCMCI$^+$ is a small adaption of Algorithm 2 of \cite{runge2020discovering} which is further described in the SM, colliderPhase and rulePhase refer to Algorithms 3 and 4 in \cite{runge2020discovering}}\label{alg:ts}
\KwData{Background knowledge on context-system link orientation $\mathcal{E}$, $M$ observational system time series datasets $\textbf{X}=(\textbf{X}^i)_{i \in \mathcal{I}}$, observed temporal context variables $\mathbf{C}_\text{time}$, observed spatial context variables $\mathbf{C}_\text{space}$, temporal and spatial dummy variables $ D_\text{time}, D_\text{space}$, significance level $\alpha$, maximal time lag $\tau_\text{max}$, $CI(X, Y, \mathbf{Z})$}%, 
\KwResult{graph $\mathcal{G}$}
$\{\hat{\mathcal{B}}^-_t(X) | X \in \mathbf{X} \cup \mathbf{C}_\text{time} \} \leftarrow \text{laggedSkeletonPCMCI$^+$}(\mathbf{X} \cup \mathbf{C}_\text{time})$ \red{} \\
Initialize $\mathcal{C}(X) \leftarrow \emptyset$ for all $X \in \mathbf{X}$\\
Set %\red{The indices for $C$ and $X$ should be different for better readability. And why do we need $(X_t, C_t)$ here, isn't that included in the second item in $P_C$?} 
$\mathcal{P}_C := \{ (C^k_{t-\tau}, X^i_t), (X_t^i, C_t^k) | \tau \geq 0, \forall i,k \}$, $\mathcal{P}_D := \{ (X_t^i, D), (D, X_t^i) | \text{for } D \in \{ D_\text{time}, D_\text{space}  \}, ~ \forall i \}$,  $\mathcal{P}_S = \{ ((X^j_{t-\tau}, X_t^i))_{\tau > 0}, (X_t^i, X_t^j) | i,j\}$\\
Set $data_C := \mathbf{X} \cup \mathbf{C}_\text{time} \cup \mathbf{C}_\text{space}$, and $data_D := \mathbf{X} \cup \mathbf{C}_\text{time} \cup \mathbf{C}_\text{space} \cup \{ D_\text{time}, D_\text{space}  \}$\\
\For{index in [C, D]}{
    $\mathcal{G}$ $\leftarrow$ partialContempSkeletonPCMCI$^+$($data_\text{index}$, $|\mathcal{I}|$, $\tau_\text{max}$, $\alpha$, $\hat{\mathcal{B}}^-_t(X)$, $\mathcal{C}(X)$, $\mathcal{P}_\text{index}$)\\
    \For{$X$ in $\mathbf{X}$}{
    orient context-system edge as in $\mathcal{E}$\\
    add all context nodes that are adjacent to $X$ in $\mathcal{G}$ to $\mathcal{C}(X)$\\}   
}

$\mathcal{G}$ $\leftarrow$ partialContempSkeletonPCMCI$^+$($data_D$, $|\mathcal{I}|$, $\tau_\text{max}$, $\alpha$,  $\hat{\mathcal{B}}^-_t(X)$, $\mathcal{C}$ $\mathcal{P}_S$)\\
$\mathcal{G},$ sepset, ambigious triples, conflicting links $\leftarrow$ colliderPhase($data_D$), i.e., on all unshielded triples $X_{t-\tau}^i \rightarrow X_{t}^k \oo X_{t}^j $ ($\tau > 0$) or $X_{t}^i \oo X_{t}^k \oo X_{t}^j $ or $K \rightarrow X_t^i \oo X_t^j$ with $K \in \mathbf{C}_\text{time} \cup \mathbf{C}_\text{space} \cup \{ D_\text{time}, D_\text{space} \}$\\
$\mathcal{G}$, conflicting links $\leftarrow$ rulePhase($\mathcal{G},$ ambigious triples, conflicting links)\\
%orientation
\textbf{return} $\mathcal{G}$
\end{algorithm*}

Next, we combine the PCMCI$^+$ algorithm \cite{runge2020discovering} with observed context and dummy  variables. To recall briefly, PCMCI$^+$ is a causal discovery algorithm for time series data, that allows for both contemporaneous and lagged links and assumes causal sufficiency. It consists of two steps. The first \emph{$\text{PC}_1$ lagged phase} infers a superset of the lagged parents together with the parents of contemporaneous ancestors. Next, the \emph{MCI contemporaneous phase} starts with links found in the previous step and all possible contemporaneous links, it then conducts momentary conditional independence (MCI) with a modified conditioning set learned in the previous step to increase detection power. 

Our method consists of four main steps: one $\text{PC}_1$ lagged phase and three MCI phases. In the first step, supersets of the lagged parents of the system and observed temporal context nodes are discovered by running the $\text{PC}_1$ lagged phase on this subset of nodes.
Next, the MCI test is run on pairs of system and context nodes conditional on subsets of system and context, i.e.\ perform MCI tests for pairs $((C^j_{t-\tau}, X^i_t))_{\tau > 0}$,  $(C_t^j, X_t^i)$, $(X_t^i, C_t^j)$ for all $i,j$, 
    \[
        C_{t-\tau}^i \indep X_t^j | \mathbf{S}, \hat{\mathcal{B}}^-_t(X_t^j)  \setminus \{ C_{t-\tau}^i \}, \hat{\mathcal{B}}^-_{t-\tau}(C_{t-\tau}^i)
    \]
with $\mathbf{S}$ being a subset of the contemporaneous adjacencies $\mathcal{A}_t(X_t^j)$ and $\hat{\mathcal{B}}^-_t(X_t^j)$ are the lagged adjacencies from step one. If $C$ is a spatial context variable, we only have to test the contemporaneous pairs $(C_t^j, X_t^i)$, $(X_t^i, C_t^j)$ for all $i,j$. If $C_t^j$ and $X_t^i$ are conditionally independent, all lagged links between $C_t^j$ and $X^j_{t-\tau}$ are also removed for all $\tau$.
In the third step, MCI tests on all system-dummy pairs conditional on the superset of lagged links, the discovered contemporaneous context adjacencies, as well as on subsets of contemporaneous system links, are performed, i.e.\ test for $(D, X_t^i)$, $(X_t^i, D)$ for all $i$, i.e.\ 
    \[
    D \indep X_t^j | \mathbf{S}, \hat{\mathcal{B}}^C_t(X_t^j)
    \]
    where $\mathbf{S} \subset \mathcal{A}_t(X_t^i)$ and $\hat{\mathcal{B}}^C_t(X_t^j)$ are the lagged and contextual adjacencies found in the previous step.
If $D$ and $X_t^j$ are found to be conditionally independence, links between $D$ and $X^j_{t-\tau}$ are removed for all $\tau$.
Using assumption \ref{ass:JCI}, context node is the parent in all system-context links.
Finally, in the fourth step, we perform  MCI tests on all system pairs conditional on discovered lagged, context and dummy adjacencies, as well as on subsets of contemporaneous system links and orientation phase. In more detail, we perform MCI test for pairs $((X^j_{t-\tau}, X_t^i))_{\tau > 0}$, $(X_t^i, X_t^j)$ for all $i, j$, i.e.\ 
    \[
     X^i_{t-\tau} \indep X_t^j | \mathbf{S}, \hat{\mathcal{B}}^{CD}_t(X_t^j)  \setminus \{ X_{t-\tau}^i \},  \hat{\mathcal{B}}^{CD}_t(X_{t-\tau}^i) 
    \]
    where $\mathbf{S} \subset \mathcal{A}_t(X_t^i)$ and $\hat{\mathcal{B}}^{CD}_t(X_t^j)$ are the lagged, contextual, and dummy adjacencies found in the previous steps.
Finally, all remaining edges (without expert knowledge) are oriented using the PCMCI$^+$ orientation phase while making use of all triples involving one context or dummy variable and two system variables as in the non-time series case.


\subsection{Theoretical Results}\label{subsec:theorems}
Proofs for the following statements are provided in SM.

\begin{theorem}[Non-time series consistency result] \label{thm:nonts}
    Denote the output of J-PC (Algorithm \ref{alg:nonts}) as $\mathcal{G}_{alg}$.
    Under the assumptions \ref{ass:sufficiency},\ref{ass:JCI}, \ref{ass:no_conf},  \ref{ass:asymmetry}, and assuming consistent conditional independence tests are used, the dummy deletion of $\mathcal{G}_{alg}$ corresponds to the dummy-deleted ground truth graph as the number of data sets $M$ tends to infinity.
\end{theorem}

Note that here the dummy-deleted ground truth graph is the target graph (definition \ref{def:target_graph}) adapted to the non-time series case.

\begin{theorem}[Time series consistency result] \label{thm:ts}
    Denote the time series graph output of J-PCMCI$^+$ (Algorithm \ref{alg:ts}) as $\mathcal{G}_{alg}$.
    Under assumptions \ref{ass:sufficiency}, \ref{ass:JCI}, \ref{ass:no_conf}, \ref{ass:asymmetry}, and assuming consistent conditional independence tests are used, the dummy deletion of $\mathcal{G}_{alg}$ corresponds to the target graph (definition \ref{def:target_graph}) as the number of data sets $M$ and the number of times steps $T$ tend to infinity.
\end{theorem}

The following consequence of theorem \ref{thm:ts} allows us to relax the rather strong assumption that latent context variables cannot mediate or confound an observed context variable and a system variable.

\begin{corollary}
    If some of the observed context variables are treated as unobserved, and the assumptions \ref{ass:sufficiency}-\ref{ass:asymmetry} still hold, our method J-PCMCI$^+$ will recover the correct system-system adjacencies.
\end{corollary}

In particular, even if all context variables are treated as unobserved, our algorithm yields the correct induced graph over the system variables.


\section{Numerical Experiments}\label{sec:num}
\paragraph{Data simulation}
We generate toy data from the SCM \ref{eq:scm} where we assume the functions $f_i, g_k$, and $h_l$ to be linear. We also evaluate the method on data where the mechanisms $f_i$ are nonlinear. For a more detailed description of this setup, see the SM. In particular, in the linear setting, for system variables $\mathbf{X}=\{X^i\}_{i\in \mathcal{I}}$ and temporal context variables $(C^{\text{time},k})_{k \in \mathcal{K}\text{time}}$ and spatial context variables $(C^{\text{space},l})_{l \in \mathcal{K}\text{space}}$, we consider the following ground truth SCM $ X^{i,m}_t = a_i X^{i,m}_{t-1} + \sum_{j} b_j X^{j,m}_{t-\tau_j} + \sum_{j} c_jC^{\text{time},j}_{t-\tau_j} + \sum_{j} d_jC^{j, m}_\text{space} + \eta^{i,m}_t$,
where $i \in \mathcal{I}$, $t= 1, \ldots, T$, and $m=1, \ldots, M$, $C^{\text{time},k} \sim \mathcal{N}(0,1)$, $C^{\text{space}} \sim \mathcal{N}(0,1)$. Furthermore, $\eta^i \sim \mathcal{N}(0,1)$ i.i.d., $a_i$ autocorrelation parameter uniformly drawn from $[0.3, 0.8]$, coefficients $b_j,c_j, d_j$ are uniformly drawn from $[0.5, 0.9]$, $50\%$ of the links are contemporaneous, the remaining lags are drawn uniformly from $[1,3]$. After the data has been generated, its variance is rescaled to one across all datasets to avoid varsortability \cite{reisach2021beware}. In the numerical experiments, we make the restriction that one system node can have at most one contextual parent.
After the time series for the ground truth model has been generated a certain fraction (indicated by parameter frac\_observed) of the context nodes is selected to be observed, the others are unobserved.
Note that in our simulated data all context nodes are exogeneous (they do not even have other context nodes as parents). We decided to set the experiments up in this way to put the focus on the discovery of the system-context links and also the deconfounding property of the dummy nodes. 


\paragraph{Setup}
We evaluate the performance of our method using True (TPR) and False Positive Rate (FPR) for the adjacencies which is calculated only on the system-system context links. Separately, we report the TPR and FPR on system-observed context links. All metrics and their standard deviations are computed on the estimated graphs of $50$ realizations of the model from time series with length $T$.

We compare our method to PCMCI$^+$ run on the data of the system nodes only by simply concatenating the data, as well as to PCMCI$^+$ where we have only included the observed (context and system) nodes. In this variant, we took care to include spatial context nodes only once in the time series graph. We build upon the implementation of PCMCI$^+$ algorithm within the Tigramite software package \citep{runge2019detecting} published under the GNU General Public License.

 We used the following model parameters in our experiments: Number of system nodes $|\mathcal{I}|=5$, number of context nodes $|\mathcal{K}_\text{time}| + |\mathcal{K}_\text{space}| = 3$ %\red{Confusing with the coefficient in the SCM above?}
 , maximal time lag $\tau_{\operatorname{max}}=2$, significance level $\alpha=0.05$. We use an extension of the (component-wise) partial correlation conditional independence test.
We vary the value of the time sample size $T$, and number of datasets $M$. Results for other fractions of observed context nodes can be found in the SM.


\paragraph{Benefit of including expressive context nodes and dummy}
In figure \ref{fig:results_cnt}, we report the TPR and FPR for observed context-system links, respectively. We see that our method finds the links between all observed context nodes and the system nodes as well as the PCMCI$^+$ variant with observed context nodes included in the graph. In figure \ref{fig:results}, we observe that the performance of our method is comparable to only including dummy variables when evaluated only on the system-system links. These two observations illustrate the benefit of using our method: If we would only rely on the dummy context variables, we would achieve deconfounding of the system variables but, naturally, we would not find any of the links between the observed context nodes and the system. In other words, we would not be able to learn which parts of the system are dependent on the properties of the context. On the other hand, if we would only include the observed context nodes, we would not be able to remove the confounding that could be introduced by the latent context variables. The effects of the counfounding latent context variables, are visible in the rise in FPR on system-system links in figure \ref{fig:results}. We also see lower edgemark recall and precision when only using system data, see the figures in the SM.

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{./frac1-cnt_nodes-10dom-5-3_adj.png}
    \caption{Discovery results of context-system links for varying sample sizes $T$, $M=10$. All other setup parameters are set as the defaults described in the main text. In this setting all context nodes are observed. Here, we compare our method (J-PCMCI$^+$) to PCMCI$^+$ using all data of observed nodes. Note that the maximal TPR that can be reached is equal to frac\_observed.}
    \label{fig:results_cnt}
\end{figure}

\paragraph{Convergence analysis (time and space dimension)}
We want to numerically study the finite sample properties of our method. For that, we look at the TPR and FPR on the system links for varying time sample size $T$ while keeping the number of spatial contexts $M$ fixed, and the other way around, see figure \ref{fig:results}. In these experiments half of the context nodes are unobserved. In the SM, we have also included 3D-plots for TPR and FPR on all pairs of $T$ and $M$.

\begin{figure}[htb!]
    \centering
    \includegraphics[width=\linewidth]{./frac0_5-sys_nodes-10dom-5-3_adj.png}
    \includegraphics[width=\linewidth]{./frac0_5-sys_nodes-10dom-5-3_adj_inset.png}
    \caption{Discovery results of system-system links for varying sample sizes $T$, and fixed $M=10$ (top row), and varying number of contexts $M$, and fixed $T=10$ (bottom row). All other setup parameters are set as the defaults described in the main text. In this setting half of the context nodes are observed. We compare our method (J-PCMCI$^+$) to PCMCI$^+$ using all data of observed nodes (PCMCI$^+$ with C), using all data of system variables and including dummies (PCMCI$^+$ with D), and only using data of system variables (PCMCI$^+$). %\red{Keep y-axis limits the same for top and bottom UN: But this makes the results in Fig. 2 look less nice. } 
    The inset shows the adj-FPR-surface with the contour of the $\alpha$-level in a simplified experimental setup to visualize the convergence of the method. Refer to the SM for details. 
    }
    \label{fig:results}
\end{figure}
It is a well established result in the econometrics literature, that, when considering fixed effects models, a bias is introduced in the OLS estimator of the slope parameter \cite{nickell1981biases}. A similar inconsistency problem can be observed in our method whenever $T$ is kept fixed and is small compared to $M$: Even when the number of spatial contexts $M$ goes to infinity, the links are not discovered correctly, see figure \ref{fig:results} (in particular the FPR plots) and the SM. %\red{Jakob: Does this go away for $T=200, M=100$ or so?}


\section{Discussion and Conclusions}\label{sec:discussion}

We presented an algorithm (J-PCMCI$^+$) for causal discovery from a collection of multivariate time series datasets that is able to deal with observed and unobserved context variables underlying the datasets.  We established its asymptotic consistency and studied its convergence properties numerically. 

The \textbf{main strengths} of J-PCMCI$^+$ are that it combines the efficient algorithm from \citet{runge2020discovering} (handling highly autocorrelated time series) with ideas in \citet{mooij2020joint,huang2020causal} to model observed as well as unobserved contexts. Pooling data from multiple datasets and adding observed contexts as well as dummies has several important benefits: (1) pooling increases sample size, (2) adding observed contexts and dummies allows to learn context-system relations and can help to orient  system-system links, (3) dummy variables allow to remove confounding at least from latent context variables.  J-PCMCI$^+$ inherits the benefits of PCMCI$^+$ for high-recall and accounting for autocorrelation in the conditional independence tests. We find numerically that J-PCMCI$^+$ has good performance for sufficiently large sample sizes and moderate numbers of datasets.

The \textbf{main weaknesses} are that some assumptions might be strong and unrealistic. In particular, JCI assumption 2 and the assumption that prohibits latent context mediation of the observed context to system link can be hard to justify depending on the setup, see Assumption \ref{ass:JCI}. In the SM, we discuss ramifications of partially relaxing this assumption. Our numerical experiments indicate that for too small sample sizes, we get inflated false positives due to missing latent confounders. We also cannot overcome the fundamental finite-sample bias in the OLS estimator~\citep{nickell1981biases} for small sample sizes and dummy variables carry the disavantage of increasing dimensionality.

In \textbf{future work}, we plan to extend the method to weaken the partially strong assumptions and to account for latent system variable confounding \cite{gerhardus2020high}. Furthermore, while our method only uses high-dimensional dummy variables where it finds dependence, we haven't looked yet into adapting CI tests specifically for dummy variables. In this work, we also only considered acyclic SCMs. However, an extension to include the cyclic case is possible based on the consistency of the PC algorithm in the cyclic setting \cite{mooij2020constraint}. Moreover, an in-depth analysis of finite sample properties of the presented method is needed.


\begin{acknowledgements}
W.G. was supported by the Helmholtz AI project CausalFlood (grant no. ZT-I-PF-5-11). 
J.R. and U.N.  received funding from the European Research Council (ERC) Starting Grant CausalEarth under the European Union’s Horizon 2020 research and innovation program (Grant Agreement No. 948112).
This work used resources of the Deutsches Klimarechenzentrum (DKRZ) granted by its Scientific Steering Committee (WLA) under project ID bd1083.

We thank the anonymous reviewers for their helpful comments.  
\end{acknowledgements}

% References
\bibliography{gunther_390}
\end{document}