\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\usepackage{bm}
\usepackage{subcaption}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{wrapfig}
\usepackage{enumitem}
\usepackage[ruled,linesnumbered]{algorithm2e}
%\RestyleAlgo{ruled}

\newcommand{\tailhead}{{\,\rightarrow\,}} % directed edge to the right
\newcommand{\headtail}{{\,\leftarrow\,}} % directed edge to the left
\newcommand{\headhead}{{\,\leftrightarrow\,}} % bidirected edge
\newcommand{\asthead}{{\ast\!\!\!\rightarrow}} % wildcard 1
\newcommand{\headast}{{\leftarrow\!\!\!\ast}} % wildcard 2
\newcommand{\astast}{{\ast\!{-}\!\ast}} % wildcard 3


\newcommand\xqed[1]{%
	\leavevmode\unskip\penalty9999 \hbox{}\nobreak\hfill
	\quad\hbox{#1}}
\newcommand\demo{\xqed{$\triangle$}}
\newcommand{\ind}{\perp\!\!\!\perp}
\newcommand{\nind}{\not\!\perp\!\!\!\perp}
\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\newtheorem{assumption}{Assumption} 
\newtheorem{remark}{Remark}
\newtheorem{lemma}{Lemma}
\theoremstyle{definition}
\newtheorem{example}{Example}
\newtheorem{definition}{Definition}

\makeatletter
\newbox\dottedarrow@box
\setbox\dottedarrow@box\hbox
  {%
    \begin{tikzpicture}
      \draw[dotted,->] (0,0) -- (1.5em,0);
    \end{tikzpicture}%
  }
\newcommand*\dottedarrow
  {\relax\ifmmode\expandafter\dottedarrow@m\else\expandafter\dottedarrow@t\fi}
\newcommand*\dottedarrow@t[1][1.5em]
  {\resizebox{#1}{!}{\raisebox{.5ex}{\usebox\dottedarrow@box}}}
\newcommand*\dottedarrow@m[1][]
  {%
    \if\relax\detokenize{#1}\relax
      \mathchoice% values are trial and error based\ldots
        {\dottedarrow@t}
        {\dottedarrow@t}
        {\dottedarrow@t[1.1em]}
        {\dottedarrow@t[0.9em]}%
    \else
      \dottedarrow@t[#1]%
    \fi
  }
\makeatother

\title{A Global Markov Property for Solutions of Stochastic Difference Equations and the corresponding Full Time Graphs}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<tom.hochsprung@dlr.de>?Subject=Your UAI 2024 paper}{Tom~Hochsprung}}
\author[1,2]{Jakob Runge$^*$}
\author[1]{Andreas Gerhardus$^*$}


% Add affiliations after the authors
\affil[1]{%
German Aerospace Center (DLR)\\
    Institute of Data Science\\
    Jena, Germany
}
\affil[2]{%
    Technische Universität Berlin\\
    Berlin, Germany
}

  
  \begin{document}
\maketitle
\def\thefootnote{$*$}\footnotetext{Equal supervision}
\def\thefootnote{\arabic{footnote}}
\begin{abstract}
  Structural Causal Models (SCMs) are an important tool in causal inference. They induce a graph and if the graph is acyclic, a unique observational distribution. A standard result states that in this acyclic case, the induced observational distribution satisfies a \textit{d}-separation global Markov property relative to the induced graph.
  Time series can also be modelled like SCMs: One just interprets the stochastic difference equations that a time series solves as structural equations. However, technical problems arise when time series "start" at minus infinity. In particular, a \textit{d}-separation global Markov property for time series and the corresponding infinite graphs, the so-called full time graphs, has thus far only been shown for stable vector autoregressive processes with independent finite-second-moment noise. 
  In this paper, we prove a much more general version of this Markov property. We discuss our assumptions and study violations of them. 
  Doing so hints at several pitfalls at the intersection of time series analysis and causal inference.
  Moreover, we introduce a new projection procedure for these infinite graphs which might be of independent interest.
\end{abstract}

\section{Introduction}\label{sec:intro}

Structural Causal Models (SCMs), also known as Structural Equation Models (SEMs), are widely used in causal inference \citep{pearl2009causality, peters2017elements, bongers2021foundations}. An SCM consists of finitely many exo- and endogeneous random variables, a joint probability distribution over the exogeneous random variables, and assignment functions that causally relate the random variables to each other. SCMs induce a graph where each variable is represented by a node and each direct causal relation by a directed edge. When the induced graph is acyclic, an SCM induces a unique observational distribution \citep[Proposition 3.4]{bongers2021foundations}. 
A standard result states that in this acyclic case, the induced observational distribution satisfies a \textit{d}-separation global Markov property relative to the induced graph (Theorem 1.4.1 in \citet{pearl2009causality} combined with Proposition 4 in \citet{fields1990independence}). This result, also known as the causal Markov property, allows one to read off conditional independencies from the induced graph and hence is highly important for SCM-based causal reasoning.

 SCMs frequently rely on independent and identically distributed (i.i.d.) observations. However, many real-world observations are not i.i.d.\ and instead come as time series.
 Nevertheless, one can still model many time series like SCMs:
One just interprets the stochastic difference equations that time series often solve as structural equations. This approach runs under various names and assumptions: Structural vector autoregressive (SVAR) processes
  \citep{demiralp2003searching, hyvarinen2010estimation,moneta2011causal,malinsky2018causal, pamfil2020dynotears}, additive nonlinear time series causal models \citep{chu2008search}, time series models with independent noise \citep{peters2013causal, peters2017elements}, vector autoregressive (VAR) processes \citep{dahlhaus2003causality,eichler2010graphical, entner2010causal, thams2022identifying}, structural causal processes \cite{pmlr-v124-runge20a} or just SCMs \citep{gerhardus2020high}.

When the time points $t$ are in some finite index set $I$, no new technical difficulties arise and one can directly translate the stochastic difference equations into structural equations. Similarly in the case $t\in\mathbb{N}_{\geq 0}$: Even though the stochastic difference equations continue infinitely to the future, the fact that one can push-forward a given distribution over the noise variables still allows for a typical SCM-like interpretation. Technically more problematic is the case $t\in\mathbb{Z}$: Here, the stochastic process "starts" at $-\infty$. Therefore, it is not immediately clear what the induced distribution of a "time-series SCM" is. In fact, the stochastic difference equations might have no solution, one solution or several solutions. Moreover, these solutions might behave quite differently and carry quite different causal interpretations: For example, solutions might depend on future noise variables, which does not fit well to the idea that causes precede their effect.

 One might be tempted to just ignore the case $t\in\mathbb{Z}$. However, several reasons speak against doing so (especially when modelling equilibriums): First, thinking about stochastic difference equations that continue infinitely to the past is often more convenient than to additionally also think about initial distributions. Second, the notion of stationarity, on which most of the existing time series literature relies \citep[among others]{fan2003nonlinear, lutkepohl2005new, brockwell2009time}, fits much more naturally to $t\in\mathbb{Z}$: When time series do not "start" at $-\infty,$ then they usually only converge to a stationary state instead of always being in it (unless one particularly crafts initial distributions).

In this paper, we focus on one particular issue for $t\in\mathbb{Z}$: In spirit of Theorem 1.4.1 in \citet{pearl2009causality}, we show that certain solutions of stochastic difference equations  satisfy a $d$-separation global Markov property with respect to the induced infinite graph when $t\in\mathbb{Z}$. To the best of our knowledge, such a result has so far only been shown for stable vector autoregressive (VAR) processes with independent finite-second-moment noise \citep{ thams2022identifying}.\footnote{
 Such a result also appears in \citet{dahlhaus2003causality}. However, as also remarked by an anonymous reviewer, \citet{dahlhaus2003causality} apply a result for finite graphs, namely the AMP Markov property from \citet{andersson2001alternative}, to  infinite graphs without giving further justification on why this is possible.
 
 Besides, there are also several Markov properties for finite graphical representations of time series, see, e.g., \citet{eichler2012graphical}.}
 We also discuss the required assumptions and study violations of them.
 Doing so hints at several pitfalls at the intersection of time series analysis and causal inference. Moreover, we thus also provide a better theoretical foundation for much existing and future work.


 
We structure our paper as follows: In \textbf{Section 2}, we introduce our notation and setup. \textbf{Section 3} is about proving the global Markov property. In \textbf{Section 4}, we show that our result applies to stable VAR processes with independent finite-second-moment noise that has a density with respect to Lebesgue measure. In \textbf{Section 5}, we provide a conclusion and an outlook. The \textbf{Supplementary Material (SM)} contains several of our proofs and further lemmas.

We require that the reader is familiar with basic terminology from causal inference. In particular, we use concepts such as graphs, $d$- and $m$-separation, conditional independence and SCMs. For a brief overview of graphs, $d$- and $m$-separation, see Section \ref{sec_basic_graphical_defs} in the SM, and for further reference see \citet{peters2017elements}, \citet{pearl2009causality} and \citet{richardson2003markov}. 

\section{Setup}
In \textbf{Section \ref{subsec_notation}}, we introduce notation that is relevant to the main paper (Section \ref{sec_further_notation} in the SM contains further notation that is just relevant to the SM). In \textbf{Section \ref{Sec_Stochastic_processes}}, we connect causality and stochastic difference equations and introduce several concepts along the way. In that section, we also formally state the $d$-separation global Markov property. Finally, \textbf{Section \ref{sec:futher_technical_things}} focuses on further technical assumptions that we need in order to prove the $d$-separation global Markov property.

\subsection{Notation}
\label{subsec_notation}
For two sets $A$ and $B$, we write $A\times B$ to denote their Cartesian product. We write $\mathbb{Z}$ to denote the integers. 
For some $d\in \{0,1,\ldots\}$, we write $\mathbb{N}_{\geq d}$ to denote the set $\{d,d+1,\ldots\}$.  For some $k\in\mathbb{N}_{\geq 1}$ and some set $A$, we write $A^k$ to denote the $k$-times Cartesian product $A\times \ldots \times A.$ For some $k\in\mathbb{N}_{\geq 1}$, we write $[k]_1$ to  abbreviate the set $\{1,\ldots,k\}$. Similarly, for some $k\in\mathbb{N}_{\geq 0}$, we write $[k]_0$ to  abbreviate the set $\{0,\ldots,k\}$.  For some $k\in\mathbb{N}_{\geq 1}$ and sets $A_1,\ldots,A_k$, we write $\Pi_{i\in [k]_1}A_i$ to denote the set $A_1\times \ldots \times A_k$, similarly for some $k\in\mathbb{N}_{\geq 0}$.
To denote a vector with potentially more than one component (with respect to some base space), we use \textbf{bold font}.

For random vectors $\bm{X}$ and $\bm{Y}$ that are defined on the same underlying probability space $(\Omega,\mathcal{F},P)$, we write $P_{\bm{XY}}$ to denote the joint distribution of $\bm{X}$ and $\bm{Y}$. For the marginal distributions of $\bm{X}$ and $\bm{Y}$, we write $P_{\bm{X}}$ and $P_{\bm{Y}}$, respectively. To denote unconditional independence between $\bm{X}$ and $\bm{Y}$, we write $\bm{X}\ind \bm{Y}.$ To denote conditional independence between $\bm{X}$ and $\bm{Y}$ given some further random vector $\bm{Z}$, we write $\bm{X}\ind \bm{Y}\mid \bm{Z}$. In slight abuse of notation, we consider a set $\bm{A}$ of random vectors as a random vector whose components are the in some way ordered elements of $\bm{A}$. Correspondingly, we write these sets of random vectors in bold font as well.

For the graphical part, we follow the convention that a vertex $v$ is never a parent or a child of itself (unless there is a self-edge, which we do not consider in this paper). However, we follow the convention that a vertex $v$ is always an ancestor or a descendant of itself. Moreover, for two vertices $a$ and $b$, we write $a\astast b$ to represent $a\rightarrow b$, $a\leftarrow b$ and $a\leftrightarrow b$ all at once. For more details and more precise explanations, see Section \ref{sec_basic_graphical_defs} in the SM.












\subsection{Causality and Stochastic Difference Equations}
\label{Sec_Stochastic_processes}
Let $(\Omega,\mathcal{F},P)$ be a probability space and let $d\in\mathbb{N}_{\geq 1}$. For all $i\in[d]$, let $(\mathcal{X}_i,\Sigma^X_i)$ and $(\mathcal{E}_i,\Sigma^\epsilon_i)$ be measurable spaces such that all $\mathcal{X}_i$ and all $\mathcal{E}_i$ are either real finite-dimensional vector spaces or finite discrete sets and such that $\Sigma^X_i$ and $\Sigma^\epsilon_i$ are the corresponding Borel $\sigma$-algebras.
We consider a given exogeneous noise process $\{\bm{\epsilon}_t\}_{t\in\mathbb{Z}}:=\{(\epsilon^{1}_t,\ldots,\epsilon^{d}_t)\}_{t\in\mathbb{Z}}$ and a given endogeneous process $\{\bm{X}_t\}_{t\in\mathbb{Z}}:=\{(X^{1}_t,\ldots\allowbreak,X^{d}_t)\}_{t\in\mathbb{Z}}$.
Here, each $X^{i}_t$ and each $\epsilon^{i}_t$ is a random variable defined on $(\Omega,\mathcal{F},P)$ and taking values in $(\mathcal{X}_i,\Sigma^X_i)$ respectively $(\mathcal{E}_i,\Sigma^\epsilon_i)$.
	Similarly to existing work \citep{peters2013causal, thams2022identifying}, we assume that
	\begin{itemize}
		\item \textbf{(A0)}: $\{\bm{\epsilon}_t\}_{t\in\mathbb{Z}}$ is an i.i.d. process with independent components, that is, 
		the distribution of any finite subset of $\{\epsilon^{i}_{t}:i\in[d],\;t\in\mathbb{Z}\}$ factorizes into its individual components and each $\bm{\epsilon}_t$ has the same distribution.\footnote{Note that the factorization assumption is equivalent to all finite subsets of
		$\{\epsilon^{i}_{t}:i\in[d],\;t\in\mathbb{Z}\}$ being mutually independent.
		Also note that Assumption (A0) \textit{does not} state that two different components $\epsilon^i_t$ and $\epsilon^j_t$ need to have the same distribution.}
	\end{itemize}
	 


We further assume that $\{(\bm{X}_t, \bm{\epsilon}_t)\}_{t\in\mathbb{Z}}$ is a solution of given stochastic difference equations: For each $i\in[d]_1$, let $q_i\in \mathbb{N}_{\geq 0}$ be a fixed number (which is the order of the stochastic difference equation for $X^i_t$). Now, for each $i\in[d]_1$ and for each $s\in[q_i]_0$, let $\textbf{PA}^i_s$ denote a tuple of fixed component time series, say the first and second component time series, and let $\mathcal{X}_{\text{pa}^i_s}$ denote the corresponding product space. Furthermore, write $\mathcal{X}_{\text{pa}^i}:=\prod_{s\in [q_i]_0}\mathcal{X}_{\text{pa}^i_s}$ and let $(\textbf{PA}^i_s)_{t-s}$ denote the respective component time series evaluated at time point $t-s$. \footnote{This notation is partly from \citet{peters2013causal} and Chapter 10 of \citet{peters2017elements}.}  Additionally, for all $i\in[d]_1$, let $f_i:\mathcal{X}_{\text{pa}^i}\times \mathcal{E}_i \rightarrow \mathcal{X}_i$ be a measurable function. Now, for all $i\in[d]$ and $t\in\mathbb{Z}$, we assume that $\{(\bm{X}_t, \bm{\epsilon}_t)\}_{t\in\mathbb{Z}}$ 
solves $P$-almost surely the stochastic difference equation
\begin{align}
\label{sde}
X^{i}_t=f_i\biggr((\textbf{PA}^i_{q_i})_{t-q_i},\ldots,(\textbf{PA}^i_0)_{t},\epsilon^{i}_t\biggr).
\end{align}
\begin{remark}
On the distribution level, a solution $\{(\bm{X}_t, \bm{\epsilon}_t)\}_{t\in\mathbb{Z}}$ or for simplicity $\{\bm{X}_t\}_{t\in\mathbb{Z}}$
corresponds to the collection of all joint and marginal distributions of the respective random vectors.
\end{remark}
\textbf{A causal interpretation:} So far, the stochastic difference equation in \eqref{sde} is just "algebraic". One can give \eqref{sde} a causal interpretation by saying that all non-superfluous variables on the right hand-side of the equation cause the variable on the left-hand side of the equation. In resemblance to SCMs, one can emphasize this structural character of \eqref{sde} by replacing the "$=$" with a "$:=$". In the next definition, we formally introduce the notion of non-superfluous random variables which we from now on call \textit{causal parents} \citep[c.f.\ Definition 2.6]{bongers2021foundations}.
\begin{definition}[Causal parent]
	\label{causal_parent}
	 For $i,j\in[d]_1$ and $s,t\in \mathbb{Z}$, we call $X^j_s$ a (causal) parent of $X^i_t$ if and only if there does not exist a measurable function\footnote{The notation $\mathcal{X}_{\text{pa}^i\setminus X^j_s}$ means that the space corresponding to $X^j_s$ is removed from $\mathcal{X}_{\text{pa}^i}$. Similarly, for $\bm{x}\in \mathcal{X}_{\text{pa}^i}$, writing $\bm{x}_{\setminus X^j_s}$ means that the element corresponding to $X^j_s$ is removed.} $\Tilde{f_i}:\mathcal{X}_{\text{pa}^i\setminus X^j_s}\times \mathcal{E}_i\rightarrow \mathcal{X}_i$ such that for $P_{\epsilon^i_t}$-almost every $e\in \mathcal{E}_i$ and for all $\bm{x}\in \mathcal{X}_{\text{pa}^i}$,
\begin{align*}
X^i_t&:=f_i\bigr(\bm{x},e\bigr)\\
\Longleftrightarrow
X^i_t&:=\Tilde{f_i}\bigr(\bm{x}_{\setminus X^j_s},e\bigr).
\end{align*}



We similarly define parentship of a noise variable $\epsilon^j_t$ on $X^i_{t}$.
\end{definition}
One can graphically represent \eqref{sde} using the \textit{(augmented) full time graph}.
\begin{definition} [(Augmented) full time graph]
	The \textit{full time graph} is an infinite graph with vertex set $\{X^{i}_t:\;t\in\mathbb{Z},\;i\in[d]_1\}$.\footnote{For notational simplicity, we do not make a distinction between random variables and their representation by vertices in the (augmented) full time graph. For us, variables = vertices.} The full time graph only has directed edges: There is a directed edge from $X^{j}_{s}$ to $X^{i}_t$ if and only if $X^{j}_{s}$ is a causal parent of $X^{i}_t$. 
	
	The \textit{augmented full time graph} consists of further vertices $\{\epsilon^{i}_t:\;t\in\mathbb{Z},\;i\in[d]_1\}$ and a directed edge from $\epsilon^{i}_t$ to $X^{i}_t$ if the former is a causal parent of the latter.
\end{definition}
\begin{remark}
	In our setting, there never is a directed edge from $X^{j}_{s}$ to $X^{i}_t$ when $s> t$. Moreover, there  never is a directed edge from $\epsilon^{i}_s$ to $X^{i}_t$ when $s\neq t$.
\end{remark}
\begin{remark}
   As long as Assumption (A1) below is satisfied, we allow contemporaneous edges in the full time graph, that is, edges between vertices at the same time point (the term $(\textbf{\textup{PA}}^i_0)_t$ in Equation \eqref{sde} hints at this option).
\end{remark}

\begin{example}
	\label{ex1}
	Consider a stochastic process $\{\bm{\epsilon}_t\}_{t\in\mathbb{Z}}=\{\epsilon^{1}_t,\epsilon^{2}_t\}_{t\in\mathbb{Z}}$ satisfying (A0). In addition, consider a stochastic process $\{\bm{X}_t\}_{t\in\mathbb{Z}}=\{X^{1}_t,X^{2}_t\}_{t\in\mathbb{Z}}$ satisfying
	\begin{align*}
	\label{ex1_eq}
	X^{1}_t&:=0.4 \cdot X^{1}_{t-1} + 0.2 \cdot X^{2}_{t-2} + \epsilon^{1}_t\;\;\;\;\text{ and}\\
	X^{2}_t&:=0.3 \cdot X^{2}_{t-1} + \epsilon^{2}_t.\numberthis
	\end{align*}
	Figure \ref{ftg_example} shows the full time graph and the augmented full time graph.
\end{example}
\begin{figure*}[t]
	\center
	\scalebox{0.6}{		
		\begin{tikzpicture}
		\node[draw = none] (U5) at (-6,0) {\Large $X^{2}_{t-4}$};
		\node[draw=white, minimum size=0.8cm] (e5) at (-6,-2) {};
		\node[draw = none] (X5) at (-6,2) {\Large$X^{1}_{t-4}$};
		
		\node[draw = none] (U4) at (-4,0) {\Large $X^{2}_{t-3}$};
		\node[draw = none] (X4) at (-4,2) {\Large$X^{1}_{t-3}$};
		
		\node[draw = none] (U3) at (-2,0) {\Large $X^{2}_{t-2}$};
		\node[draw = none] (X3) at (-2,2) {\Large$X^{1}_{t-2}$};
		
		\node[draw = none] (U) at (0,0) {\Large$X^{2}_{t-1}$};
		\node[draw = none] (X) at (0,2) {\Large$X^{1}_{t-1}$};
		
		\node[draw = none] (U2) at (2,0) {\Large $X^{2}_{t}$};
		\node[draw = none] (X2) at (2,2) {\Large$X^{1}_{t}$};
		
		\node[] (U6) at (4,0) {\Large$\ldots$};
		\node[] (X6) at (4, 2) {\Large$\ldots$};
		
		\node[] (U7) at (-8,0) {\Large$\ldots$};
		\node[] (X7) at (-8, 2) {\Large$\ldots$};
		
			%	\node[draw = none] (L1) at (-5,-3.2) {\Large{auxiliary nodes}};
	%	\node[draw = none] (L1) at (0,-3.2) {\Large{proper nodes}};
		%\draw [dashed, color = brown] (-3,-1) -- (-3,3);
		%\draw [dashed, color = brown] (-7.7,-1) -- (-7.7,3);
		%\draw [dashed, color = brown] (3.7,-1) -- (3.7,3);
		
		\path [->, line width = 0.5mm] (U) edge node[left] {} (U2);
		\path [->, line width = 0.5mm] (U3) edge node[left] {} (U);
		\path [->, line width = 0.5mm] (U4) edge node[left] {} (U3);
		\path [->, line width = 0.5mm] (U5) edge node[left] {} (U4);
		\path [->, line width = 0.5mm] (U2) edge node[left] {} (U6);
		\path [->, line width = 0.5mm] (U7) edge node[left] {} (U5);
		
		\path [->, line width = 0.5mm] (X) edge node[left] {} (X2);
		\path [->, line width = 0.5mm] (X3) edge node[left] {} (X);
		\path [->, line width = 0.5mm] (X4) edge node[left] {} (X3);
		\path [->, line width = 0.5mm] (X5) edge node[left] {} (X4);
		\path [->, line width = 0.5mm] (X2) edge node[left] {} (X6);
		\path [->, line width = 0.5mm] (X7) edge node[left] {} (X5);
		
		
		\path [->, line width = 0.5mm, color = blue] (U) edge node[left] {} (X6);
		\path [->, line width = 0.5mm, color = blue] (U3) edge node[left] {} (X2);
		\path [->, line width = 0.5mm, color = blue] (U4) edge node[left] {} (X);
		\path [->, line width = 0.5mm, color = blue] (U5) edge node[left] {} (X3);
		\path [->, line width = 0.5mm, color = blue] (U7) edge node[left] {} (X4);
		
		
		\end{tikzpicture}
	}
	\scalebox{0.6}{		
		\begin{tikzpicture}
		\node[draw = none] (U5) at (-6,0) {\Large $X^{2}_{t-4}$};
		\node[draw = none] (e5) at (-6,-2) {\Large $\epsilon^{2}_{t-4}$};
		\node[draw = none] (X5) at (-6,2) {\Large $X^{1}_{t-4}$};
		\node[draw = none] (f5) at (-6,4) {\Large $\epsilon^{1}_{t-4}$};
		
		\node[draw = none] (U4) at (-4,0) {\Large $X^{2}_{t-3}$};
		\node[draw = none] (e4) at (-4,-2) {\Large $\epsilon^{2}_{t-3}$};
		\node[draw = none] (X4) at (-4,2) {\Large $X^{1}_{t-3}$};
		\node[draw = none] (f4) at (-4,4) {\Large $\epsilon^{1}_{t-3}$};
		
		\node[draw = none] (U3) at (-2,0) {\Large $X^{2}_{t-2}$};
		\node[draw = none] (e3) at (-2,-2) {\Large $\epsilon^{2}_{t-2}$};
		\node[draw = none] (X3) at (-2,2) {\Large $X^{1}_{t-2}$};
		\node[draw = none] (f3) at (-2,4) {\Large $\epsilon^{1}_{t-2}$};
		
		\node[draw = none] (U) at (0,0) {\Large $X^{2}_{t-1}$};
		\node[draw = none] (e) at (0,-2) {\Large $\epsilon^{2}_{t-1}$};
		\node[draw = none] (X) at (0,2) {\Large $X^{1}_{t-1}$};
		\node[draw = none] (f) at (0,4) {\Large $\epsilon^{1}_{t-1}$};
		
		\node[draw = none] (U2) at (2,0) {\Large $X^{2}_{t}$};
		\node[draw = none] (e2) at (2,-2) {\Large $\epsilon^{2}_{t}$};
		\node[draw = none] (X2) at (2,2) {\Large $X^{1}_{t}$};
		\node[draw = none] (f2) at (2,4) {\Large $\epsilon^{1}_{t}$};
		
		\node[] (U6) at (4,0) {\Large$\ldots$};
		\node[] (e6) at (4,-2) {\Large$\ldots$};
		\node[] (X6) at (4, 2) {\Large$\ldots$};
		\node[] (f6) at (4, 4) {\Large$\ldots$};
		
		\node[] (U7) at (-8,0) {\Large$\ldots$};
		\node[] (e7) at (-8,-2) {\Large$\ldots$};
		\node[] (X7) at (-8, 2) {\Large$\ldots$};
		\node[] (f7) at (-8, 4) {\Large$\ldots$};
		
						%\node[draw = none] (L1) at (-5,-3.2) {\Large{auxiliary nodes}};
		%\node[draw = none] (L1) at (0,-3.2) {\Large{proper nodes}};
		%\draw [dashed, color = brown] (-3,-1) -- (-3,3);
		%\draw [dashed, color = brown] (-7.7,-1) -- (-7.7,3);
		%\draw [dashed, color = brown] (3.7,-1) -- (3.7,3);
		
		\path [->, line width = 0.5mm] (U) edge node[left] {} (U2);
		\path [->, line width = 0.5mm] (U3) edge node[left] {} (U);
		\path [->, line width = 0.5mm] (U4) edge node[left] {} (U3);
		\path [->, line width = 0.5mm] (U5) edge node[left] {} (U4);
		\path [->, line width = 0.5mm] (U2) edge node[left] {} (U6);
		\path [->, line width = 0.5mm] (U7) edge node[left] {} (U5);
		
		\path [->, line width = 0.5mm, color = orange] (e3) edge node[left] {} (U3);
		\path [->, line width = 0.5mm, color = orange] (e2) edge node[left] {} (U2);
		\path [->, line width = 0.5mm, color = orange] (e) edge node[left] {} (U);
		\path [->, line width = 0.5mm, color = orange] (e4) edge node[left] {} (U4);
		\path [->, line width = 0.5mm, color = orange] (e5) edge node[left] {} (U5);
		
		
		\path [->, line width = 0.5mm] (X) edge node[left] {} (X2);
		\path [->, line width = 0.5mm] (X3) edge node[left] {} (X);
		\path [->, line width = 0.5mm] (X4) edge node[left] {} (X3);
		\path [->, line width = 0.5mm] (X5) edge node[left] {} (X4);
		\path [->, line width = 0.5mm] (X2) edge node[left] {} (X6);
		\path [->, line width = 0.5mm] (X7) edge node[left] {} (X5);
		
		\path [->, line width = 0.5mm, color = orange] (f3) edge node[left] {} (X3);
		\path [->, line width = 0.5mm, color = orange] (f2) edge node[left] {} (X2);
		\path [->, line width = 0.5mm, color = orange] (f) edge node[left] {} (X);
		\path [->, line width = 0.5mm, color = orange] (f4) edge node[left] {} (X4);
		\path [->, line width = 0.5mm, color = orange] (f5) edge node[left] {} (X5);
		
		\path [->, line width = 0.5mm, color = blue] (U) edge node[left] {} (X6);
		\path [->, line width = 0.5mm, color = blue] (U3) edge node[left] {} (X2);
		\path [->, line width = 0.5mm, color = blue] (U4) edge node[left] {} (X);
		\path [->, line width = 0.5mm, color = blue] (U5) edge node[left] {} (X3);
		\path [->, line width = 0.5mm, color = blue] (U7) edge node[left] {} (X4);
		
		
		\end{tikzpicture}
	}
	
	
	\caption{The full time graph (left) and the augmented full time graph (right) from Example \ref{ex1}. The "$\ldots$" indicate that nodes and edges repeat infinitely to the past and future. We used color for better clarity.}
	\label{ftg_example}
\end{figure*}

\textbf{Acyclicity assumption:}  For the remainder of the paper, we assume that
\begin{itemize}
    \item \textbf{(A1):} the full time graph does not contain directed cycles.
\end{itemize}
Note that directed cycles in the full time graph (and hence in the augmented full time graph) never occur when there are no contemporaneous edges.

\textbf{A global Markov property}: For SCMs whose induced graph is a DAG (and who by definition have finitely many variables), the induced observational distribution satisfies a $d$-separation global Markov property with respect to the induced graph (Theorem 1.4.1\ in \citet{pearl2009causality} combined with Proposition 4 in \citet{fields1990independence}). In this paper, we show that such a result also holds for the distributions of $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ respectively $\{(\bm{X}_t,\bm{\epsilon}_t)\}_{t\in\mathbb{Z}}$ and the corresponding (augmented) full time graph. In the next theorem, we present this result; we defer the remaining technical assumptions to Section \ref{sec:futher_technical_things}.



\begin{theorem}[A global Markov property for solutions of stochastic difference equations and the corresponding (augmented) full time graphs]
	\label{Propo_Global_Markov_Property}
	Every solution of the stochastic difference equations in \eqref{sde} that satisfies (A0)--(A4) is globally Markov in a $d$-separation sense with respect to the full time graph. That is, for any finite disjoint index sets $J_1,J_2,J_3\subseteq [d]_1\times\mathbb{Z}$ such that $\bm{S}_3:=\{X^{i}_t:(i,t)\in J_3\}$ $d$-separates $\bm{S}_1:=\{X^{i}_t:(i,t)\in J_1\}$ and $\bm{S}_2:=\{X^{i}_t:(i,t)\in J_2\}$ in the full time graph, $\bm{S}_1\ind \bm{S}_2\mid \bm{S}_3$.
	
	An analogous result including noise variables $\{\epsilon^{i}_t:\;t\in\mathbb{Z},\;i\in[d]_1\}$ holds for the augmented full time graph.
\end{theorem}

\subsection{Further Technical Assumptions}
\label{sec:futher_technical_things}
\textbf{Further technical assumptions}:	In this paper, we further assume that
\begin{itemize}
	\item \textbf{(A2):} $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ is strictly stationary\footnote{The part of Assumption (A0) which states that each $\bm{\epsilon}_t$ has the same distribution ensures that Assumption (A2) can hold for non-pathological cases. However, this part of (A0) is not required for the proofs.}, that is, all distributions of finitely many vectors are time shift-invariant,
	\item \textbf{(A3):} $\{\bm{X}_s\}_{s\leq t}$ is independent of $\{\bm{\epsilon}_s\}_{s>t}$ for all $t\in\mathbb{Z}$ (meaning that all finite subsets of these two sets are independent), and
	\item \textbf{(A4):} $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ is $\alpha$-strongly mixing, that is,
	\begin{align*}
	\alpha(m)&:=\sup_{A\in \mathcal{A}_0,\;B\in\mathcal{A}^m} |P(A\cap B)-P(A)P(B)|\\
	&\overset{m\rightarrow \infty}{\longrightarrow 0 }
	\end{align*}
	where $\mathcal{A}_0:=\sigma(\bm{X}_t:\;t\leq 0)$ and $\mathcal{A}^m:=\sigma(\bm{X}_t:\;t\geq m)$.\footnote{Due to Assumption (A2), namely strict stationarity, we could take any other reference point than $0$ (see, e.g., \citet{bradley2005basic} for this statement and Lemma \ref{lemma_ref_point} in the SM for a formal proof).}
\end{itemize}	


\textbf{Discussion of the assumptions:} 
We decided to make assumptions that often appear in the time series (and causal inference) literature and from thereon prove the global Markov property.

Assumptions similar to (A2), so other forms of stationarity, are common in the time series literature \citep[among others]{brockwell2009time, fan2003nonlinear} and the time series causal inference literature \citep[among others]{malinsky2018causal, pamfil2020dynotears}. Furthermore, existing Markov properties for time series also assume some form of stationarity \citep{dahlhaus2003causality, eichler2012graphical, thams2022identifying}. \textit{Strict} stationarity in particular, so Assumption (A2), is the most natural stationarity-notion for the nonlinear setting as other weaker notions such as covariance stationarity neglect higher moments \citep[Section 2.1.1]{fan2003nonlinear}. The existing Markov property closest to our work from \citet{thams2022identifying} implicitly also assumes strict stationarity as we explain in Section \ref{SectionApplications}.

Also note that nonstationarity is usually better modelled by allowing the functions in equation \eqref{sde} to be time-variant instead of considering "artificial" nonstationary solutions to time-invariant equations. So while stationarity (in whatever form) is a strong assumption and many real-world time series arguably violate it, stationarity within the realm of our time-invariant difference equations looks rather natural. The strong assumption rather is equation \eqref{sde}; we consider it future work to generalize it.


 

 Assumption (A3) is also common in the time series literature and as we argue in Section \ref{SectionApplications}, implicitly made by \citet{thams2022identifying}. Assumption (A3) states that noise variables only influence present and future instances of $\{\bm{X}_t\}_{t\in\mathbb{Z}}$. Processes that do not satisfy assumption (A3) are known as "non-causal" or "future-dependent" \citep[Section 3.1]{brockwell2009time}. Note that a violation of (A3) also immediately implies a violation of the global Markov property for the augmented full time graph. Besides, Assumption (A3) fits well to the existing causal inference literature: In SCMs, noise variables also only influence endogeneous variables further down the causal chain.

Lastly, Assumption (A4) is typical in the \textit{nonlinear} time series literature and used to prove several limit theorems \citep[Section 2.6]{fan2003nonlinear}. Assumption (A4) states that past and future instances of $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ are asymptotically independent.
In terms of the functions $f_i$, the mixing condition roughly states that the noise variables play a non-negligible role. Assumption (A4) is thus similar to the structural minimality assumption from the causal inference literature \citep[Definition 2.10]{bongers2021foundations}. In comparison to other mixing conditions, Assumption (A4) is rather weak \citep{bradley2005basic}: Many other mixing conditions imply $\alpha$-strong mixing. Besides, as we will see in Section \ref{SectionApplications} and further discuss in Remark \ref{remark_var_assumptions}, $\alpha$-strong mixing is also implicitly assumed by \citet{thams2022identifying} if the noise variables have a density with respect to Lebesgue measure.

Establishing mixing conditions is rather difficult. However, there are several papers that prove mixing conditions for various processes \citep[among others]{ibragimov1978gaussian, mokkadem1988mixing,chan1994note, xia1999projection,cline1999geometric,bradley2005basic}.






\textbf{Assumption violations:} We now illustrate that one generally requires assumptions similar to (A3) and (A4) to ensure that a $d$-separation global Markov property holds: We present counterexamples for which just one assumption is violated while the other assumptions are satisfied. Doing so shows that none of Assumptions (A0)--(A4) alone suffices to establish a $d$-separation global Markov property.\footnote{For (A2), we are not able to write down such a counterexample --- for the examples we could think of at least one of the other assumptions was also always violated (see Example \ref{ex_A2_violation} in the SM for one such example).}


\underline{Violation of Assumption (A3):}\footnote{See Example \ref{ex_numerical_illus} in the SM for a numerical illustration of this assumption violation.} Let $\phi\in \mathbb{R}$ with $|\phi|>1$.
Furthermore, let $\bm{\epsilon}_t:=\{\epsilon^1_t,\epsilon^2_t\}_{t\in\mathbb{Z}}$ be a stochastic process satisfying (A0) and where each $\bm{\epsilon}_t$ has the same bivariate Gaussian distribution with mean zero and identity covariance matrix. 
 Define the stochastic process $\{\bm{X}_t\}_{t\in \mathbb{Z}}:=\{X^{1}_t,X^{2}_t\}_{t\in \mathbb{Z}}$ for each $t\in\mathbb{Z}$ by
\begin{align*}
\label{eq_nonstable_sol}
X^{1}_t&=-\sum_{i=1}^\infty \phi^{-i}(\epsilon^{1}_{t+i}+\epsilon^{2}_{t+i-1})\;\;\;\;\text{ and}\\
X^{2}_t&=\epsilon^{2}_{t}.\numberthis
\end{align*}
Note that $\{\phi^{-i}\}_{i\in \mathbb{N}_{\geq 1}}$ is absolutely summable and hence $X^1_t$, interpreted as a limit in mean-square, exists and is uniquely defined up to $P$-nullsets \citep[Section C.3]{lutkepohl2005new}.
Also note that Assumption (A3) is violated: For example, just consider the covariance between $X^1_t$ and $\epsilon^2_{t+1}$ which equals $-\phi^{-2}$ to see that $X^1_t$ and $\epsilon^2_{t+1}$ are dependent.

Some calculations (see Lemma \ref{lemma_solv_sde} in the SM) show that $\{\bm{X}_t\}_{t\in \mathbb{Z}}$ and $\{\bm{\epsilon}_t\}_{t\in\mathbb{Z}}$ solve the stochastic difference equations
\begin{align*}
\label{eq_counter_a3}
X^{1}_{t}&=\phi \cdot X^{1}_{t-1}+X^{2}_{t-1}+\epsilon^{1}_{t}\;\;\;\;\text{ and}\\
X^{2}_{t}&=\epsilon^{2}_{t}\numberthis
\end{align*}
$P$-almost surely for all $t\in\mathbb{Z}$. There are clearly no contemporaneous edges in the induced full time graph, so Assumption (A1) holds.
To verify the other assumptions, we define time reversed processes $\{\bm{\Tilde{X}}_t\}_{t\in \mathbb{Z}}$ and $\{\bm{\Tilde{\epsilon}}_t\}_{t\in \mathbb{Z}}$ by
$(\Tilde{X}^1_t,\Tilde{X}^2_t)=(X^1_{-t},-X^2_{-t-1}/\phi)$ and $(\Tilde{\epsilon}^1_t,\Tilde{\epsilon}^2_{t})=(-\epsilon^1_{-t+1}/\phi,-\epsilon^2_{-t-1}/\phi)$.
Again, some calculations (see Lemma \ref{lemma_solv_sde} in the SM) show that $\{\bm{\Tilde{X}}_t\}_{t\in \mathbb{Z}}$ and $\{\bm{\Tilde{\epsilon}}_t\}_{t\in \mathbb{Z}}$ satisfy the VAR stochastic difference equations
\begin{align*}
\Tilde{X}^1_{t}&=\frac{1}{\phi} \Tilde{X}^1_{t-1}+\Tilde{X}^2_{t-1}+\Tilde{\epsilon}^1_t\;\;\;\;\text{ and}\\
\Tilde{X}^2_t&=\Tilde{\epsilon}^2_{t}
\end{align*}
$P$-almost surely for all $t\in\mathbb{Z}$.
Clearly, $\{\bm{\Tilde{\epsilon}}_t\}_{t\in \mathbb{Z}}$ is bivariate Gaussian with mean zero and diagonal covariance matrix.
 Moreover, $\{\bm{\Tilde{X}}_t\}_{t\in \mathbb{Z}}$ is a stable VAR process with independent Gaussian noise which implies that $\{\bm{\Tilde{X}}_t\}_{t\in \mathbb{Z}}$ is strictly stationary \citep[p.16 and Proposition 2.1]{lutkepohl2005new}. Therefore, $\{\bm{X}_t\}_{t\in \mathbb{Z}}$ is strictly stationary and so Assumption (A2) is satisfied.

Furthermore, \citet{mokkadem1988mixing} shows that $\{\bm{\Tilde{X}}_t\}_{t\in \mathbb{Z}}$ is $\alpha$-strongly mixing, which, together with Assumption (A2), implies that $\{\bm{X}_t\}_{t\in \mathbb{Z}}$ is $\alpha$-strongly mixing, so Assumption (A4) is satisfied (see Lemmas \ref{lemma_reversing} and \ref{lemma_shifting} in the SM for a formal proof of this implication).

In the corresponding full time graph, $X^{1}_t$ and $X^{2}_{t+1}$
are $d$-separated by the empty set, however, $X^{1}_t$ depends on $X^{2}_{t+1}=\epsilon^{2}_{t+1}$. Thus, the $d$-separation global Markov property for this solution of \eqref{eq_counter_a3} and the full time graph does not hold.

\underline{Violation of Assumption (A4):}

Take any two random variables $Z_1$ and $Z_2$ that are dependent and take any $\{\bm{\epsilon}_t\}_{t\in\mathbb{Z}}$ that satisfies Assumption (A0) and  $\{\bm{\epsilon}_t\}_{t\in \mathbb{Z}}\ind(Z_1,Z_2)$. Define $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ by
\begin{align*}
X^{1}_{t}&=Z_1\;\;\;\;\text{ and}\\
X^{2}_{t}&=Z_2.
\end{align*}
One can see that $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ and $\{\bm{\epsilon}_t\}_{t\in\mathbb{Z}}$ satisfy the stochastic difference equations
\begin{align*}
\label{eq_counter_a4}
X^{1}_{t}&=X^{1}_{t-1}\;\;\;\;\text{ and}\\
X^{2}_{t}&=X^{2}_{t-1}.\numberthis
\end{align*}
One can also see that Assumption (A1) holds because the induced full time graph clearly contains no contemporaneous edges. Also, by construction, Assumptions (A2) and (A3) hold.
However, Assumption (A4) does not hold because every $\alpha(m)$ is lower bounded by $\sup_{A\in \Sigma^X_1,B\in \Sigma^X_2}|P(Z_1\in A,\;Z_2\in B)-P(Z_1\in A)P(Z_2\in B)|$
 which is positive because $Z_1$ and $Z_2$ are dependent.

In the corresponding full time graph, $X^{1}_t$ and $X^{2}_t$ are $d$-separated by the empty set, however, by construction, they are dependent. Thus, the $d$-separation global Markov property for this solution of \eqref{eq_counter_a4} and the full time graph does not hold.


\section{Proving the Global Markov Property}
 In \textbf{Section \ref{subsec_cutoff_graphs}}, we define finite graphs that represent the stochastic processes $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ and $\{(\bm{X}_t,\bm{\epsilon}_t)\}_{t\in\mathbb{Z}}$ in a given finite-length time interval.
In the same section, we then show that the distribution of $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ respectively $\{(\bm{X}_t,\bm{\epsilon}_t)\}_{t\in\mathbb{Z}}$ in this finite-length time interval is globally Markov in an $m$-separation sense with respect to the corresponding finite graph. Finally, we show in \textbf{Section \ref{subsection:remaining_proof}} that $d$-separation in the (augmented) full time graph implies $m$-separation in these finite graphs.

Our proof thus works as follows: A $d$-separation statement in the (augmented) full time graph implies an $m$-separation statement in a certain finite graph (\textbf{Lemma \ref{lemma_sep_impli}}). Due to the $m$-separation global Markov property (\textbf{Lemma \ref{m-sep_glob_markov_property}}), the respective conditional independence holds.








\subsection{Finite cutoff graphs}
\label{subsec_cutoff_graphs}
We now define cutoff graphs and augmented cutoff graphs.
\begin{definition}[(Augmented) $(t_0,n)$-cutoff graph]
		Write $q:=\max_{i\in[d]_1}q_i$.
		For given $t_0\in\mathbb{Z}$ and $n\in\mathbb{N}_{\geq 0}$,
		the $(t_0,n)$-cutoff graph is a finite graph with vertex set $\{X^{i}_t:\;t\in[t_0-n-q,t_0]\,\cap\,\mathbb{Z},\;i\in[d]_1\}$. The $(t_0,n)$-cutoff graph has directed and bidirected edges: There is a directed edge from $X^{j}_{s}$ to $X^{i}_t$ if and only if $t\geq t_0-n$ and $X^{j}_{s}$ is a causal parent of $X^{i}_t$. Moreover, there is a bidirected edge between $X^{j}_{s}$ and $X^{i}_t$ if and only if $s,t< t_0-n$ and $X^{j}_{s}$ and $X^{i}_t$ have a common ancestor in the full time graph (which might equal $X^{j}_{s}$ or $X^{i}_t$).
	
	The augmented $(t_0,n)$-cutoff graph consists of further vertices $\{\epsilon^{i}_t:\;t\in[t_0-n,t_0]\,\cap\,\mathbb{Z},\;i\in[d]_1\}$ and a directed edge from $\epsilon^{i}_t$ to $X^{i}_t$ if the former is a causal parent of the latter.
\end{definition}


\begin{remark}
	These cutoff graphs very much look like a finite version of the (augmented) full time graph: The directed edges in the (augmented) $(t_0,n)$-cutoff graph are exactly as the directed edges in the (augmented) full time graph that point to a vertex in $[t_0-n,t_0]$. Only the edges with both endpoints in the interval $[t_0-n-q,t_0-n-1]$ differ. 
\end{remark}



\begin{figure*}[t]
	\center
	\scalebox{0.6}{		
		\begin{tikzpicture}
		\node[draw = none] (U5) at (-6,0) {\Large $X^{2}_{t_0-4}$};
	\node[draw=white, minimum size=0.8cm] (e5) at (-6,-2) {};
		\node[draw = none] (X5) at (-6,2) {\Large$X^{1}_{t_0-4}$};
		
		\node[draw = none] (U4) at (-4,0) {\Large $X^{2}_{t_0-3}$};
		\node[draw = none] (X4) at (-4,2) {\Large$X^{1}_{t_0-3}$};
		
		\node[draw = none] (U3) at (-2,0) {\Large $X^{2}_{t_0-2}$};
		\node[draw = none] (X3) at (-2,2) {\Large$X^{1}_{t_0-2}$};
		
		\node[draw = none] (U) at (0,0) {\Large$X^{2}_{t_0-1}$};
		\node[draw = none] (X) at (0,2) {\Large$X^{1}_{t_0-1}$};
		
		\node[draw = none] (U2) at (2,0) {\Large $X^{2}_{t_0}$};
		\node[draw = none] (X2) at (2,2) {\Large$X^{1}_{t_0}$};
		
		\node[draw = none] (L1) at (-5,-3.2) {\Large{auxiliary nodes}};
		\node[draw = none] (L1) at (0,-3.2) {\Large{proper nodes}};
		\draw [dashed, color = brown] (-3,-1) -- (-3,3);
		
		
		\path [->, line width = 0.5mm] (U) edge node[left] {} (U2);
		\path [->, line width = 0.5mm] (U3) edge node[left] {} (U);
		\path [->, line width = 0.5mm] (U4) edge node[left] {} (U3);
		\path [<->, line width = 0.5mm, color = gray] (U5) edge node[left] {} (U4);
		
		
		
		\path [->, line width = 0.5mm] (X) edge node[left] {} (X2);
		\path [->, line width = 0.5mm] (X3) edge node[left] {} (X);
		\path [->, line width = 0.5mm] (X4) edge node[left] {} (X3);
		\path [<->, line width = 0.5mm, color = gray] (X5) edge node[left] {} (X4);
		
		
		
		
		\path [<->, line width = 0.5mm, color = gray] (U5) edge node[left] {} (X5);
		\path [<->, line width = 0.5mm, color = gray] (U4) edge node[left] {} (X4);
		\path [<->, line width = 0.5mm, color = gray] (U5) edge node[left] {} (X4);
		\path [<->, line width = 0.5mm, color = gray] (U4) edge node[left] {} (X5);
		
		\path [->, line width = 0.5mm, color = blue] (U3) edge node[left] {} (X2);
		\path [->, line width = 0.5mm, color = blue] (U4) edge node[left] {} (X);
		\path [->, line width = 0.5mm, color = blue] (U5) edge node[left] {} (X3);
		
		
		
		\end{tikzpicture}
	}
\hspace{0.5cm}
	\scalebox{0.6}{		
		\begin{tikzpicture}
		\node[draw = none] (U5) at (-6,0) {\Large $X^{2}_{t_0-4}$};
		
		\node[draw = none] (X5) at (-6,2) {\Large$X^{1}_{t_0-4}$};
		
		
		\node[draw = none] (U4) at (-4,0) {\Large $X^{2}_{t_0-3}$};
		
		\node[draw = none] (X4) at (-4,2) {\Large$X^{1}_{t_0-3}$};
		
		
		\node[draw = none] (U3) at (-2,0) {\Large $X^{2}_{t_0-2}$};
		\node[draw = none] (e3) at (-2,-2) {\Large $\epsilon^{2}_{t_0-2}$};
		\node[draw = none] (X3) at (-2,2) {\Large$X^{1}_{t_0-2}$};
		\node[draw = none] (f3) at (-2,4) {\Large$\epsilon^{1}_{t_0-2}$};
		
		\node[draw = none] (U) at (0,0) {\Large$X^{2}_{t_0-1}$};
		\node[draw = none] (e) at (0,-2) {\Large$\epsilon^{2}_{t_0-1}$};
		\node[draw = none] (X) at (0,2) {\Large$X^{1}_{t_0-1}$};
		\node[draw = none] (f) at (0,4) {\Large$\epsilon^{1}_{t_0-1}$};
		
		\node[draw = none] (U2) at (2,0) {\Large $X^{2}_{t_0}$};
		\node[draw = none] (e2) at (2,-2) {\Large $\epsilon^{2}_{t_0}$};
		\node[draw = none] (X2) at (2,2) {\Large$X^{1}_{t_0}$};
		\node[draw = none] (f2) at (2,4) {\Large$\epsilon^{1}_{t_0}$};
		
		\node[draw = none] (L1) at (-5,-3.2) {\Large{auxiliary nodes}};
\node[draw = none] (L1) at (0,-3.2) {\Large{proper nodes}};
\draw [dashed, color = brown] (-3,-1) -- (-3,3);
		
		
		\path [->, line width = 0.5mm] (U) edge node[left] {} (U2);
		\path [->, line width = 0.5mm] (U3) edge node[left] {} (U);
		\path [->, line width = 0.5mm] (U4) edge node[left] {} (U3);
		\path [<->, line width = 0.5mm, color = gray] (U5) edge node[left] {} (U4);
		
		
		\path [->, line width = 0.5mm, color = orange] (e3) edge node[left] {} (U3);
		\path [->, line width = 0.5mm, color = orange] (e2) edge node[left] {} (U2);
		\path [->, line width = 0.5mm, color = orange] (e) edge node[left] {} (U);
		
		
		
		\path [->, line width = 0.5mm] (X) edge node[left] {} (X2);
		\path [->, line width = 0.5mm] (X3) edge node[left] {} (X);
		\path [->, line width = 0.5mm] (X4) edge node[left] {} (X3);
		\path [<->, line width = 0.5mm, color = gray] (X5) edge node[left] {} (X4);
		
		
		\path [->, line width = 0.5mm, color = orange] (f3) edge node[left] {} (X3);
		\path [->, line width = 0.5mm, color = orange] (f2) edge node[left] {} (X2);
		\path [->, line width = 0.5mm, color = orange] (f) edge node[left] {} (X);
		
		
		
		\path [->, line width = 0.5mm, color = blue] (U3) edge node[left] {} (X2);
		\path [->, line width = 0.5mm, color = blue] (U4) edge node[left] {} (X);
		\path [->, line width = 0.5mm, color = blue] (U5) edge node[left] {} (X3);
		
		\path [<->, line width = 0.5mm, color = gray] (U5) edge node[left] {} (X5);
		\path [<->, line width = 0.5mm, color = gray] (U4) edge node[left] {} (X4);
		\path [<->, line width = 0.5mm, color = gray] (U5) edge node[left] {} (X4);
		\path [<->, line width = 0.5mm, color = gray] (U4) edge node[left] {} (X5);
		
		\end{tikzpicture}
	}
	
	
	\caption{$(t_0,2)$-cutoff graph (left) and augmented $(t_0,2)$-cutoff graph (right) from Example \ref{ex1}.}
	\label{ex_cutoff}
\end{figure*}




We now introduce some terminology that applies both to $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ and $\{\bm{\epsilon}_t\}_{t\in\mathbb{Z}}$: For given $t_0\in\mathbb{Z}$ and $n\in\mathbb{N}_{\geq 0}$, we call the variables in the interval $[t_0-n-q,t_0-n-1]$ \textit{auxiliary} variables. Also, we call the variables in the interval $[t_0-n,t_0]$ \textit{proper} variables. Furthermore, we call the variables in the interval $(-\infty,t_0-n-q-1]$ \textit{ancient} variables and in the interval $[t_0+1,\infty)$ \textit{future} variables.

\addtocounter{example}{-1}
\begin{example}(continued)
	Let $t_0\in\mathbb{Z}$ and $n=2$. The auxiliary variables that are represented in the (augmented) $(t_0,n)$-cutoff graph are $X^{1}_{t_0-4}$, $X^{1}_{t_0-3}$, $X^{2}_{t_0-4}$ and $X^{2}_{t_0-3}$. The proper variables that are represented in the (augmented) $(t_0,n)$-cutoff graph are $X^{1}_{t_0-2}$, $X^{1}_{t_0-1}$, $X^{1}_{t_0}$, $X^{2}_{t_0-2}$, $X^{1}_{t_0-1}$, $X^{1}_{t_0}$ (and $\epsilon^{1}_{t_0-2}$, $\epsilon^{1}_{t_0-1}$, $\epsilon^{1}_{t_0}$, $\epsilon^{2}_{t_0-2}$, $\epsilon^{1}_{t_0-1}$, $\epsilon^{1}_{t_0}$).
The (augmented) $(t_0,2)$-cutoff graph is as in Figure \ref{ex_cutoff}.
\end{example}

\textbf{Districts and dependence:} One can write every proper variable as a function of the auxiliary variables and noise variables. Thus, to understand $m$-separation between proper variables in a cutoff graph, it is crucial to understand $m$-separation between the auxiliary variables.
 Similarly, to understand the dependence structure between the proper variables, it is crucial to understand the dependence structure between the auxiliary variables.

To understand $m$-separation between the auxiliary variables, the notion of \textit{districts} is important \citep[Section 3]{richardson2003markov}. 
Districts are the connected components into which one can partition the auxiliary variables of a cutoff graph:
Two auxiliary variables are in the same connected component if and only if there is a path between them in the respective cutoff graph just consisting of bidirected edges.
 
 As an important sub-result, we first show that each district is independent of the other districts. As we will see later, this sub-result then enables us to prove a global $m$-separation Markov property for the (augmented) $(t_0,n)$-cutoff graph and the corresponding distribution. The following lemma leads us to this sub-result.

\begin{lemma}[Extended Reichenbach's common cause principle for (augmented) full time graphs]
	\label{lemma_reichenbach}
	Let Assumptions (A0)--(A4) hold.
	Let $J_1,J_2\subseteq [d]_1\times\mathbb{Z}$ be finite disjoint index sets and define $\bm{S}_1:=\{X^{i}_t:(i,t)\in J_1\}$ and $\bm{S}_2:=\{X^{i}_t:(i,t)\in J_2\}$. If $\bm{S}_1$ and $\bm{S}_2$ are dependent, then $\bm{S}_1$ and $\bm{S}_2$ have a common ancestor in the full time graph (which might be in either $\bm{S}_1$ or $\bm{S}_2$).
\end{lemma}
\begin{proof}
	We here just provide a proof idea, for the details see Section \ref{sm_proof_3} in the SM.
	
	Notational remark: For $\Delta\in\mathbb{N}_{\geq 0}$, let $\bm{S}_{1,\Delta}$ and $\bm{S}_{2, \Delta}$ be the sets of variables that are constructed by time-shifting all elements of $\bm{S}_1$ respectively $\bm{S}_2$ by $\Delta$ time steps to the future.
	
	Proof by contraposition: Suppose that $\bm{S}_1$ and $\bm{S}_2$ are dependent but do not have a common ancestor in the full time graph. Now because of the acyclicity assumption (A1), for all $\Delta\in\mathbb{N}_{\geq 0}$, we can write $\bm{S}_{1,\Delta}$ and $\bm{S}_{2, \Delta}$ as functions of the auxiliary and noise variables (with respect to some $t_0\in\mathbb{Z}$ and $n\in\mathbb{N}_{\geq 0}$ such that all elements of $\bm{S}_{1,\Delta}$ and $\bm{S}_{2, \Delta}$ are proper). Dependence between $\bm{S}_{1,\Delta}$ and $\bm{S}_{2, \Delta}$ can just be due to dependence between the auxiliary variables, because otherwise, either Assumption (A3) would be violated or $\bm{S}_{1,\Delta}$ and $\bm{S}_{2, \Delta}$, and hence, $\bm{S}_1$ and $\bm{S}_2$ would have a common ancestor in the full time graph, contradiction. 
	Thus, schematically, the dependence between $\bm{S}_{1,\Delta}$ and $\bm{S}_{2, \Delta}$ is the dependence between $\bm{S}_{1,\Delta}$ and the auxiliary variables combined with the dependence between $\bm{S}_{2, \Delta}$ and the auxiliary variables. Now, $\alpha$-strong mixing implies that the dependence strength between $\bm{S}_{1,\Delta}$ respectively $\bm{S}_{2, \Delta}$ and the auxiliary variables, and thus, the dependence strength between $\bm{S}_{1,\Delta}$ and $\bm{S}_{2, \Delta}$ eventually decreases for large enough $\Delta$. That fact, however, is a contradiction, because by strict stationarity the dependence strength between $\bm{S}_{1,\Delta}$ and $\bm{S}_{2, \Delta}$ is the same as that between $\bm{S}_1$ and $\bm{S}_2$ for all $\Delta\in\mathbb{N}_{\geq 0}$.
\end{proof}
Lemma \ref{lemma_reichenbach} immediately implies that each district is independent of the other districts: If this fact had not been true, then there would be a common ancestor between at least two districts in the (augmented) full time graph and hence, a bidirected edge between two districts in the (augmented) $(t_0,n)$-cutoff graph. This bidirected edge implies that these two separate districts are in the same district, contradiction.

Equipped with this fact about districts, we now prove an $m$-separation global Markov property for the (augmented) $(t_0,n)$-cutoff graph and the corresponding distribution.

\begin{lemma}[Global $m$-separation Markov property for the (augmented) $(t_0,n)$-cutoff graphs and corresponding distributions]
	\label{m-sep_glob_markov_property}
	For all $t_0\in\mathbb{Z}$ and for all $n\in\mathbb{N}_{\geq 0}$, a global $m$-separation Markov property holds for the $(t_0,n)$-cutoff graph and corresponding distribution of $\{\bm{X}_t\}_{t\in\mathbb{Z}}$.
	That is, for any finite disjoint index sets $J_1,J_2,J_3\subseteq [d]_1\times([t_0-n-q,t_0]\cap \mathbb{Z})$ such that $\bm{S}_3:=\{X^{i}_t:(i,t)\in J_3\}$ $m$-separates $\bm{S}_1:=\{X^{i}_t:(i,t)\in J_1\}$ and $\bm{S}_2:=\{X^{i}_t:(i,t)\in J_2\}$ in the $(t_0,n)$-cutoff graph, $\bm{S}_1\ind \bm{S}_2\mid \bm{S}_3$.
	
	An analogous result  for the augmented $(t_0,n)$-cutoff graph and corresponding distribution of $\{(\bm{X}_t,\bm{\epsilon}_t)\}_{t\in\mathbb{Z}}$ .
\end{lemma}

\begin{proof}
	We here again just provide a proof idea, for the details see Section \ref{sm_proof_2} in the SM.
	
	The proof is similar to the proof for SCMs with mutually independent exogeneous variables \citep{pearl2009causality, janzing2010causal}: First, we prove an $m$-separation local Markov property for the augmented $(t_0,n)$-cutoff graph \citep[Section 3]{richardson2003markov}. This local Markov property requires us to show that a proper vertex is independent of its nondescendants without its parents given its parents. This case works as in the original SCM-proof and is rather immediate. Moreover, we need to show that an auxiliary vertex is independent of its nondescedants given the other auxiliary variables in its district (roughly speaking). As one can write the nondescedants of that auxiliary vertex as a function of the other auxiliary vertices, this independence question boils down to whether separate districts are independent of each other. Lemma \ref{lemma_reichenbach} answers that question.
	
	Now, this local Markov property implies an $m$-separation global Markov property for the augmented graph and corresponding distribution \citep[Theorem 2]{richardson2003markov}. Finally, because each noise variable is only adjacent to at most one endogeneous variable, we conclude that $m$-separation in the not-augmented graph implies $m$-separation in the augmented graph. From this fact, the $m$-separation global Markov property for the not-augmented graph follows.
\end{proof}







\subsection{$d$-separation in the full time graph implies $m$-separation in the finite cutoff-graphs}
\label{subsection:remaining_proof}


In the following lemma, we show that $d$-separation in the (augmented) full time graph implies $m$-separation in all sufficiently large (augmented) cutoff graphs.
\begin{lemma}
	\label{lemma_sep_impli}
	Let Assumptions (A0)--(A4) hold. For any finite disjoint index sets $J_1,J_2,J_3\subseteq [d]_1\times\mathbb{Z}$ such that $\bm{S}_3:=\{X^{i}_t:(i,t)\in J_3\}$ $d$-separates $\bm{S}_1:=\{X^{i}_t:(i,t)\in J_1\}$ and $\bm{S}_2:=\{X^{i}_t:(i,t)\in J_2\}$ in the full time graph, it holds that 
	$\bm{S}_3$ $m$-separates $\bm{S}_1$ and $\bm{S}_2$ in all $(t_0,n)$-cutoff graphs for which all occuring vertices are proper.
	
		An analogous result including noise variables $\epsilon^{i}_t$ holds true for the augmented full time graph and the augmented cutoff-graphs.
\end{lemma}

\begin{proof}
The following proof both works for the non-augmented and augmented case:
Let $t_0\in\mathbb{Z}$ and $n\in\mathbb{N}_{\geq 0}$ be such that
all elements of $\bm{S}_1$, $\bm{S}_2$ and $\bm{S}_3$ are proper.

Proof by contraposition: Assume that $\bm{S}_1$ and $\bm{S}_2$ are $m$-connected given $\bm{S}_3$ in the (augmented) $(t_0,n)$-cutoff graph.
Then, there exists an $m$-connecting path $\pi^{t_0,n}$ between $\bm{S}_1$ and $\bm{S}_2$ given $\bm{S}_3$ in the (augmented) $(t_0,n)$-cutoff graph. We now construct a $d$-connecting path $\pi$ between $\bm{S}_1$ and $\bm{S}_2$ given $\bm{S}_3$ in the (augmented) full time graph: All directed edges and all vertices incident to these directed edges are copied from $\pi^{t_0,n}$ to $\pi$. All remaining auxiliary vertices in $\pi^{t_0,n}$ are also taken to $\pi$. The bidirected edges between two auxiliary vertices in $\pi^{t_0,n}$ are replaced by collider-free subpaths which only consist of auxiliary and ancient vertices (which is possible by construction of these bidirected edges); it does not matter which particular subpaths one takes. 


The proper vertices in $\pi$ are distinct because the proper vertices in $\pi^{t_0,n}$ are distinct and because the collider-free subpaths that replace the bidirected edges do not contain proper vertices. However, the auxiliary and ancient vertices in $\pi$ are not necessarily distinct. To make $\pi$ a path, we repeatedly apply the following procedure until all vertices in $\pi$ are unique:  Take some non-distinct auxiliary or ancient vertex $v$ in $\pi$ (the order of the non-distinct vertices does not matter). Delete all vertices and corresponding edges after the first occurence of $v$ in $\pi$ until the last occurence of $v$ in $\pi$. Then, stitch together the two remaining parts of $\pi$. That is, if $u\astast v$ is the edge corresponding to the first occurence of $v$ and $v\astast w$ is the edge corresponding to the last occurence of $v$, then $\pi$ will contain the triplet $u\astast v\astast w$ with all vertices and edges in between deleted.

If $v$ is a collider in $u\astast v\astast w$, so $u\rightarrow v\leftarrow w$, then $v$ is an ancestor of a vertex $\Tilde{v}$ in the (augmented) full time graph such that $\Tilde{v}$ is a collider in $\pi^{t_0,n}$: The vertex $v$ is only non-unique in the initial (so before deletion) $\pi$ if it occurs on at least two of the collider-free common ancestor subpaths.  Consider the two collider-free common ancestor subpaths corresponding to the first and last occurence of $v$ in the initial $\pi$, and consider the two corresponding bidirected edges $\Tilde{u}_1 \leftrightarrow \Tilde{u}_2$ and $\Tilde{w}_1 \leftrightarrow \Tilde{w}_2$ (see Figure \ref{Fig_schem_illus}a for a schematic illustration).
Without loss of generality, assume that $\pi^{t_0,n}$ first visits $\Tilde{u}_2$, then $\Tilde{u}_1$, then $\Tilde{w}_1$ (which might equal $\Tilde{u}_1$) and then $\Tilde{w}_2$. Suppose now that $v$ is not an ancestor of $\Tilde{u}_1$ in the (augmented) full time graph. Then, on the collider-free subpath from $\Tilde{u}_2$ to $\Tilde{u}_1$, the initial $\pi$ first comes to $v$ and then to $u$ (Figure \ref{Fig_schem_illus}b), because otherwise, there would either not be the edge $u\rightarrow v$ or $v$ would be an ancestor of $\Tilde{u}_1$ (note that both $v$ and $u$ occur \textit{exactly} once on the collider-free subpath between $\Tilde{u}_2$ and $\Tilde{u}_1$ because that subpath is a \textit{path}). Thus, the last edge before the first occurence of $v$ is not $u\rightarrow v$ (in Figure  \ref{Fig_schem_illus}b it is $v\rightarrow r\neq u$) and hence, after deletion, the triplet $u\astast v \astast w$ cannot occur in $\pi$, contradiction. Thus, $v$ is an ancestor of $\Tilde{u}_1$ (Figure \ref{Fig_schem_illus}c). In $\pi^{t_0,n}$, the vertex $\Tilde{u}_1$ is either incident to another bidirected edge (Figure \ref{Fig_schem_illus}d), in which case it is a collider, or it is connected to $\Tilde{w}_1$ via a path containing proper vertices and thus, at least one collider (Figure \ref{Fig_schem_illus}e). In the first case, let $\Tilde{v}:=\Tilde{u}_1$, in the second case, let $\Tilde{v}$ be the first collider on $\pi^{t_0,n}$ occuring after $\Tilde{u}_1$.


We now show that no triplet $a\astast b\astast c$ in $\pi$ is $d$-blocked by $\bm{S}_3$.\newline
 \noindent \textbf{Case 1: $b$ is a proper vertex:} In this case, $a$ and $c$ are proper or auxiliary vertices as there are no edges from ancient to proper vertices. Thus, $a$, $b$ and $c$ exist in the (augmented) $(t_0,n)$-cutoff graph.
 The edges between $a$, $b$ and $c$ in the (augmented) full time graph are exactly as the edges between $a$, $b$ and $c$ in the (augmented) $(t_0,n)$-cutoff graph because edges incident to a proper vertex are always directed. Also, $a\astast b\astast c$ occurs in $\pi^{t_0,n}$ by construction of $\pi$. Hence, $b$ is a collider in this triplet in $\pi$ if and only if $b$ is a collider in this triplet in $\pi^{t_0,n}$.
 
 
 If $b$ is a collider in this triplet in $\pi^{t_0,n}$, then a descendant of $b$ in the (augmented) $(t_0,n)$-cutoff graph is in $\bm{S}_3$. Hence, a descendant of $b$ in the (augmented) full time graph is in $\bm{S}_3$. 
 If $b$ is a non-collider in this triplet in $\pi^{t_0,n}$, then $b\notin \bm{S}_3$.
 
 \noindent \textbf{Case 2: $b$ is an auxiliary or ancient vertex:}
 Note that $\bm{S}_3$ only contains proper vertices, so $b\notin \bm{S}_3$. Thus, if $b$ is a non-collider in $a\astast b\astast c$ in $\pi$, then $a\astast b\astast c$ is not $d$-blocked by
 $\bm{S}_3$.
 If $b$ is a collider in $a\astast b\astast c$ in $\pi$, then $a$ and $c$ are also auxiliary or ancient.
 Moreover, $b$ is either at the end- respectively startpoint of two collider-free common ancestor subpaths, in which case $b$ is incident to two bidirected edges and hence a collider in $\pi^{t_0,n}$ (see Figure \ref{Fig_schem_illus_Case2}a), or $b$ has been introduced when applying the above-mentioned procedure for removing non-unique vertices (see Figure \ref{Fig_schem_illus_Case2}b). In the first case, $b$ has a descendant in $\bm{S}_3$ in the (augmented) $(t_0,n)$-cutoff graph and thus in the (augmented) full time graph. In the latter case, as already illustrated in Figure \ref{Fig_schem_illus} and the corresponding paragraph, $b$ is an ancestor of a vertex $\Tilde{b}$ in the (augmented) full time graph such that $\Tilde{b}$ is a collider in $\pi^{t_0,n}$. Because $\Tilde{b}$ has a descendant in $\bm{S}_3$ in the (augmented) $(t_0,n)$-cutoff graph, $\Tilde{b}$ and hence $b$ has a descendant in $\bm{S}_3$ in the (augmented) full time graph.
\end{proof}

\section{Application to VAR processes}
\label{SectionApplications}
In this section, we illustrate that (A0)--(A4) hold  for stable finite-order VAR processes with independent finite-second-moment noise that has a density with respect to Lebesgue measure (in Remark \ref{remark_var_assumptions} we compare these assumptions to \citet{thams2022identifying}). For the basics on VAR processes, we refer to Chapter 2 and Section C.3 of \citet{lutkepohl2005new}.


	Let  $q\in\mathbb{N}_{\geq 0}$ be the order of the VAR process.	
	Let $\{\bm{\epsilon}_t\}_{t\in\mathbb{Z}}$ be a stochastic process satisfying (A0) and for which $P_{\bm{\epsilon}_t}$  has mean zero, finite second moments, and is absolutely continuous with respect to Lebesgue measure.
	Moreover, consider $\mathbb{R}^{d\times d}$-matrices $\bm{A}^{(1)},\ldots,\bm{A}^{(q)}$ which satisfy the \textit{stability condition}, that is, for which
	$\det(\bm{I_d} - \bm{A}^{(1)}\lambda-\bm{A}^{(2)}\lambda^2-\cdots -\bm{A}^{(q)}\lambda^p)=0$ implies
	  $|\lambda| >  1$.
Now, let $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ be a stochastic process such that
	\begin{align}
	\label{var_process}
	\bm{X}_t	= \bm{A}^{(1)}\bm{X}_{t-1}+\ldots+\bm{A}^{(q)}\bm{X}_{t-q} + \bm{\epsilon}_t
	\end{align}
$P$-almost surely for all $t\in\mathbb{Z}$.
	
By definition, $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ is a stable VAR process and thus has a moving average (MA) representation which just depends on past and present noise terms: That is, one can write $\bm{X}_t=\sum_{i=0}^\infty \bm{\Phi}_i\bm{\epsilon}_{t-i}$ where each $\bm{\Phi}_i\in\mathbb{R}^{d\times d}$ and such that 
	$\{(\bm{\Phi}_i)_{k,l}\}_{i\in \mathbb{N}_{\geq 0}}$ is absolutely summable for all $k,l\in[d]_1$. Here, the infinite sum is meant as a limit in mean-square which is uniquely defined up to $P$-nullsets.
	
This MA representation allows us to verify Assumption (A2): By Assumption (A0), for all finite index sets $J\subseteq \mathbb{Z}$ and for all $N\in\mathbb{N}_{\geq 0}$, the joint distribution of $\{\bm{\epsilon}_{t-i}:\;t\in J,\;i\in[N]_0\}$ is time-shift invariant. Therefore,  for all finite index sets $J\subseteq \mathbb{Z}$ and for all $N\in\mathbb{N}_{\geq 0}$, the distribution of $\{\sum_{i=0}^N \bm{\Phi}_i\bm{\epsilon}_{t-i}:\;t\in J\}$ is also time-shift invariant. Because these individual sums converge in mean-square,
	$\{\sum_{i=0}^N \bm{\Phi}_i\bm{\epsilon}_{t-i}:\;t\in J\}$ also converges in mean square  and thus in distribution to $\{\bm{X}_t:\;t\in J\}$. Therefore, the distribution of $\{\bm{X}_t:\;t\in J\}$ is also time-shift invariant (see Lemma \ref{lemma_equal_dist} in the SM for a formal proof of this implication). 	

 The MA representation also allows us to verify Assumption (A3):
 By Assumption (A0), for all finite index sets $J\subseteq \mathbb{Z}$ and for all $N\in\mathbb{N}_{\geq 0}$, the set $\{\bm{\epsilon}_{t-i}:\;t\in J,\;i\in[N]_0\}$
 is independent of any set of $\bm{\epsilon}_t$-variables that lie strictly in its future. Thus, also $\{\sum_{i=0}^N \bm{\Phi}_i\bm{\epsilon}_{t-i}:\;t\in J\}$ is independent of any set of $\bm{\epsilon}_t$-variables that all lie strictly in its future. By Lemma \ref{lemma_convergence} in the SM, the same also holds for $\{\bm{X}_t:\;t\in J\}$. 

Assumptions (A1) and (A4) are more immediate: (A1) follows because the full time graph does not have contemporaneous edges. (A4) follows from \citet{mokkadem1988mixing}. 

\begin{remark}
\label{remark_nonstability}
There are stationary nonstable VAR processes for which the global Markov property does not hold --- just recall our example for a violation of Assumption (A3) from Section \ref{sec:futher_technical_things}. To see that this $2$-dimensional VAR(1) process is not stable, note that
\begin{align*}
 \bm{A}^{(1)}   = \begin{pmatrix}
 \phi & 1 \\
 0 & 0
 \end{pmatrix}
\end{align*}
and thus,  $\det(\bm{I_d} - \bm{A}^{(1)}\lambda)=(1-\lambda\phi)$. Because $(1-\lambda\phi)$ has the root $\lambda=1/\phi$ and because $|\phi|>1$ by assumption, this VAR(1) process is not stable.
\end{remark}	

\begin{remark}
\label{remark_var_assumptions}
The global Markov property for VAR processes from \citet{thams2022identifying} requires stability and finite second moments and independence of the noise variables, as we do. As explained in Section \ref{SectionApplications} above, these assumptions together with equation \eqref{var_process} imply Assumptions (A0)--(A3).

However, \citet{thams2022identifying} do not require that $P_{\bm{\epsilon}_t}$ is absolutely continuous with respect to Lebesgue measure, which we do (but only for VAR processes and not for our general result!). We only need absolute-continuity to establish $\alpha$-strong mixing (note that there are stable VAR processes for which this absolute-continuity assumption does not hold and which are not strongly mixing \citep[Section 2.3]{doukhan1994mixing}).  \citet{thams2022identifying} get around this absolute-continuity assumption because they directly employ the moving average representation of VAR processes in their proof. For nonlinear processes, such a moving average representation is typically not available, and a natural generalization are mixing assumptions \citep[Section 2.6]{fan2003nonlinear}. Our absolute-continuity assumption for VAR processes thus seems to be inherent to the more general nonlinear setting we consider.
\end{remark}
	
	\section{Conclusion and Outlook}
	
	In this paper, we proved a $d$-separation global Markov property for certain solutions of stochastic difference equations and the corresponding (augmented) full time graphs when $t\in\mathbb{Z}$. To the best of our knowledge, we presented the most general version of this result so far. The assumptions we made are natural and typical in the causal inference and time series literature. Furthermore, we illustrated why such assumptions are needed by investigating assumption violations.
Our work is thus relevant whenever one causally models time series with $t\in\mathbb{Z}$ --- a case that naturally arises when modelling equilibriums, for example.

	In the future, loosening the assumptions, especially strict stationarity (A2), would be good. Here, we imagine that our result for the stationary case helps in establishing results for the nonstationary case as nonstationarity is often reduced to the stationary case (e.g., by assuming piecewise stationarity or by considering transformations of the nonstationary time series). One could also study the notion of faithfulness or even introduce the notion of time series SCMs and define what interventions and counterfactuals mean, similar in spirit to \citet{bongers2021foundations}. 
	




\begin{figure*}
\center
	\begin{subfigure}{.33\linewidth}
	\center
	\scalebox{0.7}{		
		\begin{tikzpicture}

		\node[draw = none] (v) at (0,0) {\Large$v$};
		\node[draw = none] (u) at (-1,-1) {\Large$u$};
		\node[draw = none] (w) at (-1,1) {\Large$w$};
		
		\node[draw = none] (Tu1) at (2,-1) {\Large$\Tilde{u}_1$};
		\node[draw = none] (Tu2) at (2,-2.41) {\Large$\Tilde{u}_2$};
		\node[draw = none] (Tw2) at (2,2.41) {\Large$\Tilde{w}_2$};
		\node[draw = none] (Tw1) at (2,1) {\Large$\Tilde{w}_1$};
		
		\node[draw = none] (Q1) at (0.75,-1) {\Large$?$};
		\node[draw = none] (Q2) at (0.75,1) {\Large$?$};
		\node[draw = none] (Q3) at (2,0) {\Large$?$};
		
		\path [->, line width = 0.5mm, color = red] (u) edge node[left] {} (v);
		\path [->, line width = 0.5mm, color = red] (w) edge node[left] {} (v);
	
		\path [<->, line width = 0.5mm, color = blue] (Tu1) edge node[left] {} (Tu2);
		\path [<->, line width = 0.5mm, color = blue] (Tw1) edge node[left] {} (Tw2);
		\end{tikzpicture}
	}
	\caption{}
	\end{subfigure}
%\hspace{0.5cm}
\begin{subfigure}{.33\linewidth}
\center
	\scalebox{0.7}{		
	\begin{tikzpicture}

		\node[draw = none] (v) at (0,0) {\Large$v$};
		\node[draw = none] (u) at (-1,-1) {\Large$u$};
		\node[draw = none] (w) at (-1,1) {\Large$w$};
		
		\node[draw = none] (r) at (0.900444,-1.08503) {\Large$r$};
		
		\node[draw = none] (Tu1) at (2,-1) {\Large$\Tilde{u}_1$};
		\node[draw = none] (Tu2) at (2,-2.41) {\Large$\Tilde{u}_2$};
		\node[draw = none] (Tw2) at (2,2.41) {\Large$\Tilde{w}_2$};
		\node[draw = none] (Tw1) at (2,1) {\Large$\Tilde{w}_1$};
		
        \node[draw = none] (Q2) at (0.75,1) {\Large$?$};
		\node[draw = none] (Q3) at (2,0) {\Large$?$};
		
		\path [->, line width = 0.5mm, color = red] (u) edge node[left] {} (v);
		\path [->, line width = 0.5mm, color = red] (w) edge node[left] {} (v);
	
		\path [<->, line width = 0.5mm, color = blue] (Tu1) edge node[left] {} (Tu2);
		\path [<->, line width = 0.5mm, color = blue] (Tw1) edge node[left] {} (Tw2);
		
		\path [->, line width = 0.5mm, color = red, loosely dotted] (r) edge node[left] {} (Tu2);
		\path [->, line width = 0.5mm, color = red] (v) edge node[left] {} (r);
		\end{tikzpicture}
	}
	\caption{}
\end{subfigure}	
\begin{subfigure}{.33\linewidth}
\center
	\scalebox{0.7}{		
	\begin{tikzpicture}

		\node[draw = none] (v) at (0,0) {\Large$v$};
		\node[draw = none] (u) at (-1,-1) {\Large$u$};
		\node[draw = none] (w) at (-1,1) {\Large$w$};
		

		
		\node[draw = none] (Tu1) at (2,-1) {\Large$\Tilde{u}_1$};
		\node[draw = none] (Tu2) at (2,-2.41) {\Large$\Tilde{u}_2$};
		\node[draw = none] (Tw2) at (2,2.41) {\Large$\Tilde{w}_2$};
		\node[draw = none] (Tw1) at (2,1) {\Large$\Tilde{w}_1$};
		
        \node[draw = none] (Q2) at (0.75,1) {\Large$?$};
		\node[draw = none] (Q3) at (2,0) {\Large$?$};
		
		\path [->, line width = 0.5mm, color = red] (u) edge node[left] {} (v);
		\path [->, line width = 0.5mm, color = red] (w) edge node[left] {} (v);
	
		\path [<->, line width = 0.5mm, color = blue] (Tu1) edge node[left] {} (Tu2);
		\path [<->, line width = 0.5mm, color = blue] (Tw1) edge node[left] {} (Tw2);
		
		\path [->, line width = 0.5mm, color = red, loosely dotted] (v) edge node[left] {} (Tu1);

		\end{tikzpicture}
	}
	\caption{}
\end{subfigure}	
\center
\begin{subfigure}{.33\linewidth}
\center
	\scalebox{0.7}{		
	\begin{tikzpicture}

		\node[draw = none] (v) at (0,0) {\Large$v$};
		\node[draw = none] (u) at (-1,-1) {\Large$u$};
		\node[draw = none] (w) at (-1,1) {\Large$w$};
		

		
		\node[draw = none] (Tu1) at (2,-1) {\Large$\Tilde{u}_1 =: \Tilde{v}$};
		\node[draw = none] (Tu2) at (2,-2.41) {\Large$\Tilde{u}_2$};
		\node[draw = none] (Tw2) at (2,2.41) {\Large$\Tilde{w}_2$};
		\node[draw = none] (Tw1) at (2,1) {\Large$\Tilde{w}_1$};
		
        \node[draw = none] (Q2) at (0.75,1) {\Large$?$};
		\node[draw = none] (Q3) at (2,0) {};
		
		\path [->, line width = 0.5mm, color = red] (u) edge node[left] {} (v);
		\path [->, line width = 0.5mm, color = red] (w) edge node[left] {} (v);
	
		\path [<->, line width = 0.5mm, color = blue] (Tu1) edge node[left] {} (Tu2);
		\path [<->, line width = 0.5mm, color = blue] (Tw1) edge node[left] {} (Tw2);
		
		\path [->, line width = 0.5mm, color = red, loosely dotted] (v) edge node[left] {} (Tu1);

		\path [<->, line width = 0.5mm, color = blue] (Tu1) edge node[left] {} (Q3);
		\path [-, line width = 0.5mm, color = blue, loosely dotted] (Tw1) edge node[left] {} (Q3);
		\end{tikzpicture}
	}
	\caption{}
\end{subfigure}	
%\hspace{2cm}
\begin{subfigure}{.33\linewidth}
\center
	\scalebox{0.7}{		
	\begin{tikzpicture}

		\node[draw = none] (v) at (0,0) {\Large$v$};
		\node[draw = none] (u) at (-1,-1) {\Large$u$};
		\node[draw = none] (w) at (-1,1) {\Large$w$};
		

		
		\node[draw = none] (Tu1) at (2,-1) {\Large$\Tilde{u}_1$};
		\node[draw = none] (Tu2) at (2,-2.41) {\Large$\Tilde{u}_2$};
		\node[draw = none] (Tw2) at (2,2.41) {\Large$\Tilde{w}_2$};
		\node[draw = none] (Tw1) at (2,1) {\Large$\Tilde{w}_1$};
		
        \node[draw = none] (Q2) at (0.75,1) {\Large$?$};
		\node[draw = none] (Q3) at (2,0) {};
		
        \node[draw = none] (Tv) at (4,0) {\Large$\Tilde{v}$};
         \node[draw = none] (i) at (3,0.5) {};
		
		\path [->, line width = 0.5mm, color = red] (u) edge node[left] {} (v);
		\path [->, line width = 0.5mm, color = red] (w) edge node[left] {} (v);
	
		\path [<->, line width = 0.5mm, color = blue] (Tu1) edge node[left] {} (Tu2);
		\path [<->, line width = 0.5mm, color = blue] (Tw1) edge node[left] {} (Tw2);
		
		\path [->, line width = 0.5mm, color = red, loosely dotted] (v) edge node[left] {} (Tu1);

		\path [->, line width = 0.5mm, color = violet, loosely dotted] (Tu1) edge node[left] {} (Tv);
		\path [->, line width = 0.5mm, color = violet] (i) edge node[left] {} (Tv);
		\path [-, line width = 0.5mm, color = violet, loosely dotted] (Tw1) edge node[left] {} (i);

		\end{tikzpicture}
	}
	\caption{}
\end{subfigure}	

	\caption{Schematic illustrations for the proof of Lemma \ref{lemma_sep_impli}: These schematic plots illustrate bidirected edges in the cutoff-graph and the snippets in the full time graph that replace them. Here, red edges/paths just occur in the (augmented) full time graph, blue edges/paths just in the (augmented) cutoff graph, and purple edges/paths appear in both.  Also, $\dottedarrow$ stands for directed paths, $\cdots$ stands for paths, $\cdots\rightarrow$ stands for a path where the last edge is $\rightarrow$ and $\cdots\leftrightarrow$ stands for a path where the last edge is $\leftrightarrow$. A question mark indicates that the corresponding graphical structure around it is unknown. More detailed explanations are given in the proof.}
	\label{Fig_schem_illus}
\end{figure*}


\begin{figure*}
\center
\begin{subfigure}{.5\linewidth}
\center
\scalebox{0.7}{		
	\begin{tikzpicture}
	
			\node[draw = none] (z1) at (0,-2.41) {};
			\node[draw = none] (z2) at (0,2.41) {};
	
			\node[draw = none] (x1) at (0,-1.41) {};
		\node[draw = none] (b) at (0,0) {\Large$b$};
		\node[draw = none] (x2) at (0,1.41) {};
	
		\node[draw = none] (a) at (-1,-1) {\Large$a$};	
		\node[draw = none] (c) at (-1,1) {\Large$c$};	
		
		\node[draw = none] (S3) at (2,0) {\Large$\bm{S}_3$};

		\path [<->, line width = 0.5mm, color = blue] (x1) edge node[left] {} (b);
		\path [<->, line width = 0.5mm, color = blue] (b) edge node[left] {} (x2);
		
		\path [->, line width = 0.5mm, color = red] (a) edge node[left] {} (b);
		\path [->, line width = 0.5mm, color = red] (c) edge node[left] {} (b);
		
		\path [->, line width = 0.5mm, color = purple, loosely dotted] (b) edge node[left] {} (S3);
	\end{tikzpicture}
}
\caption{}
\end{subfigure}
\hspace{-1cm}
\begin{subfigure}{.5\linewidth}
\center
	\scalebox{0.7}{		
	\begin{tikzpicture}

		\node[draw = none] (b) at (0,0) {\Large$b$};
		\node[draw = none] (a) at (-1,-1) {\Large$a$};
		\node[draw = none] (c) at (-1,1) {\Large$c$};
		

		
		\node[draw = none] (Tu1) at (2,-1) {};
		\node[draw = none] (Tu2) at (2,-2.41) {};
		\node[draw = none] (Tw2) at (2,2.41) {};
		\node[draw = none] (Tw1) at (2,1) {};
		
        \node[draw = none] (Q2) at (0.75,1) {\Large$?$};
		\node[draw = none] (Q3) at (2,0) {};
		
        \node[draw = none] (Tv) at (4,0) {\Large$\Tilde{b}$};
         \node[draw = none] (i) at (3,0.5) {};
		
		\node[draw = none] (S3) at (6,0) {\Large$\bm{S}_3$};		
		
		\path [->, line width = 0.5mm, color = red] (u) edge node[left] {} (v);
		\path [->, line width = 0.5mm, color = red] (w) edge node[left] {} (v);
	
		\path [<->, line width = 0.5mm, color = blue] (Tu1) edge node[left] {} (Tu2);
		\path [<->, line width = 0.5mm, color = blue] (Tw1) edge node[left] {} (Tw2);
		
		\path [->, line width = 0.5mm, color = red, loosely dotted] (v) edge node[left] {} (Tu1);

		\path [->, line width = 0.5mm, color = violet, loosely dotted] (Tu1) edge node[left] {} (Tv);
		\path [->, line width = 0.5mm, color = violet] (i) edge node[left] {} (Tv);
		\path [-, line width = 0.5mm, color = violet, loosely dotted] (Tw1) edge node[left] {} (i);
		
		\path [->, line width = 0.5mm, color = violet, loosely dotted] (Tv) edge node[left] {} (S3);

		\end{tikzpicture}
	}
	\caption{}
\end{subfigure}	

\caption{Schematic illustration for Case $2$ in the proof of Lemma \ref{lemma_sep_impli}. The notation is as in Figure \ref{Fig_schem_illus}. Further explanations are again given in the proof.}
\label{Fig_schem_illus_Case2}
\end{figure*}



%\begin{contributions} % will be removed in pdf for initial submission 
					  % (without ‘accepted’ option in \documentclass)
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
   % Briefly list author contributions. 
    %This is a nice way of making clear %who did what and to give proper %credit.
   % This section is optional.

   %% H.~Q.~Bovik conceived the idea and wrote the paper.
    %5Coauthor One created the code.
    %5Coauthor Two created the figures.
%\end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
We thank the anonymous reviewers for their helpful feedback and suggestions.
J.R.\ received funding from the European Research Council (ERC) Starting Grant CausalEarth under the European Union’s Horizon 2020 research and innovation program (Grant Agreement No.\ 948112) and J.R.\ also from No.\ 101003469 (XAIDA).
\end{acknowledgements}

% References
\bibliography{uai2024-template}

\newpage

\onecolumn

\title{A Global Markov Property for Solutions of Stochastic Difference Equations and the corresponding Full Time Graphs\\(Supplementary Material)}
\maketitle





\appendix

\section{Basic graphical definitions}
\label{sec_basic_graphical_defs}
In this section, we review some basic graphical notions that we use throughout paper. We mainly follow \citet{richardson2002ancestral}, \citet{richardson2003markov} and \citet{foygel2012half}.

\textbf{Mixed graphs:} A mixed graph $\mathcal{G}$ is a tuple $(V,D,B)$ where $V$ is a set of countably many vertices and $D,B\subseteq V\times V$ are the sets of directed and bidirected edges, respectively. For the set of directed edges $D$, we identify $(v,w)\in D$ with $v\rightarrow w$. Note that $(v,w)\in D$ does not necessarily imply $(w,v)\in D$. For the set of bidirected edges, $(v,w)\in B$ if and only if $(w,v)\in B$ and we identify these two tuples with the bidirected edge $v\leftrightarrow w$. There are no self-loops in either the directed or bidirected edge set, that is for all $v\in V$, it holds that $(v,v)\notin D\cup B.$ For an edge $e=v\rightarrow w$ or $e=v\leftrightarrow w,$ we say that $v$ and $w$ are incident to $e$ and that $e$ has endpoints $v$ and $w$. If two vertices $v$ and $w$ are connected by a directed or bidirected edge, then we say that $v$ and $w$ are adjacent. For a directed edge $v\rightarrow w$, we also say that $v$ points towards $w$. For a subset of vertices $A\subseteq V$ and a graph $G$, the induced subgraph $G_A$ has vertex set $A$ and all edges from $G$ with both endpoints in $A$. If there is the edge $v\rightarrow w$, then we call $v$ a parent of $w$ and $w$ a child of $v$. For a vertex $v$, we abbreviate the set of all its parents and all its childs by $\text{pa}(v)$ and $\text{ch}(v)$, respectively. For a set of vertices $A\subseteq V$, we define $\text{pa}(A):=\bigcup_{v\in A}\text{pa}(v)$ and $\text{ch}(A):=\bigcup_{v\in A}\text{ch}(v)$. 

\textbf{Walks and paths:} A walk $\pi$ from a vertex $v$ to another vertex $w$ is a \textit{finite} sequence of edges, that is, $\pi=(e_1,\ldots,e_n)$ for some $n\in \mathbb{N}_{\geq 0}$, such that there exists a sequence of (not necessarily distinct) vertices $(v=v_1,\ldots,v_{n+1}=w)$ such that the edge $e_i$ has endpoints $v_i$ and $v_{i+1}$. If all vertices on $\pi$ are distinct, then we call $\pi$ a path. If $n=0$, then we say that $\pi$ is trivial. A path $\pi$ is called directed path if $e_1,\ldots,e_n$ are directed and point from $v_i$ to $v_{i+1}$. A path $\pi$ is called a directed cycle if $\pi$ is a non-trivial directed path with $v_1=v_{n+1}$. For two sets of vertices $A,B\subseteq V$, we say that there is a path between them if there is a vertex $v\in A$ and a vertex $w\in B$ such that there is a path between $v$ and $w$.

\textbf{Ancestors and descendants:} A vertex $v$ is called an ancestor of another vertex $w$ and $w$ is called a descendant of $v$ if $v=w$ or if there is a directed path from $v$ to $w$. The set of all ancestors and descendants of a vertex $v$ is denoted by $\text{an}(v)$ and $\text{dec}(v)$, respectively. For a set of vertices $A\subseteq V$, we write $\text{an}(A):=\bigcup_{v\in A}\text{an}(v)$  and $\text{dec}(A):=\bigcup_{v\in A}\text{dec}(v)$. The set of nondescendants $\text{nd}(A)$ equals $V\setminus \text{dec}(A)$. We say that two sets of vertices $A$ and $B$ have a common ancestor if there is a vertex $v$ that both has a descendant in $A$ and a descendant in $B$.


\textbf{Types of graphs:} A mixed graph is called acyclic directed mixed graph (ADMG) if it does not contain directed cycles.
If a mixed graph does not contain bidirected edges, that is $B=\emptyset$, then in slight abuse of notation, we identify it with the tuple $(V,D)$ instead of $(V,D,\emptyset)$ and call the graph a directed graph. If a directed graph does not contain directed cycles, then we call it a directed acyclic graph (DAG).

\textbf{Separation:} Consider a given ADMG. A vertex $v$ in a path $\pi$ is a collider (in $\pi$) if it is not an endpoint of $\pi$ and if the edges in $\pi$ to which $v$ is incident to are either $\rightarrow v\leftarrow$ or $\rightarrow v \leftrightarrow$ or $\leftrightarrow v \leftarrow$ or $\leftrightarrow v \leftrightarrow$. We call $v$ a non-collider in $\pi$ if it is not an endpoint of $\pi$ and not a collider in $\pi$.
We say that a path $\pi$ between two vertices $v$ and $w$ is $m$-connecting given a third set of vertices $C$ for which $v,w\notin C$ if
\begin{itemize}
    \item (Condition 1): every collider on $\pi$ has a descendant in $C$ and if
    \item (Condition 2): every non-collider on $\pi$ is not in $C$.
\end{itemize}
We say that a triplet  $u\astast v \astast w$ in $\pi$ is $m$-connected given $C$ if both (Condition 1) and (Condition 2) hold, otherwise, we call that triplet $m$-blocked given $C$.
We say that two sets of vertices $A$ and $B$ are $m$-connected given $C$ if there is an $m$-connecting path between them given $C$. If there is no $m$-connecting path $\pi$ between $A$ and $B$ given $C$, then we say that $A$ and $B$ are $m$-separated given $C$.
If the ADMG is a DAG, then we call $m$-separation $d$-separation.



\section{Proofs}

\subsection{Further notation}
\label{sec_further_notation}
For a system of sets $\mathcal{M}$, we write $\sigma(\mathcal{M})$ for the generated $\sigma$-algebra of $\mathcal{M}$. For random vectors $\bm{X}_1,\bm{X}_2,\ldots$, we write $\sigma(\bm{X}_1,\bm{X}_2,\ldots)$ for the generated $\sigma$-algebra thereof.
For two $\sigma$-algebras $\Sigma_1$ and $\Sigma_2$, we write $\Sigma_1\otimes \Sigma_2$ to denote the product $\sigma$-algebra of $\Sigma_1$ and $\Sigma_2$.  For some fixed $n\in\mathbb{N}$, we write $\Sigma_1^{\otimes n}$ to denote the $n$-times product $\sigma$-algebra $\Sigma_1\otimes \ldots \otimes \Sigma_1.$
For two probability spaces $(\Omega_1,\mathcal{F}_1,P_1)$ and $(\Omega_2,\mathcal{F}_2,P_2)$, we write $P_1\otimes P_2$ to denote the unique\footnote{For uniqueness see Theorem B in \S 35 in \citet{halmos1974measure}.} product measure on $(\Omega_1\times \Omega_2,\mathcal{F}_1\otimes \mathcal{F}_2)$ that equals $P_1(E)\cdot P_2(F)$ for all rectangular sets $E\times F$ where $E\in \Omega_1$ and $F\in \Omega_2$.
\subsection{Proof of Lemma \ref{lemma_reichenbach}}
\label{sm_proof_3}
\begin{proof}
	
	Notational remark: Let $t_{\text{min}}(\bm{S}_1)$ and $t_{\text{max}}(\bm{S}_1)$ be the minimal and maximal time indices of $\bm{S}_1$. Similarly, let $t_{\text{min}}(\bm{S}_2)$ and $t_{\text{max}}(\bm{S}_2)$ be the respective indices for $\bm{S}_2$. Moreover, write $(\mathcal{X}_{\bm{S}_1},\Sigma_{\bm{S}_1})$ and $(\mathcal{X}_{\bm{S}_2},\Sigma_{\bm{S}_2})$	for the measurable spaces into which $\bm{S}_1$ and $\bm{S}_2$ map, respectively. Also, write $\bm{S}_{1,\Delta}$ and $\bm{S}_{2,\Delta}$ for the time-shifted versions of $\bm{S}_1$ and $\bm{S}_2$, respectively: That is, the sets $\bm{S}_{1,\Delta}$ and $\bm{S}_{2,\Delta}$ contain all variables from the original sets $\bm{S}_1$ and $\bm{S}_2$ time-shifted to the future by $\Delta$ time steps. Note that by construction and the fact that all time instances of a component time series map into the same measurable space, $(\mathcal{X}_{\bm{S}_{1,\Delta}},\Sigma_{\bm{S}_{1,\Delta}})=(\mathcal{X}_{\bm{S}_1},\Sigma_{\bm{S}_1})$ and $(\mathcal{X}_{\bm{S}_{2,\Delta}},\Sigma_{\bm{S}_{2,\Delta}})=(\mathcal{X}_{\bm{S}_2},\Sigma_{\bm{S}_2})$, so we will always write $(\mathcal{X}_{\bm{S}_1},\Sigma_{\bm{S}_1})$ and $(\mathcal{X}_{\bm{S}_2},\Sigma_{\bm{S}_2})$.
	
	Proof by contradiction: Assume that $\bm{S}_1$ and $\bm{S}_2$ are dependent but have no common ancestor in the full time graph.
	Let 
	\begin{align*}
	\varepsilon:=\sup_{A\in \Sigma_{\bm{S}_1},\;B\in \Sigma_{\bm{S}_2}}
	\biggr|P\biggr(\bm{S}_1\in A,\; \bm{S}_2 \in B\biggr)-P\biggr(\bm{S}_1\in A\biggr)P\biggr(\bm{S}_2\in B\biggr)\biggr|.
	\end{align*}
	By assumption, $\bm{S}_1$ and $\bm{S}_2$ are dependent, so $\varepsilon>0.$
	Because $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ is $\alpha$-strongly mixing, $\alpha(m)\rightarrow 0$ for $m\rightarrow \infty$. Thus, there is an $M\in \mathbb{N}_{\geq 0}$ such that for all $m\in\mathbb{N}_{\geq M}$ it holds that $\alpha(m)<\varepsilon$.
	Now, choose some $\Delta\in\mathbb{N}_{\geq M}$. Furthermore, choose $n\in\mathbb{N}_{\geq 0}$ and $t_0\in\mathbb{Z}$ such that $t_{\text{min}}(\bm{S}_1)+\Delta= t_0$, such that $\min\{t_{\text{min}}(\bm{S}_1),t_{\text{min}}(\bm{S}_2)\}\geq t_0-n$, and such that $n\geq \Delta$.
	
	
Because of the acyclicity assumption (A1), we can write $\bm{S}_{1,\Delta}$ and $\bm{S}_{2,\Delta}$ as functions of the auxiliary variables and noise variables, that is
	\begin{align*}
	\bm{S}_{1,\Delta}&=g_1(\bm{X}_{\text{aux}},\bm{\epsilon}_{1, \rightarrow}),\;\;\;\text{and}\\
	\bm{S}_{2,\Delta}&=g_2(\bm{X}_{\text{aux}},\bm{\epsilon}_{2, \rightarrow}).
	\end{align*}
	Here, $\bm{X}_{\text{aux}}$ stands for the auxiliary variables (with respect to $t_0$ and $n$) and
	$\bm{\epsilon}_{1,\rightarrow}$ and $\bm{\epsilon}_{2,\rightarrow}$ stand for all proper and future noise variables that are ancestors of $\bm{S}_{1,\Delta}$ respectively $\bm{S}_{2,\Delta}$ in the full time graph. Moreover, let $(\mathcal{X}_{\text{aux}},\Sigma_{\text{aux}})$ and $(\mathcal{E}_{1,\rightarrow},\Sigma_{1,\rightarrow})$ and $(\mathcal{E}_{2,\rightarrow},\Sigma_{2,\rightarrow})$ denote the corresponding measurable spaces. The functions $g_1$ and $g_2$ are measurable with respect to $(\mathcal{X}_{\text{aux}}\times \mathcal{E}_{1,\rightarrow},\Sigma_{\text{aux}}\otimes\Sigma_{1,\rightarrow})$ and $(\mathcal{X}_{\text{aux}}\times \mathcal{E}_{2,\rightarrow},\Sigma_{\text{aux}}\otimes\Sigma_{2,\rightarrow})$, respectively, because $g_1$ and $g_2$ are each compositions of the measurable functions $f_i$.
	
	
	Note that $\bm{\epsilon}_{1,\rightarrow}\ind \bm{\epsilon}_{2,\rightarrow}$ because otherwise by Assumption (A0), there needs to be a noise variable that is both in $\bm{\epsilon}_{1,\rightarrow}$ and  $\bm{\epsilon}_{2,\rightarrow}$, which would in turn imply that $\bm{S}_{1,\Delta}$ and $\bm{S}_{2,\Delta}$ have a common ancestor in the full time graph, which would imply that $\bm{S}_1$ and $\bm{S}_2$ have a common ancestor in the full time graph, contradiction. 
	
	Moreover, by Assumption (A3) and because $\bm{\epsilon}_{1,\rightarrow}$ and $\bm{\epsilon}_{2,\rightarrow}$ only contain proper and future variables and hence occur strictly after $\bm{X}_{\text{aux}}$, it holds that $(\bm{\epsilon}_{1,\rightarrow}, \bm{\epsilon}_{2,\rightarrow})\ind \bm{X}_{\text{aux}}$. By the decomposition property of conditional independence\footnote{See, e.g., Section 1.5 in \citet{studeny2018conditional} for this and other properties of conditional independence.}, it particularly also holds that $\bm{\epsilon}_{1,\rightarrow}\ind \bm{X}_{\text{aux}}$.
	
	Therefore,
	\begin{align*}
	P_{\bm{\epsilon}_{2,\rightarrow}, \bm{\epsilon}_{1,\rightarrow},\bm{X}_{\text{aux}}}&=P_{\bm{\epsilon}_{2,\rightarrow}, \bm{\epsilon}_{1,\rightarrow}}\otimes P_{\bm{X}_{\text{aux}}}\\
	&=P_{\bm{\epsilon}_{2,\rightarrow}}\otimes P_{ \bm{\epsilon}_{1,\rightarrow}}\otimes P_{\bm{X}_{\text{aux}}}\\
	&=P_{\bm{\epsilon}_{2,\rightarrow}}\otimes P_{ \bm{\epsilon}_{1,\rightarrow},\bm{X}_{\text{aux}}},
	\end{align*}
	and thus,
 $\bm{\epsilon}_{2,\rightarrow}\ind (\bm{\epsilon}_{1,\rightarrow},\bm{X}_{\text{aux}})$ and hence,
	$\bm{\epsilon}_{2,\rightarrow}\ind (\bm{S}_{1,\Delta},\bm{X}_{\text{aux}})$.
	
	Now, note that 
	\begin{align*}
	&\sup_{A\in \Sigma_{\bm{S}_1},\;B\in \Sigma_{\bm{S}_2}}\biggr|P\biggr(\bm{S}_{1,\Delta}\in A, \bm{S}_{2,\Delta}\in B\biggr)-P\biggr(\bm{S}_{1,\Delta}\in A\biggr)P\biggr(\bm{S}_{2,\Delta}\in B\biggr)\biggr|\\
	&=\sup_{A\in \Sigma_{\bm{S}_1},\;B\in \Sigma_{\bm{S}_2}}\biggr|P\biggr(\bm{S}_{1,\Delta}\in A,\; g_2(\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow})\in B\biggr)\\
	&\hspace{2.4cm}-P\biggr(\bm{S}_{1,\Delta}\in A\biggr)P\biggr(g_2(\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow})\in B\biggr)\biggr|\\
	&=\sup_{A\in \Sigma_{\bm{S}_1},\;B\in \Sigma_{\bm{S}_2}}\biggr|P\biggr(\bm{S}_{1,\Delta}\in A, (\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow})\in g_2^{-1}(B)\biggr)\\
	&\hspace{2.4cm}-P\biggr(\bm{S}_{1,\Delta}\in A\biggr)P\biggr((\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow})\in g_2^{-1}(B)\biggr)\biggr|\\
	&=\sup_{\substack{ A\in \Sigma_{\bm{S}_1}, \; B'\in \Sigma_{\text{aux}}\otimes \Sigma_{2,\rightarrow}:\\\exists B \in\Sigma_{\bm{S}_2}:\;g_2^{-1}(B)=B'}}
	\hspace{1cm}\biggr|P\biggr(\bm{S}_{1,\Delta}\in A, (\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow})\in B'\biggr)\\
	&\hspace{3cm}-P\biggr(\bm{S}_{1,\Delta}\in A\biggr)P\biggr((\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow})\in B'\biggr)\biggr|\\
	&\leq\sup_{A\in \Sigma_{\bm{S}_1}, \; B'\in \Sigma_{\text{aux}}\otimes \Sigma_{2,\rightarrow}}
	\biggr|P\biggr(\bm{S}_{1,\Delta}\in A, (\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow})\in B'\biggr)\\
	&\hspace{3cm}-P\biggr(\bm{S}_{1,\Delta}\in A\biggr)P\biggr((\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow})\in B'\biggr)\biggr|.
	\end{align*}
	Here, the second last equality follows because $g_2$ is measurable and thus $\{g_2^{-1}(B):B\in \Sigma_{\bm{S}_2} \}\subseteq \Sigma_{\text{aux}}\otimes \Sigma_{2,\rightarrow}$.
	
	For an arbitrary $y\in \mathcal{E}_{2,\rightarrow}$, define the $y$-section of $B'$ by $B'_y:=\{x\in \mathcal{X}_{\text{aux}}:\;(x,y)\in B'\}$. A standard result in measure theory (see, e.g., Theorem A in \S 34 in \citet{halmos1974measure}) states that if $B'\in \Sigma_{\text{aux}}\otimes \Sigma_{2,\rightarrow}$, then $B'_y\in \Sigma_{\text{aux}}$ for all $y\in \mathcal{E}_{2,\rightarrow}$.
	Now, because $\bm{\epsilon}_{2,\rightarrow}\ind (\bm{S}_{1,\Delta},\bm{X}_{\text{aux}})$, we have $P_{\bm{S}_{1,\Delta},\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow}}=P_{\bm{S}_{1,\Delta},\bm{X}_{\text{aux}}}\otimes P_{\bm{\epsilon}_{2,\rightarrow}}$. Moreover, recall that probability measures are $\sigma$-finite measures. Therefore, we can apply Theorem B in \S 35 in \citet{halmos1974measure} which yields
	\begin{align*}
	&P\biggr(\bm{S}_{1,\Delta}\in A, (\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow})\in B'\biggr)-P\biggr(\bm{S}_{1,\Delta}\in A\biggr)P\biggr((\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow})\in B'\biggr)\\
	&=\int P_{\bm{S}_{1,\Delta},\bm{X}_{\text{aux}}}(A\times B'_y)dP_{\bm{\epsilon}_{2,\rightarrow}}(y)-\int P_{\bm{S}_{1,\Delta}}(A)P_{\bm{X}_{\text{aux}}}(B'_y)dP_{\bm{\epsilon}_{2,\rightarrow}}(y)\\
	&=\int \biggr(P_{\bm{S}_{1,\Delta},\bm{X}_{\text{aux}}}(A\times B'_y)- P_{\bm{S}_{1,\Delta}}(A)P_{\bm{X}_{\text{aux}}}(B'_y)\biggr)dP_{\bm{\epsilon}_{2,\rightarrow}}(y).
	\end{align*}
	Therefore, 
	\begin{align*}
	&\sup_{A\in \Sigma_{\bm{S}_1},\;B'\in \Sigma_{\text{aux}}\otimes \Sigma_{2,\rightarrow}}
	\biggr|P\biggr(\bm{S}_{1,\Delta}\in A, (\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow})\in B'\biggr)\\
	&\hspace{3cm}-P\biggr(\bm{S}_{1,\Delta}\in A\biggr)P\biggr((\bm{X}_{\text{aux}},\bm{\epsilon}_{2,\rightarrow})\in B'\biggr)\biggr|\\
	&=\sup_{A\in \Sigma_{\bm{S}_1},\;B'\in \Sigma_{\text{aux}}\otimes \Sigma_{2,\rightarrow}}
	\biggr|\int\biggr( P_{\bm{S}_{1,\Delta},\bm{X}_{\text{aux}}}(A\times B'_y)\\
	&\hspace{3.5cm}- P_{\bm{S}_{1,\Delta}}(A)P_{\bm{X}_{\text{aux}}}(B'_y)\biggr)dP_{\bm{\epsilon}_{2,\rightarrow}}(y)\biggr|\\
	&\leq \sup_{A\in \Sigma_{\bm{S}_1},\;B'\in \Sigma_{\text{aux}}\otimes \Sigma_{2,\rightarrow}}\int \biggr|P_{\bm{S}_{1,\Delta},\bm{X}_{\text{aux}}}(A\times B'_y)\\
	&\hspace{3.7cm}- P_{\bm{S}_{1,\Delta}}(A)P_{\bm{X}_{\text{aux}}}(B'_y)\biggr|dP_{\bm{\epsilon}_{2,\rightarrow}}(y)\\
	&\leq \sup_{A\in \Sigma_{\bm{S}_1},\;B'\in \Sigma_{\text{aux}}\otimes \Sigma_{2,\rightarrow}}
	\int \sup_{A'\in \Sigma_{\bm{S}_1},\;B''\in   \Sigma_{\text{aux}}}\biggr|P_{\bm{S}_{1,\Delta},\bm{X}_{\text{aux}}}(A'\times B'')\\
	&\hspace{6.6cm}- P_{\bm{S}_{1,\Delta}}(A')P_{\bm{X}_{\text{aux}}}(B'')\biggr|dP_{\bm{\epsilon}_{2,\rightarrow}}(y)\\
	&\leq \sup_{A\in \Sigma_{\bm{S}_1},\;B'\in \Sigma_{\text{aux}}\otimes \Sigma_{2,\rightarrow}}\;\sup_{A'\in \Sigma_{\bm{S}_1},\;B''\in   \Sigma_{\text{aux}}}\biggr|P_{\bm{S}_{1,\Delta},\bm{X}_{\text{aux}}}(A'\times B'')\\
	&\hspace{5cm}- P_{\bm{S}_{1,\Delta}}(A')P_{\bm{X}_{\text{aux}}}(B'')\biggr|\cdot \int dP_{\bm{\epsilon}_{2,\rightarrow}}(y)\\
	&=\sup_{A'\in \Sigma_{\bm{S}_1},\;B''\in  \Sigma_{\text{aux}}}\biggr|P_{\bm{S}_{1,\Delta},\bm{X}_{\text{aux}}}(A'\times B'')
	-P_{\bm{S}_{1,\Delta}}(A')P_{\bm{X}_{\text{aux}}}(B'')\biggr|.
	\end{align*}
	By construction, every variable in $\bm{X}_{\text{aux}}$ occurs before $t_0-n$ and hence before $t_0-\Delta$. Also by construction, every variable in $\bm{S}_{1,\Delta}$ occurs after $t_{\text{min}}(\bm{S}_1)+\Delta$. Thus,
	\begin{align*}
	    &\sup_{A'\in \Sigma_{\bm{S}_1},\;B''\in  \Sigma_{\text{aux}}}\biggr|P_{\bm{S}_{1,\Delta},\bm{X}_{\text{aux}}}(A'\times B'')
	-P_{\bm{S}_{1,\Delta}}(A')P_{\bm{X}_{\text{aux}}}(B'')\biggr|.\\
	&\leq \alpha(t_{\text{min}}(\bm{S}_1)+\Delta-(t_0-\Delta))\\
	&= \alpha(t_0-(t_0-\Delta))\\
	& = \alpha(\Delta)\\
	&<\varepsilon.
	\end{align*}
	This inequality is a contradiction, because by strict stationarity (Assumption (A2)), the distribution of $(\bm{S}_{1},\bm{S}_{2})$ equals the distribution of $(\bm{S}_{1,\Delta},\bm{S}_{2,\Delta})$ and thus,
	\begin{align*}
	&\sup_{A\in \Sigma_{\bm{S}_1},\;B\in \Sigma_{\bm{S}_2}}\biggr|P\biggr(\bm{S}_{1,\Delta}\in A, \bm{S}_{2,\Delta}\in B\biggr)-P\biggr(\bm{S}_{1,\Delta}\in A\biggr)P\biggr(\bm{S}_{2,\Delta}\in B\biggr)\biggr|\\
	&=\varepsilon.
	\end{align*}
	
	
	
\end{proof}



\subsection{Proof of Lemma \ref{m-sep_glob_markov_property}}
\label{sm_proof_2}
\begin{proof}
	Let $t_0\in\mathbb{Z}$ and $n\in\mathbb{N}_{\geq 0}$ be arbitrary but fixed. 
	
	\textbf{Outline:}
	
	To prove Lemma \ref{m-sep_glob_markov_property}, we follow a similar outline as the proof for SCMs with mutually independent exogeneous variables (see, for example, Lemma 2 in \citet{janzing2010causal} and Theorem 1.4.1\ in \citet{pearl2009causality}): 
	\begin{itemize}
		\item 	First, we prove the ordered local $m$-separation Markov property \citep[Section 3]{richardson2003markov} for the augmented $(t_0,n)$-cutoff graph and the corresponding distribution of $\{(\bm{X}_t,\bm{\epsilon}_t)\}_{t\in\mathbb{Z}}$ .
		\item  Using Theorem 2 in \citet{richardson2003markov}, we then obtain the global $m$-separation Markov property for the augmented $(t_0,n)$-cutoff graph and the corresponding distribution of $\{(\bm{X}_t,\bm{\epsilon}_t)\}_{t\in\mathbb{Z}}$.
		\item From the global $m$-separation Markov property for the augmented case, we then deduct the global $m$-separation Markov property for the not-augmented $(t_0,n)$-cutoff graph and the corresponding distribution of $\{\bm{X}_t\}_{t\in\mathbb{Z}}$.
	\end{itemize}
	
\textbf{Notation and terminology:}
	
	The notation and terminology in this proof largely follows Section 3 \citet{richardson2003markov}: Abbreviate the augmented $(t_0,n)$-cutoff graph by $\mathcal{M}^a$ and the not-augmented $(t_0,n)$-cutoff graph by $\mathcal{M}$. Write $\prec$ to denote a total ordering of the variables in $\mathcal{M}^a$ such that $x\prec y$ implies that $y\notin \textbf{an}(x)$ (we call such an ordering consistent with respect to $\mathcal{M}^a$). Let 
	\begin{align*}
	\textbf{pre}_{\mathcal{M}^a,\prec}(x):=\{v\mid v\prec x \text{ or } v = x\}
	\end{align*}
	and let the district of $x$ be
	\begin{align*}
	\textbf{dis}_{\mathcal{M}^a}(x):=\{v\mid v\leftrightarrow \cdots \leftrightarrow x \text{ in } \mathcal{M}^a \text{ or } v=x\}.
	\end{align*}
	Note that $\textbf{dec}_{\mathcal{M}^a}(x)\cap \textbf{pre}_{\mathcal{M}^a,\prec}(x)=\{x\}.$
	
	Let $\bm{A}$ be an ancestral set of vertices, that is, $\textbf{an}_{\mathcal{M}^a}(\bm{A})=\bm{A}$.
	Let $x$ be a vertex in $\bm{A}$ such that no children of $x$ in $\mathcal{M}^a$ are in $\bm{A}$. Now, let $\mathcal{M}^a_{\bm{A}}$ denote the induced subgraph on $\bm{A}$. The Markov blanket of a vertex $x$ with respect to $\bm{A}$, is defined as
	\begin{align*}
	\textbf{mb}_{\mathcal{M}^a}(x,\bm{A}):=\textbf{pa}_{\mathcal{M}^a_{\bm{A}}}(\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x))\cup (\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\setminus \{x\}).
	\end{align*}
	The notation for the not-augmented graph $\mathcal{M}$ is analogous.

\textbf{Goal:}
	
	We need to show that for all vertices $x$ in $\mathcal{M}^a$ and for any ancestral set $\bm{A}\subseteq \textbf{pre}_{\mathcal{M}^a,\prec}(x)$  (which implies $\textbf{ch}(A)\cap \bm{A}=\emptyset$) with $x\in \bm{A}$, 
	\begin{align*}
	x\ind \bm{A}\setminus (\textbf{mb}_{\mathcal{M}^a}(x, \bm{A})\cup\{x\})\mid \textbf{mb}_{\mathcal{M}^a}(x, \bm{A}).
	\end{align*}
	
	\textbf{Main proof:}
	
	Take any consistent ordering $\prec$ with respect to $\mathcal{M}^a$. Furthermore, take any ancestral set $\bm{A}$ such that $\bm{A}\subseteq \textbf{pre}_{\mathcal{M}^a,\prec}(x)$ with $x\in \bm{A}$.
	
	We now make the following case distinction. 
	
	\noindent\textbf{Case 1: $x$ is a proper non-noise vertex}
	
	\noindent  Because $x$ is a non-auxiliary vertex, we have $\textbf{dis}_{\mathcal{M}^a}(x)=\{x\}$. Thus, $\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\subseteq \{x\}$. Because $x\in \bm{A}$, the vertex $x$ occurs in the subgraph $\mathcal{M}^a_{\bm{A}}$ and thus, $\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)= \{x\}$.
	
	Moreover, $\textbf{pa}_{\mathcal{M}^a_{\bm{A}}}(x)\subseteq \textbf{pa}_{\mathcal{M}^a}(x).$ Because $\bm{A}$ is ancestral and $x\in \bm{A}$, we have that $\textbf{pa}_{\mathcal{M}^a}(x)\subseteq \bm{A}$, and hence all parents of $x$ occur as vertices in $\mathcal{M}^a_{\bm{A}}.$ Therefore, $\textbf{pa}_{\mathcal{M}^a_{\bm{A}}}(x)= \textbf{pa}_{\mathcal{M}^a}(x).$
	Hence, $\textbf{mb}_{\mathcal{M}^a}(x,\bm{A})=\textbf{pa}_{\mathcal{M}^a}(x)$.
	
	Furthermore,
	\begin{align}
	\label{subset_relation}
		 \bm{A}\setminus \textbf{pa}_{\mathcal{M}^a}(x)\subseteq \textbf{pre}_{\mathcal{M}^a,\prec}(x)\setminus \textbf{pa}_{\mathcal{M}^a}(x)\subseteq (\textbf{nd}_{\mathcal{M}^a}(x)\cup\{x\})\setminus \textbf{pa}_{\mathcal{M}^a}(x).
	\end{align}
	

	
	 We can write $x$ as a function of its parents (including the respective noise term). That is, for some measurable deterministic function $g$, we can write
	\begin{align*}
	x=g\bigr(\textbf{pa}_{\mathcal{M}^a}(x)\bigr).
	\end{align*}
	Here, measurability follows because $g$ equals one of the measurable $f_i$'s. Now, using a standard fact about conditional independence\footnote{This fact states that for random vectors $\bm{X}$ and $\bm{Y}$, it holds that $\bm{X}\ind \bm{Y}\mid \bm{X}$, see, e.g., Definition 1.5.2 in \citet{studeny2018conditional} and the comment below this definition.}, we get
	\begin{align*}
		\textbf{pa}_{\mathcal{M}^a}(x)\ind \textbf{nd}_{\mathcal{M}^a}(x)\setminus \textbf{pa}_{\mathcal{M}^a}(x)\mid \textbf{pa}_{\mathcal{M}^a}(x).
	\end{align*}
	 Therefore,
	 	\begin{align*}
	 g\bigr(\textbf{pa}_{\mathcal{M}^a}(x)\bigr)\ind \textbf{nd}_{\mathcal{M}^a}(x)\setminus \textbf{pa}_{\mathcal{M}^a}(x)\mid \textbf{pa}_{\mathcal{M}^a}(x),
	 \end{align*}
	 and hence,
	\begin{align*}
	x\ind \textbf{nd}_{\mathcal{M}^a}(x)\setminus \textbf{pa}_{\mathcal{M}^a}(x)\mid \textbf{pa}_{\mathcal{M}^a}(x).
	\end{align*}
	Thus, by \eqref{subset_relation}, the decomposition property of conditional independence\footnote{See, e.g., Section 1.5 in \citet{studeny2018conditional} for this and other properties of conditional independence.} and the fact that $\textbf{mb}_{\mathcal{M}^a}(x,\bm{A})=\textbf{pa}_{\mathcal{M}^a}(x)$,
	\begin{align*}
	x\ind \bm{A}\setminus (\textbf{mb}_{\mathcal{M}^a}(x, \bm{A})\cup\{x\})\mid \textbf{mb}_{\mathcal{M}^a}(x, \bm{A}).
	\end{align*}
	
	\noindent\textbf{Case 2: $x$ is a proper noise vertex}
	
	\noindent If $x$ is a noise vertex, then $x$ does not have a parent in $\mathcal{M}^a$, so $\textbf{pa}_{\mathcal{M}^a}(x)=\emptyset.$
	Moreover, $\textbf{dis}_{\mathcal{M}^a}(x)=\{x\}$.
	Thus, $\textbf{mb}_{\mathcal{M}^a}(x,\bm{A})=\emptyset$.
	
	One can write the nondescendants of $x$ in $\mathcal{M}^a$ as a deterministic function $g$ of the
	auxiliary variables $\bm{s}_{\text{aux}}$ and the remaining noise variables $\bm{s}_{\epsilon}$ excluding $x$, that is $\textbf{nd}_{\mathcal{M}^a}(x)=g(\bm{s}_{\text{aux}},\bm{s}_{\epsilon})$.
 Note that $g$ is measurable because it is a composition of the measurable $f_i$'s.
	 By Assumptions (A0) and (A3),
	 \begin{align*}
	     P_{x,\bm{s}_{\epsilon},\bm{s}_{\text{aux}}}&=P_{x,\bm{s}_{\epsilon}}\otimes P_{\bm{s}_{\text{aux}}}\\
	     &=P_{x}\otimes P_{\bm{s}_{\epsilon}}\otimes P_{\bm{s}_{\text{aux}}}\\
	     &=P_{x}\otimes P_{\bm{s}_{\epsilon},\bm{s}_{\text{aux}}}
	 \end{align*}
 and thus,
	 \begin{align*}
	 	  x\ind (\bm{s}_{\text{aux}},\bm{s}_{\epsilon}).
	 \end{align*}
	 Therefore,
	 	 \begin{align*}
	 x\ind g(\bm{s}_{\text{aux}},\bm{s}_{\epsilon}),
	 \end{align*}
	 and hence,
	\begin{align*}
	x\ind \textbf{nd}_{\mathcal{M}^a}(x).
	\end{align*}
	
	Because $\bm{A}\subseteq \textbf{pre}_{\mathcal{M}^a,\prec}(x)\subseteq \textbf{nd}_{\mathcal{M}^a}(x)\cup\{x\}$, the decomposition property of conditional independence together with $\textbf{mb}_{\mathcal{M}^a}(x,\bm{A})=\emptyset$ implies
	\begin{align*}
	x\ind \bm{A}\setminus (\textbf{mb}_{\mathcal{M}^a}(x, \bm{A})\cup\{x\})\mid \textbf{mb}_{\mathcal{M}^a}(x, \bm{A}).
	\end{align*}
	
	
	\noindent\textbf{Case 3: $x$ is an auxiliary vertex}
	
	\noindent Every vertex in the district of $x$ has no parents; so $\textbf{mb}_{\mathcal{M}^a}(x,\bm{A})=\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\setminus\{x\}$. Moreover, $\textbf{pre}_{\mathcal{M}^a,\prec}(x)\subseteq \textbf{nd}_{\mathcal{M}^a}(x)\cup\{x\}$.
	
	One can write $\bm{A}\setminus\{x\}$ as a function $g$ of all auxiliary ancestors $\bm{s}_{\text{aux}}$ of $\bm{A}\setminus\{x\}$ in  $\mathcal{M}^a$ and of all noise ancestors $\bm{s}_{\epsilon}$ of $\bm{A}\setminus\{x\}$ in $\mathcal{M}^a$, that is $\bm{A}\setminus\{x\}=g(\bm{s}_{\text{aux}},\bm{s}_{\epsilon}).$ Note that $x\notin \bm{s}_{\text{aux}}$ because $\bm{A}\setminus\{x\}\subseteq \textbf{nd}_{\mathcal{M}^a}(x)$. Also note that $g$ is measurable because the $f_i$'s are measurable. Furthermore, because $\bm{A}$ is ancestral, $\bm{s}_{\text{aux}}$ and $\bm{s}_{\epsilon}$ are contained in $\bm{A}$. Also, by assumption, $x\in \bm{A}$.

Now, suppose that
\begin{align*}
\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\nind \bm{s}_{\text{aux}}\setminus \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x).
\end{align*}
Then, by the extended Reichenbach common cause principle (Lemma \ref{lemma_reichenbach}),
$\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)$ and $\bm{s}_{\text{aux}}\setminus \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)$ have a common ancestor in the full time graph. By construction of the augmented $(t_0,n)$-cutoff graph $\mathcal{M}^a$, there thus is a bidirected edge between some vertex in 
 $\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)$ and some vertex in $\bm{s}_{\text{aux}}\setminus \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)$ in $\mathcal{M}^a$. Because $\bm{A}$ contains both $\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)$ and $\bm{s}_{\text{aux}}\setminus \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)$, there hence is a bidirected edge between the two sets in $\mathcal{M}_{\bm{A}}^a$.
This fact, however, is a contradiction to the definition of districts. Thus,
\begin{align}
\label{eq_ind_dis}
\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\ind \bm{s}_{\text{aux}}\setminus \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x).
\end{align}
Therefore and because of Assumption (A3),
\begin{align*}
P_{\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x),\;\bm{s}_{\text{aux}}\setminus \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x),\;s_{\epsilon}}&=P_{\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x),\;\bm{s}_{\text{aux}}\setminus \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)}\otimes P_{s_{\epsilon}}\\
&=P_{\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)}\otimes P_{\bm{s}_{\text{aux}}\setminus \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)}\otimes P_{\bm{s}_{\epsilon}}\\
&=P_{\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)}\otimes P_{\bm{s}_{\text{aux}}\setminus \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x),\;\bm{s}_{\epsilon}}.
\end{align*}
Thus, 
\begin{align*}
\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\ind (\bm{s}_{\text{aux}}\setminus \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x),\bm{s}_{\epsilon}).
\end{align*}
By the weak union property of conditional independence and the fact that $x\in \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)$ by definition,
\begin{align*}
x\ind (\bm{s}_{\text{aux}}\setminus \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x),\bm{s}_{\epsilon}) \mid (\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\setminus \{x\}).
\end{align*}
Also, from a standard property of conditional independence\footnote{This fact states that for random vectors $\bm{X}$, $\bm{Y}$, $\bm{Z}$, it holds that $\bm{X}\ind \bm{Y}\mid (\bm{X},\bm{Z})$, see, e.g., Definition 1.5.2 in \citet{studeny2018conditional} and the comment below this definition.},
\begin{align*}
x\ind (\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\setminus \{x\}) \mid (\bm{s}_{\text{aux}}\setminus \textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x),\bm{s}_{\epsilon},\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\setminus \{x\}).
\end{align*}
Thus, by the contraction property of conditional independence
\begin{align*}
  x\ind (\bm{s}_{\text{aux}},\bm{s}_{\epsilon}) \mid (\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\setminus \{x\})
\end{align*}
and therefore,
\begin{align*}
x\ind g(\bm{s}_{\text{aux}},\bm{s}_{\epsilon}) \mid (\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\setminus \{x\}).
\end{align*}
Thus,
\begin{align*}
x\ind \bm{A}\setminus\{x\} \mid (\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\setminus \{x\}).
\end{align*}
Hence, due to the decomposition property of conditional independence and because $\textbf{mb}_{\mathcal{M}^a}(x,\bm{A})=\textbf{dis}_{\mathcal{M}^a_{\bm{A}}}(x)\setminus\{x\}$,
	\begin{align*}
	x\ind \bm{A}\setminus (\textbf{mb}_{\mathcal{M}^a}(x, \bm{A})\cup\{x\})\mid \textbf{mb}_{\mathcal{M}^a}(x, \bm{A}).
	\end{align*}
	
\noindent\textbf{Remaining proof:}	
	
	Thus, we have shown the ordered local $m$-separation Markov property for the augmented $(t_0,n)$-cutoff graph and corresponding distribution of $\{(\bm{X}_t,\bm{\epsilon}_t)\}_{t\in\mathbb{Z}}$. 
	As mentioned above, Theorem $2$ in \citet{richardson2003markov} now yields the global $m$-separation Markov property for the augmented $(t_0,n)$-cutoff graph and corresponding distribution of $\{(\bm{X}_t,\bm{\epsilon}_t)\}_{t\in\mathbb{Z}}$.
	
	Now, for any sets of non-noise vertices $\bm{S}_1$, $\bm{S}_2$ and $\bm{S}_3$, note that $\bm{S}_1$ and $\bm{S}_3$ are $m$-separated in $\mathcal{M}^a$ if and only if they are $m$-separated in $\mathcal{M}$ because the noise variables are at most adjacent to one non-noise variable and thus no new paths between $\bm{S}_1$ and $\bm{S}_2$ are created when considering $\mathcal{M}^a$. Therefore, from the global $m$-separation Markov property for the augmented graph follows the global $m$-separation Markov property for the not-augmented graph and corresponding distribution of $\{\bm{X}_t\}_{t\in\mathbb{Z}}$.
\end{proof}






\section{Further lemmas and examples}
\begin{example}[Violation of Assumption (A2)]\label{ex_A2_violation}
Consider the stochastic difference equations in equation \eqref{eq_counter_a3} with $\phi = 1$ and the stochastic processes $\{\bm{\epsilon}_t\}_{t\in\mathbb{Z}}=\{(\epsilon^1_t,\epsilon^2_t)\}_{t\in\mathbb{Z}}$ and $\{\bm{X}_t\}_{t\in\mathbb{Z}}=\{(X^1_t,X^2_t)\}_{t\in\mathbb{Z}}$ where each $\bm{\epsilon}_t$ is normally distributed with mean zero and identity covariance matrix, where $X^1_0$ is standard normally distributed and independent of  $\{\bm{\epsilon}_t\}_{t\in\mathbb{Z}}$, and where the remaining variables of $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ are generated by
\begin{align*}
   X^1_{t+1}&=X^1_t+X^2_t+\epsilon^1_{t+1}\;\;\;\text{ for $t>0$,}\\
   X^1_{t}&=X^1_{t+1}-X^2_{t}-\epsilon^1_{t+1}\;\;\;\text{ for $t<0$, and}\\
   X^2_t&=\epsilon^2_t \text{ for all $t\in \mathbb{Z}$}.
\end{align*}
Note that these equations are not the stochastic difference equations, these are just the equations that define $\{\bm{X}_t\}_{t\in\mathbb{Z}}$.

The process $\{(\bm{X}_t,\bm{\epsilon}_t)\}_{t\in\mathbb{Z}}$ is a solution of the stochastic difference equations in \eqref{eq_counter_a3}: For $t>0$, this claim follows by definition; for $t\leq 0$, this claim follows from rearranging $X^1_{t}=X^1_{t+1}-X^2_{t}-\epsilon^1_t$. One can also see that $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ is not strictly stationary: The variance of $X^1_1$  equals $3$, however, the variance of $X^1_0$ equals just $1$ by definition. Also note that Assumption (A3) and the $d$-separation global Markov property are violated: We have
\begin{align*}
     \text{Cov}(X^1_{-2},X^2_{-1})&=\text{Cov}(X^1_{-2},\epsilon^2_{-1})\\
    &=\text{Cov}(X^1_{-1}-X^2_{-2}-\epsilon^1_{-1},\epsilon^2_{-1})\\
    &=\text{Cov}(X^1_{-1}-\epsilon^2_{-2}-\epsilon^1_{-1},\epsilon^2_{-1})\\
        &=\text{Cov}(X^1_{-1},\epsilon^2_{-1})\\
        &=\text{Cov}(X^1_{0}-X^2_{-1}-\epsilon^1_0,\epsilon^2_{-1})\\
        &=\text{Cov}(X^1_{0}-\epsilon^2_{-1}-\epsilon^1_0,\epsilon^2_{-1})\\
        &=-\text{Var}(\epsilon^2_{-1})\\
    &=-1
\end{align*}
where we used that $\epsilon^2_{-2}$, $\epsilon^1_{-1}$, $\epsilon^2_{-1}$, $\epsilon^1_0$ and $X^1_0$ are independent of $\epsilon^2_{-1}$ by definition. However, in the full time graph $X^1_{-2}$ and $X^2_{-1}$ are $d$-separated given the empty set.
The other assumptions are satisfied --- the explanations are similar as for the violation of Assumption (A3) in the main paper.

So we have an example that violates Assumption (A1), but the real reason why the $d$-separation global Markov property is violated is the violation of Assumption (A3) because $X^2_t=\epsilon^2_t$ for all $t\in\mathbb{Z}$ and thus a violation of Assumption (A3) directly carries over to a violation of the $d$-separation global Markov property.
\end{example}



\begin{lemma}
	\label{lemma_solv_sde}
Let $\phi\in \mathbb{R}$ with $|\phi|>1$. Let $\bm{\epsilon}_t:=\{\epsilon^1_t,\epsilon^2_t\}_{t\in\mathbb{Z}}$ be a stochastic process satisfying (A0) and having finite second moments.
	Define the stochastic process $\{\bm{X}_t\}_{t\in \mathbb{Z}}:=\{X^{1}_t,X^{2}_t\}_{t\in \mathbb{Z}}$ for each $t\in\mathbb{Z}$ by
	\begin{align*}
	X^{1}_t&=-\sum_{i=1}^\infty \phi^{-i}(\epsilon^{1}_{t+i}+\epsilon^{2}_{t+i-1})\;\;\;\;\text{ and}\\
	X^{2}_t&=\epsilon^{2}_{t}.
	\end{align*}
	Then, $\{\bm{X}_t\}_{t\in \mathbb{Z}}$ and $\{\bm{\epsilon}_t\}_{t\in\mathbb{Z}}$ solve the VAR stochastic difference equations
	\begin{align*}
	X^{1}_{t}&=\phi \cdot X^{1}_{t-1}+X^{2}_{t-1}+\epsilon^{1}_{t}\;\;\;\;\text{ and}\\
	X^{2}_{t}&=\epsilon^{2}_{t}
	\end{align*}
	$P$-almost surely for all $t\in\mathbb{Z}$.
	Moreover, define the time reversed processes $\{\bm{\Tilde{X}}_t\}_{t\in \mathbb{Z}}$ and $\{\bm{\Tilde{\epsilon}}_t\}_{t\in \mathbb{Z}}$ by
	$(\Tilde{X}^1_t,\Tilde{X}^2_t)=(X^1_{-t},-X^2_{-t-1}/\phi)$ and $(\Tilde{\epsilon}^1_t,\Tilde{\epsilon}^2_{t})=(-\epsilon^1_{-t+1}/\phi,-\epsilon^2_{-t-1}/\phi)$.
	Then, $\{\bm{\Tilde{X}}_t\}_{t\in \mathbb{Z}}$ and $\{\bm{\Tilde{\epsilon}}_t\}_{t\in \mathbb{Z}}$ satisfy the VAR stochastic difference equations
	\begin{align*}
	\Tilde{X}^1_{t}&=\frac{1}{\phi} \Tilde{X}^1_{t-1}+\Tilde{X}^2_{t-1}+\Tilde{\epsilon}^1_t\;\;\;\;\text{ and}\\
	\Tilde{X}^2_t&=\Tilde{\epsilon}^2_{t}
	\end{align*}
	$P$-almost surely for all $t\in\mathbb{Z}$.
\end{lemma}
\begin{proof}
	
\noindent First of all, note that $\{\phi^{-i}\}_{i\in \mathbb{N}_{\geq 1}}$ is absolutely summable and hence, $\sum_{i=1}^\infty \phi^{-i}(\epsilon^{1}_{t+i}+\epsilon^{2}_{t+i-1}):=\lim_{N\rightarrow \infty}\sum_{i=1}^N \phi^{-i}(\epsilon^{1}_{t+i}+\epsilon^{2}_{t+i-1})$, interpreted as a limit in mean-square, exists and is uniquely defined up to $P$-nullsets \citep[Section C.3]{lutkepohl2005new}. Therefore, the below algebraic manipulations of this infinite sum such as multiplication by a scalar, shifting the summation index, and adding other random variables correspond to the same operations on the finite sums $\sum_{i=1}^N \phi^{-i}(\epsilon^{1}_{t+i}+\epsilon^{2}_{t+i-1})$ and then taking the limit.

	\textbf{First part of the lemma:}
	
	For an arbitrary but fixed $t\in\mathbb{Z}$, we $P$-almost surely have
	\begin{align*}
	\phi \cdot X^{1}_{t-1}+X^{2}_{t-1}+\epsilon^{1}_{t} &=\phi \cdot \biggr(-\sum_{i=1}^\infty \phi^{-i}(\epsilon^{1}_{t-1+i}+\epsilon^{2}_{t-1+i-1})\biggr)+\epsilon^{2}_{t-1}+\epsilon^{1}_{t}\\
	&=\biggr(-\sum_{i=1}^\infty \phi^{-(i-1)}(\epsilon^{1}_{t+i-1}+\epsilon^{2}_{t+(i-1)-1})\biggr)+\epsilon^{2}_{t-1}+\epsilon^{1}_{t}\\
	&=\biggr(-\sum_{i=0}^\infty \phi^{-i}(\epsilon^{1}_{t+i}+\epsilon^{2}_{t+i-1})\biggr)+\epsilon^{2}_{t-1}+\epsilon^{1}_{t}\\
	&=\biggr(-\sum_{i=1}^\infty \phi^{-i}(\epsilon^{1}_{t+i}+\epsilon^{2}_{t+i-1})\biggr)-\epsilon^{2}_{t-1}-\epsilon^{1}_{t}+\epsilon^{2}_{t-1}+\epsilon^{1}_{t}\\
	&=-\sum_{i=1}^\infty \phi^{-i}(\epsilon^{1}_{t+i}+\epsilon^{2}_{t+i-1})\\
	&=X^{1}_t.
	\end{align*}
Moreover, $X^2_t=\epsilon^2_t$ trivially holds.
Thus, we have finished the first part of the lemma.

\textbf{Second part of the lemma:}
From the first part of the lemma, for an arbitrary but fixed $t\in\mathbb{Z}$, we $P$-almost surely have
	\begin{align*}
X^{1}_{-t+1}&=\phi \cdot X^{1}_{-t}+X^{2}_{-t}+\epsilon^{1}_{-t+1}.
\end{align*}
Rearranging this equation yields
\begin{align*}
X^{1}_{-t}=\frac{1}{\phi}\bigr(X^{1}_{-t+1}-X^{2}_{-t}-\epsilon^{1}_{-t+1}\bigr).
\end{align*}
Plugging in the definition of $\{\bm{\Tilde{X}}_{t}\}_{t\in\mathbb{Z}}$ and $\{\bm{\Tilde{\epsilon}}_t\}_{t\in \mathbb{Z}}$, we obtain
\begin{align*}
\Tilde{X}^{1}_{t}=\frac{1}{\phi}\Tilde{X}^{1}_{t-1}+\Tilde{X}^{2}_{t-1}+\Tilde{\epsilon}^{1}_{t}.
\end{align*}

Similarly, from the first part of the lemma, we have
\begin{align*}
X^2_{-t-1}=\epsilon^2_{-t-1}.
\end{align*}
Plugging in the definition of $\{\bm{\Tilde{X}}_{t}\}_{t\in\mathbb{Z}}$ and $\{\bm{\Tilde{\epsilon}}_t\}_{t\in \mathbb{Z}}$ yields
\begin{align*}
\Tilde{X}^2_t=\Tilde{\epsilon}^2_t.
\end{align*}
Thus, we have finished the second part of the lemma.
\end{proof}

\begin{lemma}
	\label{lemma_ref_point}
		Let $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ be a stochastic process such that each $\bm{X}_t$ is defined on the probability space $(\Omega,\mathcal{F},P)$ and takes values in the measurable space $(S_{\bm{X}},\Sigma_{\bm{X}})$. Furthermore, assume that $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ is strictly stationary. Then for all $a\in\mathbb{Z}$ and $m\in\mathbb{N}_{\geq 0}$,
\begin{align}
\label{eq_reversing}
\sup_{\substack{A\in\sigma(\bm{X}_{t}:\;t\leq a) \\ B\in \sigma(\bm{X}_{t}:\;t\geq a+m)}}\bigr| P(A\cap B) - P(A)P(B)\bigr|=\sup_{\substack{A\in\sigma(\bm{X}_{t}:\;t\leq 0) \\ B\in \sigma(\bm{X}_{t}:\;t\geq m)}}\bigr| P(A\cap B) - P(A)P(B)\bigr|.
\end{align}	
\end{lemma}
\begin{proof}
		Notational remark: In this proof, we always implicitly assume that time indices are in $\mathbb{Z}$.
		
	Write 
	\begin{align*}
	\alpha(m):=\sup_{\substack{A\in\sigma(\bm{X}_{t}:\;t\leq 0) \\ B\in \sigma(\bm{X}_{t}:\;t\geq m)}}\bigr| P(A\cap B) - P(A)P(B)\bigr|
	\end{align*}
	Define
	\begin{align*}
	\mathcal{M}_{\leq a}:=\{\bm{X}^{-1}_t(E):\;E\in \Sigma_{\bm{X}},\;t\leq a\}.
	\end{align*}
	Recall that $\sigma(\bm{X}_{t}:\;t\leq a)=\sigma(\mathcal{M}_{\leq a})$. 
	Furthermore, define
	\begin{align*}
	\mathcal{C}_{\leq a}:=\biggr\{\bigcup_{i\in I}\bigcap_{j\in J}\bm{X}^{-1}_{t_{ij}}(E_{ij}):\;I,J\text{ are finite index sets, and }\forall i,j: E_{ij}\in \Sigma_{\bm{X}},\;t_{ij}\leq a\biggr\}.
	\end{align*}
	We now show that 
	\begin{align*}
	\sigma(\mathcal{M}_{\leq a})= \sigma(\mathcal{C}_{\leq a}).
	\end{align*}
	Note that 
	\begin{align*}
	\mathcal{M}_{\leq a}\subseteq	\mathcal{C}_{\leq a},
	\end{align*}
	so $\sigma(\mathcal{M}_{\leq a})\subseteq \sigma(\mathcal{C}_{\leq a})$. Also note that every $\sigma$-algebra containing $\mathcal{M}_{\leq a}$ contains $\mathcal{C}_{\leq a}$,
	so 
	\begin{align*}
	\mathcal{C}_{\leq a}\subseteq \sigma(\mathcal{M}_{\leq a}).
	\end{align*}
	Therefore,
	\begin{align*}
	\sigma(\mathcal{C}_{\leq a})\subseteq \sigma\bigr(\sigma(\mathcal{M}_{\leq a})\bigr)=\sigma(\mathcal{M}_{\leq a})
	\end{align*}
	where the last equality follows because $\sigma(\mathcal{M}_{\leq a})$ is a $\sigma$-algebra and thus the smallest one containing itself. Hence, we conclude that $\sigma(\mathcal{C}_{\leq a})=\sigma(\mathcal{M}_{\leq a}).$
	
	We next show that $\mathcal{C}_{\leq a}$
	is an algebra (see, e.g., \S4 in \citet{halmos1974measure} for the notion of an algebra):
	\begin{itemize}
		\item $\Omega \in \mathcal{C}_{\leq a}$ because $S_{\bm{X}} \in \Sigma_{\bm{X}}$ and for all $t\leq a$, it holds that $\bm{X}^{-1}_t(S_{\bm{X}}) = \Omega$.
		\item If $A\in \mathcal{C}_{\leq a}$, then $A^c\in\mathcal{C}_{\leq a}$: Write $A:=\bigcup_{i\in I}\bigcap_{j\in J}\bm{X}^{-1}_{t_{ij}}(E_{ij})$.  Then,
		\begin{align*}
		A^c&=\biggr(\bigcup_{i\in I}\bigcap_{j\in J}\bm{X}^{-1}_{t_{ij}}(E_{ij})\biggr)^c\\
		&=\bigcap_{i\in I}\biggr(\bigcap_{j\in J}\bm{X}^{-1}_{t_{ij}}(E_{ij})\biggr)^c\\
		&=\bigcap_{i\in I}\bigcup_{j\in J}\biggr(\bm{X}^{-1}_{t_{ij}}(E_{ij})\biggr)^c\\
		&=\bigcap_{i\in I}\bigcup_{j\in J}\bm{X}^{-1}_{t_{ij}}(E_{ij}^c).
		\end{align*}
		The last line is again a finite union of finite intersections of preimages, just apply the distributive law for unions and intersections (however, one cannot simply interchange $\bigcap_{i\in I}$ and $\bigcup_{j\in J}$). So
		because every $E_{ij}^c\in \Sigma_{\bm{X}}$ since $\Sigma_{\bm{X}}$ is a $\sigma$-algebra, $A^c\in \mathcal{C}_{\leq a}$. 
		\item If $A,B\in\mathcal{C}_{\leq a}$, then $A\cup B\in \mathcal{C}_{\leq a}:$
		This result follows immediately since the union of two finite unions of finite intersections of preimages is again a finite union of finite intersections of preimages, just some relabelling is necessary.
	\end{itemize}
	Therefore, $\mathcal{C}_{\leq a}$ is an algebra and we have that $\sigma(\bm{X}_{t}:\;t\leq a)=\sigma(\mathcal{C}_{\leq a})$.
	
	An analogous result holds for $\sigma(\bm{X}_t:\;t\geq a+m)$ and 
	\begin{align*}
	\mathcal{C}_{\geq a+m}:=\biggr\{\bigcup_{i\in I}\bigcap_{j\in J}\bm{X}^{-1}_{t_{ij}}(F_{ij}):\;I,J\text{ are finite index sets, and }\forall i,j: F_{ij}\in \Sigma_{\bm{X}},\;t_{ij}\geq a+m\biggr\},
	\end{align*}
	so $\sigma(\bm{X}_t:\;t\geq a+m)=\sigma(\mathcal{C}_{\geq a+m})$ and $\mathcal{C}_{\geq a+m}$ is an algebra.
	
	Now, for all $A=\bigcup_{i\in I_1}\bigcap_{j\in J_1}\bm{X}^{-1}_{t_{ij}}(E_{ij})\in \mathcal{C}_{\leq a}$ and for all $B=\bigcup_{i\in I_2}\bigcap_{j\in J_2}\bm{X}^{-1}_{t_{ij}}(F_{ij})\in \mathcal{C}_{\geq a+m}$, define  $A'=\bigcup_{i\in I_1}\bigcap_{j\in J_1}\bm{X}^{-1}_{t_{ij}-a}(E_{ij})$ and $B'=\bigcup_{i\in I_2}\bigcap_{j\in J_2}\bm{X}^{-1}_{t_{ij}-a}(F_{ij}).$ Clearly, $A'\in \sigma(\bm{X}_t:\;t\leq 0)$ and $B'\in \sigma(\bm{X}_t:\;t\geq m)$.
	
	By strict stationarity and the inclusion-exclusion formula,
	\begin{align*}
	P(A\cap B)-P(A)P(B) = P(A'\cap B')-P(A')P(B'),
	\end{align*}
	Thus, 
	\begin{align*}
	|P(A\cap B)-P(A)P(B)| = |P(A'\cap B')-P(A')P(B')| \leq \alpha(m).
	\end{align*}
	
	For general sets $A\in\sigma(\bm{X}_t:\;t\leq a)$ and $B\in \sigma(\bm{X}_t:\;t\geq a+m)$, we use the following result from measure theory (see, e.g., Theorem D in \S13 in \citet{halmos1974measure}) that relies on the generating set of a $\sigma$-algebra being an algebra: For all $A\in \sigma(\bm{X}_{t}:\;t\leq a)$, for all $B\in \sigma(\bm{X}_t:\;t\geq a+m)$ and for all $ \varepsilon >0$, one can find $\hat{A}\in \mathcal{C}_{\leq a}$ and $\hat{B}\in \mathcal{C}_{\geq a+m}$ such that
	\begin{align*}
	\label{epsilon_bound}
	&P\biggr(\bigr(A\setminus \hat{A}\bigr)\;\cup\; \bigr(\hat{A}\setminus A\bigr)\biggr) \leq \Tilde{\varepsilon}\;\;\;\;\text{and}\\
	&P\biggr(\bigr(B\setminus \hat{B}\bigr)\;\cup\; \bigr(\hat{B}\setminus B\bigr)\biggr) \leq \Tilde{\varepsilon},\numberthis
	\end{align*} 
	where $\Tilde{\varepsilon}:=-2+\sqrt{4+\varepsilon}$; note that $\Tilde{\varepsilon}^2+4\Tilde{\varepsilon}=\varepsilon$.
	Also note that \eqref{epsilon_bound} implies
	\begin{align*}
	&P\biggr(\biggr[\bigr(A\cap B\bigr)\;\setminus\; \bigr(\hat{A}\cap \hat{B}\bigr)\biggr]\;\cup\; \biggr[\bigr(\hat{A}\cap \hat{B}\bigr)\;\setminus\; \bigr(A\cap B\bigr)\biggr]\biggr)\\
	& \leq P\biggr(\bigr(A\setminus \hat{A}\bigr)\;\cup\; \bigr(\hat{A}\setminus A\bigr)\;\cup \;\bigr(B\setminus \hat{B}\bigr)\;\cup\; \bigr(\hat{B}\setminus B\bigr)\biggr)\\
	&\leq P\biggr(\bigr(A\setminus \hat{A}\bigr)\;\cup\; (\hat{A}\setminus A)\biggr)+P\biggr(\bigr(B\setminus \hat{B}\bigr)\;\cup\; \bigr(\hat{B}\setminus B\bigr)\biggr)\\
	&\leq \Tilde{\varepsilon} + \Tilde{\varepsilon}\\
	& = 2\Tilde{\varepsilon}.
	\end{align*}
	Here, the first inequality follows from
	\begin{align}\label{eq_subset_rel}
	\biggr[\bigr(A\cap B\bigr)\;\setminus\; \bigr(\hat{A}\cap \hat{B}\bigr)\biggr]\;\cup\; \biggr[\bigr(\hat{A}\cap \hat{B}\bigr)\;\setminus\; \bigr(A\cap B\bigr)\biggr]\subseteq \bigr(A\setminus \hat{A}\bigr)\;\cup\; \bigr(\hat{A}\setminus A\bigr)\;\cup\; \bigr(B\setminus \hat{B}\bigr)\;\cup\; \bigr(\hat{B}\setminus B\bigr).
	\end{align}
	The subset relation in \eqref{eq_subset_rel} is true due to following argumentation:
	Let
	\begin{align*}
	   \omega \in \biggr[\bigr(A\cap B\bigr)\;\setminus\; \bigr(\hat{A}\cap \hat{B}\bigr)\biggr]\;\cup\; \biggr[\bigr(\hat{A}\cap \hat{B}\bigr)\;\setminus\; \bigr(A\cap B\bigr)\biggr].
	\end{align*}
	Without loss of generality, assume that
	\begin{align*}
	    \omega \in \bigr(A\cap B\bigr)\;\setminus\; \bigr(\hat{A}\cap \hat{B}\bigr).
	\end{align*}
	Then, 
	\begin{align*}
	    \omega \in A\;\setminus\; \bigr(\hat{A}\cap \hat{B}\bigr).
	\end{align*}
	Now, suppose that
	\begin{align*}
	    \omega \notin A\;\setminus\;\hat{A}.
	\end{align*}
	Then, $\omega \in A$ and $\omega \in \hat{A}$ but $\omega \notin \hat{B}$. Because $\omega \in \bigr(A\cap B\bigr)\;\setminus\; \bigr(\hat{A}\cap \hat{B}\bigr)$, it follows that $\omega \in B$ and hence, $\omega \in B\setminus \hat{B}$.
	Arguing analogously for the other cases, we conclude that the subset-relation in \eqref{eq_subset_rel} is true.
	
	Combining the previous approximations of the probability measures, we now have
	\begin{align*}
	&P(\hat{A})-\Tilde{\varepsilon} \\
	&= P(A\cap \hat{A}) + P(\hat{A}\setminus A) - \Tilde{\varepsilon}\\
	& \leq P(A)+ P(\hat{A}\setminus A) - \Tilde{\varepsilon}\\
	& \leq P(A)+ P\biggr(\bigr(A\setminus \hat{A}\bigr)\;\cup\; \bigr(\hat{A}\setminus A\bigr)\biggr)  - \Tilde{\varepsilon}\\
	&\leq P(A)\\
	&=P(A\cap \hat{A}) + P(A\setminus \hat{A})\\
	&\leq P(\hat{A}) + P(A\setminus \hat{A})\\
	&\leq P(\hat{A}) + P\biggr(\bigr(A\setminus \hat{A}\bigr)\;\cup\; \bigr(\hat{A}\setminus A\bigr)\biggr)\\
	&\leq P(\hat{A}) + \Tilde{\varepsilon}.
	\end{align*}
	Similarly, $P(\hat{B})-\Tilde{\varepsilon}\leq P(B)\leq P(\hat{B}) + \Tilde{\varepsilon}$ and $P(\hat{A}\cap \hat{B}) - 2\Tilde{\varepsilon} \leq P(A\cap B)\leq P(\hat{A}\cap \hat{B}) + 2\Tilde{\varepsilon}$.
	Thus,
	\begin{align*}
	&P(\hat{A}\cap \hat{B})-P(\hat{A})P(\hat{B}) - \varepsilon\\
	&=P(\hat{A}\cap \hat{B})-P(\hat{A})P(\hat{B}) - 4\Tilde{\varepsilon} -\Tilde{\varepsilon}^2\\
	&\leq P(\hat{A}\cap \hat{B})-P(\hat{A})P(\hat{B}) - 2\Tilde{\varepsilon} - \biggr(P(\hat{A})+P(\hat{B})\biggr)\cdot\Tilde{\varepsilon}-\Tilde{\varepsilon}^2\\
	&= \biggr(P(\hat{A}\cap \hat{B}) - 2\Tilde{\varepsilon}\biggr)-\biggr(P(\hat{A})+\Tilde{\varepsilon}\biggr)\cdot\biggr(P(\hat{B})+\Tilde{\varepsilon}\biggr)\\
	&\leq P(A\cap B) - P(A)P(B)\\
	&\leq \biggr(P(\hat{A}\cap \hat{B}) + 2\Tilde{\varepsilon}\biggr)-\biggr(P(\hat{A})-\Tilde{\varepsilon}\biggr)\cdot \biggr(P(\hat{B})-\Tilde{\varepsilon}\biggr)\\
	&= P(\hat{A}\cap \hat{B})-P(\hat{A})P(\hat{B}) + 2\Tilde{\varepsilon} + \biggr(P(\hat{A})+P(\hat{B})\biggr)\cdot\Tilde{\varepsilon}-\Tilde{\varepsilon}^2\\
	&\leq P(\hat{A}\cap \hat{B})-P(\hat{A})P(\hat{B}) + 4\Tilde{\varepsilon} -\Tilde{\varepsilon}^2\\
	&\leq P(\hat{A}\cap \hat{B})-P(\hat{A})P(\hat{B}) + 4\Tilde{\varepsilon} +\Tilde{\varepsilon}^2\\
	&=  P(\hat{A}\cap \hat{B})-P(\hat{A})P(\hat{B}) + \varepsilon
	\end{align*}
	Therefore, 
	\begin{align*}
	|P(A\cap B) - P(A)P(B)|&\leq \max\biggr\{\bigr|P(\hat{A}\cap \hat{B})-P(\hat{A})P(\hat{B}) - \varepsilon\bigr|, \bigr|P(\hat{A}\cap \hat{B})-P(\hat{A})P(\hat{B}) + \varepsilon\bigr|\biggr\}\\
	&\leq \max\biggr\{\bigr|P(\hat{A}\cap \hat{B})-P(\hat{A})P(\hat{B})\bigr|+ \varepsilon, \bigr|P(\hat{A}\cap \hat{B})-P(\hat{A})P(\hat{B})\bigr| + \varepsilon\biggr\}\\
	&=\bigr|P(\hat{A}\cap \hat{B})-P(\hat{A})P(\hat{B})\bigr|+\varepsilon\\
	&\leq \alpha(m)+\varepsilon.
	\end{align*}
	Because $\varepsilon>0$ was arbitrary,
	$|P(A\cap B) - P(A)P(B)|\leq \alpha(m)$.
	Therefore,
	\begin{align*}
	\sup_{\substack{A\in\sigma(\bm{X}_{t}:\;t\leq a) \\ B\in \sigma(\bm{X}_{t}:\;t\geq a+m)}}\bigr| P(A\cap B) - P(A)P(B)\bigr|\leq \alpha(m).
	\end{align*}
	Arguing analogously the other way round, we conclude that equation \eqref{eq_reversing} is true.
\end{proof}



\begin{lemma}
	\label{lemma_reversing}
	Let $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ be a stochastic process such that each $\bm{X}_t$ is defined on the probability space $(\Omega,\mathcal{F},P)$ and takes values in the measurable space $(S_{\bm{X}},\Sigma_{\bm{X}})$. Furthermore, assume that $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ is strictly stationary and $\alpha$-strongly mixing. Then, the time-reversed process $\{\bm{\Tilde{X}}_t\}_{t\in\mathbb{Z}}$ defined by $\bm{\Tilde{X}}_t:=\bm{X}_{-t}$ is also $\alpha$-strongly mixing.
\end{lemma}

\begin{proof}
	Notational remark: In this proof, we always implicitly assume that time indices are in $\mathbb{Z}$.
	
	For arbitrary but fixed $m\in\mathbb{N}_{\geq 0}$,
	\begin{align*}
	\alpha_{\bm{\Tilde{X}}}(m)&:=\sup_{\substack{A\in\sigma(\bm{\Tilde{X}}_t:\;t\leq 0) \\ B\in \sigma(\bm{\Tilde{X}}_t:\;t\geq m)}}\bigr| P(A\cap B) - P(A)P(B)\bigr|\\
	&=\sup_{\substack{A\in\sigma(\bm{X}_{-t}:\;t\leq 0) \\ B\in \sigma(\bm{X}_{-t}:\;t\geq m)}}\bigr| P(A\cap B) - P(A)P(B)\bigr|\\
	&=	\sup_{\substack{A\in\sigma(\bm{X}_{-t}:\;-t\geq 0) \\ B\in \sigma(\bm{X}_{-t}:\;-t\leq -m)}}\bigr| P(A\cap B) - P(A)P(B)\bigr|\\
	&=			\sup_{\substack{A\in\sigma(\bm{X}_{t}:\;t\geq 0) \\ B\in \sigma(\bm{X}_{t}:\;t\leq -m)}}\bigr| P(A\cap B) - P(A)P(B)\bigr|\\
		&\hspace{-0.04cm}\overset{(*)}{=}			\sup_{\substack{A\in\sigma(\bm{X}_{t}:\;t\geq m) \\ B\in \sigma(\bm{X}_{t}:\;t\leq 0)}}\bigr| P(A\cap B) - P(A)P(B)\bigr|\\
	&\hspace{-0.11cm}\overset{(**)}{=}	\sup_{\substack{A\in\sigma(\bm{X}_{t}:\;t\leq 0) \\ B\in \sigma(\bm{X}_{t}:\;t\geq m)}}\bigr| P(A\cap B) - P(A)P(B)\bigr|\\
	&=:\alpha_{\bm{X}}(m)
	\end{align*}
	Here, $(*)$ follows from Lemma \ref{lemma_ref_point} and $(**)$ follows from relabelling. Thus, we obtain the statement that we wanted to prove.
\end{proof}

\begin{lemma}
	\label{lemma_shifting}
	Let $\{\bm{X}_t\}_{t\in\mathbb{Z}}=\{(X^1_t,X^2_t)\}_{t\in\mathbb{Z}}$ be a stochastic process such that each $X^1_t$ and each $X^2_t$ is defined on the probability space $(\Omega,\mathcal{F},P)$ and takes values in the measurable space $(\mathcal{X}_1,\Sigma_{X_1})$ respectively $(\mathcal{X}_2,\Sigma_{X_2})$.
	Define a new stochastic process $\{\bm{\Tilde{X}}_t\}_{t\in\mathbb{Z}}$ by $\bm{\Tilde{X}}_t=(\bm{\Tilde{X}}^1_{t},\bm{\Tilde{X}}^2_{t}):=(\bm{X}^1_{t-k}, \bm{X}^2_{t})$ for some fixed $k\geq 0$. If $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ is $\alpha$-strongly mixing, then $\{\bm{\Tilde{X}}_t\}_{t\in\mathbb{Z}}$ is $\alpha$-strongly mixing.
\end{lemma}
\begin{proof}
Remark: In this proof, we always implicitly assume that time indices are in $\mathbb{Z}$.
	
Let $m\in\mathbb{N}_{\geq k}$ be arbitrary but fixed. Write
\begin{align*}
	\alpha_{\bm{X}}(m)&:=\sup_{\substack{A\in\sigma(\bm{X}_t:\;t\leq 0) \\ B\in \sigma(\bm{X}_t:\;t\geq m)}}\bigr| P(A\cap B) - P(A)P(B)\bigr|.
\end{align*}
	Define
	\begin{align*}
	\mathcal{S}_{\leq 0}:=\{\bm{\Tilde{X}}^{-1}_t(E):\;E\in \Sigma^X_1\otimes\Sigma^X_2,\;t\leq 0\}.
	\end{align*}
	Recall that $\sigma(\bm{\Tilde{X}}_{t}:\;t\leq 0)=\sigma(\mathcal{S}_{\leq 0})$. 
	Furthermore, define
		\begin{align*}
	\mathcal{M}_{\leq 0}:=\{\bm{\Tilde{X}}^{-1}_t(E^1\times E^2):\;E^1\in\Sigma^X_1,\;E^2\in \Sigma^X_2,\;t\leq 0\}.
	\end{align*}
and	
	\begin{align*}
	\mathcal{C}_{\leq 0}:=\biggr\{\bigcup_{i\in I}\bigcap_{j\in J}(\Tilde{X}^{l_{ij}}_{t_{ij}})^{-1}(E_{ij}):\;I,J\text{ are finite index sets, and }\forall i,j: t_{ij}\leq 0,\;l_{ij}\in\{1,2\},\;E_{ij}\in \Sigma^X_{l_{ij}};\biggr\}.
	\end{align*}
	We first show that 
	\begin{align*}
	\sigma(\mathcal{M}_{\leq 0})= \sigma(\mathcal{C}_{\leq 0}).
	\end{align*}
	First, note that for all $E^1\in \Sigma^X_1$ and for all $E^2\in\Sigma^X_2$, 
	\begin{align*}
	\bm{\Tilde{X}}^{-1}_t(E^1\times E^2)=\bigr(\Tilde{X}^1_t\bigr)^{-1}(E^1)\;\cap\; \bigr(\Tilde{X}^2_t\bigr)^{-1}(E^2).	
	\end{align*}
	Thus, 
	\begin{align*}
	\mathcal{M}_{\leq 0}\subseteq \mathcal{C}_{\leq 0}
	\end{align*}
	and hence,
	\begin{align*}
	\sigma(\mathcal{M}_{\leq 0})\subseteq \sigma(\mathcal{C}_{\leq 0}).
	\end{align*}
	Vice versa, note that every $\sigma$-algebra containing $\mathcal{M}_{\leq 0}$ contains 
	\begin{align*}
		\bm{\Tilde{X}}^{-1}_t(E^1\times \mathcal{X}_2)=\bigr(\Tilde{X}^1_t\bigr)^{-1}(E^1)\;\cap\; \bigr(\Tilde{X}^2_t\bigr)^{-1}(\mathcal{X}_2)=\bigr(\Tilde{X}^1_t\bigr)^{-1}(E^1)\cap \Omega = \bigr(\Tilde{X}^1_t\bigr)^{-1}(E^1)
	\end{align*}
	 for every $E_1\in\Sigma^X_1$. Similarly, every such $\sigma$-algebra contains
	 \begin{align*}
	 \bigr(\Tilde{X}^2_t\bigr)^{-1}(E^2)
	 \end{align*}
	   for every $E_2\in \Sigma^X_2$. Thus, every $\sigma$-algebra containing $\mathcal{M}_{\leq 0}$ contains $\mathcal{C}_{\leq 0}$ and hence,
	\begin{align*}
	\sigma(\mathcal{C}_{\leq 0})\subseteq 	\sigma(\sigma(\mathcal{M}_{\leq 0})) = 	\sigma(\mathcal{M}_{\leq 0}),
	\end{align*}
	where the last equality follows because $\sigma(\mathcal{M}_{\leq 0})$ is a $\sigma$-algebra and thus the smallest one containing itself.
	Therefore, we have shown $\sigma(\mathcal{M}_{\leq 0})= \sigma(\mathcal{C}_{\leq 0})$.
	
	We next show that 
		\begin{align*}
\sigma(\mathcal{S}_{\leq 0}) = \sigma(\mathcal{M}_{\leq 0}).
	\end{align*} 
		 First, we (trivially) have that
	 \begin{align*}
	 \mathcal{M}_{\leq 0}\subseteq \mathcal{S}_{\leq 0},
	 \end{align*}
	 so 
	 \begin{align*}
	 \sigma\bigr(\mathcal{M}_{\leq 0}\bigr)\subseteq \sigma(\mathcal{S}_{\leq 0}).
	 \end{align*}
	Vice versa, we argue as follows:
	Note that for all $E:=E^1\times E^2$ with $E^1\in \Sigma^X_1$ and $E^2\in \Sigma^X_2$ and for all $t\leq 0$, it (trivially) holds that
	\begin{align*}
		 \bm{\Tilde{X}}_t^{-1}(E)\in \sigma\bigr(\{\bm{\Tilde{X}}_t^{-1}(E'^1\times E'^2):\;E'^1\in \Sigma^X_1,\;E'^2\in \Sigma^X_2\}\bigr).
	\end{align*}
Now, recall that by definition of $\Sigma^X_1\otimes \Sigma^X_2$,
		\begin{align*}
				\sigma\bigr(\{E^1\times E^2:\;E'^1\in \Sigma^X_1,\;E'^2\in \Sigma^X_2\}\bigr) = \Sigma^X_1\otimes \Sigma^X_2. 
		\end{align*}
	 From a standard result in measure theory (see, e.g., Theorem 1.3.1 in \citet{durrett2019probability}), it then follows that 
	 \begin{align*}
	 \bm{\Tilde{X}}_t^{-1}(E)\in \sigma\bigr(\{\bm{\Tilde{X}}_t^{-1}(E'^1\times E'^2):\;E'^1\in \Sigma^X_1,\;E'^2\in \Sigma^X_2\}\bigr)
	 \end{align*}
	 for all $E\in \Sigma^X_1\otimes \Sigma^X_2$ and for all $t\leq 0$. Thus,
	 	 \begin{align*}
	 \sigma(\bm{\Tilde{X}}_t)\subseteq \sigma\bigr(\{\bm{\Tilde{X}}_t^{-1}(E'^1\times E'^2):\;E'^1\in \Sigma^X_1,\;E'^2\in \Sigma^X_2\}\bigr).
	 \end{align*}
	  Vice versa, it is immediately clear by definition that
	  \begin{align*}
	  \sigma(\bm{\Tilde{X}}_t)\supseteq \sigma\bigr(\{\bm{\Tilde{X}}_t^{-1}(E'^1\times E'^2):\;E'^1\in \Sigma^X_1,\;E'^2\in \Sigma^X_2\}\bigr),
	  \end{align*}
	  so, we conclude that
	  \begin{align*}
 \sigma(\bm{\Tilde{X}}_t)= \sigma\bigr(\{\bm{\Tilde{X}}_t^{-1}(E'^1\times E'^2):\;E'^1\in \Sigma^X_1,\;E'^2\in \Sigma^X_2\}\bigr).
	  \end{align*}
	  Therefore, 
	 \begin{align*}
	 \sigma(\mathcal{S}_{\leq 0}) = \sigma\biggr(\bigcup_{t\leq 0}\sigma(\bm{\Tilde{X}}_t)\biggr)=\sigma\biggr(\bigcup_{t\leq 0} \sigma\bigr(\{\bm{\Tilde{X}}_t^{-1}(E'^1\times E'^2):\;E'^1\in \Sigma^X_1,\;E'^2\in \Sigma^X_2\}\bigr)\biggr).
	 \end{align*}

	 Now, by definition, for all $t\leq 0$,
	 \begin{align*}
	 \{\bm{\Tilde{X}}_t^{-1}(E'^1\times E'^2):\;E'^1\in \Sigma^X_1,\;E'^2\in \Sigma^X_2\}\subseteq \sigma(\mathcal{M}_{\leq 0}).
	 \end{align*}
	  So,  for all $t\leq 0$,
	 \begin{align*}
	 \sigma(\{\bm{\Tilde{X}}_t^{-1}(E'^1\times E'^2):\;E'^1\in \Sigma^X_1,\;E'^2\in \Sigma^X_2\})\subseteq \sigma(\sigma(\mathcal{M}_{\leq 0}))=\sigma(\mathcal{M}_{\leq 0}),
	 \end{align*}
	 where the last equality follows because $\sigma(\mathcal{M}_{\leq 0})$ is a $\sigma$-algebra and thus the smallest one containing itself.
	 Hence,
	 \begin{align*}
	 \bigcup_{t\leq 0}\sigma\bigr(\{\bm{\Tilde{X}}_t^{-1}(E'^1\times E'^2):\;E'^1\in \Sigma^X_1,\;E'^2\in \Sigma^X_2\}\bigr)\subseteq \sigma(\mathcal{M}_{\leq 0})
	 \end{align*}
	 and thus,
	 \begin{align*}
	 \sigma(\mathcal{S}_{\leq 0}) =\sigma\biggr(\bigcup_{t\leq 0}\sigma\bigr(\{\bm{\Tilde{X}}_t^{-1}(E'^1\times E'^2):\;E'^1\in \Sigma^X_1,\;E'^2\in \Sigma^X_2\}\bigr)\biggr)\subseteq \sigma(\sigma(\mathcal{M}_{\leq 0}))=\sigma(\mathcal{M}_{\leq 0}),
	 \end{align*}
	 where the last equality follows because $\sigma(\mathcal{M}_{\leq 0})$ is a $\sigma$-algebra and thus the smallest one containing itself.
	Therefore, $\sigma(\mathcal{C}_{\leq 0})=\sigma( \mathcal{M}_{\leq 0})=\sigma(\mathcal{S}_{\leq 0})=\sigma(\bm{\Tilde{X}}_t:\;t\leq 0)$.




	By a similar argument as in the proof of Lemma \ref{lemma_reversing}, one can show that  $\mathcal{C}_{\leq 0}$ is an algebra.
	
	In an analogous fashion, one can also show that 	
	\begin{align*}
	\mathcal{C}_{\geq m}:=\biggr\{\bigcup_{i\in I}\bigcap_{j\in J}(\Tilde{X}^{l_{ij}}_{t_{ij}})^{-1}(F_{ij}):\;I,J\text{ are finite index sets, and }\forall i,j: t_{ij}\geq m,\;l_{ij}\in\{1,2\},\;F_{ij}\in \Sigma^{X}_{l_{ij}}\biggr\},
	\end{align*}
  is an algebra with $\sigma(\mathcal{C}_{\geq m})=\sigma(\bm{\Tilde{X}}_t:\;t\geq m)$.
	
	Now, for all sets $A=\bigcup_{i\in I_1}\bigcap_{j\in J_1}(\Tilde{X}^{l_{ij}}_{t_{ij}})^{-1}(E_{ij})\in \mathcal{C}_{\leq 0}$ and for all sets $B=\bigcup_{i\in I_2}\bigcap_{j\in J_2}(\Tilde{X}^{l_{ij}}_{t_{ij}})^{-1}(F_{ij})\in \mathcal{C}_{\geq m}$,
	we can by definition write $A=\bigcup_{i\in I_1}\bigcap_{j\in J_1}(X^{l_{ij}}_{t_{ij}-k\cdot1_{l_{ij}=1}})^{-1}(E_{ij})$ and $B= \bigcup_{i\in I_2}\bigcap_{j\in J_2}(X^{l_{ij}}_{t_{ij}-k\cdot1_{l_{ij}=1}})^{-1}(F_{ij})$. Thus, $A\in \sigma(\bm{X}_t:\;t\leq 0)$ and $B\in \sigma(\bm{X}_t:\;t\geq m-k)$.
	Therefore,
	\begin{align*}
	|P(A\cap B)-P(A)P(B)| \leq \alpha_{\bm{X}}(m-k).
		\end{align*}
For general sets $A\in \sigma(\bm{\Tilde{X}}_t:\;t\leq 0)$ and $B\in \sigma(\bm{\Tilde{X}}_t:\;t\geq m)$, one can do the exact same $\varepsilon$-approximation argument as in the proof of Lemma \ref{lemma_reversing} because $\mathcal{C}_{\leq 0}$ and $\mathcal{C}_{\geq m}$ are algebras.

Therefore, we conclude that 
\begin{align*}
	\alpha_{\bm{\Tilde{X}}}(m)&:=\sup_{\substack{A\in\sigma(\bm{X}_t:\;t\leq 0) \\ B\in \sigma(\bm{X}_t:\;t\geq m)}}\bigr| P(A\cap B) - P(A)P(B)\bigr|\leq \alpha_{\bm{X}}(m-k).
\end{align*}
Because $\{\bm{X}_t\}_{t\in\mathbb{Z}}$ is $\alpha$-strongly mixing, and hence, $\alpha_{\bm{X}}(m-k)\rightarrow 0$ for $m\rightarrow \infty$, we obtain, $\alpha_{\bm{\Tilde{X}}}(m)\rightarrow 0$ for $m\rightarrow \infty$.
\end{proof}







\begin{lemma}\label{lemma_convergence} Suppose that a sequence of $\mathbb{R}^{d_1}$-valued random variables $\{\bm{Z}_n\}_{n\in\mathbb{N}_{\geq 1}}$ converges in probability to an $\mathbb{R}^{d_1}$-valued random variable $\bm{Z}$, in short, $\bm{Z}_n\overset{p}{\rightarrow} \bm{Z}$. Moreover, let $\bm{S}$ be an $\mathbb{R}^{d_2}$-valued random variable and suppose that for all $n\in\mathbb{N}_{\geq 1}$ it holds that $\bm{Z}_n\ind \bm{S}$. Then, $\bm{Z}\ind \bm{S}$.	
\end{lemma}
\begin{proof}
Notational remark: For vectors $\bm{a}\in \mathbb{R}^{d_1}$ and $\bm{b}\in \mathbb{R}^{d_2},$ we write $(\bm{a},\bm{b})\in \mathbb{R}^{d_1+d_2}$ to denote the vector whose first components are the components of $\bm{a}$ and whose second components are the components of $\bm{b}$. We write $\bm{a}\cdot \bm{b}$ to denote the standard scalar product between $\bm{a}$ and $\bm{b}$.

Note that $\bm{Z}_n\overset{p}{\rightarrow} \bm{Z}$ and $\bm{S}\overset{p}{\rightarrow} \bm{S}$ implies that $(\bm{Z}_n,\bm{S})\overset{p}{\rightarrow}(\bm{Z},\bm{S})$. Thus, $(\bm{Z}_n,\bm{S})\overset{d}{\rightarrow}(\bm{Z},\bm{S})$. By Levy's continuity theorem \citep[Theorem 3.3.17]{durrett2019probability}, 
\begin{align*}
\lim_{n\rightarrow \infty}\mathbb{E}[e^{i(\bm{t}_1,\bm{t}_2)\cdot(\bm{Z}_n,\bm{S})}]= \mathbb{E}[e^{i(\bm{t}_1,\bm{t}_2)\cdot(\bm{Z},\bm{S})}]
\end{align*}
for all $\bm{t}_1\in\mathbb{R}^{d_1}$ and for all $\bm{t}_2\in \mathbb{R}^{d_2}$.
Also, by Levy's continuity theorem, 
\begin{align*}
\lim_{n\rightarrow \infty}\mathbb{E}[e^{i\bm{t}_1\cdot \bm{Z}_n}]= \mathbb{E}[e^{i\bm{t}_1 \cdot \bm{Z}}]
\end{align*}
for all $\bm{t}_1\in\mathbb{R}^{d_1}$.
Now, because $\bm{Z}_n\ind \bm{S}$ for all $n\in\mathbb{N}$, we have
\begin{align*}
\mathbb{E}[e^{i(\bm{t}_1,\bm{t}_2)\cdot(\bm{Z}_n,\bm{S})}]=\mathbb{E}[e^{i\bm{t}_1\cdot \bm{Z}_n}]\mathbb{E}[e^{i\bm{t}_2\cdot \bm{S}}]
\end{align*}
for all $\bm{t}_1\in\mathbb{R}^{d_1}$ and for all $\bm{t}_2\in \mathbb{R}^{d_2}$.

Thus, for all $\bm{t}_1\in\mathbb{R}^{d_1}$ and for all $\bm{t}_2\in \mathbb{R}^{d_2}$,
\begin{align*}
\mathbb{E}[e^{i(\bm{t}_1,\bm{t}_2)\cdot (\bm{Z},\bm{S})}]&=\lim_{n\rightarrow \infty}\mathbb{E}[e^{i(\bm{t}_1,\bm{t}_2)\cdot(\bm{Z}_n,\bm{S})}]\\
&=\lim_{n\rightarrow \infty}\mathbb{E}[e^{i\bm{t}_1\cdot \bm{Z}_n}]\mathbb{E}[e^{i\bm{t}_2\cdot \bm{S}}]\\
&=\mathbb{E}[e^{i\bm{t}_2\cdot \bm{S}}]\lim_{n\rightarrow \infty}\mathbb{E}[e^{i\bm{t}_1\cdot \bm{Z}_n}]\\
&=\mathbb{E}[e^{i\bm{t}_1\cdot \bm{Z}}]\mathbb{E}[e^{i\bm{t}_2\cdot \bm{S}}].
\end{align*}
So $(\bm{Z},\bm{S})\sim P_{\bm{Z},\bm{S}}$ has characteristic function $\mathbb{E}[e^{i\bm{t}_1\cdot \bm{Z}}]\mathbb{E}[e^{i\bm{t}_2\cdot \bm{S}}]$. The random vector $(\bm{\Tilde{Z}},\bm{\Tilde{S}})\sim P_{\bm{Z}}\otimes P_{\bm{S}}$ also has characteristic function $\mathbb{E}[e^{i\bm{t}_1\cdot \bm{Z}}]\mathbb{E}[e^{i\bm{t}_2\cdot \bm{S}}]$. Because equality of characteristic functions implies equality of the corresponding distributions \citep[Theorem 3.3.11]{durrett2019probability}, we conclude that $P_{\bm{Z},\bm{S}}=P_{\bm{Z}}\otimes P_{\bm{S}}$. Hence, $\bm{Z}\ind \bm{S}$
\end{proof}

\begin{lemma}\label{lemma_equal_dist} Suppose that a sequence of $\mathbb{R}^{d}$-valued random variables $\{\bm{X}_n\}_{n\in\mathbb{N}}$ converges in distribution to an $\mathbb{R}^{d}$-valued random variable $\bm{X}$, in short $\bm{X}_n\overset{d}{\rightarrow}\bm{X}$. Similarly, suppose that a sequence of $\mathbb{R}^{d}$-valued random variables $\{\bm{Y}_n\}_{n\in\mathbb{N}}$ converges in distribution to an $\mathbb{R}^{d}$-valued random variable $\bm{Y}$, in short $\bm{Y}_n\overset{d}{\rightarrow}\bm{Y}$.
If $\bm{X}_n$ and $\bm{Y}_n$ have the same distribution for all $n\in\mathbb{N}$, then $\bm{X}$ and $\bm{Y}$ have the same distribution.
\end{lemma}
\begin{proof}
By Levy's continuity theorem \citep[Theorem 3.3.17]{durrett2019probability}, 
\begin{align*}
\lim_{n\rightarrow \infty}\mathbb{E}[e^{i\bm{t}\cdot \bm{X}_n}]= \mathbb{E}[e^{i\bm{t} \cdot \bm{X}}]
\end{align*}
and 
\begin{align*}
\lim_{n\rightarrow \infty}\mathbb{E}[e^{i\bm{t}\cdot \bm{Y}_n}]= \mathbb{E}[e^{i\bm{t} \cdot \bm{Y}}]
\end{align*}
for all $\bm{t}\in \mathbb{R}^d$.
Because $\bm{X}_n$ and $\bm{Y}_n$ have the same distribution for all $n\in\mathbb{N}_{\geq 1}$,
\begin{align*}
\mathbb{E}[e^{i\bm{t}\cdot \bm{X}_n}]=
\mathbb{E}[e^{i\bm{t}\cdot \bm{Y}_n}]
\end{align*}
for all $\bm{t}\in \mathbb{R}^d$ and for all $n\in\mathbb{N}_{\geq 1}$.
Therefore,
\begin{align*}
  \mathbb{E}[e^{i\bm{t} \cdot \bm{X}}]=  \mathbb{E}[e^{i\bm{t} \cdot \bm{Y}}]
\end{align*}
for all $\bm{t}\in \mathbb{R}^d$.

Thus, $\bm{X}$ and $\bm{Y}$ have the same characteristic function and hence the same distribution  \citep[Theorem 3.3.11]{durrett2019probability}.
\end{proof}
\clearpage
\section{Numerical Illustration}
We now consider a small numerical illustration of the violation of Assumption (A3).
\begin{example}[Numerical Illustration of the violation of Assumption (A3)]
\label{ex_numerical_illus}
For different sample sizes and different $\phi$'s, we simulate $X^1_0$ from the given solution in equation \eqref{eq_nonstable_sol}: For that, we first generate the noise variables an then approximate the infinite sum with a finite sum consisting of the first $50$ summands. We then estimate the covariance between $X^1_0$ and $X^2_1=\epsilon^2_1$; the theoretical covariance between the two is $-\phi^{-2}$. For each combination of sample size and $\phi$, we do $100$ different runs and calculate the mean and standard error of the estimated covariances.

We have implemented this example in R (version 3.6.3) \citep{R} and provide the code in the attached supplementary material.

\begin{table}[h]
\centering
\begin{tabular}{rrrr}
  \hline
 $\phi$ & sample size & est\_cov\_mean & est\_cov\_se \\ 
  \hline
$2$ & $10^2$ & $-0.25$ & $0.01$ \\ 
$2$ & $10^3$ & $-0.25$ & $0.00$ \\ 
$2$ & $10^4$ & $-0.25$ & $0.00$ \\ 
$2$ & $10^5$ & $-0.25$ & $0.00$ \\ 
$3$ & $10^2$ & $-0.12$ & $0.01$ \\ 
$3$ & $10^3$ & $-0.11$ & $0.00$ \\ 
$3$ & $10^4$ & $-0.11$ & $0.00$ \\ 
$3$ & $10^5$ & $-0.11$ & $0.00$ \\ 
$5$ & $10^2$ & $-0.04$ & $0.00$ \\ 
$5$ & $10^3$ & $-0.04$ & $0.00$ \\ 
$5$ & $10^4$ & $-0.04$ & $0.00$ \\ 
$5$ & $10^5$ & $-0.04$ & $0.00$ \\ 
   \hline
\end{tabular}
\caption{Results for the numerical illustration of Example \ref{ex_numerical_illus}.}
\end{table}
The numerical results correspond well to our theoretical considerations.
\end{example}


\end{document}
