
\section{Background}

\label{sec:background}
\textbf{Notation.} We recall the notation and some key definitions here for the reader's convenience.
 For any stochastic process $\myCurls{x_t}_{t\in \mathbb{Z}} \in \mathbb{R}^d$, we use $\mathbf{x}^n_{t-\omega} = \myCurls{x_{t-\omega-n+1}, \cdots, x_{t-\omega - 1}, x_{t-\omega}} $ to denote the \textit{set} of $x_{t-\omega}$ and the $n-1$ variables in the past of $x_{t-\omega}$. We distinguish this from $y_t^n$ which denotes the \textit{vector} $\begin{pmatrix} x_{t}, x_{t-1} , \cdots, x_{t-n+1}\end{pmatrix}^T \in \R^{nd}$. When it is clear from context, to reduce cumbersome notation, we simply use $y_t$. For any random variable $x$, $\mathbb{E}[x]$ denotes its expectation. For any matrix $A$, we use $A_{i:}$ and $A_{:j}$ to denote the $i$th row and $j$th column of $A$ respectively. We use $A^j_{1k}$ to denote the $(1, k)$th element of $A^j$. For any vector $x_t$ at time $t$, we use $x_{t,i}$ to denote the $i$th element of $x_t$. We use $\lambda_{\max}(A), \lambda_{\min}(A), \kappa(A)$  to denote the maximum and minimum eigenvalues and the condition number of $A$ respectively, where $\kappa(A) = \lambda_{\max}(A) / \lambda_{\min}(A)$. $\mathbb{I}_{p}$ denotes the identity matrix of size $p$,  $\mathbb{N}, \mathbb{Z}$ denote the set of natural numbers and integers respectively and $[n]$ denotes the set $\myCurls{1, 2, \cdots n}$. 
%
%
%
\begin{definition}[\textbf{Vector Autoregressive Model}]
\label{def:VAR}
A vector autoregressive model (VAR(p)) of dimension $d$ and order $p$ is defined as 
 \begin{equation}
 \label{eq:var_1}
     x_t = A_1 x_{t-1} + A_2 x_{t-2} + \cdots A_P x_{t-p} + \epsilon_t, \;  
 \end{equation}
 %
 where $x_t \in \R^d$ is a vector-valued time-series, for all $i \in [p]$, $A_i \in \R^{d \times d}$ are the coefficients of the VAR model, and $\epsilon_t \in \R^{d}$ denotes the noise vector such that $ \E[\epsilon_t] = 0$ and $\E[\epsilon_t \epsilon_{t+h}^T] =  \Sigma_{\epsilon} \textrm{ if} \; h = 0$ and $0 \textrm{ otherwise}.$ For some $\sigma_{\epsilon}^2 > 0$, we simply set $\Sigma_{\epsilon} = \sigma^2_{\epsilon} \mathbbm{I}$ for enhanced readability. Our results can be easily generalized to arbitrary covariance matrices by means of the spectral properties ($\lambda_{\min}, \lambda_{\max}$) of $\Sigma_{\epsilon}$. 
% 
 %
% \begin{align*}
%     \E(\epsilon_t) &= 0 \\
%     \E(\epsilon_t \epsilon_{t+h}^T) &= \begin{cases} \Sigma_{\epsilon}  &\textrm{for} \; h = 0 \\
%     0 &\textrm{otherwise}
%     \end{cases}.
% \end{align*}
\end{definition}
%
%
\begin{definition}[\textbf{Weak Stationarity}]
	\label{def:weak_stationarity}
		A stochastic process $\{x_t\}_{t \in \mathbb{Z}}$ is weakly stationary if the mean and the covariance of the process does not change over time, that is, for all $t, \tau \in \mathbb{Z}$
	\begin{equation} 
		\begin{aligned}
			\E[x_t] &= \E[x_{t+\tau}],
			\; \;  %\\
			\mathbb{C}_{x}(t, t+\tau) = \mathbb{C}_{x}(0, \tau),
		\end{aligned}
	\end{equation}
	where $ \mathbb{C}_{x}(t, t+\tau) = \E[(x_t - \E[x_t])(x_{t+\tau} - \E[x_{t+\tau}])] $ denotes the autocovariance function.
\end{definition}
%
The autocovariance matrix of $\myCurls{x_t}_{t \in \mathbb{Z}}$ plays a central role in our results and analysis. For any $n \in \N$, we use $\Sigma_{n}$ to denote the autocovariance matrix of size $n$ defined as $ \E [(y^n_{t} - \E[y^n_{t}]) (y^n_{t} - \E[y^n_{t}])^T]$.

% \textbf{Rewriting VAR(p) Model as a VAR(1) Model.}
It is often quite convenient to rewrite a VAR model of order $p$ in Equation (\ref{eq:var_1}) as a VAR(1) model, $ y_t = A y_{t-1} + e_t$, where $y_t \in \mathbb{R}^{dp}, e_t \in \mathbb{R}^{dp}$ are defined as $y_t = \begin{pmatrix} x_{t}, x_{t-i} , \cdots, x_{t-p+1}\end{pmatrix}^T$, $e_t = \begin{pmatrix} \epsilon_t, 0, \cdots, 0 \end{pmatrix}^T$, and $A \in \mathbb{R}^{dp \times dp}$ is a \textit{(multi) companion matrix} defined as: 
    \begin{align}
    \label{eq:var_1_def_supp}
    %
        A = \begin{pmatrix}
        A_1 & A_2 & \cdots & A_{p-1} & A_p \\
        I & 0 & \cdots & 0 & 0 \\
        0 & I & \cdots & 0 & 0 \\
        \vdots &\vdots &\cdots &\vdots &\vdots \\
        0 & 0 & \cdots & I & 0
        \end{pmatrix}.
    \end{align}
%
%Then, Equation (\ref{eq:VAR}) can be rewritten as $ Y_t = A Y_{t-1} + E_t$.
%
%
% \textbf{Conditions for Stability and Weak-Stationarity.}
The eigenvalues of the multi-companion matrix $A$ fully characterize the stability and stationarity of the VAR process. For a VAR(p) process to be weakly stationary, the eigenvalues of $A$, which satisfy 
\begin{equation}
\textrm{det} \abs{\mathbb{I}_d \lambda^p - A_1 \lambda^{p-1} - A_2 \lambda^{p-2} - \cdots - A_p} = 0,
\end{equation}
are constrained to not lie on the unit circle. If the magnitude of the eigenvalues are $\abs{\lambda_i} < 1$ for all $i \in [dp]$, then the underlying process is stable, that is, its values do not diverge \parencite{lutkepohl2013vector}.

\begin{figure}[!htp]
    \centering
    \input{tikz_files/var_dag}
    \caption{Causal DAG of an AR(2) model}
    \label{fig:my_label1}
\end{figure}
%
\begin{figure}[!htp]
    \centering
    \input{tikz_files/ar2_dag_intervened}
    \caption{Graphical representation of the effect of an intervention $do(x_{t-4} = x*_{t-4})$ on an AR(2) model. Incoming edges into $x_{t-4}$ are removed in the new DAG which are in red.}
    \label{fig:my_label2}
\end{figure}

\begin{definition}[\textbf{Empirical Rademacher Complexity}]
Given a finite sample $X = \myCurls{x_1, x_2, \cdots, x_n} \in \mathbb{R}^d$, the empirical Rademacher complexity of a hypothesis class $\mathcal{F}$ of functions $f:\mathbb{R}^d \rightarrow \mathbb{R}$ is defined as:
\begin{equation*}
    \hat{\mathfrak{R}(\mathcal{F})} = \frac{2}{n} \mathbb{E}_{\sigma} \left [ \sup \limits_{f \in \mathcal{F}} \vert \sum\limits_{i=1}^n \sigma_i f(x_i) \vert \right ],
\end{equation*}
\end{definition}
where $\sigma = (\sigma_1, \sigma_2, \cdots, \sigma_n)$ and for all $i\in[n]$, $\sigma_i$ are independent random variables drawn from the Rademacher distribution, that is, a uniform distribution over $\myCurls{-1, +1}.$ 


\section{Proofs of Main Results}
\begin{lemma}[\textbf{Expressing powers of a companion matrix using symmetric polynomials}]
\label{lemma:coef_as_schur_supp}
For a companion matrix $A$ with distinct eigenvalues and for any $k \in [p]$, the $(1, k)$th element of $A^{\omega}$, can be expressed as a Schur polynomial of the eigenvalues $\lambda = \myCurls{\lambda_1, \lambda_2, \cdots \lambda_p}$ of $A$. in particular, $ \abs{A^{\omega}_{1, k}} = S_{\mu_{\omega, k}, \lambda}$ where $S_{\mu_{\omega, k}, \lambda}$ refers to the Schur polynomial over $\lambda$ indexed by $\{\omega, 1, \cdots {k-1 \textrm{ times}} \cdots, 1, 0, \cdots, 0  \}$.
\end{lemma}

\begin{proof}

For convenience, we use the notation $\lambda$ and $\lambda / \lambda_i$ to denote the sets $\myCurls{\lambda_1, \lambda_2, \cdots, \lambda_p}$ and $\myCurls{\lambda_1, \lambda_2, \cdots, \lambda_{i-1}, \lambda_{i+1}, \cdots, \lambda_p}$ respectively.

Assuming that the eigenvalues $\lambda = \myCurls{\lambda_i}_{i=1}^p$ of a companion matrix $A$ are distinct, it can be diagonalized as $A = V \Lambda V^{-1}$, where $\Lambda = \text{diag}(\lambda_1, \cdots, \lambda_p)$ is the diagonal matrix of eigenvalues of $A$ and $V$ is a vandermonde matrix  \parencite{brand1964companion} given by 

\begin{equation}
\label{matrix:vandermonde}
V_{\lambda} = {\begin{pmatrix}
 \lambda_1^{p-1} & \lambda_2^{p-1}  & \cdots & \lambda_p^{p-1} \\
   \lambda_1^{p-2} & \lambda_2^{p-2}  & \cdots & \lambda_p^{p-2} \\
    \vdots & \vdots  & \vdots & \vdots \\
    \lambda_1 & \lambda_2  & \cdots & \lambda_p \\
    1 & 1  & \cdots & 1 \\
    \end{pmatrix}}.
\end{equation}

For any $i \in [p]$, let $e_k(\lambda / \lambda_i)$ denote the elementary symmetric polynomial of order $k$ with variables in $\lambda / \lambda_i$ and let 
\begin{equation}
    \alpha_i = \frac{1}{\prod \limits_{j \neq i} (\lambda_i - \lambda_j)}.
\end{equation}
The inverse of the Vandermonde matrix $V$ can then be explicitly computed \parencite{el2003explicit} to obtain
\begin{equation}
\label{matrix:vandermonde_inverse}
V^{-1} = {\begin{pmatrix}
 \alpha_1  & - \alpha_1 e_1(\lambda / \lambda_1)  & \cdots & (-1)^{p-1} \alpha_1 e_{p-1}(\lambda / \lambda_1) \\
   \alpha_2  & - \alpha_2 e_1(\lambda / \lambda_2)  & \cdots & (-1)^{p-1} \alpha_2 e_{p-1}(\lambda / \lambda_2) \\
    \vdots & \vdots  & \vdots & \vdots \\
    \alpha_p  & - \alpha_p e_1( \lambda / \lambda_p)  & \cdots & (-1)^{p-1} \alpha_p e_{p-1}( \lambda / \lambda_p)
    \end{pmatrix}},
\end{equation}

Using the diagonalization of $A$, we can compute its power $A^{\omega}$ as
\begin{equation}
    A^{\omega} = V \Lambda^{\omega} V^{-1}
\end{equation}
and the coefficients $A^{\omega}_{1k}$ can be computed as 
\begin{equation*}
    (-1)^{k-1} \sum \limits_{i=1}^p \alpha_i \lambda_i^{p + \omega - 1} e_{k-1}(\lambda / \lambda_i)
\end{equation*}

\textbf{Claim. $\abs{A^{\omega}_{1k}}$ is the Schur polynomial} $S_{\myCurls{\omega, 1, 1, \cdots \; k-1 \textrm{times} \cdots \; 1, \; 0, 0, \cdots, 0}}$

For any $\mu = \myCurls{\mu_1, \mu_2, \cdots, \mu_p}$ such that $\mu_1 \geq \mu_2 \geq \cdots \geq \mu_p$ consider the generalized Vandermonde matrix $V_{\mu, \lambda}$ defined as
\begin{equation}
\label{matrix:vandermonde_gen}
V_{\mu, \lambda} = {\begin{pmatrix}
 \lambda_1^{p-1 + \mu_1} & \lambda_2^{p-1  + \mu_1}  & \cdots & \lambda_p^{p-1  + \mu_1} \\
   \lambda_1^{p-2  + \mu_2} & \lambda_2^{p-2 + \mu_2}  & \cdots & \lambda_p^{p-2 + \mu_2} \\
    \vdots & \vdots  & \vdots & \vdots \\
    \lambda_1^{1 + \mu_{p-1}} & \lambda_2^{1 + \mu_{p-1}}   & \cdots & \lambda_p^{1 + \mu_{p-1}}  \\
   \lambda_1^{\mu_{p}} & \lambda_2^{\mu_{p}}   & \cdots & \lambda_p^{\mu_{p}} \\
    \end{pmatrix}}.
\end{equation}

The Bilaternant formulation defines Schur polynomial $S_{\mu, \lambda}$ as

\begin{equation}
    \label{eq:bialternant_schur}
    S_{\mu, \lambda} = \frac{\textrm{det} ({V_{\mu, \lambda}})}{\textrm{det} ({V_{\lambda}})}.
\end{equation}

It can be shown that the determinant of the vandemonde matrix $V_{\lambda}$ can be given as 
\begin{equation}
    \label{eq:vandermonde_determinant}
    \textrm{det}({V_{\lambda}}) = \prod_{1 \mathop \leq i \mathop < j \mathop \leq n}  ({\lambda_i - \lambda_j}).
\end{equation}

A proof of this statement can be found in most standard texts on Matrix analysis, for example, see \textcite{horn2012matrix}.

For any $i, k \in [p]$, consider the generalized Vandermonde matrix $V_{\mu_{k}, \lambda / \lambda_i}$, where $\mu_k = \myCurls{1, 1, \cdots \; k-1 \textrm{times} \cdots \; 1, \; 0, 0, \cdots, 0}$. That is, 


\begin{equation}
\label{matrix:vandermonde_gen_1}
V_{\mu_{k}, \lambda / \lambda_i} = {\begin{pmatrix}[1.5]
 \lambda_1^{p - 1} & \lambda_2^{p - 1}  & \cdots & \lambda_{i-1}^{p - 1} & \lambda_{i+1}^{p - 1} & \cdots & \lambda_p^{p - 1} \\
 \lambda_1^{p - 2} & \lambda_2^{p - 2}  & \cdots & \lambda_{i-1}^{p - 2} & \lambda_{i+1}^{p - 2} & \cdots & \lambda_p^{p - 2} \\
    \vdots & \vdots  & \cdots & \vdots & \vdots  & \cdots & \vdots \\
 \lambda_1^{p-(k-1)} & \lambda_2^{p-(k-1)}  & \cdots & \lambda_{i-1}^{p-(k-1)} & \lambda_{i+1}^{p-(k-1)} & \cdots & \lambda_p^{p-(k-1)} \\
 \lambda_1^{p-(k+1)} & \lambda_2^{p-(k+1)}  & \cdots & \lambda_{i-1}^{p-(k+1)} & \lambda_{i+1}^{p-(k+1)} & \cdots & \lambda_p^{p-(k+1)} \\
    \vdots & \vdots  & \cdots & \vdots & \vdots  & \cdots & \vdots \\
   1 & 1    & \cdots & 1 & 1 & \cdots & 1 \\
    \end{pmatrix}}.
\end{equation}

From \eqref{eq:bialternant_schur}, we know that 

$$
\textrm{det}(V_{\mu_{k}, \lambda / \lambda_i}) = \textrm{det}(V_{\lambda / \lambda_i}) S_{\mu_k, \lambda / \lambda_i},
$$

where $S_{\mu_k, \lambda / \lambda_i}$ is the Schur polynomial of variables $\lambda / \lambda_i$ indexed by $\mu_k = \myCurls{1, 1, \cdots \; k-1 \textrm{times} \cdots \; 1, \; 0, 0, \cdots, 0}$. Using a combinatorial definition of a Schur polynomial as a summation over semi-standard representations over a Young's Tableaux (see \textcite{macdonald1998symmetric} for an exposition), it is easy to verify that 
\begin{equation}
    \label{eq:elementary_as_schur}
    S_{\mu_k, \lambda / \lambda_i} = e_{k-1}(\lambda / \lambda_i).
\end{equation}

Therefore, combining \eqref{eq:vandermonde_determinant} and \eqref{eq:elementary_as_schur} we can write 

$$
\textrm{det}(V_{\mu_{k}, \lambda / \lambda_i}) = \textrm{det}(V_{\lambda / \lambda_i}) e_{k-1}(\lambda / \lambda_i) = e_{k-1}(\lambda / \lambda_i) \prod \limits_{ \substack{ 1 \leq l < l' \leq p \\ l, l' \neq i}} (\lambda_l - \lambda_{l'}) 
$$

Now, observe that we can rewrite $A^{\omega}_{1k}$ as 
\begin{align*}
    A^{\omega}_{1k} &= (-1)^{k-1} \sum \limits_{i=1}^p \alpha_i \lambda_i^{p + \omega - 1} e_{k-1}(\lambda /\lambda_i), \\
    &= (-1)^{k-1} \sum \limits_{i=1}^p (-1)^{i+1} \lambda_i^{p + \omega - 1} e_{k-1}(\lambda /\lambda_i)  \prod \limits_{ \substack{ 1 \leq l < l' \leq p \\ l, l' \neq i}} (\lambda_l - \lambda_{l'}) /  \textrm{det}({V_{\lambda}}), \\
    &= (-1)^{k-1} \sum \limits_{i=1}^p (-1)^{i+1} \lambda_i^{p + \omega - 1} \textrm{det}(V_{\mu_{k}, \lambda / \lambda_i}) / \textrm{det}({V_{\lambda}}).
\end{align*}

Finally, letting $\mu_{\omega, k} = \myCurls{\omega, 1, 1, \cdots \; k-1 \textrm{times} \cdots \; 1, \; 0, 0, \cdots, 0}$, consider the generalized Vandermonde matrix $V_{\mu_{\omega, k}, \lambda}$ given by 



\begin{equation}
\label{matrix:vandermonde_gen_2}
V_{\mu_{\omega, k}, \lambda} = {\begin{pmatrix}[1.5]
 \lambda_1^{p -1 + \omega} & \lambda_2^{p -1 + \omega} & \cdots & \lambda_p^{p -1 + \omega} \\
\lambda_1^{p - 1} & \lambda_2^{p - 1} & \cdots & \lambda_p^{p - 1} \\
 \lambda_1^{p - 2} & \lambda_2^{p - 2} & \cdots & \lambda_p^{p - 2} \\
    \vdots & \vdots  & \cdots & \vdots \\
 \lambda_1^{p-(k-1)} & \lambda_2^{p-(k-1)} & \cdots & \lambda_p^{p-(k-1)} \\
 \lambda_1^{p-(k+ 1)} & \lambda_2^{p-(k+ 1)} & \cdots & \lambda_p^{p-(k+ 1)} \\
    \vdots & \vdots  & \cdots & \vdots \\
   1 & 1    & \cdots & 1 \\
    \end{pmatrix}}.
\end{equation}

 Using the Laplace expansion to compute the determinant along the first row of $V_{\mu_{\omega, k}, \lambda}$ and observing that for any $i \in [p]$, the minor of $V_{\mu_{\omega, k}, \lambda}(1, i)$ is given by $\textrm{det}(V_{\mu_{k}, \lambda / \lambda_i})$, we have

$$\sum \limits_{i=1}^p (-1)^{i+1} \lambda_i^{p + \omega - 1} e_{k-1}(\lambda_i) \prod \limits_{ \substack{ 1 \leq l < l' \leq p \\ l, l' \neq i}} (\lambda_l - \lambda_{l'}) = \textrm{det}(V_{\mu_{\omega, k}, \lambda})$$ and once again by invoking the bialternant formulation for Schur polynomials, we have

\begin{equation*}
  \abs{ A^{\omega}_{1k}} = \sum \limits_{i=1}^p \alpha_i \lambda_i^{p + \omega - 1} e_{k-1}(\lambda_i)  = \frac{\textrm{det}(V_{\mu_{\omega, k}, \lambda})}{\textrm{det}(V_{\lambda})} = S_{\mu_{\omega, k}, \lambda}.
\end{equation*}
%
\end{proof}
%

\begin{lemma}[\textbf{Form of Interventional Autocovariance matrix}]
\label{lemma:cov_intervene_form}
Consider a vector-valued time series $\myCurls{x_t}_{t \in \Z} \in \R^d$, following a VAR(q) process with autocovariance matrix of size $nd \times nd$ denoted by $\Sigma_n$. Consider simultaneous atomic interventions on components $\myCurls{l_1, l_2, \cdots, l_r} \subset [d]$ of $x_{t-\omega}$, that is, consider the intervention $do(x_{t-\omega, l_1 } = x*_{t-\omega, l_1 }, \cdots, x_{t-\omega, l_r } = x*_{t-\omega, l_r })$. Then, the autocovariance matrix of size $nd \times nd$ ($\Gamma'_n$) of the corresponding joint interventional distribution, denoted by $\prob_{do_{\omega}}(\mathbf{x}_{t-\omega}^n)$ is given by 
\begin{equation}
    \Gamma'_n({i,j}) = \begin{cases} 
    0 & \textrm{if } i \neq j, i = l_m, \; j = l_m \; \forall m  \in [r] \\
    x*_{t - \omega, l_m}^2 & \textrm{if } i = j = l_m \; \forall m  \in [r] \\
    \Sigma_n({i,j}) & \textrm{otherwise}
    \end{cases}.
\end{equation}

Moreover, let $$\Gamma_n = \mathbb{E}_{\myCurls{x*_{t-\omega, l_m}}_{m \in [r]} \sim \prod \limits_{m \in [r]} \prob(x_{t-\omega}, l_m) } \Gamma'_n.$$

Then,

\begin{equation}
     \Gamma_n({i,j}) = \begin{cases} 
    0 & \textrm{if } i \neq j, i = l_m, \; j = l_m \; \forall m  \in [r] \\
    \Sigma_n({i,j}) & \textrm{otherwise}
    \end{cases}.
\end{equation}
The autocovariance matrix of the interventional distribution under simultaneous interventions on consecutive time-steps can be analogously obtained.
\end{lemma}

\begin{proof}[\textbf{Proof of Lemma \ref{lemma:cov_intervene_form}.}]
Note that due to time ordering and since instantaneous effects are not modelled by a VAR model, there is no directed path from any of the variables $x_{t-\omega, l_1 }, x_{t-\omega, l_2 }, \cdots, x_{t-\omega, l_r }$ to $\mathbf{x}^n_{t-\omega - 1}$ as well as to variables in $\myCurls{x_{t-\omega, 1}, x_{t-\omega, 2}, \cdots, x_{t-\omega, d}} / x_{t-\omega, l_1 }, x_{t-\omega, l_2 }, \cdots, x_{t-\omega, l_r }.$ \textcite[Proposition 6.14]{peters2017elements} provides graphical criterion for determining the existence of a total causal effect from a variable $x$ to a variable $y$ under interventions on $x$. Absence of a directed path from $x$ to $y$ implies there is no total causal effect from $x$ to $y$ and from Proposition 6.12 of \textcite{peters2017elements}, we know that $x \indep y$ under the corresponding interventional distribution. As a consequence of these Propositions, we have our desired result.

% interventional   the DAG depicted in XXX Figure XXX that 
%  Since the interventions randomize the variables $X_{t-\omega, l_1 }, X_{t-\omega, l_2 }, \cdots, X_{t-\omega, l_r }$, thereby deleting all the incoming edges into the variables and by the assumption that there are no instantaneous effects, we have that there is no directed path 
 
\end{proof}

% \begin{lemma}[\textbf{Difference in Causal and Statistical errors (VAR)}]
% \label{lemma:diff_G_S_VAR}
% Consider a vector-valued time series $\myCurls{X_t}_{t \in \Z} \in \R^d$, following a VAR(q) process with model parameters $\myCurls{A_1, A_2, \cdots A_q}$. Assuming $n > \max \myCurls{p,q}$, for any VAR(p) model $f$ with parameters $\{\widehat{A}_1, \widehat{A}_2, \cdots \widehat{A}_p\}$, 
% \begin{equation}
% \label{eq:lemma1_VAR}
%     \abs{\CErrwi(f) - \SErrw(f)} = 2\bigg \vert {(A^{\omega}_{i,i} - \widehat{A}^{\omega}_{i,i}) \sum \limits_{k \neq i}^{d \cdot \max \left \{p,q \right \}} (A^{\omega}_{i,k} - \widehat{A}^{\omega}_{i,k}) \SCovn{\max{\myCurls{p,q}}}(i,k)} \bigg \vert,
% \end{equation}
% where $\SCovn{\max{\myCurls{p,q}}}$ is the autocovariance matrix of $X_t$. $A$ is a multi-companion matrix of the form described in (\ref{eq:var_1_def_supp}) with the first $d$ rows populated by $\{A'_1, A'_2, \cdots A'_{\max \myCurls{p,q}}\}$, where $A'_l$ is defined as $A_l$ for all $l \leq p$ and as $\ZeroVec_{d \times d}$ for all $l > p$. $\widehat{A}$ is analogously defined.
% \end{lemma}

\begin{lemma}[\textbf{Difference in Causal and Statistical error (VAR(p))}]
\label{lemma:diff_G_S_var}
Consider a vector-valued time series $\myCurls{x_t}_{t \in \Z} \in \R^d$, following a VAR(q) process with model parameters $\myCurls{A_1, A_2, \cdots A_q}$. Assuming $n > \max \myCurls{p,q}$, for any VAR(p) model $f$ with parameters $\{\widehat{A}_1, \widehat{A}_2, \cdots \widehat{A}_p\}$, 
\begin{equation}
    \abs{\mathcal{G}_{do_{\omega}}(f) - \mathcal{S}(f)} = \sum \limits_{i = 1}^d (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T (\Gamma - \Sigma) (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:}),
\end{equation}
% where, $$\Gamma(i,j) = \begin{cases} \Sigma(i,j) & \; \textrm{if } i \neq j \textrm{ and } (i = 1 \textrm{ or } j = 1) \\
% 0 & \; \textrm{otherwise} \end{cases}$$
\end{lemma}
\begin{proof}[\textbf{Proof of Lemma \ref{lemma:diff_G_S_var}}]

Let $A$ denote the multi-companion matrix corresponding to the true VAR(q) process with  model parameters $\myCurls{{A}_1, {A}_2, \cdots, {A}_q}$ of the form described in (\ref{eq:var_1_def_supp}) with the first $d$ rows populated by $\{A'_1, A'_2, \cdots A'_{\max \myCurls{p,q}}\}$, where $A'_l$ is defined as $A_l$ for all $l \leq q$ and as $\ZeroVec_{d \times d}$ for all $l > q$. Define $\widehat{A}^{(\max \myCurls{p,q})}$ analogously as the multi-companion matrix corresponding to parameters $\myCurls{\widehat{A}_1, \widehat{A}_2, \cdots, \widehat{A}_p}$ of the estimated VAR(p) model $f$ obtained independently from some statistical estimation procedure $\mathcal{E}$. 

%  For any autoregressive model of order $p$ with model parameters $\widehat{A}$, the expected statistical error can be computed as follows. 
% \todo{Write that when it is clear from context, we will remove the dependence on $p$}
 
 Using (\ref{eq:var_1}) recursively, we can write 
 \begin{equation}
     y_t^{(\max \myCurls{p,q})} = A^{\omega} y_{t-\omega}^{(\max \myCurls{p,q})} + A^{\omega} e_{t-\omega + 1}^{(\max \myCurls{p,q})} + A^{\omega - 1} e_{t-{\omega}+ 2}^{(\max \myCurls{p,q})} + \cdots + A e_{t-1}^{(\max \myCurls{p,q})} + e_t^{(\max \myCurls{p,q})}
 \end{equation}
 To reduce cumbersome notation, we let $\zeta_t = A^{\omega} e_{t-\omega + 1} + A^{\omega - 1} e_{t-{\omega}+ 2} + \cdots + A e_{t-1} + e_t \in \mathbb{R}^{dp}$ and write 
 \begin{equation}
 \label{eq:2}
     Y_t = A^{\omega} y_{t-\omega} + \zeta_t.
 \end{equation}
%  Further, let $Q = vec(A^T)$, where $A$ is defined as in (\ref{eq:var_1_def_supp}) and $\widehat{Q}$ is analogously defined. 
 
 Let $\hat{x}_{t}$ denote the prediction of the target variable $x_t$ corresponding to the estimated model $f$. Then, Statistical error $\mathcal{O}_{\omega}$ defined with respect to the squared norm can be computed as follows:
 
 
 \begin{align*}
     \mathcal{O}_{\omega} &= \mathbb{E}_{\prob(\mathbf{x}_{t - \omega}^n, x_t)}[\norm{x_t - \hat{x}_t}^2] \\
     &= \sum \limits_{i = 1}^d \mathbb{E} [x_{t,i} - \hat{x}_{t,i}]^2  && \textrm{(Subscript omitted for convenience)} \\
     &= \sum \limits_{i = 1}^d \mathbb{E} [A^{\omega}_{i, :} y_{t - \omega} + \zeta_{t, \omega, i} - \hat{A}^{\omega}_{i, :} y_{t - \omega}]^2 \\
     &= \sum \limits_{i = 1}^d {(A^{\omega}_{i:} - \hat{A}^{\omega}_{i:})^T \mathbb{E}[y_{t-{\omega}} y_{t - \omega}^T] (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:}) + \E[\zeta_{t, \omega, i}^2]} && (\textrm{$\E[x_{t-i} \epsilon_t^T] = 0, \; \forall i \in \N$ }) \label{seq4}\\
     &= \sum \limits_{i = 1}^d (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T \Sigma_{\max \myCurls{p,q}} (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:}) + \E[\zeta_{t, \omega, i}^2]
 \end{align*}
 
%  \begin{align}
%       &= \mathbb{E}_{\prob(\mathbf{x}_{t - \omega}^n, x_t)}(\norm{x_t - \hat{x}_t}^2) \\
%       &= \sum \limits_{i = 1}^d \mathbb{E}_{\prob(\mathbf{x}_{t - \omega}^n)\prob(x_t \vert \mathbf{x}_{t - \omega}^n)}\big [x_{t,i} - \hat{x}_{t,i} \big ]^2 \label{seq1}\\
%       %
%       &= \sum \limits_{i = 1}^d \mathbb{E}_{\prob(\mathbf{x}_{t - \omega}^n)\prob(x_t \vert \mathbf{x}_{t - \omega}^n)} \big [x_{t,i}^2 + \hat{x}_{t,i}^2 - 2 x_{t,i} \hat{x}_{t,i}  \big ]^2 \\
%       %
%       &= \sum \limits_{i = 1}^d \mathbb{E}_{\prob(\mathbf{x}_{t - \omega}^n)} \big [\mathbb{E}_{\prob(x_t \vert \mathbf{x}_{t - \omega}^q)} [x_{t,i}^2] + (\widehat{A}^{\omega}_{i:})^T y_{t-\omega})^2 - 2 \mathbb{E}_{\prob(x_t \vert \mathbf{x}_{t - \omega}^q)}[x_{t,i}] (\widehat{A}^{\omega}_{i:})^T y_{t-\omega}  \big ]^2 \label{seq2}\\
%       %
%       &= \sum \limits_{i = 1}^d \mathbb{E}_{\prob(\mathbf{x}_{t - \omega}^n)} \big [((A^{\omega}_{i:})^T y_{t-\omega} + \zeta_t)^2 + (\widehat{A}^{\omega}_{i:})^T y_{t-\omega})^2 - 2 ((A^{\omega}_{i:})^T y_{t-\omega}) ((\widehat{A}^{\omega}_{i:})^T y_{t-\omega})  \big ]^2 \\
%       &= \sum \limits_{i = 1}^d \myBracs{(A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T \mathbb{E}(y_{t-{\omega}} y_{t - \omega}^T ) (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:}) + \E(\zeta_{t, i}^2)} \hspace{2em} (\textrm{$\E(x_{t-i} \epsilon_t^T) = 0, \; \forall i \in \N$ }) \label{seq4}\\
%       &= \sum \limits_{i = 1}^d (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T \Sigma_{\max \myCurls{p,q}} (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:}) + \E(\zeta_{t, i}^2) \label{seq5}
%  \end{align}

%  \begin{align}
%       \mathbb{E}_{\prob(\mathbb{X}_{t - \omega}^n, X_t)}(\norm{X_t - \widehat{X}_t}^2) &= \sum \limits_{i = 1}^d \mathbb{E}_{\prob(\mathbb{X}_{t - \omega}^n)\prob(X_t \vert \mathbb{X}_{t - \omega}^n)}\big [X_{t,i} - \widehat{X}_{t,i} \big ]^2 \label{seq1}\\
%       &= \sum \limits_{i = 1}^d \mathbb{E}_{\prob(\mathbb{X}_{t - \omega}^{n})\prob(X_t \vert \mathbb{X}_{t - \omega}^{q})}\big [ X_{t,i} - \widehat{X}_{t,i}\big ]^2  \label{seq2}\\
%      & = \sum \limits_{i = 1}^d \mathbb{E}_{\prob(\mathbb{X}_{t - \omega}^n)} \myBracs{(A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T Y_{t-\omega} + \zeta_{t, i}}^2 \label{seq3}\\
%      &= \sum \limits_{i = 1}^d \myBracs{(A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T \E(Y_{t-{\omega}} Y_{t - \omega}^T ) (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:}) + \E(\zeta_{t, i}^2)} \hspace{2em} (\textrm{$\E(X_{t-i} \epsilon_t^T) = 0, \; \forall i \in \N$ }) \label{seq4}\\
%      &= \sum \limits_{i = 1}^d (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T \Sigma (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:}) + \E(\zeta_{t, i}^2) \label{seq5}
%  \end{align}
 
%  where (\ref{seq2}) holds since the DAG corresponding to the VAR(q) process shows that $(x_{t,i}$ is d-separated from $x_{t-\omega - j, l}$ by $\myCurls{x_{t-\omega, \cdots , x_{t-\omega -q + 1}}}$ for all $j > q, l \in [d]$ and therefore $(x_{t,i} \indep x_{t-\omega - j, l} \vert x_{t-\omega, \cdots , x_{t-\omega -q + 1}} \forall j > q, l \in [d])$.
Similarly,
  \begin{align}
      \mathcal{G}_{do_{\omega}}&= \mathbb{E}_{\prob_{do_{\omega}}(\mathbf{x}_{t - \omega}^n, x_t)}(\norm{x_t - \hat{x}_t}^2) \\
      %
      &= \sum \limits_{i = 1}^d \mathbb{E}_{\prob_{do_{\omega}}(\mathbf{x}_{t - \omega}^n)\prob_{do_{\omega}}(x_t \vert \mathbf{x}_{t - \omega}^n)}\big [x_{t,i} - \hat{x}_{t,i} \big ]^2 \label{ceq1}\\
      %
      &= \sum \limits_{i = 1}^d \mathbb{E}_{\prob_{do_{\omega}}(\mathbf{x}_{t - \omega}^n)\prob_{do_{\omega}}(x_t \vert \mathbf{x}_{t - \omega}^n)} \big [x_{t,i}^2 + \hat{x}_{t,i}^2 - 2 x_{t,i} \hat{x}_{t,i}  \big ]^2 \\
      %
      &= \sum \limits_{i = 1}^d \mathbb{E}_{\prob_{do_{\omega}}(\mathbf{x}_{t - \omega}^n)\prob_{do_{\omega}}(x_t \vert \mathbf{x}_{t - \omega}^q)} \big [x_{t,i}^2 + \hat{x}_{t,i}^2 - 2 x_{t,i} \hat{x}_{t,i}  \big ]^2 \\
      %
      %
      &= \sum \limits_{i = 1}^d \mathbb{E}_{\prob_{do_{\omega}}(\mathbf{x}_{t - \omega}^n)\prob(X_t \vert \mathbf{x}_{t - \omega}^q)} \big [x_{t,i}^2 + \hat{x}_{t,i}^2 - 2 x_{t,i} \hat{x}_{t,i}  \big ]^2 \label{ceq2}\\
      %
      &= \sum \limits_{i = 1}^d \mathbb{E}_{\prob_{do_{\omega}}(\mathbf{x}_{t - \omega}^q)} \big [\mathbb{E}_{\prob(x_t \vert \mathbf{x}_{t - \omega}^q)} [x_{t,i}^2] + (\widehat{A}^{\omega}_{i:})^T y_{t-\omega})^2 - 2 \mathbb{E}_{\prob(x_t \vert \mathbf{x}_{t - \omega}^q)}[x_{t,i}] (\widehat{A}^{\omega}_{i:})^T y_{t-\omega}  \big ]^2 \\
      %
      &= \sum \limits_{i = 1}^d \mathbb{E}_{\prob_{do_{\omega}}(\mathbf{x}_{t - \omega}^n)} \big [((A^{\omega}_{i:})^T y_{t-\omega} + \zeta_t)^2 + (\widehat{A}^{\omega}_{i:})^T y_{t-\omega})^2 - 2 ((A^{\omega}_{i:})^T y_{t-\omega}) ((\widehat{A}^{\omega}_{i:})^T y_{t-\omega})  \big ]^2 \\
      %
      &= \sum \limits_{i = 1}^d \myBracs{(A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T \mathbb{E}_{do_{\omega}}(y_{t-{\omega}} y_{t - \omega}^T ) (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:}) + \E(\zeta_{t, i}^2)} \hspace{2em} (\textrm{$\E(x_{t-i} \epsilon_t^T) = 0, \; \forall i \in \N$ }) \label{ceq4}\\
      &= \sum \limits_{i = 1}^d (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T \Gamma'_{\max \myCurls{p,q}} (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:}) + \E(\zeta_{t, i}^2) \label{ceq5}
 \end{align}

To see why Equation (\ref{ceq2}) holds, note that the structural equations that specify the dependence of $x_t$ on $\mathbf{x}_{t-\omega}^q$ remain unchanged under interventions on $x_{t-\omega}$ and therefore the conditional distributions remain unchanged under these interventions.

Therefore, $$\mathbb{E}_{x*_{t-\omega} \sim \prob(x_{t-\omega})}\mathbb{E}_{\prob_{do_{\omega}}(\mathbf{x}_{t - \omega}^n, x_t)}(\norm{x_t - \hat{x}_t}^2) = \sum \limits_{i = 1}^d (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T \Gamma_{\max \myCurls{p,q}} (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:}) + \E(\zeta_{t, i}^2),$$

where $\Gamma$ can be obtained using Lemma \ref{lemma:cov_intervene_form}.


 
%   \begin{align}
%      \mathbb{E}_{\prob_{do_{\omega}}(\mathbb{X}^n_{t - \omega}, X_t)}(\norm{X_t - \widehat{X}_t}^2) &= \sum \limits_{i = 1}^d  \mathbb{E}_{\prob_{do_{\omega}}(\mathbb{X}^n_{t - \omega})\prob_{do_{\omega}}(X_t \vert \mathbb{X}_{t - \omega})}(X_{t,i} - \widehat{X}_{t,i})^2  \nonumber \\
%      &= \sum \limits_{i = 1}^d  \mathbb{E}_{\prob_{do_{\omega}}(\mathbb{X}_{t - \omega})\prob(X_t \vert \mathbb{X}_{t - \omega})}(X_{t,i} - \widehat{X}_{t,i})^2 \label{eq1}\\
%      & = \sum \limits_{i = 1}^d \mathbb{E}_{\prob_{do_{\omega}}(\mathbb{X}_{t - \omega})\prob(X_t \vert \mathbb{X}_{t - \omega})} \myBracs{(A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T Y_{t-\omega} + \zeta_{t, i}}^2  \nonumber \\
%      &= \sum \limits_{i = 1}^d \myBracs{(A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T   \mathbb{E}_{\prob_{do_{\omega}}(\mathbb{X}_{t - \omega}) }(Y_{t-{\omega}} Y_{t - \omega}^T ) (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:}) + \E(\zeta_{t, i}^2)}  \nonumber \\
%      &= \sum \limits_{i = 1}^d (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:})^T \Gamma'_n (A^{\omega}_{i:} - \widehat{A}^{\omega}_{i:}) + \E(\zeta_{t, i}^2) \quad \textrm{where,} \label{eq2}
%  \end{align}
    %  \Sigma_{do_{\omega}}(i,j) &= \begin{cases} 0 & \; \textrm{if } i \neq j \textrm{ and } (i = 1 \textrm{ or } j = 1) \\ \Sigma(i,j) & \; \textrm{otherwise} \end{cases} \label{eq3}


% consider the Structural Causal Model $\mathfrak{C}$ of the VAR process given by (\ref{eq:2}). Pearl's backdoor criterion \parencite{pearl2009causality} provides a mechanism to obtain the effect of interventions on the distributions of some target variables. To compute $\prob_{do_{\omega}}(X_t \vert \mathbb{X}_{t - \omega})$, we note that the empty set $\varnothing$ is a \textit{valid adjustment set} (see \textcite[Definition 6.38]{peters2017elements}) according to the backdoor criterion and hence we have
% \begin{equation}
%     \prob_{do_{\omega}}(X_t \vert \mathbb{X}_{t - \omega}) = \prob(X_t \vert \mathbb{X}_{t - \omega}).
% \end{equation}

% XXX JUSTIFY/SHOW EQUATION \ref{eq3} XXXX

\end{proof}


\begin{corollary}[\textbf{\textbf{Difference in Causal and Statistical errors (AR)}}]
\label{corr:diff_c_s_ar_supp}
Let $\myCurls{x_t}$ follow an AR(q) process. Then, for any AR(p) model $f$ with parameters $\myCurls{\widehat{a}_1, \widehat{a}_2, \cdots, \widehat{a}_p}$,
\begin{equation}
    \abs{\CErrw(f) - \SErrw(f)} = 2 \bigg| (A^{\omega}_{1,1} - \widehat{A}^{\omega}_{1,1}) \sum \limits_{k=2}^{\max \myCurls{p,q}} (A^{\omega}_{1,k} - \widehat{A}^{\omega}_{1,k}) \gamma_{k-1} \bigg|,
\end{equation}

where, for any $k \in \mathbb{N}$, $\gamma_{k}$ denotes the autocovariance of $\myCurls{x_t}$ with lag $k$. $A$ and $\widehat{A}$ are the corresponding companion matrices of the model and estimated parameters as defined in Lemma \ref{lemma:diff_G_S_var}.
\end{corollary}

\begin{proof}[\textbf{Proof of Corollary \ref{corr:diff_c_s_ar_supp}}]

Corollary $1$ directly follows from Lemmas \ref{lemma:cov_intervene_form} and \ref{lemma:diff_G_S_var}. 
\end{proof}



\begin{proposition}[\textbf{Stability Controls Causal Generalization (VAR)}]
\label{prop:stability_control_supp}
 Consider a VAR(q) process. Assuming $n > \max \myCurls{p,q}$, for any VAR(p) model $f$,
 \begin{equation}
     \abs{\CErrwi(f) - \SErrw(f)} \leq  2 \kappa(\Sigma_{\max \myCurls{p,q}}) (\SErrw(f) - \sigma^2_{\epsilon}),
\end{equation}
where $\kappa(\Sigma_{\max \myCurls{p,q}})$ denotes the condition number of the autocovariance matrix $\Sigma_{\max \myCurls{p,q}}$.
\end{proposition}

% \begin{proposition}
% \begin{equation}
%     \abs{\mathcal{G}_{do_{\omega, i}}(f) - \mathcal{S}_{\omega}(f)} = \abs{\sum \limits_{j = 1}^d (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T (\Gamma - \Sigma) (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})},
% \end{equation}
  
% \end{proposition}
\begin{proof}
From Lemma \ref{lemma:diff_G_S_var}, it remains to prove that $$\abs{\sum \limits_{j = 1}^d (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T (\Gamma - \Sigma) (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})} \leq (2 \kappa(\Sigma) - 1) (\SErrw(f) - \sigma^2_{\epsilon}).$$

% First, we show that for all $j \in [d]$, $\abs{(A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T (\Gamma - \Sigma) (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})} \leq 2 \lambda_{\max}(\Sigma)\norm{A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}}^2$
First, we show that
\begin{align}
    \abs{(A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T (\Gamma - \Sigma) (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})} \leq (2 \lambda_{\max}(\Sigma) )\norm{A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}}^2.
\end{align}

\textbf{Case 1.} $(A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T (\Gamma - \Sigma) (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}) \geq 0$.
%
\begin{align}
    \abs{(A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T (\Gamma - \Sigma) (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})} &= (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T (\Gamma - \Sigma) (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}), \\
    % &\leq (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T  \Gamma (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}), \label{leq1}\\
    &\leq (\lambda_{\max}(\Gamma) - \lambda_{min}(\Sigma))\norm{A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}}^2 \label{leq2}.
\end{align}
where (\ref{leq2}) holds by an application of Rayleigh's principle. We still need to show that $\lambda_{\max}(\Gamma) \leq 2 \lambda_{\max}(\Sigma).$

Without loss of generality, assume that $i=1$, that is the component of $x_{t-\omega}$ that is intervened upon is indexed by $1$. Note that, this merely simplifies notation and the following steps also hold simultaneous interventions on multiple components and consecutive time instances without any additional steps. 

Representing $\Sigma$ and $\Gamma$ in block matrix form, we have
\begin{equation}
     \Sigma = \begin{pmatrix}
    \Sigma_{11} & \Sigma_{12} \\ \Sigma_{21} & \Sigma_{22}
    \end{pmatrix}, \qquad \Gamma = \begin{pmatrix}
    \Gamma_{11} & \Gamma_{12} \\ \Gamma_{21} & \Gamma_{22}
    \end{pmatrix}.
\end{equation}

From Lemma \ref{lemma:cov_intervene_form}, we have $$\Gamma_{11} \in \mathbb{R}^{1 \times 1} = \sigma^2 = \mathbb{E}(X_t^2), \;  \Gamma_{12}^T = \Gamma_{21} \in \mathbb{R}^{1 \times d \max \myCurls{p,q} - 1} = 0, \; \textrm{and } \Gamma_{22} = \Sigma_{22}.$$

We can write $\Gamma$ as follows:
\begin{align}
    \Gamma = \Gamma'_1 + \Gamma'_2,
\end{align} 
where 
\begin{equation}
    \Gamma'_1 = \begin{pmatrix} \sigma^2 & \mathbf{0}_{1 \times d \max \myCurls{p,q} - 1}  \\
    \mathbf{0}_{d \max \myCurls{p,q} - 1 \times 1} &  \mathbf{0}_{d \max \myCurls{p,q} - 1 \times d \max \myCurls{p,q} - 1}
    \end{pmatrix}, 
\end{equation}
and 
\begin{equation}
    \Gamma'_2 = \begin{pmatrix} \mathbf{0}_{1 \times 1} & \mathbf{0}_{1 \times d \max \myCurls{p,q} - 1}  \\
    \mathbf{0}_{d \max \myCurls{p,q} - 1 \times 1} &  \Sigma_{22}
    \end{pmatrix}. 
\end{equation}

Since $\Gamma'_1$ and $\Gamma'_2$ are Hermitian matrices, $\lambda_{\max}(\Gamma) \leq \lambda_{\max}(\Gamma'_1) + \lambda_{\max}(\Gamma'_2).$ 

Observe that $\Gamma_2$ is a principal sub-matrix of $\Gamma$ obtained by deleting the first row and column, by Cauchy's interlacing theorem \parencite{fisk2005very}, we have
\begin{equation}
\label{eq:seq1}
    \lambda_{\max}(\Gamma'_2) \leq \lambda_{\max}(\Sigma).
\end{equation}

Note that, when we intervene simultaneously on multiple components and time instances, instead of setting the first row to $0$, the covariance matrix of the corresponding interventional distribution $\Gamma$ can be obtained by deleting the off-diagonal elements of the corresponding rows and columns.
It remains to show that $\sigma^2 \leq \lambda_{\max} (\Sigma).$ Note that
\begin{equation}
\label{eq:seq2}
    \lambda_{\max} (\Gamma'_2) = \sigma^2 = \Sigma_{11} = e_1^T \Sigma e_1 \leq \lambda_{\max}(\Sigma),
\end{equation}
where $e_i$ denotes the $i$th standard basis vector. Combining (\ref{eq:seq1}) and (\ref{eq:seq2}) we have
\begin{equation}
    \lambda_{\max}(\Gamma) \leq 2 \lambda_{\max}(\Sigma)
\end{equation}
and 
\begin{equation}
     \abs{(A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T (\Gamma - \Sigma) (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})} \leq (2 \lambda_{\max}(\Sigma) - \lambda_{\min}(\Sigma)) \norm{A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}}^2.
\end{equation}

\textbf{Case 2.} $(A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T (\Gamma - \Sigma) (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}) \leq 0$.
%
\begin{align}
    \abs{(A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T (\Gamma - \Sigma) (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})} &= (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T ( \Sigma - \Gamma) (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}), \\
    % &\leq (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T  \Sigma (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}), \\
    &\leq (\lambda_{\max}(\Sigma) - \lambda_{\min}(\Gamma))\norm{A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}}^2.
\end{align}
Using the same arguments used in deriving upper bounds for $\lambda_{\max}(\Gamma)$, we can show that $\lambda_{\min}(\Gamma) \geq \lambda_{\min}(\Sigma)$. Therefore, we have
\begin{align}
    \abs{\mathcal{G}_{do_{\omega, i}}(f) - \mathcal{S}_{\omega}(f)} & \leq \sum \limits_{j \in [d]} (2 \lambda_{\max}(\Sigma) - \lambda_{\min(\Sigma)}) \norm{A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}}^2 \\
    & \leq (2 \lambda_{\max}(\Sigma) - \lambda_{\min(\Sigma)}) \sum \limits_{j \in [d]} \norm{A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}}^2 \\
    & \leq (2 \kappa(\Sigma) - 1) (\mathcal{S}_{\omega}(f) - \sigma^2_{\epsilon}) \label{eq:seq4}.
\end{align}
To see why (\ref{eq:seq4}) holds, observe that
\begin{align}
    \mathcal{S}_{\omega}(f) - \sigma^2_{\epsilon}&= \sum \limits_{j = 1}^d (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T \Sigma (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})  \\
    & \geq  \sum \limits_{j = 1}^d (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:})^T \Sigma (A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}) \\
    & \geq \sum \limits_{j = 1}^d \lambda_{\min}(\Sigma) \norm{A^{\omega}_{j:} - \widehat{A}^{\omega}_{j:}}^2 \label{eq:seq3}.
\end{align}

We now show that we can construct AR(2) processes such that the bound in Proposition 1 is tight upto a small constant factor. Consider an AR(2) process with true model parameters $a_1$ and $a_2$. The autocorrelation matrix $\Sigma_2$ of this process is given by $\Sigma_p = \begin{pmatrix} 1, \gamma \\ \gamma, 1 \end{pmatrix}$ where $\gamma = \frac{a_1}{1 - a_2}$. The eigenvalues of $\Sigma_2$ are given by $\lambda_1 = 1 + \gamma$ and $\lambda_2 = 1 - \gamma$ corresponding to eigenvectors $u_1$ and $u_2$ respectively. Without loss of generality assume $\gamma > 0$ which yields $\lambda_1 \geq \lambda_2$. Denote vectors $a = (a_1, a_2)$ and $\hat{a} = (\hat{a}_1, \hat{a}_2)$. Consider an AR(2) process with parameters $\hat{a}_1, \hat{a_2}$ such that $(a - \hat{a}) = u_2$. Then assuming $\omega = 1$, we have that 
\begin{align*}
    \frac{\mathcal{G}_{do_1} - \mathcal{S}_1}{\mathcal{S}_1 - \sigma^2_{\epsilon}} &= \frac{\norm{a - \hat{a}}^2 - (a - \hat{a})^T \Sigma (a - \hat{a})}{(a - \hat{a})^T \Sigma (a - \hat{a})} = \frac{\gamma }{1 - \gamma} = (\kappa(\Sigma) - 1)/2.
\end{align*}
As $a$ approaches the boundary of the stability domain, the process gets more strongly correlated and $\lambda_{\min}$ approaches $0$ and the relative difference in causal and statistical errors diverges.
\end{proof}

\begin{lemma}[\textbf{Bounds on $a_k$}]
\label{lemma:bound_on_ak}
For any AR(p) model such that the non-zero eigenvalues of the companion matrix are distinct and satisfy $\abs{\lambda} \leq \delta < 1,$
\begin{equation}
    \abs{a_k} \leq {{p} \choose {k}} \delta^k.
\end{equation}

\end{lemma}

\begin{proof}[Proof of Lemma \ref{lemma:bound_on_ak}]
 
 From Lemma \ref{lemma:coef_as_schur_supp}, we know that 
%  \begin{equation}
%      \abs{a_{k}} = \abs{S_{\myCurls{1, 1, \cdots k \; times \; 1, 0, \cdots, 0}}(\myCurls{\lambda_1, \lambda_2, \cdots, \lambda_p})}
%  \end{equation}
 \begin{align*}
   \abs{a_{k}} &= \abs{S_{\myCurls{1, 1, \cdots k \; times \; 1, 0, \cdots, 0}}(\myCurls{\lambda_1, \lambda_2, \cdots, \lambda_p})} \\
    %
    &= \abs{\sum \limits_{\myCurls{i_1 < i_2 < \cdots < i_k} \in [p]} \lambda_{i_1} \lambda_{i_2} \cdots \lambda_{i_k}} \\
    %
    & \leq  \sum \limits_{\myCurls{i_1 < i_2 < \cdots < i_k} \in [p]} \abs{\lambda_{i_1} \lambda_{i_2} \cdots \lambda_{i_k}} && (\abs{x+y} \leq \abs{x} + \abs{y})\\
    %
    & \leq \sum \limits_{\myCurls{i_1 < i_2 < \cdots < i_k} \in [p]} \delta^k && (\abs{\lambda_i} \leq \delta)\\
    %
    &= {p \choose k} \delta^k.  
\end{align*}
 
\end{proof}

\begin{lemma}[\textbf{Bounds on $\gamma_k$}]
\label{lemme:bound_on_gamma}
For any stochastic process $\myCurls{x_t}_{t \in \Z}$ following an AR(p) model the non-zero eigenvalues of the companion matrix are distinct and satisfy $\abs{\lambda} \leq \delta < 1$
 $$\abs{\gamma_k} \leq \frac{C \sigma_{\epsilon}^2 \delta^k}{1 - \delta^2}$$
\end{lemma}
\begin{proof}[\textbf{Proof of Lemma \ref{lemme:bound_on_gamma}}]
 Using the infinite-moving average representation of $X_t$ (See \textcite{brockwell1991time}), we have
 \begin{align}
     x_t &= \sum \limits_{i = 0}^{\infty} A^i_{11} \epsilon_{t-i} \\
     \abs{\mathbb{E}[x_l, x_{r}]} &= \abs{\mathbb{E}[(\sum \limits_{i_1 = 0}^{\infty} A^{i_1}_{11} \epsilon_{l-i_1})(\sum \limits_{i_2 = 0}^{\infty} A^{i_2}_{11} \epsilon_{r-i_2})]} \\
     %
     &= \abs{\sum \limits_{i=0}^{\infty} A^{i}_{11} A_{11}^{i + \abs{l - r}}\mathbb{E}[\epsilon_t \epsilon_t^T]} \\
      &= \abs{\sigma^2_{\epsilon} \sum \limits_{i=0}^{\infty} A^{i}_{11} A_{11}^{i + \abs{l - r}}} \\
      & \leq K_p \delta^{\abs{l-r}} \sigma^2_{\epsilon} \sum \limits_{i=0}^{\infty}  \delta^{2i} \label{neq11}\\
      & \leq K_p  \sigma^2_{\epsilon}   \frac{\delta^{\abs{l-r}}}{1 - \delta^{2}}
 \end{align}
 
To see why (\ref{neq11}) holds observe that, from Lemma \ref{lemma:coef_as_schur_supp}, $$A^i_{11} = S_{\myCurls{i, 0, \cdots, 0}} \leq  \sum \limits_{\myCurls{i_1 \leq  i_2 \leq  \cdots \leq i_k} \in [p]} \abs{\lambda_{i_1} \lambda_{i_2} \cdots \lambda_{i_k}} \leq p^p \delta^i$$ 
\end{proof}

\begin{lemma}[\textbf{Lower Bounds on $\lambda_{\min}(\Sigma)$}] \label{lemma:lowe_bound_eig} For any stochastic process $\myCurls{x_t}_{t \in \Z}$ following an AR(p) model the non-zero eigenvalues of the companion matrix are distinct and satisfy $\abs{\lambda} \leq \delta < 1$
$$ \lambda_{\min}(\Sigma) \geq \frac{\sigma^2_{\epsilon}}{  (1 + \delta)^{2p}}$$
\end{lemma}

\begin{proof}
First, note that $$ (1 + \sum \limits_{k=1}^p \abs{a_k}) \leq \sum \limits_{k=0}^p {p \choose k} \delta^k = (1 + \delta)^p  \text{(Binomial Theorem)}.$$

 Combining this with the results from Lemma \ref{lemma:min_eig_lower} and Proposition \ref{prop:basu}, we have 
\begin{align*}
 \lambda_{\min}(\Sigma) &\geq 2 \pi \inf \limits_{\omega} f(\omega) \geq \frac{\sigma^2_{\epsilon}}{ \nu_{\max}(\mathcal{A})} \geq \frac{\sigma^2_{\epsilon}}{  (1 + \sum \limits_{k=1}^p \abs{a_k})^2} \\
    %
    %
    \lambda_{\min}(\Sigma) &\geq \frac{\sigma^2_{\epsilon}}{  (1 + \sum \limits_{k=1}^p \abs{a_k})^2} \geq \frac{\sigma^2_{\epsilon}}{  (1 + \delta)^{2p}}.
\end{align*}
\end{proof}

\begin{lemma}[\textbf{Upper Bounds on $\lambda_{\max}(\Sigma)$}] \label{lemma:upper_bound_eig}
For any stochastic process $\myCurls{x_t}_{t \in \Z}$ following an AR(p) model the non-zero eigenvalues of the companion matrix are distinct and satisfy $\abs{\lambda} \leq \delta < 1$
$$ \lambda_{\max}(\Sigma) \leq 2K_{p} \sigma^2_{\epsilon} n \frac{1}{1 - \delta^{2}}$$
\end{lemma}

\begin{proof}
 By Gershgorin's theorem \parencite{varga2010gervsgorin}, we can derive an upper bound on the maximum eigenvalue of $\Sigma_n$ as follows:  $$\lambda_{\max}(\Sigma_n) \leq \max_{i \in [n]} ( \Sigma_{ii} + \sum \limits_{j \neq i} \abs{\Sigma_{ij}}).$$
 Note that the autocovariance matrix of an AR process which is defined as $\Sigma_{i,j} = \gamma_{\abs{i-j}}$ (the autocovariance of lag $\abs{i-j}$) has a Toeplitz structure. Due to this Toeplitz structure of the autocovariance matrix, we can see that $$\lambda_{\max}(\Sigma_n) < 2\sum \limits_{i=1}^{n}\abs{\gamma_{i-1}} < 2K_p \sigma^2_{\epsilon} \sum \limits_{i=1}^{n} \frac{\delta^{i-1}}{1 - \delta^{2}} \leq 2K_{p} n \sigma^2_{\epsilon} \frac{1}{1 - \delta^{2}} $$
\end{proof}

\begin{corollary}[\textbf{Stability Controls Causal Generalization (AR(p))}]
\label{corr:Population_diff_ar_supp}
Consider an AR(q) process, such that eigenvalues of its companion matrix satisfy $\abs{\lambda} < \delta < 1$. For any AR(q) model $f$,
 \begin{align}
 \label{eq:stability_control_supp_ar}
     \abs{\CErrwi(f)  - \SErrw(f) } & \leq K_p \SErrw(f) \frac{\max \myCurls{p,q}(1 + \delta)^{2 \max \myCurls{p,q}}}{(1 - \delta^2)},
\end{align}
where $K_p$ is some finite constant that depends on the order $p$ of the underlying process.
\end{corollary}

\begin{proof}[\textbf{Proof of Corollary \ref{corr:Population_diff_ar_supp}}]
From Proposition \ref{prop:stability_control_supp}, we already know that 
 \begin{equation}
     \abs{\CErrwi(f) - \SErrw(f)} \leq  2 \kappa(\Sigma_{\max \myCurls{p,q}}) (\SErrw(f) - \sigma^2_{\epsilon}),
\end{equation}
From Lemma \ref{lemma:lowe_bound_eig} and Lemma \ref{lemma:upper_bound_eig}, we have that $$\lambda_{\min}(\Sigma_{\max \myCurls{p,q}}) \geq \frac{\sigma^2_{\epsilon}}{  (1 + \delta)^{2p}}$$ and $$\lambda_{\max}(\Sigma_{\max \myCurls{p,q}}) \leq 2K_{p} \max \myCurls{p,q} \sigma^2_{\epsilon} \frac{1}{1 - \delta^{2}} $$. 

Combining these results, we have the desired result. 
\end{proof}

\begin{theorem}[\textbf{Finite sample bounds for VAR(p) models}]
\label{thm:main_supp}
  Let $\mathcal{F}$ denote the family of all VAR models of dimension $d$ and order $p$. For any $n > \max \left \{p,q \right \}\in \N$, let $\mu, m > 0$ be integers such that $2 \mu m = n$ and $ \delta > 2 (\mu - 1) \rho^m$ for a fixed constant $0 < \rho < 1$ determined by the underlying process. Let $\myCurls{x_1, x_2, \cdots x_n} \in \mathbb{R}^d$ be a finite sample drawn from a VAR(q) process. Then, simultaneously for every $f \in \mathcal{F}$, under the square loss truncated at $M$, with probability at least $1-\delta$,
    \begin{equation}
  \label{eq:thm_main_supp}
     \CErrwi   \leq  \zeta \hat{\mathcal{S}}_{\omega} + \zeta \widehat{\mathfrak{R}}_{\mu}(\mathcal{F}) + 3\zeta M \sqrt{\frac{\log \frac{4}{\delta'}}{2 \mu}}
  \end{equation}
 where $\zeta = 2 \kappa (\Sigma^{\nu})$, $\delta' = \delta - 2 (\mu - 1) \rho^m$, and $\widehat{\mathfrak{R}}_{\mu}(\mathcal{F})$ denotes the empirical Rademacher complexity of $\mathcal{F}$.
%   \begin{equation}
%   \label{eq:thm_main}
%      \CErrwi  \leq  \zeta \widehat{\mathcal{S}_{\omega}}(f) + M\zeta\sqrt{ \frac{(1 + pd)}{\sqrt{\mu}}\myBracs{\log{ \frac{2 \mu}{(1 + pd)}} + 1}  + \frac{1}{\sqrt{\mu}} \log \frac{\rho^{m-p}(\mu - 1)}{\eta}},
%   \end{equation}
%  The result is currently stated for a single step prediction and intervention. The results can be easily extended to multi-step prediction albeit with cumbersome notation. 
%   \begin{equation}
%       \prob \myBracs{\sup \limits_{f \in \mathcal{F}} \abs{\mathcal{S}(f) - \widehat{\mathcal{S}}(f)} > \epsilon} \leq C_1 \exp \myBracs{(1 + pd)\myBracs{\log{ \frac{2 \mu}{(1 + pd)}} + 1} - \frac{\mu 
%       \epsilon^2}{M^2} } + 2 (\mu - 1) C_2 \rho^{m-p}
%   \end{equation}
%   For any $m \in \mathbb{N}$, let $\beta(m)$ denote the $\beta$ mixing coefficient of a underlying stable auto-regressive process such that the eigenvalues of the corresponding companion matrix $A$ satisfy $\abs{\lambda_i} \leq \delta $ for some $0 < \delta < 1$.  For any $\mu, m > 0$ with $2 \mu m + p = n$ and $\eta > 4 (\mu - 1) \beta(m)$, under the square loss truncated at $M$, for any stable AR$(p)$ model $\widehat{f}$ with $p > 1$,
\end{theorem}
\begin{proof}[\textbf{Proof of Theorem \ref{thm:main_supp}}]

From Proposition , we already have that 
 \begin{equation}
     \abs{\CErrwi(f) - \SErrw(f)} \leq  (2 \kappa(\Sigma_{\max \myCurls{p,q}}) - 1) (\SErrw(f) - \sigma^2_{\epsilon}).
\end{equation}
 Additionally, processes that follow VAR models are known to be $\beta$ mixing and in particular, they are geometrically completely regular, that is, there exists some $0 < \rho < 1$ such that $\beta(k) = C \rho^{k}$ for some constant $C$, where $\beta(k)$ denotes the $\beta$ mixing coefficient of the process \parencite{mokkadem1988mixing}. Theorem \ref{thm:main_supp} then follows by applying Rademacher bounds \parencite[Theorem 1]{mohriRademacher} for generalization in time-series under mixing conditions.
\end{proof}

% \todo{Write that the minimum eigenvalue characterizes the stability of the process.}
% \begin{corollary}[\textbf{Difference in Causal and Statistical error}]
% \label{corr:Population_diff}
% Let $\myCurls{X_t}$ follow a second-order stationary, Autoregressive process of order $p$, such that the eigenvalues of the companion matrix $A$ satisfy $\abs{\lambda} < \delta $. The difference in Statistical error and Causal error as defined in \todo{Refer to the definition } can be upper bounded as follows:
% \begin{equation*}
%     \abs{\mathcal{G}_{do_{\omega}} - \mathcal{S}} \leq (1 + \delta)^{2p} \mathcal{S}
% \end{equation*}
% \end{corollary}

% \begin{proof}[\textbf{Proof of Proposition \ref{corr:Population_diff}}]
%  Define the vectors $B = \begin{pmatrix} A^{\omega}_{11}, A^{\omega}_{12}, \cdots \end{pmatrix}^{T}$ and $\Gamma = \begin{pmatrix} \gamma_1, \gamma_2, \cdots  \end{pmatrix}^{T}$, where $\gamma_k$ denotes the autocovariance of order $k$. $\widehat{B} $ is analogously defined.  
% From Lemma 1, we have 
 
%     \begin{align*}
%      \abs{\mathcal{G}_{do_{\omega}} - \mathcal{S}} &= (A^{\omega}_ {11} - \widehat{A}^{\omega}_{11}) \sum \limits_{k = 2}^{\infty} (A^{\omega}_{1k} - \widehat{A}^{\omega}_{1k}) \gamma_{k - 1} \\
%      &\leq  \abs{A^{\omega}_ {11} - \widehat{A}^{\omega}_{11}} \abs{\sum \limits_{k = 2}^{\infty} (A^{\omega}_{1k} - \widehat{A}^{\omega}_{1k}) \gamma_{k - 1}} \\
%       & \stackrel{(2)}{\leq} \norm{B - \widehat{B}}_2^2 \cdot \norm{\Gamma}_1 &&  \textrm{(Cauchy Schwarz)}
%      \end{align*}
%     Observe that $\mathcal{S} = \sum \limits_{i,j =1}^p (A^{\omega}_{1i} - \widehat{A}^{\omega}_{1i})(A^{\omega}_{1j} - \widehat{A}^{\omega}_{1j})\gamma_{\abs{i-j}} + \sigma_{\epsilon}^2 \sum \limits_{i =1}^{\omega} (A^{\omega - i}_{11})^2 = (B - \widehat{B})^T \Sigma_p (B - \widehat{B}) + Q_{\epsilon}$, where $Q_{\epsilon} = \sigma_{\epsilon}^2 \sum \limits_{i =1}^{\omega} (A^{\omega - i}_{11})^2 \geq 0$. Therefore,
%      \begin{align*}
%     \abs{\mathcal{G}_{do_{\omega}} - \mathcal{S}} &\leq \frac{S}{\lambda_{\min}(\Sigma)} \cdot \norm{\Gamma}_1.   
%     \end{align*}
% Combining the results from Lemma \ref{lemma:min_eig_lower} and Proposition \ref{prop:basu}, we have that 
% \begin{equation*}
%     \lambda_{\min}(\Sigma) \geq 2 \pi \inf \limits_{\omega} f(\omega) \geq \frac{\sigma^2_{\epsilon}}{ \nu_{\max}(\mathcal{A})} \geq \frac{\sigma^2_{\epsilon}}{  (1 + \sum \limits_{k=1}^p \abs{a_k})^2} 
% \end{equation*}
% From Lemma XXX \todo{Add the reference to our lemma of Schur polynomials}, we know that
% \begin{align*}
%     a_k &= \sum \limits_{\myCurls{i_1, i_2, \cdots i_k} \in [p]} \lambda_{i_1} \lambda_{i_2} \cdots \lambda_{i_k} && \textrm{(Lemma XXX)}\\
%     %
%     & \leq  \sum \limits_{\myCurls{i_1, i_2, \cdots i_k} \in [p]} \abs{\lambda_{i_1} \lambda_{i_2} \cdots \lambda_{i_k}} && (\abs{x+y} \leq \abs{x} + \abs{y})\\
%     %
%     & \leq \sum \limits_{\myCurls{i_1, i_2, \cdots i_k} \in [p]} \delta^k && (\abs{\lambda_i} \leq \delta)\\
%     %
%     &= {p \choose k} \delta^k.  
% \end{align*}
% Therefore, \todo{There is a mistake here. Its not p choose k. Its p to the power of k}
% \begin{align*}
%     %
%     (1 + \sum \limits_{k=1}^p \abs{a_k}) &\leq \sum \limits_{k=0}^p {p \choose k} \delta^k = (1 + \delta)^p && \text{(Binomial Theorem)}\\
%     %
%     \lambda_{\min}(\Sigma) &\geq \frac{\sigma^2_{\epsilon}}{  (1 + \sum \limits_{k=1}^p \abs{a_k})^2} \geq \frac{\sigma^2_{\epsilon}}{  (1 + \delta)^{2p}} \\
%     %
%     \abs{\mathcal{G}_{do_{\omega}} - \mathcal{S}} &\leq \frac{S(1 + \delta)^{2p}}{\sigma^2_{\epsilon}} \cdot \norm{\Gamma}_1.
% \end{align*}
% We know that XXX \todo{Derive the bound} \todo{This bound only strictly holds for causal AR processes} $$ \abs{\gamma_k} \leq \frac{C \sigma_{\epsilon}^2 \delta^k}{1 - \delta^2}$$
% \begin{align*}
%     \norm{\Gamma}_1 \leq \sum \limits_{k = 1}^p \frac{C \sigma_{\epsilon}^2 \delta^k}{1 - \delta^2}.
% \end{align*}
% Putting everything together, we have 
% \begin{align*}
%      \abs{\mathcal{G}_{do_{\omega}} - \mathcal{S}} & \leq  \mathcal{S} \frac{C p(1 + \delta)^{2p}}{(1 - \delta^2)}
% \end{align*}
% \todo{We can simplify this a bit more.}
% % \begin{align*}
% %      & \leq (1 + \delta)^{2p} \mathcal{S} \cdot \norm{\Gamma}_1  && \textrm{(
% %      From Lemma \ref{lemma:min_eig_lower})} \\
% %      & \leq \frac{C \mathcal{S} (1+ \delta)^{2p}(1-\delta^{p-1})}{(1-\delta)^2},  && \textrm{(
% %      From Lemma \ref{lemma:min_eig_lower})}
% % \end{align*}
% where $C$ is a finite constant.
% To obtain (3), observe that $(B - \widehat{B})^T \Sigma (B - \widehat{B}) \leq \mathcal{S}$ 
% \end{proof}

\section{Relative Interventions}

\begin{figure}[htp]
    \centering
    \input{tikz_files/var_dag}
    \caption{Causal DAG of an AR(2) model}
    \label{fig:my_label3}
\end{figure}
%
\begin{figure}[htp]
    \centering
    \input{tikz_files/relative}
    \caption{Graphical representation of the effect of an intervention $do(x_{t-4} = x_{t-4} + \alpha)$ on an AR(2) model. Dependencies are retained.}
    \label{fig:my_label4}
\end{figure} 

Assume for simplicity $p=q$ and $d=1$. Let $A$ and $\widehat{A}$ denote the companion matrices corresponding to the true and estimated parameters respectively. Then, rewriting the VAR(p) model as a VAR(1) model, we have
\begin{equation}
    x_t = A^{\omega}_{11} x_{t - \omega} + A^{\omega}_{12} x_{t - \omega- 1} + \cdots + A^{\omega}_{1p} x_{t - \omega-p + 1} + A^{\omega - 1}_{11} \epsilon_{t - \omega + 1} + \cdots + A_{11} \epsilon_{t - 1} + \epsilon_t.
\end{equation}
 Let $\zeta_t = A^{\omega - 1}_{11} \epsilon_{t - \omega + 1} + \cdots + A_{11} \epsilon_{t - 1} + \epsilon_t.$ Then, Statistical error $S_{\omega}$ can be computed as 
 %
 \begin{align}
     \mathbb{E}[x_t - \hat{x}_t]^2 & = \mathbb{E}[\sum \limits_{i=1}^p (A^{\omega}_{1i} - \widehat{A}^{\omega}_{1i})x_{t-\omega - i + 1} + \zeta_t^2] \\
     &= \sum \limits_{ij=1}^p (A^{\omega}_{1i} - \widehat{A}^{\omega}_{1i})(A^{\omega}_{1j} - \widehat{A}^{\omega}_{1j})\Sigma_{ij} + \mathbb{E}[\zeta_t^2] 
 \end{align}
 %
 The causal error $\mathcal{G}_{do_{\omega}}$ due to the effect of an intervention $do(x_{t - \omega} = x_{t - \omega} + \alpha)$ can be computed as 
  \begin{align}
     \mathbb{E}_{do_{\omega}}[x_t - \hat{x}_t]^2 & = \mathbb{E}[\sum \limits_{i=1}^p (A^{\omega}_{1i} - \widehat{A}^{\omega}_{1i})x_{t-\omega - i + 1} + (A^{\omega}_{11} - \widehat{A}^{\omega}_{11}) \alpha +  \zeta_t^2] \\
     &= \sum \limits_{ij=1}^p (A^{\omega}_{1i} - \widehat{A}^{\omega}_{1i})(A^{\omega}_{1j} - \widehat{A}^{\omega}_{1j})\Sigma_{ij} + (A^{\omega}_{11} - \widehat{A}^{\omega}_{11})^2 \alpha^2 + \mathbb{E}[\zeta_t^2] \label{neq1}
 \end{align}
 To see why (\ref{neq1}) holds, recall that $\mathbb{E}[x_t] = 0, \mathbb{E}[\epsilon_t] = 0, \mathbb{E}[x_{t-i} \epsilon_t] = 0 \; \forall i \in \mathbb{N}.$
 \begin{lemma}[\textbf{\textbf{Difference in Causal and Statistical errors (AR) under Relative Interventions}}]
\label{corr:diff_c_s_ar_relative}
Let $\myCurls{X_t}$ follow an AR(q) process. Then, for any AR(p) model $f$ with parameters $\myCurls{\widehat{a}_1, \widehat{a}_2, \cdots, \widehat{a}_p}$,
\begin{equation}
    \CErrw(f) - \SErrw(f) =  (A^{\omega}_{1,1} - \widehat{A}^{\omega}_{1,1})^2 \alpha^2 ,
\end{equation}

where, $A$ and $\widehat{A}$ are the corresponding companion matrices of the model and estimated parameters.
\end{lemma}


\section{Other results}
\begin{proposition}\parencite[Proposition 2.2]{basu2015regularized}
\label{prop:basu}
Consider a (matrix-valued) polynomial $\mathcal{A}(z) = I_d - \sum \limits_{k=1}^p A_k z^k, x \in \C, \; p \in \N$, satisfying $det(\mathcal{A}(z)) \neq 0$ for all  $\abs{z} < 1$, $\mu_{\max}(\mathcal{A}) \leq \myBracs{1 + (\nu_{row} + \nu_{col})/2}^2$, where

    \begin{equation*}
        \nu_{row} = \sum \limits_{k=1}^p \max \limits_{1 \leq i \leq d} \sum \limits_{j=1}^d \abs{A_k(i,j)}, \quad \nu_{col} = \sum \limits_{k=1}^p \max \limits_{1 \leq i \leq d} \sum \limits_{i=1}^d \abs{A_k(i,j)}.
    \end{equation*}
    
\end{proposition}

\begin{lemma}[\textbf{Bounds on spectrum of $\Sigma$}]
\label{lemma:min_eig_lower}
Let $\myCurls{X_t}$ be a second-order stationary time series with spectral density $f(\omega)$ and let $\Sigma_{n}$ denote the autocorrelation matrix of size $n \times n$ given by $\Sigma_n(i,j) = \gamma_{\abs{i-j}} = \E(x_{t + i}, x_{t + j})$ for any $i, j \in \mathbb{Z}$. Then the extremal eigenvalues of $\Sigma$ are bounded as follows.
\begin{equation*}
    \label{eq:bounds_eigs}
    \lambda_{\min}(\Sigma_n) \geq 2 \pi \inf \limits_{\omega} f(\omega) \quad \textrm{and} \quad \lambda_{\max}(\Sigma_n) \leq 2 \pi \sup \limits_{\omega} f(\omega) \; \forall n \in \N
\end{equation*}
Furthermore, the bound holds uniformly for all $n \in \N$. See \textcite[Proposition 4.5.3]{brockwell1991time} for a proof of the Lemma.
\end{lemma}

\section{Additional Experimental Results}
\label{sec:additional_simulations}
In section 5 we described experiments with simulated autoregressive processes.
Here, we provide additional plots from these experiments.

\subsection{Statistical and Causal Errors}
In the main paper we have seen that even in very simple AR models the causal error of an OLS regression estimator can be several times larger than its statistical error.
In Figures \ref{fig:errors}, \ref{fig:errors_hist} and \ref{fig:corr_vs_err_supp} we can see that this is also the case for OLS, Lasso and ElasticNet regression and different process orders.
All methods can be seen as the solution to an optimization problem, minimizing the empirical statistical error plus some penalty term $\Omega(\hat{a})$, that is, $\sum_{y_i, \hat{y}_i} (y_i-\hat{y}_i)^2 + \lambda \Omega(\hat{a}),$ where $\hat{y}_i$ denotes the model prediction with estimated parameters $\hat{a}$ and $\lambda>0$ the strength of the regularization.
For OLS, the penalty term is zero. 
For Ridge and Lasso the penalty is the $l_2$ and $l_1$ norm respectively, i.e. $\Omega(\hat{a}) = \|\hat{a}\|_2$ for Ridge and $\Omega(\hat{a}) = \|\hat{a}\|_1$ for Lasso.
For ElasticNet we have $\Omega(\hat{a}) = \mu\cdot\|\hat{a}\|_1 + (1-\mu)\cdot\|\hat{a}\|_2$, where $\mu$ is a parameter balancing the $l_1$ and $l_2$ penalty.

We used standard grid-search and 5-fold cross-validation to find the optimal regularization strength.
For ElasticNet, we additional optimized $\mu$ with the grid search.
Except for Figures \ref{fig:corr_vs_err_sample}, we use 100 training and 1000 test samples.
For all experiments, we simulate our processes with noise variance $\sigma^2 = 1$.
\begin{figure}[htp]
    \centering
    \includegraphics[width=\textwidth]{img/uai/supp/error_stat_vs_causal_10000_100_1_0_all.png}
    \caption{The causal error $\mathcal{G}$ plotted against the statistical error $\mathcal{S}$ for process orders $p=3, 5, 7$ (from left to right) and estimators OLS, Lasso and ElasticNet (from top to bottom).}
    \label{fig:errors}
\end{figure}
\begin{figure}[htp]
    \centering
    \includegraphics[width=\textwidth]{img/uai/supp/error_hist_10000_100_1_0_all.png}
    \caption{Histogram of the difference $|\mathcal{G} - \mathcal{S}|$ for orders $p=3, 5, 7$.}
    \label{fig:errors_hist}
\end{figure}
\begin{figure}[htp]
    \centering
    \includegraphics[width=\textwidth]{img/uai/supp/correlation_vs_error_10000_100_1_0_all_new.png}
    \caption{The maximal difference of statistical and causal error  $|\mathcal{G} - \mathcal{S}|$ plotted against the condition number of the autocorrelation matrix $\kappa$ for process orders $p=3, 5, 7$ (from left to right) and estimators OLS, Lasso and ElasticNet (from top to bottom).}
    \label{fig:corr_vs_err_supp}
\end{figure}

\textbf{Increasing sample size.}
As one would expect, in Figure \ref{fig:corr_vs_err_sample}, we can see that the absolute difference of the errors decreases for larger training samples. We show this result for the ridge regression estimator. Results for other estimators are similar.
The respective means are 13.28, 0.48 and 0.18 from left to right and the standard deviations are 264.54, 4.35 and 0.27, which is hard to read from the plot due to the scale of the outliers.
\begin{figure}[htp]
	\centering
	\includegraphics[width=.7\textwidth]{img/order5/sample_size_vs_error_10000_1_0.png}
	\caption{The absolute difference $\abs{\mathcal{G} - \mathcal{S}}$ of causal and statistical error plotted against the sample size for process orders $p=5$,  sample sizes 10, 100, 1000 using Ridge regression. The blue bars mark the 0, 0.5 and 1 quantile and the black block goes from the 0.25 to the 0.75 quantile.}
	
	\label{fig:corr_vs_err_sample}
\end{figure}

\begin{figure}[htp]
	\centering
	\includegraphics[width=\textwidth]{img/uai/supp/correlation_vs_error_omega_10000_100_7_0_all.png}
	\caption{The maximal difference of errors $|\mathcal{G} - \mathcal{S}|$ as well as the generalization bound from Theorem \ref{thm:main_supp} plotted against condition number of the autocorrelation matrix for process order $p=5$, steps predicted ahead $\omega = 1, 5, 7$ (from left to right). The top row show interventions only on the most recent timestep $x_{t-1}$ where the bottom row shows interventions on all previous timesteps before the prediction.}
	
	\label{fig:corr_vs_err_omega}
\end{figure}

\textbf{Violations of causal sufficiency.}
In Figure \ref{fig:corr_vs_err_misspec} we violated the causal sufficiency assumption by introducing a hidden confounder. To this end we draw a two-dimensional AR(1) process by drawing each entry of the parameter matrix $A$ independently and uniformly from $[-2, 2]$ and reject matrices that yield non-stationary processes.
We then only use one of the two dimensions as training and test sample.
The other one acts as hidden confounder.
We also use only the sample of the observed dimension to estimate the autocorrelation of the process, which is the x-axis of the plots in Figure \ref{fig:corr_vs_err_misspec}. 
\begin{figure}[htp]
	\centering
	\includegraphics[width=\textwidth]{img/uai/supp/misspec/correlation_vs_error_omega_10000_100_7_0_all.png}
	\caption{The maximal difference of errors $|\mathcal{G} - \mathcal{S}|$ as well as the generalization bound from Theorem \ref{thm:main_supp} plotted against condition number of the autocorrelation matrix for process order $p=5$, steps predicted ahead $\omega = 1, 5, 7$ (from left to right). }
	
	\label{fig:corr_vs_err_misspec}
\end{figure}



\section*{References}
\newpage
\printbibliography

