\section{Causal Generalization for VAR}
\label{sec:results}
In this section, we present causal generalization bounds for the family of VAR models under atomic interventions. We first provide an overview of our results in the more general case of VAR(p) models and later provide a thorough interpretation of the results, often by deriving simplified versions of the results for AR(p) models. We begin by providing an exact characterization of the difference in statistical and causal errors in terms of the model and estimated parameters and the autocovariance matrix of the underlying process. 

\begin{lemma}[\textbf{Difference in Causal and Statistical errors (VAR)}]
\label{lemma:diff_G_S_VAR}
Consider a vector-valued time series $\myCurls{x_t}_{t \in \Z} \in \R^d$, following a VAR(q) process parameterized by $\myCurls{A_1, A_2, \cdots A_q}$. Let $ \nu = \max \myCurls{p,q}$. For any VAR(p) model $f$ with parameters $\{\hat{A}_1, \hat{A}_2, \cdots \hat{A}_p\}$,

\begin{multline*}
\abs{\CErrwi - \SErrw} = 2\bigg \vert {(A^{\omega}_{ii} - \hat{A}^{\omega}_{ii}) \sum \limits_{k \neq i}^{d \nu} (A^{\omega}_{ik} - \hat{A}^{\omega}_{ik}) \Sigma_{ik}^{\nu}} \bigg \vert,
\end{multline*}

where $\Sigma^{\nu}$ denotes the autocovariance matrix of $x_t$ of size $\nu$, $A$ is a multi-companion matrix of the form described in (\ref{eq:var_1_def}) with the first $d$ rows populated by $\{A'_1, A'_2, \cdots A'_{\nu}\}$, with $A'_l$ defined as $A_l$ for all $l \leq p$ and as $\ZeroVec_{d \times d}$ for all $l > p$. $\hat{A}$ is analogously defined.
\end{lemma}
%

 Building on Lemma~\ref{lemma:diff_G_S_VAR}, we establish that the condition number of the autocovariance matrix of the underlying process controls causal generalizability from the \textit{observational} to \textit{interventional distributions}.

\begin{proposition}[\textbf{Stability Controls Causal Generalization (VAR)}]
\label{prop:stability_control} Let $\myCurls{x_t}_{t \in \mathbb{Z}}$ follow a VAR(q) process for some $q \in \mathbb{N}$. For any VAR(p) model,
 \begin{equation}
     \abs{\CErrwi - \SErrw} \leq  (2\kappa(\Sigma^{\nu }) - 1) (\SErrw - \sigma^2_{\epsilon}),
\end{equation}
where $\kappa(\Sigma^{\nu})$ denotes the condition number of the autocovariance matrix $\Sigma^{\nu}$. Further, one can construct processes where equality holds upto a small constant factor.
\end{proposition}

{\looseness=-1 The result states that the difference in expected causal and statistical errors is controlled by the \textit{condition number} of the autocovaraince matrix of size $\max \myCurls{p,q}$. It also states that without incorporating additional information, one cannot obtain a much tighter bound which is also verified by our experiments in Section \ref{sec:experiments}. The condition number of the autocovariance matrix can get arbitrarily large as the process gets closer to the boundary of the stability domain. This result therefore shows that even for very simple classes of forecasting models, causal interpretations can get challenging. We later provide a detailed interpretation of this result and provide an explicit bound on $\kappa(\Sigma_{\nu})$ in terms of the stability parameter for AR(p) models (Corollary \ref{corr:Population_diff_ar}).}


Proposition \ref{prop:stability_control} allows us to employ generalization bounds for time-series \parencite{yu1994rates, meir2000nonparametric, mohriRademacher, mcdonald2017nonparametric} to derive finite-sample \textit{causal generalization bounds} for VAR models. In particular, we utilize Rademacher complexity bounds for generalization in time-series under mixing conditions \parencite{mohriRademacher} to derive Theorem \ref{thm:main}.
%

\begin{theorem}[\textbf{Finite sample bounds for VAR(p) models}]
\label{thm:main}
  Let $\mathcal{F}$ denote the family of all VAR models of dimension $d$ and order $p$. For any $n > \max \left \{p,q \right \}\in \N$, let $\mu, m > 0$ be integers such that $2 \mu m = n$ and $ \delta > 2 (\mu - 1) \rho^m$ for a fixed constant $0 < \rho < 1$ determined by the underlying process. Let $\myCurls{x_1, x_2, \cdots x_n} \in \mathbb{R}^d$ be a finite sample drawn from a VAR(q) process. Then, simultaneously for every $f \in \mathcal{F}$, under the square loss truncated at $M$, with probability at least $1-\delta$,
    \begin{equation}
  \label{eq:thm_main}
     \CErrwi   \leq  \zeta \hat{\mathcal{S}}_{\omega} + \zeta \widehat{\mathfrak{R}}_{\mu}(\mathcal{F}) + 3\zeta M \sqrt{\frac{\log \frac{4}{\delta'}}{2 \mu}}
  \end{equation}
 where $\zeta = 2\kappa (\Sigma^{\nu})$, $\delta' = \delta - 2 (\mu - 1) \rho^m$, and $\widehat{\mathfrak{R}}_{\mu}(\mathcal{F})$ denotes the empirical Rademacher complexity of $\mathcal{F}$.
%   \begin{equation}
%   \label{eq:thm_main}
%      \CErrwi  \leq  \zeta \widehat{\mathcal{S}_{\omega}}(f) + M\zeta\sqrt{ \frac{(1 + pd)}{\sqrt{\mu}}\myBracs{\log{ \frac{2 \mu}{(1 + pd)}} + 1}  + \frac{1}{\sqrt{\mu}} \log \frac{\rho^{m-p}(\mu - 1)}{\eta}},
%   \end{equation}
%  The result is currently stated for a single step prediction and intervention. The results can be easily extended to multi-step prediction albeit with cumbersome notation. 
%   \begin{equation}
%       \prob \myBracs{\sup \limits_{f \in \mathcal{F}} \abs{\mathcal{S}(f) - \widehat{\mathcal{S}}(f)} > \epsilon} \leq C_1 \exp \myBracs{(1 + pd)\myBracs{\log{ \frac{2 \mu}{(1 + pd)}} + 1} - \frac{\mu 
%       \epsilon^2}{M^2} } + 2 (\mu - 1) C_2 \rho^{m-p}
%   \end{equation}
%   For any $m \in \mathbb{N}$, let $\beta(m)$ denote the $\beta$ mixing coefficient of a underlying stable auto-regressive process such that the eigenvalues of the corresponding companion matrix $A$ satisfy $\abs{\lambda_i} \leq \delta $ for some $0 < \delta < 1$.  For any $\mu, m > 0$ with $2 \mu m + p = n$ and $\eta > 4 (\mu - 1) \beta(m)$, under the square loss truncated at $M$, for any stable AR$(p)$ model $\widehat{f}$ with $p > 1$,
\end{theorem}
Our causal generalization bound in Theorem \ref{thm:main} suggests that, given sufficiently many samples, the true causal error can be guaranteed to be close to empirical statistical error if our VAR models come from a class with a small Rademacher complexity, particularly when the process is associated with a small stability parameter.
% In Proposition \ref{prop:error_poly}, which immediately follows from this result, we will see how this characterization yields more interpretable and useful insights into all the factors that play a role in determining the difference between the Causal and Statistical errors. 
%1
% This result provides some useful insights. It states that if the underlying process has a weak correlation structure, then the causal error, with high probability, can be closer to the statistical error, which also supports our intuition. If the underlying process is non-stationarity or if it is violated by the estimated model, the difference in errors can be unbounded for interventions far into the past of the target variable. If the true order of the AR process (reflected by the Schur polynomials of the eigenvalues) is incorrectly estimated, then the difference in errors is also higher.
%
%

We now focus on providing a detailed interpretation of our results. First, we take a minor detour to present a technical result (Lemma \ref{lemma:coef_as_schur}) which is useful both in deriving some of our main results as well as in interpreting them. 
%
\begin{lemma}[\textbf{Expressing powers of a companion matrix using symmetric polynomials}]
\label{lemma:coef_as_schur}
For a companion matrix $A$ with distinct eigenvalues, for any $k \in [p]$, the $(1, k)$th element of $A^j$, can be expressed using Schur polynomials of the eigenvalues $\lambda = \myCurls{\lambda_1, \lambda_2, \cdots \lambda_p}$ of $A$, that is, $ A^j_{1, k} = S_{j,k}(\lambda)$, where $S_{j,k}(\lambda)$ refers to the Schur polynomial indexed by $K =  \{j, 1, \cdots {k-1 \textrm{ times}} \cdots, 1, 0, \cdots, 0  \}$.
\end{lemma}
Lemma \ref{lemma:coef_as_schur} shows that the coefficients of the powers of a companion matrix can be fully characterized using symmetric Schur polynomials of its eigenvalues. 
%Such a characterization using Schur polynomials has not been noted in literature. Companion matrices are ubiquitous in stochastic processes and in Linear-Time-Invariant dynamical systems \parencite{davison1976robust, melnyk2016estimating}. This result could potentially be of independent interest in theoretical endeavours that build upon companion matrices.
%   \todo{Differences in the powers of the matrices can be interpretable but the bounds rely on specific coefficients of the powers of the matrix which can be quite hard to interpret.}
A good overview of these polynomials can be found in \textcite{chaugule2019schur}. An advantage of expressing the coefficients using symmetric Schur polynomials is that these polynomials have been a subject of extensive research in combinatorics and an equivalence between several alternate definitions has been established. To name a few, Cauchy's bialternant expression, \parencite{cauchy1815memoire,jacobi1841functionibus}, the combinatorial formula \parencite{macdonald1998symmetric} or Jacobi–Trudi identity \parencite{jacobi1841functionibus} are all equivalent ways to define Schur polynomials. It is therefore possible and often beneficial to choose the definition that yields the most useful notion for the context. We utilize this connection to interpret our results. First, for easier interpretation, we simplify Lemma \ref{lemma:diff_G_S_VAR} to the following result for scalar AR models.
%
\begin{corollary}[\textbf{\textbf{Difference in Causal and Statistical errors (AR)}}]
\label{corr:diff_c_s_ar}
Let $\myCurls{x_t}_{t\in \mathbb{Z}}$ follow an AR(q) process. Then, for any AR(p) model with parameters $\hat{A}$,
\begin{equation}
\label{eq:diff_caus_stat_ar}
    \abs{\CErrw - \SErrw} = 2 \bigg| (A^{\omega}_{11} - \hat{A}^{\omega}_{11}) \sum \limits_{k=2}^{\nu} (A^{\omega}_{1k} - \hat{A}^{\omega}_{1k}) \gamma_{k-1} \bigg|,
\end{equation}

where, for any $k \in \mathbb{N}$, $\gamma_{k}$ denotes the autocovariance of $\myCurls{x_t}_{t \in \mathbb{Z}}$ with lag $k$. $A$ and $\widehat{A}$ are the corresponding companion matrices of the model and estimated parameters as defined in Lemma \ref{lemma:diff_G_S_VAR}.
\end{corollary}
%
Lemma \ref{lemma:diff_G_S_VAR} identifies factors that control causal generalizability. We now describe them.


\textbf{Correlations control causal generalizability.} Recall our motivating example of the two highly correlated time-series where the casual and statistical errors diverge. Intuitively, one would therefore expect that large correlations among time series potentially induce large differences between observational and interventional distributions. The quantitative dependence of causal generalizability on the correlation structure of the process is, however, less obvious. Lemma \ref{lemma:diff_G_S_VAR} confirms the intuition and shows that correlations between the intervened time-series $x_{t-\omega, i}$ across both the components and time instances in $\mathbf{x}_{t-\omega}$ control generalizability from observational to the interventional distributions. 


\textbf{High-dimensional and higher-order processes can hurt generalization.} For high-dimensional processes it is not unlikely to have strong correlations across components, which may obscure causal relations in the same way as strong correlations across time does for univariate processes. Lemma \ref{lemma:diff_G_S_VAR} also supports this intuition and shows that strong correlations across components as well as time instances play a role. With increasing order or dimension of the processes, larger orders of covariances across time and dimensions could entail poor causal generalizability.


%     %
{\looseness=-1     
\textbf{Dependence on $\omega$.} The dependence of the error on $\omega$ arises through the elements of the matrix power $A^k$. A simple computation shows that, even for an AR(2) model, the dependence of these coefficients on the model parameters is asymmetric and highly intricate. However, using the Cauchy's bialternant formulation of Schur polynomials, 
we have that for any AR(p) model, the coefficients $A^{\omega}_{1k}$ can be expressed as
%\begin{equation}
    $\displaystyle A^{\omega}_{1k} = (-1)^{k+1} \frac{\sum_{i=1}^p \lambda_i^{p + \omega - 1} e_k(\lambda_i)}{\det \big \vert {\big\{\lambda_k^{ p - k' }\big\}_{k,k' \in [p]} \big \vert}}$\,,
%\end{equation}
where $e_k(\lambda_i)$ refers to the elementary symmetric polynomial of order $k$ and with variables $\myCurls{\lambda_1, \cdots \lambda_{i-1}, \lambda_{i+1}, \cdots, \lambda_p}$.  While this is not the most interpretable definition per se, the dependence of the coefficients on $\omega$ is easily understood and it is easy to verify that if the underlying model as well as the estimated model are both stable ($|\lambda|< 1$), the coefficients and hence the difference in errors exponentially decays with interventions arbitrarily in the past of the target variable and if either of the process is not stable  ($|\lambda|> 1$), the difference can indeed diverge.}


Proposition \ref{prop:stability_control} allows us to obtain a high-level perspective on causal generalizability. It states that the condition number of the autocovariance matrix controls causal generalizability. Both the maximum and the minimum eigenvalue of the autocovariance matrix (and hence the condition number) can be used as a measure of stability and hence determine the strength of correlation of the underlying process \parencite{basu2015regularized, melnyk2016estimating}. As the process gets closer to the boundary of stability domain, the autocovariance matrix gets singular and hence the condition number of the auto-covariance matrix can get arbitrarily large. Proposition \ref{prop:stability_control}, therefore, can be interpreted as if the underlying process gets closer to the boundary of the stability domain the causal and statistical errors can diverge. 

For intuition, let us revisit our motivating example from the introduction with strongly correlated observations in an $AR(p)$ process.
Let, without loss of generality $p=q$. 
Introducing the vectors $a := (a_1,a_2, \dots,a_p)$ and  $\hat{a} := (\hat{a}_1,\hat{a}_2, \dots,\hat{a}_p)$ and the covariance matrix $\Sigma_p = \Sigma_{\max\{p,q\}}$. Then the quotient between causal and statistical error for 
predicting one time step ahead i.e. $(\omega=1)$ reads:
\begin{equation}\label{eq:quotient}  
  \frac{ \CErrw}{\SErrw} = \frac{(\hat{a} - a )^T (\hat{a} - a ) + \sigma^2_{\epsilon} }{ (\hat{a} - a )^T \Sigma_p (\hat{a} - a ) + \sigma^2_{\epsilon}},
\end{equation}    
Where we have assumed $X_t$ to have unit variance without loss of generality.    
The quotient is maximized if $(\hat{a}-a)$ is a multiple of the eigenvector to the smallest eigenvalue of  $\Sigma_p$. This aligns with the intuition that causal loss diverges when the auto-covariance matrix gets singular. Moreover, we see that the vector $(\hat{a}-a)$ can be large with little observable effect when it mainly consists of eigenvectors with small eigenvalues of $\Sigma_p$. In the extreme case, if the minimum eigenvalue of the autocovariance matrix is $0$, it is possible to arbitrarily deviate from the true model parameters along the direction of the corresponding eigenvector which can significantly affect the causal error without affecting the statistical error at all.
For an $AR(2)$ process, for instance, we obtain
$\Sigma_p = \left(\begin{array}{cc} 1 & a_1/(1-a_2) \\ a_1/(1-a_2) & 1 \end{array}\right)$,
which becomes singular for $a_1= \pm (1-a_2)$ which indeed is the boundary of the stability domain (see for example,  \textcite{lutkepohl2009econometric}). This is the limit in Section \ref{sec:intro}  where
$X_t =\pm X_{t-1}$. The eigenvector for eigenvalue $0$ reads
$(1,\mp 1)$. Accordingly, the quotient \eqref{eq:quotient} diverges
when $\hat{a}$ differs from $a$ by $(1,\mp 1)$. 

This further highlights that even for simple classes of forecasting models and with simplifying assumptions such as causal sufficiency, causal risks may even diverge from statistical risks. To show this formally, by means of Lemma \ref{lemma:coef_as_schur}, we can derive an explicit upper bound on the condition number of the autocovariance matrix $\kappa(\Sigma_{ \max \myCurls{p,q}})$ for AR(p) models and arrive at Corollary~\ref{corr:Population_diff_ar}.

%
\begin{corollary}[\textbf{Stability Controls Causal Generalization (AR)}]
\label{corr:Population_diff_ar}
Consider an AR(q) process, such that eigenvalues of its companion matrix satisfy $\abs{\lambda} < \delta < 1$. For any AR(q) model $f$,
 \begin{align}
 \label{eq:stability_control_ar}
     \abs{\CErrwi  - \SErrw } & \leq K_p \SErrw(f) {\nu(1 + \delta)^{2 \nu}}/{(1 - \delta^2)},
\end{align}
where $K_p$ is some finite constant that depends on the order $p$ of the underlying process.
\end{corollary}
%

{\looseness=-1 The bound in Corollary \ref{corr:Population_diff_ar} is elegant due to its simplicity and generality. However, the cost of generality of the bound that relies only on the stability parameter is clearly that it cannot explain the variations in behavior exhibited by individual processes with the same stability parameter. For instance, consider an AR(2) model with parameters $a_1$ and $a_2$ with $a_2 \approx 0$ so that it is essentially an AR(1) model. Then, it is easy to verify that $\lambda_2 \approx 0$. The combinatorial definition of the Schur polynomials \parencite{macdonald1998symmetric} allows us to express the coefficients as follows: $ A^{\omega}_{11} = \sum_{i=0}^{\omega} \lambda_1^{\omega - i} \lambda_2^{i}, \quad A^{\omega}_{12} = \sum_{i=1}^{\omega - 1} \lambda_1^{\omega - i} \lambda_2^{i}.$ { Combining this with Corollary \ref{corr:diff_c_s_ar}, it is easy to see that if the estimated model is also close to AR(1), then the coefficients $A^{\omega}_{12}$ and $\widehat{A}^{\omega}_{12}$ and hence the difference in statistical and causal errors is close to $0$. The bound in (\ref{eq:stability_control_ar}) which relies on the stability parameter does not capture this. For tighter bounds that utilize additional information about the spectrum of the companion matrix, we can exploit the connection to Schur polynomials to arrive at the following bound.}
%
\begin{equation*}
    \label{eq:tight_ar_polynomial_bounds}
    \abs{\CErrwi  - \SErrw } \leq K_{p,q} \max \myCurls{\delta, \widehat{\delta}}^{\omega}\sum \limits_{k=2}^{\nu} \big(S_{\omega k}^{\lambda} - S_{\omega k}^{\hat{\lambda}}\big) \gamma_{k-1},
\end{equation*}
where $K_{p,q}$ is a constant that depends on $p,q$, $\delta$ and $\widehat{\delta}$ are the stability parameters of the true and estimated processes respectively and $\lambda$ and $ \widehat{\lambda}$ denote the set of eigenvalues of $A$ and $\widehat{A}$ respectively.}
