\documentclass[accepted]{uai2025} % for initial submission
%\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

% My packages
\usepackage[round]{natbib}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
%\usepackage[colorlinks,allcolors=blue]{hyperref}
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{subcaption}


\usepackage{adjustbox}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
% \usepackage{etex,etoolbox}
\usepackage{amsthm}

\usepackage{microtype}
\usepackage{graphicx}
%\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{tikz}
 \usetikzlibrary {arrows.meta}
\usepackage{multirow}
\usepackage{hhline}
\usepackage{siunitx}
\usepackage[linesnumbered,ruled,vlined]{algorithm2e} % Import the algorithm2e package



\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{example}[theorem]{Example}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{amsthm}


\hypersetup{
    colorlinks=true,
    linkcolor=blue,
    filecolor=magenta,
    citecolor = blue,
    urlcolor=cyan,
    pdftitle={DEA},
    pdfpagemode=FullScreen,
}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)
% \theoremstyle{plain}
% \newtheorem{theorem}{Theorem}[section]
% \newtheorem{proposition}[theorem]{Proposition}
% \newtheorem{lemma}[theorem]{Lemma}
% \newtheorem{corollary}[theorem]{Corollary}
% \theoremstyle{definition}
% \newtheorem{definition}[theorem]{Definition}
% \newtheorem{assumption}[theorem]{Assumption}
% \theoremstyle{remark}
% \newtheorem{remark}[theorem]{Remark}
% \newtheorem{example}[theorem]{Example}
%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example



\newcommand{\Cgam}{\operatorname{C_{\gamma}}}
\newcommand{\Etrain}{\operatorname{\mathbb{E}_{\text{train}}}}
\newcommand{\Etest}{\operatorname{\mathbb{E}_{\text{test}}}}
\newcommand{\Enu}{\operatorname{\mathbb{E}_\nu}}
\newcommand{\Pnu}{\operatorname{\mathbb{P}_\nu}}
\newcommand{\Loss}{\operatorname{\mathcal{L}(X, Y, H; \Theta)}}
\newcommand{\argmin}{\operatorname{\text{argmin}}}
\newcommand{\tr}{\operatorname{\text{tr}}}
\newcommand{\argmax}{\operatorname{arg\max}}
\newcommand\sbullet[1][.5]{\mathbin{\vcenter{\hbox{\scalebox{#1}{$\bullet$}}}}}
\newcommand{\Bmax}{\operatorname{\mathbf{b}^{\diamond}}}
\newcommand{\WIP}{\operatorname{\textcolor{orange}{X}}}
\newcommand{\indep}{\perp \!\!\! \perp}
\usepackage{pifont}% http://ctan.org/pkg/pifont
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%

\title{Learning Causal Response Representations through Direct Effect Analysis}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2025 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  
\begin{document}
\maketitle

\begin{abstract}
 We propose a novel approach for learning causal response representations. Our method aims to extract directions in which a multidimensional outcome is most directly caused by a treatment variable. By bridging conditional independence testing with causal representation learning, we formulate an optimisation problem that maximises the evidence against conditional independence between the treatment and outcome, given a conditioning set. This formulation employs flexible regression models tailored to specific applications, creating a versatile framework. The problem is addressed through a generalised eigenvalue decomposition. We show that, under mild assumptions, the distribution of the largest eigenvalue can be bounded by a known $F$-distribution, enabling testable conditional independence. We also provide theoretical guarantees for the optimality of the learned representation in terms of signal-to-noise ratio and Fisher information maximisation. Finally, we demonstrate the empirical effectiveness of our approach in simulation and real-world experiments. Our results underscore the utility of this framework in uncovering direct causal effects within complex, multivariate settings.
%We present a novel framework for learning causal response representations by directly analysing how treatment variables influence a multivariate outcome. Understanding these direct effects is crucial for isolating causal mechanisms in high-dimensional settings, enabling better decision-making in scientific and policy applications. Traditional approaches often rely on indirect modelling assumptions or fail to explicitly capture the response subspace most affected by the treatment. Our method bridges conditional independence testing and causal representation learning, formulating an optimisation problem that maximises the evidence against conditional independence between treatment and outcome, given a conditioning set. The key idea is leveraging a generalised eigenvalue decomposition, which under mild conditions, enables testable conditional independence through an F-distribution characterisation of the largest eigenvalue. This approach provides theoretical guarantees for latent structure recovery, optimality of the learned representation, and practical effectiveness in simulations and real-world experiments. Ultimately, our framework offers a powerful tool for identifying direct causal effects in complex, high-dimensional environments.
 
\end{abstract}

\section{Introduction}\label{sec:intro}


    Representation learning has been a foundational tool in modern machine learning, enabling models to automatically extract features from high-dimensional data~\citep{bengio2013representation,lecun2015deep}. However, traditional approaches often fail to capture the causal mechanisms that underlie data generation, leading to poor generalisation under data distribution shifts. To address these shortcomings, \emph{causal representation learning} (CRL) has emerged as a crucial approach to integrate causality into representation learning~\citep{scholkopf2021toward}. By learning representations that reflect the causal structure of the data, models can become more robust to distribution shifts and provide better causal insights for downstream tasks. This enables the modelling of intervention effects and the construction of counterfactuals, allowing for the analysis of questions that classical statistical models may struggle with, such as estimating the effects of policies. 
    % By aligning representations with the causal structure of the underlying system, CRL enables interventions on causal variables, allowing the modeler to estimate the effects of model manipulations.  
  
    A key focus of causal inference literature is understanding how variables influence one another along different pathways~\citep{pearl2014interpretation}. Of particular interest to this work is the direct effect of a cause on an outcome variable while controlling for confounders and mediators~\citep{pearl2022direct}. Mediators transmit the effect of the cause to the outcome, while confounders influence both the cause and the outcome. Studying direct effects rather than total effects is essential for several reasons. For instance, it allows isolating specific mechanisms in science, such as assessing the effect of greenhouse gas emissions on local temperature while controlling for natural climate variations (that emissions may also influence). Finally, it helps disentangle immediate effects from delayed downstream effects, which may have a longer-term impact on the outcome. 
    % This focus on direct effects establishes a connection between our work and the GrangerPCA~\citep{varando2022learning} method.

    When the outcome is multi-dimensional, identifying a subspace where the causes maximally influence it can benefit various tasks. In these cases, it is often impractical to observe how each dimension's distribution is shifted by the intervention. Therefore, it is of interest to examine this shift in a lower-dimensional space (e.g., 1-D or 2-D). This approach can also help discover simple, low-dimensional representations that capture relevant information about the intervention's effect. Additionally, it can help disentangle the direction in which the outcome is affected by the intervention from the direction where the distribution remains unchanged. We will demonstrate that this has important implications in different application domains with a focus on climate change attribution. % interesting applications. %, such as attributing climate change by separating the effects of external forcing factors from internal variability.

    While considerable work has focused on learning representations for confounder adjustment in causal effect estimation~\citep{louizos2017causal}, modelling representations of causes~\citep{Arjovsky2019, Peters2016}, or uncovering latent causal graphs~\citep{locatello2019challenging}, the representation of effects remains largely underexplored. Here, we aim to bridge this gap by learning a mapping of the response variable through the maximisation of a conditional independence statistic. Under certain structural assumptions, the method identifies the direction in which the effect of interventions is most observable. By using conditional expectation estimators, it adapts to different data types through various regression models.
    
    % To the best of our knowledge, this approach is new and paves the way for improved data representation in multivariate settings.

   % We know briefly describe the main concept necessary to better understand the learning framework proposed in Sec. \ref{sec:LF}.

\section{Preliminary}

% \begin{figure}
%     \centering
%         \begin{tikzpicture}[node distance=2cm, thick, ->, >=stealth, line width=1.2pt]
%             % Nodes with larger size and thicker text
%             \node[circle, draw, thick, minimum size=1.2cm, font=\bfseries\Large] (X) at (0, 0) {$X$};
%             \node[circle, draw, thick, minimum size=1.2cm, font=\bfseries\Large] (Y) at (3, 0) {$Y$};
%             \node[circle, draw, thick, minimum size=1.2cm, font=\bfseries\Large] (Z) at (1.5, 2) {$Z$};
            
%             % Thicker and larger arrows
%             \draw[->] (X) -- (Y); % X causes Y
%             \draw[->] (Z) -- (Y); % Z causes Y
%             \draw[-] (X) -- (Z);  % X associated with Z
%         \end{tikzpicture}
%     \caption{Direct Acyclic Graph considered in this graph. The relation between $X$ and $Z$ is left unspecified such that $Z$ can be a mediator, a confounder or a combination of both. \textcolor{red}{Necessary???}}
%     \label{fig:DAG}
% \end{figure}


Let $X \in \mathbb{R}^p$, $Y \in \mathbb{R}^d$, and $Z \in \mathbb{R}^r$ be three random vectors  with density function $p(x, y, z)$ and assume that their joint distribution is absolutely continuous with respect to the Lebesgue measure. We also assume that $X$ and $Z$ are known causes of $Y$, but the relation between $X$ and $Z$ is left unspecified, allowing it to be a confounder, a mediator, or both. 
% 
We aim to identify the component of $Y$ that is most directly caused by $X$, by finding $\mathbf{w}$ that maximises the causal relationship between $X$ and $\mathbf{w}^\top Y$. In the following, we clarify key terms related to the concepts of direct effect and conditional independence.
% \subsection{Direct effect}
We begin by considering James Woodward's \textit{manipulationist} definition~\citep{woodward2005making} of a direct cause:
\begin{quote}
    \textit{A necessary and sufficient condition for $X$ to be a direct cause of $Y$ with respect to some variable set [$Z$] is that there be a possible intervention on $X$ that will change $Y$ (or the probability distribution of $Y$) when all other variables in [$Z$] besides $X$ and $Y$ are held fixed at some value by interventions.}
\end{quote}
% X is a direct cause of Y with respect to a variable set [Z] if and only if an intervention on X can alter Y (or its probability distribution) while keeping all other variables in [Z], except X and Y , fixed by interventions.
The distribution of $Y$ under intervention is called the direct effect (DE) of $X$ on $Y$. For simplicity, we assume that the effects of $X$ and $Z$ on $Y$ are additive, as formalised in the model assumption in Sec. \ref{sec:Theory}. Thus, DE can be written as:
\begin{align}\label{eq:DE}
    DE(x) &= p(Y|do(X=x), do(Z=z)).
\end{align}
The variable $Y$ under the intervention $do(X = x)$ is denoted as $Y^x$. 
We note that under the assumption of additivity, the direct effect is equivalent to the natural direct effect~\citep[see][section 4.5]{pearl2009causality}. In some contexts, it is described in terms of conditional expectation—referred to as the expected direct effect (EDE)~\citep[see][section 4.5.4]{pearl2009causality}. However, we avoid this reduction, as the (conditional) expectation masks valuable information needed to identify the direction in which $Y$ is most caused by $X$, namely $Y$'s noise structure. Additionally, the term \textit{Gradient DE} (GDE) will be used to denote the vector of the partial derivative of Eq. \eqref{eq:DE} with respect to $x$, capturing how small variations in the intervention affect $Y$.  
% 
In some cases, the Gradient Direct Effect (GDE) lies in a subspace $\mathbb{R}^q \subset \mathbb{R}^d$, meaning that the distribution $P(Y|do(X=x))$ is affected by the intervention only in this subspace, while the remaining dimensions of the space are unaffected. We refer to this as the direct effect subspace (DES). Our work focuses on recovering this reduced space, with its basis ordered by the variance in $Y$ explained by $X$, while controlling for $Z$, analogous to how Principal Component Analysis (PCA) identifies directions of maximum variance in a random vector.

We now summarise conditional independence testing, which plays an important role in our work. 
%
% \subsection{Conditional independence testing}
%
% \textcolor{red}{In appendix? Feel out of its place and not necessary}
%
We say that $X$ is conditionally independent of $Y$ given $Z$, denoted $X \indep Y \mid Z$, if for all $x \in \mathbb{R}^p$, $y \in \mathbb{R}^d$, and $z \in \mathbb{R}^r$, $p(y \mid x, z) = p(y \mid z)$, or equivalently, $p(x, y \mid z) = p(x \mid z)p(y \mid z)$. This means that, given $Z$, $X$ adds no additional information about $Y$. Let $P$ denote the joint distribution of $(X, Y, Z)$ such that $P \in \mathcal{P}$ if $X \indep Y \mid Z$ holds (the null hypothesis), and $P \in \mathcal{Q}$ if $X \not \indep Y \mid Z$ (the alternative hypothesis). A Conditional Independence Test (CIT) is formulated as $H_0:   P \in \mathcal{P} \quad \text{vs.} \quad  H_1:   P \in \mathcal{Q}$.
Given i.i.d. observations $\mathbf{X} \in \mathbb{R}^{n \times p}$, $\mathbf{Y} \in \mathbb{R}^{n \times d}$, and $\mathbf{Z} \in \mathbb{R}^{n \times r}$, we use a statistic $\mathbf{T}_n(\mathbf{X}, \mathbf{Y}, \mathbf{Z})$, and reject $H_0$ when $\mathbf{T}_n$ deviates sufficiently from its expected distribution under $P \in \mathcal{P}$. 
% Conditional independence testing is a challenging problem, as shown by an impossibility result from~\citep{Shah2018TheHO}, which proves that no valid CI test \textit{has power against all alternatives}. This highlights the need for domain-specific assumptions to effectively narrow the null hypothesis space in CI testing.



% The test has a valid level at sample size $n$ if $\sup_{P \in \mathcal{P}} \mathbb{P}_P(\phi_n(\alpha) = 1)  \leq \alpha$, and has power against alternative $P \in \mathcal{Q}$ if $\mathbb{P}_P(\phi_n(\alpha) = 1)  \geq \alpha$.


\subsection{Introductory example}\label{sec:example}

\begin{figure*}
    \centering
    \includegraphics[width=1\linewidth]{uai2025-template/figures/Distrib_final_resized.png}
    \caption{Illustration of the linear model from Sec.~\ref{sec:example} with $\mathbf{b} = (1,1)^\top$ and $\boldsymbol{\Sigma} = ( 4, 0 ; 0, 1/2 )$, showing the one-sigma ellipsoid for $Y^0$ and $Y^1$. For one-dimensional $X$, $Y^x$ shifts along $\mathbf{b}$, but projection along $\mathbf{b}$ is suboptimal. In contrast, projection along $\boldsymbol{\Sigma}^{-1} \mathbf{b}$ is optimal, with $(\boldsymbol{\Sigma}^{-1} \mathbf{b}, \mathbf{b}^\perp)$ forming a natural basis for the intervention space, where the first axis captures the intervention effect and the second contains no information.
}
    \label{fig:optimal_proj}
\end{figure*}

% \textcolor{red}{Make this section better written}

Through a simple example, we demonstrate that EDE is generally suboptimal for distinguishing $Y$ distributions under different interventions and that strategically maximising a CIT statistic may be more effective.
%
Let us consider the simple linear model $Y = \mathbf{b} X + \mathbf{c} Z + N$
with $Y \in \mathbb{R}^d$, $X \in \mathbb{R}$, $Z \in \mathbb{R}^p$, and $N \in \mathbb{R}^d$. Let also $\boldsymbol{\Sigma}$ denote the covariance matrix of $Y^x$. The relationship between $X$ and $Z$ is not relevant in this context, as we focus on the intervention distribution $Y^x$, and such an intervention breaks the statistical association between $X$ and $Z$.

In a linear model, the Gradient EDE is given by the weight vector $\mathbf{b}$ for interventions on $X$~\citep[][ex. 6.42]{peters2017elements}, often called the direct effect. This means that the distribution of $Y$ is only shifting along the $\vec{\mathbf{b}}$ axis when intervening on $X$ (see Fig. \ref{fig:optimal_proj}$)$. The most common approach to find $\mathbf{b}$ is by analysing the weights of $X$ in the conditional expectation $\mathbb{E}[Y | X, Z]$. Alternatively, $\mathbf{b}$ can be obtained by maximising the partial correlation between $X$ and $\mathbf{w}^\top Y$ given $Z$. When $N$ is isotropic, the vector $\mathbf{w}$ that maximises this partial correlation is indeed $\mathbf{b}$. Since partial correlation is used in CITs, through Fisher's $Z$ transformation~\citep{fisher1915frequency}, which applies the \textit{arctanh} function to the partial correlation, the Gradient EDE can be recovered by finding the direction that maximises a CIT statistic for $\mathbf{w}^\top Y$.


However, when the noise $N$ is non-isotropic, $\mathbf{b}$ may not be optimal for isolating the causal effect of $X$ on $Y$. In this case, the direction $\mathbf{b}$ may align too closely with the noise structure of $Y$, making the intervention's effect less discernible. While regression-based approaches fail to account for the noise structure, CIT statistics balance signal detection (the effect of the intervention) and noise reduction to obtain optimal power. For non-isotropic noise, it can be shown that the most discriminative direction for the intervention is $\boldsymbol{\Sigma}^{-1} \mathbf{b}$. While EDE generally fails, maximising the partial correlation recovers this optimal direction. This illustrates how identifying the direction in which $Y$ maximises a conditional independence statistic can effectively uncover the subspace of $Y$ most caused by $X$.

This example is illustrated in Fig. \ref{fig:optimal_proj}, where we observe that projection along $\boldsymbol{\Sigma}^{-1} \mathbf{b}$ improves the separability of distributions under different interventions (here $X=0$ and $X=1$). A natural basis for representing interventions on $Y$ is then $(\boldsymbol{\Sigma}^{-1} \mathbf{b}, \mathbf{b}^\perp)$, where the first vector captures all information about the intervention, and the second contains no information. These axes need not be orthogonal. Under favorable conditions, such as a rapid decay in the eigenvalues of the covariance matrix $\boldsymbol{\Sigma}$, the noise in the distribution of $Y$ along the optimal direction $\boldsymbol{\Sigma}^{-1} \mathbf{b}$ diminishes as the dimensionality increases, concentrating the distribution’s mass in a single point and achieving optimal separability of the intervention distributions.


% A key idea in our approach is that maximising a CITing statistic aligns with recovering the reduced space of the DE. To test the presence of a direct effect, we use the partial correlation $\rho_{XY.Z}$, assuming Gaussian noise. Fisher's Z-transformation of the partial correlation, $T_{\text{fisher}} = \text{artanh}(\rho_{XY.Z})$, follows a normal distribution under the null hypothesis of no direct effect. Since $\text{artanh}$ is monotonic, maximising $T_{\text{fisher}}$ is equivalent to maximising $\rho_{XY.Z}$. Therefore, we seek $\mathbf{w}$ that maximises $\rho_{X (\mathbf{w}^\top Y).Z}$, which quantifies the relationship between $\mathbf{w}^\top Y$ and $X$ conditioned on $Z$. For isotropic noise, this partial correlation is maximised when $\mathbf{w} = \mathbf{b}$, leading to:
% \begin{align*}
%     \mathbf{w}^\top Y &= \mathbf{b}^\top \mathbf{b} X + \mathbf{b}^\top \mathbf{c} Z + \mathbf{b}^\top N  = X + \mathbf{b}^\top N.
% \end{align*}
% Thus, $\mathbf{w}^\top Y$ recovers the 1-dimensional subspace spanned by the direct effect, up to a noise term $\mathbf{b}^\top N$, with $\mathbf{b}$ representing the direction of the direct effect. If the noise is assumed to be isotropic (i.e., it has the same magnitude in all directions), this representation is optimal as it maximises the information that $\mathbf{w}^\top Y$ contains about $X$. However, for a general covariance structure $\boldsymbol{\Sigma}$, this representation may be suboptimal, as the noise might obscure part of the intervention’s effect. In such cases, one should project the data along $\boldsymbol{\Sigma}^{-1} \mathbf{b}$, as this would strike the best balance between preserving the effect of the intervention on $X$ and minimising the noise level. 


\subsection{Related work}
% 
Although the idea of learning representations of effects of causes is, to our knowledge, novel, there are important connections between our work and other fields. %In the following, we briefly review the most relevant literature in this context.
It intersects two key areas of statistical learning: conditional independence testing and causal representation learning. Below, we summarise the most relevant results in these and other relevant fields.

\textbf{Conditional Independence Testing:} A variety of methods address this problem, broadly classified into nonparametric and parametric approaches. Nonparametric methods, like kernel-based tests~\citep{zhang2011kernel}, nearest-neighbour methods~\citep{runge2018conditional}, and mutual information-based tests~\citep{fukumizu2008kernel}, offer flexibility but are computationally expensive. Regression-based approaches~\citep{Shah2018TheHO} test residual dependencies or whether $X$ improves prediction of $Y$ given $Z$~\citep{chow1960tests}. Parametric methods, such as partial canonical correlation analysis (CCA)~\citep{Rao1969}, assume linearity and Gaussianity, providing computational efficiency at the cost of strong assumptions. While these methods balance complexity, power, and robustness, they do not explicitly recover an optimal subspace for testing, though they may indirectly solve an optimisation problem that achieves this, as we will demonstrate.


\textbf{Causal Representation Learning (CRL):}  CRL~\citep{scholkopf2021toward} aims to learn representations that capture causal mechanisms, enhancing generalisation, interpretability, and robustness. Leveraging invariance across environments~\citep{Arjovsky2019}, recent methods focus on learning representations for confounders or predictors to estimate causal effects~\citep{yao2018representation, Yang2021, locatello2019challenging}, with some extending to temporal data~\citep{lachapelle2022disentanglement, lippe2022causal}. While prior work targets confounder or predictor representations, our method focuses on causal effect representation of the outcome, filling a gap in previous approaches.




\textbf{Connections to Signal Detection:} Our framework relates to signal detection~\citep{macmillan2002signal, Kay1998Detection, kay1993fundamentals}, aiming to identify a deterministic signal \( X \) in noisy observations \( Y = X + N \). In climate science, this is addressed by the ``optimal fingerprint''~\citep{hasselmann1993optimal}, which maximises the signal-to-noise ratio of a linear projection of observations. This enables a direct test for the detection of climate change while recovering a useful climate pattern.

\textbf{Sufficient Dimensionality Reduction (SDR):} There are also similarities with the SDR framework~\citep{globerson2003sufficient, fukumizu2009kernel}, which aims to find a sufficient statistic $\mathbf{w}^\top X$ such that $p(Y|X) = p(Y|\mathbf{w}^\top X)$. The reduced space therefore contains all the relevant information in $X$ to predict $Y$. Our work focuses on finding a sufficient statistic specifically for the DE, to know, a subspace that retains all relevant information about the DE.


\section{Learning Framework}\label{sec:LF}

Our goal is to identify the components of $Y$ that are most caused by $X$, conditional on $Z$, assuming all confounders $C \subseteq Z$ are observed and the causal relationship $X \to Y$ is known. Specifically, we aim to find a subspace of $Y$ that encapsulates all information about interventions on $X$. To achieve this, we represent the subspace as a linear transformation, $\tilde{Y} = \mathbf{W}^\top Y \in \mathbb{R}^q$, where $\mathbf{W} \in \mathbb{R}^{d \times q}$. For simplicity, we focus on the case where $q = 1$, and identify a vector $\mathbf{w} \in \mathbb{R}^d$ such that $\mathbf{w}^\top Y \in \mathbb{R}$ captures the maximum amount of information that a one-dimensional representation of $Y$ can convey about the intervention on $X$. The case for $q > 1$ is discussed in Section \ref{sec:more_components}.



\subsection{Maximisation of a CIT statistic}

We propose a class of learning algorithms that maximise a CIT statistic to find $\mathbf{w}$, %. This is formulated as the 
following the optimisation problem:
\begin{align}\label{eq:optimisation_problem}
    \mathbf{w}^\star = \argmax_{\mathbf{w}} T(X, \mathbf{w}^\top Y, Z).
\end{align}
Here, $X$, $\mathbf{w}^\top Y$, and $Z$ are treated as random variables, as we consider a \textit{population} version of the test statistic. This formulation provides theoretical guarantees for recovering the latent structure (see Sec. \ref{sec:latent_structure_recovery}) and the optimality of the learned representation in terms of Fisher information. We denote $T$ the population loss and $\mathbf{T}_n$ its empirical counterpart.

% \paragraph{Nested models test} 
Building on this idea, we propose a flexible framework based on nested predictive models of $Y$.
% , leveraging the definition of conditional independence: $p(y | x, z) = p(y | z)$. 
This approach assesses conditional independence by analysing the residuals from two regression models. The restricted model regresses $Y$ on $Z$ alone, while the full model includes both $X$ and $Z$. Conditional independence is evaluated by comparing the residuals of these models, without assuming a specific functional form between $X$ and $Y$. This flexibility makes the framework broadly applicable across various settings, accommodating complex, nonlinear relationships between variables.
% 
Let us define $ R_{\text{full}}^2(\mathbf{w}) = \mathbb{E}\left[(\mathbf{w}^\top Y - \mathbb{E}[\mathbf{w}^\top Y | X, Z])^2\right] $ and $ R_{\text{res}}^2(\mathbf{w}) = \mathbb{E}\left[(\mathbf{w}^\top Y - \mathbb{E}[\mathbf{w}^\top Y | Z])^2\right] $ as the population mean squared error when predicting $ \mathbf{w}^\top Y $ from the full model (including both $X$ and $Z$) and the restricted model (including only $Z$), respectively. A straightforward way of enforcing conditional dependence--maximising the \textit{distance} between $p(y | x, z)$ and $p(y | z)$--is to maximise the distance between the residuals of the full regression model and the restricted one. This leads to the simple loss function:
\begin{align}\label{eq:simple_population_loss}
    T_S(X, Y, Z; \mathbf{w}) = R_{\text{res}}^2(\mathbf{w}) - R_{\text{full}}^2(\mathbf{w}).
\end{align}
Under the null hypothesis, both regression models have equal predictive power, but the full model, with more degrees of freedom, yields smaller residuals. This can also be viewed through an information theory perspective, detailed further in Sec. \ref{sec:IT_and_SNR}.
% 
% \textcolor{red}{Necessary???}
% This formulation can also be interpreted through the lens of information theory~\citep{thomas2006elements}. In this context, the residuals can be viewed as measures of uncertainty, or entropy, concerning $Y$ given $X$ and $Z$. Thus, this statistic aims to maximise the conditional mutual information $I(X; \mathbf{w}^\top Y \mid Z) = H(\mathbf{w}^\top Y \mid Z) - H(\mathbf{w}^\top Y \mid Z, X)$, where $H(\mathbf{w}^\top Y \mid Z)$ and $H(\mathbf{w}^\top Y \mid Z, X)$ denote the conditional entropies. The connection between conditional mutual information, causality, and conditional independence (CI) being well established~\citep{jansing2013quantifying}. 
% 
However, this loss function is unbounded with respect to $\mathbf{w}$; thus, it is necessary to impose additive constraints on $\mathbf{w}$ to avoid trivial solutions. The most straightforward way to constrain the loss is to limit $\mathbf{w}$ to be a unit norm vector, i.e., $\|\mathbf{w}\| = 1$. We show in Lemma \ref{lemma:EGV_sol} in supplementary material that this approach recovers the EDE and is thus suboptimal for non-isotropic noises.

Another approach, is to constrain the full residuals to be fixed, leading to the following loss function:
\begin{align}\label{eq:population_loss_F}
    T_{F}(X, Y, Z; \mathbf{w}) = \frac{ R_{\text{res}}^2(\mathbf{w}) - R_{\text{full}}^2(\mathbf{w})}{R_{\text{full}}^2(\mathbf{w})}.
\end{align}
In the context of a linear Gaussian SCM, this statistic can be interpreted as an F-test between nested models (aka Chow test~\citep{chow1960tests}), which is commonly used for variable selection~\citep{hocking1976biometrics} or causal discovery~\citep{nogueira2022methods}. When the conditioning set $Z$ consists of the past values of $Y$, the empirical version of $T_F$ corresponds to the statistic of the well-known Granger causality test~\citep{granger1969investigating}. In this context, the maximisation of $T_F$ with respect to $\mathbf{w}$ leads to a causal representation method known as Granger PCA~\citep{varando2022learning}. This further emphasises how maximising a conditional independence testing statistic can be leveraged to uncover the direction in which $Y$ is most strongly caused by $X$.
% 
Another possible constraint is grounded in detection theory~\citep{macmillan2002signal, kay1993fundamentals}. Considering that $Y$ can be decomposed into a signal term $S$ (variance related to $X$) and a noise term $N$ (variance related to $Z$ and $Y$'s intrinsic noise), we constrain the variance of $\mathbf{w}^\top N$. Assuming that the signal and noise are additive in $Y$, this constraint relates to constraining $R^2_{\text{noise}} = \mathbb{E}[(\mathbf{w}^\top Y - \mathbb{E}[\mathbf{w}^\top Y \mid X, Z=0])^2]$. 
% This computational trick is equivalent to computing the variance of $Y$ when projected along $\mathbf{b}^\perp$. 
We thus propose the  loss function:
\begin{align}\label{eq:population_loss_detect}
    T_{D} = \frac{ R^2_{\text{res}}(\mathbf{w}) - R^2_{\text{full}}(\mathbf{w})}{R^2_{\text{noise}}(\mathbf{w})}.
\end{align}
It will be shown in Sec. \ref{sec:latent_structure_recovery} that this formulation is optimal under certain structural assumptions.

Canonical Correlation Analysis (CCA) \citep{hotelling1992relations} and its partial variant \cite{Rao1969} also seek a subspace that captures reduced information between $X$ and $Y$ (conditioning on $Z$ in partial CCA), enabling (conditional) independence testing. In Sec.~\ref{supp:pCCA}, we demonstrate that partial CCA aligns with our framework by interpreting it as the maximisation of a conditional independence statistic.

\subsection{Empirical estimators}\label{sec:emp_est}

We now present the practical optimisation procedure to estimate $\mathbf{w}^\star$.
% , with the asymptotic properties of these estimators discussed in Sec. \ref{prop:cv}.
Given observation (or design) matrices $ \mathbf{X} \in \mathbb{R}^{n \times p} $, $ \mathbf{Y} \in \mathbb{R}^{n \times d} $, and $ \mathbf{Z} \in \mathbb{R}^{n \times r} $, we now present empirical estimators for $\mathbf{w}_S$, $\mathbf{w}_F$ and $\mathbf{w}_D$.

% \paragraph{Nested models Based} 
Similarly, we assume that we have two estimators $ \hat{g}_{\text{full}}(X, Z) $ and $ \hat{g}_{\text{res}}(Z) $ for the conditional expectations $ \mathbb{E}[Y \mid X, Z] $ and $ \mathbb{E}[Y \mid Z] $, respectively. The learning algorithms employed to estimate these conditional expectations are not restricted, allowing users to tailor them based on their assumptions about the relationships within the data and their prior knowledge. We denote by $ \hat{\boldsymbol{\Sigma}}_{\text{full}} $, $ \hat{\boldsymbol{\Sigma}}_{\text{res}} $, and $ \hat{\boldsymbol{\Sigma}}_{\text{noise}} $ the sample covariance matrices of the residuals from the full and restricted models, as well as the noise covariance.
% , defined as:
% \begin{align*}
%     \hat{\boldsymbol{\Sigma}}_{\text{full}} &= \frac{1}{n-1}\sum_{i=1}^n(\mathbf{Y}_i - \hat{g}_{\text{full}}(\mathbf{X}_i \mathbf{Z}_i))^\top 
%     (\mathbf{Y}_i - \hat{g}_{\text{full}}(\mathbf{X}_i, \mathbf{Z}_i)) 
%     % \\
%     % \hat{\boldsymbol{\Sigma}}_{\text{res}} &= \frac{1}{n-1}\sum_{i=1}^n(\mathbf{Y}_i - \hat{g}_{\text{res}}(\mathbf{Z}_i))^\top (\mathbf{Y}_i - \hat{g}_{\text{res}}(\mathbf{Z}_i)) \\
%     % \hat{\boldsymbol{\Sigma}}_{\text{noise}} &= \frac{1}{n-1}\sum_{i=1}^n(\mathbf{Y}_i - \hat{g}_{\text{full}}(\mathbf{X}_i, \mathbf{0}))^\top
%     %  (\mathbf{Y}_i - \hat{g}_{\text{full}}(\mathbf{X}_i, \mathbf{0})).
% \end{align*}
% and similarly for $\hat{\boldsymbol{\Sigma}}_{\text{res}} $, and $\hat{\boldsymbol{\Sigma}}_{\text{noise}}$.
The three population losses can be maximised by solving the general eigenvalue problem $\hat{\mathbf{M}} \mathbf{w} = \lambda \hat{\mathbf{N}} \mathbf{w}$, where $\hat{\mathbf{M}} = \hat{\boldsymbol{\Sigma}}_{\text{res}} - \hat{\boldsymbol{\Sigma}}_{\text{full}}$ and $\mathbf{N}$ corresponds to the constraints on $\mathbf{w}$: $\hat{\mathbf{N}} = \mathbf{I}$ for $T_S$, $\hat{\mathbf{N}} = \hat{\boldsymbol{\Sigma}}_{\text{full}}$ for $T_F$, and $\hat{\mathbf{N}} = \hat{\boldsymbol{\Sigma}}_{\text{noise}}$ for $T_D$.
Given random realisations of $(X, Y, Z)$, the population matrices $\mathbf{M}$ and $\mathbf{N}$ are random, typically following a Wishart distribution. Under this condition, the first eigenvalue of the GEV problem, denoted by $\Lambda_1$, is also random. Upon observing data $(\mathbf{X}, \mathbf{Y}, \mathbf{Z})$, the empirical covariances $\hat{\mathbf{M}}$ and $\hat{\mathbf{N}}$ are fixed, and we obtain a realisation $\lambda_1 \sim \Lambda_1$ with corresponding eigenvector $\mathbf{w}_1$. We denote the eigen-pairs $(\lambda_S, \mathbf{w}_S)$, $(\lambda_F, \mathbf{w}_F)$, and $(\lambda_D, \mathbf{w}_D)$ as those corresponding to the first eigenvalues for the losses $T_S$, $T_F$, and $T_D$, respectively.


\begin{algorithm}
\caption{Direct Effect Analysis Algorithm}
\label{algo:compact_GEV_power_iteration}
\KwIn{Data matrices $\mathbf{X}$, $\mathbf{Y}$, $\mathbf{Z}$, components $K$, $solver \in \{T_S, T_F, T_D\}$, learning algorithms $g_{res}$, $g_{full}$.}
\KwOut{Matrix $\mathbf{W} = [\mathbf{w}_1, \dots, \mathbf{w}_K]$}
Initialize $\mathbf{W} \gets [~~]$, $\mathbf{Y}^{(1)} \gets \mathbf{Y}$\\
\For{$k = 1$ \KwTo $K$}{
    Train $\hat{g}_{res}$, $\hat{g}_{full}$ using $(\mathbf{Y}^{(k)}, \mathbf{X}, \mathbf{Z})$\\  
    $\mathbf{N}_{\text{res}} \gets \mathbf{Y}^{(k)} - \hat{g}_{res}(\mathbf{Z})$\\  
    $\mathbf{N}_{\text{full}} \gets \mathbf{Y}^{(k)} - \hat{g}_{full}(\mathbf{X}, \mathbf{Z})$\\
    \If{solver == $T_D$}{
        $\mathbf{N}_{\text{noise}} \gets \mathbf{Y}^{(k)} - \hat{g}_{full}(\mathbf{X}, \mathbf{0})$
    }

    Compute $\mathbf{\Sigma}_{\text{res}}, \mathbf{\Sigma}_{\text{full}}$, and (if $T_D$) $\mathbf{\Sigma}_{\text{noise}}$ from residuals.\\
    $\hat{\mathbf{M}} \gets \mathbf{\Sigma}_{\text{res}} -  \mathbf{\Sigma}_{\text{full}}$\\
    $\hat{\mathbf{N}} \gets \mathbf{I}$ (if $T_S$), $\hat{\mathbf{N}} \gets \mathbf{\Sigma}_{\text{full}}$ (if $T_F$), $\hat{\mathbf{N}} \gets\mathbf{\Sigma}_{\text{noise}}$ (if $T_D$).\\
    
    Solve GEV: $\hat{\mathbf{M}}\mathbf{w} = \lambda \hat{\mathbf{N}}\mathbf{w}$, normalize and append $\mathbf{w}_k$ to $\mathbf{W}$.\\
    $\mathbf{Y}^{(k+1)} \gets \mathbf{Y}^{(k)} - \sum_{i=1}^{k} \mathbf{Y}^{(k)} \mathbf{w}_i \mathbf{w}_i^\top$
}
\Return $\mathbf{W}$
\end{algorithm}



The convergence properties of these estimators are presented in Th. \ref{prop:cv} in the supplementary materials. Additional details on the estimation of the conditional expectations, as well as the estimation of other components and the stability of the solution, can be found in Section \ref{sec:Estim_details}.


\section{Theoretical guarantees}\label{sec:Theory}

In this section, we discuss the theoretical properties of the maximisation of the statistics introduced earlier. We consider the distribution of $ (X, Y, Z) \sim P $ entailed within the following Structural Causal Model (SCM):
\begin{align}\label{eq:scm}
    Y &:= \mathbf{b} \phi(X) + \psi(Z) + N_y,
\end{align}
where $ \phi(x): \mathbb{R}^p \to \mathbb{R} $, $ \psi(z): \mathbb{R}^r \to \mathbb{R}^d $, $\mathbf{b} \in \mathbb{R}^{d}$ and with $N_y \sim \mathcal{N}(0, \mathbf{\Sigma})$. Again, the relationship between $X$ and $Z$ is left undefined as applying $do(X)$ breaks any statistical dependencies that existed in the observational setting. We denote by $\mathbf{\Sigma}_{\psi(z)}$ the covariance of $\psi(Z)$. For the remainder of this section, we assume that the intervention $\phi(x)$ is bounded. 
% 
Concretely, we assume that $X$ goes threw an information bottleneck of dimension one. The vector $\mathbf{b}$ thus gives the direction of the causal effect as intervention on $X$ will shift along axis $\mathbf{b}$. Note that if $\phi(x)$ is linear, it corresponds to the Gradient EDE.
% However $\mathbf{b}$ might not be an optimal axis to project $Y^x$ as this would not take into account the the noise structure of $Y^x$. 
% 
All the proofs are given in Sec. \ref{sec:Proofs} in supplementary materials.

The notion of the "direction most caused by" an intervention is ambiguous when considering the full distribution rather than just its mean, as in EDE. We seek a direction along which interventions on $X$ induce the most significant changes in the distribution of $Y$, capturing the overall distribution shift rather than relying solely on mean effects. To quantify this, we analyze the signal-to-noise ratio (SNR) and Fisher information, which serve as natural measures of intervention-induced variation. In the Gaussian case, where distributions are fully characterized by their first two moments, these measures provide a principled way to identify the most affected direction. Given our generative model, we show that our algorithm optimally identifies directions that maximize SNR, effectively separating interventional distributions. Furthermore, under the assumption that $\phi(x)$ is linear, we prove that these directions also maximize the Fisher information of $\mathbf{w}^\top Y$ with respect to the intervention, ensuring the most effective disentanglement of $Y$ under infinitesimal intervention shifts.  

\subsection{Causal effect representation}\label{sec:latent_structure_recovery}

To better understand the properties of the different learning algorithms, it is useful to decompose the intervention distribution \( Y^x \) into a signal term and a noise term $Y^x = S(x) + N$ where \( S(x) = \mathbf{b} \phi(x) \) represents the EDE, a non-random component of \( Y^x \), and the noise term is given by \( N = \psi(Z) + N_y \), which remains random. We define the SNR of the transformed variable \( \mathbf{w}^\top Y^x = \mathbf{w}^\top S(x) + \mathbf{w}^\top N \) as  
\begin{align}\label{eq:SNR}
    \gamma^2(\mathbf{w}) = \frac{(\mathbf{w}^\top S(x))^2}{\mathbf{w}^\top \mathbf{\Sigma}_N \mathbf{w}},
\end{align}
where \( \mathbf{\Sigma}_N \) is the covariance matrix of the noise term. Notably, when the conditioning set \( Z \) is accounted for, the noise covariance \( \mathbf{\Sigma}_N \) simplifies to \( \mathbf{\Sigma} \). In this case, the optimality results that will be established for \( \mathbf{w}_D \) also apply to \( \mathbf{w}_F \).  
% 
We now present some optimality results related to the SNR. This metric is tied to an optimal representation because, as the SNR increases, the distribution becomes more concentrated around the signal $S(x)$ Thus, the direction that maximises the SNR is the one for which small perturbations of the intervention are most observable. We thus say that a weight vector $\mathbf{w}$ is optimal if it maximises $\gamma^2(\mathbf{w})$. For general noise structures, $\mathbf{w}_D$ is shown to be optimal.

% 
\begin{proposition}[General optimality]\label{prop:snr_max}
    Assuming $P$ is entailed in the SCM in \eqref{eq:scm}, we have that $\mathbf{w}_D$ is optimal.
\end{proposition}
% 
% Note that it can be similarly shown that $\mathbf{w}_F$ is optimal with regard to $Y^x|Z$, thus all theoretical guarantees for $\mathbf{w}_D$ apply to $\mathbf{w}_F$ where the distribution $Y^x$ is adjusted for $Z$.
Under stronger assumptions -- isotropy of the noises -- both $\mathbf{w}_S$ and $\mathbf{w}_F$ are shown to be optimal.
% 
\begin{proposition}[Optimality under isotropic noise]\label{prop:snr_max_others}
    Assuming that $P$ is entailed in the SCM in \eqref{eq:scm} and that $\mathbf{\Sigma}_N$ is isotropic, we have that both $\mathbf{w}_S$ and $\mathbf{w}_D$ are optimal. Moreover, if $\mathbf{\Sigma}$ is also isotropic, then $\mathbf{w}_F$ is also optimal.
\end{proposition}
This proposition implies that when the effects of $X$ and $Z$ are assumed to be separable, $\mathbf{w}_D$ is optimal in the sense that it maximises the SNR. 
% 
% We now provide different guarantees different conditions for each estimator to converge in the sense that their SNR grows to infinity as $d$ goes to infinity, meaning that the signal completely dominates the noise.

We now present different guarantees for the learned representation, demonstrating that in the large-dimensional regime, and under specific conditions on the characteristics of \( \mathbf{b} \), \( \boldsymbol{\Sigma} \), and \( \boldsymbol{\Sigma}_{\psi(z)} \), the signal-to-noise ratio improves as the dimensionality of \( Y \) increases, such that the signal of \( \mathbf{w}^\top Y \) completely dominates its noise.

\begin{proposition}[Noise term behavior]\label{prop:ntb}
    Let $\|\mathbf{b}\|^2 = o\left(\nu_1(d)\right)$, $\mathbf{b}^\top (\boldsymbol{\Sigma} + \boldsymbol{\Sigma}_{\psi(z)}) \mathbf{b} = o\left(\nu_2(d)\right)$, $\mathbf{b}^\top \boldsymbol{\Sigma}^{-1} \mathbf{b} = o\left(\nu_3(d)\right)$, $\mathbf{b}^\top (\boldsymbol{\Sigma}^{-1} + \boldsymbol{\Sigma}^{-1} \boldsymbol{\Sigma}_{\psi(\mathbf{Z})} \boldsymbol{\Sigma}^{-1}) \mathbf{b} = o\left(\nu_4(d)\right)$, and $\mathbf{b}^\top (\boldsymbol{\Sigma} + \boldsymbol{\Sigma}_{\psi(z)})^{-1} \mathbf{b} = o\left(\nu_5(d)\right)$. Here $\nu_i$ denotes the rates of growth with regard to $d$.

    Assume the distribution $P$ follows the structural causal model in Eq. \ref{eq:scm}, and the following conditions hold: \textbf{1.} $\lim_{d \to \infty} \frac{\nu_1(d)}{\nu_2(d)} \to \infty$, \textbf{2.} $\lim_{d \to \infty} \frac{\nu_3^2(d)}{\nu_4(d)} \to \infty$ and \textbf{3.} $\lim_{d \to \infty} \nu_5(d) \to \infty$.

    The following convergence properties hold: $\gamma^2(\mathbf{w}_S) \to \infty$ if condition \textbf{1} holds, $\gamma^2(\mathbf{w}_F) \to \infty$ if condition \textbf{2} holds, $\gamma^2(\mathbf{w}_D) \to \infty$ if condition \textbf{1}, \textbf{2} or \textbf{3} holds.
    % \begin{enumerate}
    %     \item 

    %     \item 

    %     \item 
    % \end{enumerate}
\end{proposition}

In general, the above conditions reflect the fact that $\mathbf{b}$ is unaligned with $\boldsymbol{\Sigma}$, meaning that large values of $\mathbf{b}$ correspond to small values of $\boldsymbol{\Sigma}$ and $\boldsymbol{\Sigma}_{\psi(Z)}$. This relationship can also be interpreted in terms of the growth of the largest eigenvalue of $\boldsymbol{\Sigma}$ or of $\|\mathbf{b}\|^2$, independently. All of these conditions are related to the observation that as the dimensionality increases, $Y^x$'s distribution contains 'more signal' relative to its noise level. This phenomenon occurs, for example, when the sources of noise are limited and the resolution of the observations is increased. We provide further details and insights on these assumptions in Sec. \ref{sec:noise_term_behavior}.
% 
As discussed above, a strong SNR indicates that the recovered signal is closer to the information bottleneck $\phi(x)$. More importantly, it also implies better separability of the distributions of $Y^x$ along the projected axis. This can be formalised by considering the Fisher information of $\mathbf{w}^\top Y^x$ with respect to $x$, given by:
% 
\begin{align*}
I_{\mathbf{w}}(x) = \mathbb{E} \left[U(x) U(x)^\top \right],
\end{align*}
with $U(x) =  \nabla_x \log P(\mathbf{w}^\top Y \mid  do(X= x))$ denoting the score function.
% 
We now show that for linear models, the Fisher information and the SNR of $\mathbf{w}^\top Y^x$ are equivalent up to a positive scaling factor.
% 
\begin{proposition}[Equivalence between Fisher information and SNR]\label{prop:SNR_FI_equiv}
    Consider a SCM as described in \eqref{eq:scm}, and let the intervention function be $\phi(x) = \mathbf{v}^\top x$, where $\mathbf{v} \in \mathbb{R}^d$. Then, the SNR is proportional to the Fisher Information of the intervention, i.e.   $I _{\mathbf{w}}(x) = \alpha \gamma^2(\mathbf{w})$ with $\alpha \in \mathbb{R}^+$.
\end{proposition}
% 
Applying this result to $T_D$, we obtain an optimality guarantee in terms of Fisher information.
% 
\begin{corollary}
    Under the assumptions of Prop. \ref{prop:SNR_FI_equiv}, the optimal solution $\mathbf{w}_D$ maximises the Fisher information $I_{\mathbf{w}}(x)$.
\end{corollary}
% 
A similar result to Prop. \ref{prop:ntb} can also be derived for Fisher information under the assumption of a linear effect of $X$ on $Y$. Thus, the optimality conditions for recovering the bottleneck structure $\phi(x)$ translate into conditions for the discriminative power of the learned representation. 
% 
In this setting, maximising the SNR is equivalent to maximising the Fisher information, which quantifies the sensitivity of the projected distribution to changes in the intervention parameter. This can be better understood by examining the relationship between Fisher information and the Kullback-Leibler divergence (see \ref{sec:IT_and_SNR}). Specifically, it measures the distance between parametric distributions, where in this case, the parameter corresponds to the intervention value. For linear models, $T_D$ is optimal as it maximises the distributional divergence induced by infinitesimal perturbations of the intervention. This enhances the discriminative power of the learned representation across different interventions. Moreover, higher Fisher information indicates that the learned representation retains more information about the intervention. 
% A more detailed exploration of this idea is left for future work.
% 
\subsection{Testing the presence of a direct effect}\label{sec:lantent}
% 
We now explore a direct implication of our problem formulation. Since we are maximising a test statistic for conditional independence testing, we can derive the distribution of the loss function under the null hypothesis. Consequently, one can reject the hypothesis of conditional independence at level $ \alpha $ if the value of the loss function, specifically the largest eigenvalue $ \lambda_{1} $, exceeds a critical threshold. 
% An intuitive, albeit less mathematically rigorous, interpretation of the loss function is that a higher value indicates a stronger effect of $ X $ on $ Y $ in the extracted component.
% 
\begin{proposition}[Distribution of $\lambda_F$ under conditional independence] \label{prop:lambda_F_distrib} 
    Let the distribution $P$ be induced by the SCM in \eqref{eq:scm} with linear assignments and Gaussian noise, and assume $p = q = 1$. Under the null hypothesis $H_0: X \indep Y \mid Z$, the largest root $\lambda_F$ is $F$-distributed such that $(dfn/dfd)\Lambda_F\sim F(dfd, dfn)$ where $dfn = d$ and $dfd=n-p-r-1$.
\end{proposition}  

Finding the distribution of $\Lambda_D$ is more challenging. Instead, we establish an upper bound on $\Lambda_D$'s distribution, allowing the distribution of $\lambda_F$ to serve as a proxy for computing upper bounds on the p-values of $\Lambda_D$.  

\begin{proposition}[Upper bound on $\Lambda_D$ under conditional independence]  
    Under similar assumptions as in Prop \ref{prop:lambda_F_distrib} we have under the null hypothesis $H_0: X \indep Y \mid Z$ that $P(\Lambda_D \geq \lambda_D | H_0) \leq P(\Lambda_F\geq\lambda_D |H_0)$.
\end{proposition}  

% 
Testing is straightforward by rejecting the null hypothesis if $(dfd/dfn) \hat{\lambda_1}$ deviates sufficiently from $F(dfn, dfd)$. This property is useful for testing whether the learned representation (Sec. \ref{sec:latent_structure_recovery}) captures a meaningful effect of $X$ on $Y$.


% \subsection{Asymptotic properties of the empirical estimators}


% We now show that under common assumptions, specifically that there are two unbiased estimators $ \hat{g}_{\text{full}} $ and $ \hat{g}_{\text{res}} $ with convergence rates $ \kappa_1(n) $ and $ \kappa_2(n) $, the estimators proposed in \eqref{eq:emp_loss_T_C} are consistent with their population counterparts. Furthermore, we demonstrate that their convergence rate depends linearly on the convergence rates $ \kappa_1(n) $ and $ \kappa_2(n) $.

% \begin{proposition}[Convergence Rate of F-Test Based Losses]
%     Assuming the following conditions hold:
%     \begin{enumerate}
%         \item $\mathbb{E} \| \hat{g}_{\text{full}}(\mathbf{X}_i, \mathbf{Z}_i) - \mathbb{E}[\mathbf{Y}_i | \mathbf{X}_i, \mathbf{Z}_i] \|^2 = o_P(\kappa_1(n)) $ 
%         \item $ \mathbb{E}  \| \hat{g}_{\text{res}}(\mathbf{Z}_i) - \mathbb{E}[\mathbf{Y}_i | \mathbf{Z}_i] \|^2 = o_P(\kappa_2(n)) $ 
%         \item $ \lambda_M^{(1)} - \lambda_M^{(2)} = \delta_M > 0 $ where $ \lambda_M^{(1)} > \lambda_M^{(2)} \geq \dots \geq \lambda_M^{(d)} $ are the eigenvalues of $ \mathbf{M} $,
%         \item $ \lambda_N^{(1)} - \lambda_N^{(2)} = \delta_N > 0 $ where $ \lambda_N^{(1)} > \lambda_N^{(2)} \geq \dots \geq \lambda_N^{(d)} $ are the eigenvalues of $ \mathbf{N} $,
%         \item $ \mathbb{E}  \|Y - \mathbb{E}[Y|X, Z]\|^2 \leq N_{\text{full}} $ and $ \mathbb{E} \|Y - \mathbb{E}[Y|Z]\|^2 \leq N_{\text{res}} $
%     \end{enumerate}

%     Let $ \mathbf{w}_1 $ be the optimal solution of \eqref{eq:simple_population_loss}, \eqref{eq:population_loss_F}, or \eqref{eq:population_loss_detect}, and let $ \hat{\mathbf{w}} $ be the empirical solution of their respective empirical estimator. We have that 
%     \begin{align}
%         \mathbb{E}[\|\mathbf{w}_1 -\mathbf{\hat{w}}_0\|^2_2] = o(\sqrt{\kappa_1(n)} + \sqrt{\kappa_2(n)}).
%     \end{align}

% \end{proposition}

% We refer the reader to the multivariate analysis literature~\citep{Rao1969, Bilodeau1999TheoryOM, Borchani2015} for the asymptotic properties of $ \mathbf{T}_C $, which have been studied more extensively.




\section{Experiments}\label{sec:Experiments}

In this section, we present the results of our extensive simulation experiments designed to support our theoretical findings. Additionally, we provide a straightforward use case from climate science detection and attribution to illustrate the practical relevance of our approach. The code for all experiments is available at this \href{https://anonymous.4open.science/r/DEA_UAI2025-EBBF/}{github repository}.

\subsection{Simulation experiments}\label{sec:simulations}


\begin{figure}
    \centering
    \includegraphics[width=1\linewidth]{uai2025-template/figures/DR_noise_behavior.png}
    \caption{Correlation between $\mathbf{w}^\top Y$ and $\phi(X)$ as $d$ increases. $T_D$ consistently outperforms all methods, recovering $\phi(X)$ as $d$ grows, provided that $\mathbf{b}$ faster than $\mathbf{\Sigma}$. When $T_F$, $T_S$ or PCA are not visible, they are overlapped by $T_D$. See Fig.~\ref{fig:DR_noise_behavior_Noise} for the (5, 95) percentiles.}
    \label{fig:DR_noise_behavior_noNoise}
\end{figure}


\begin{figure*}[ht]
    \centering
    \includegraphics[width=1\linewidth]{uai2025-template/figures/Power_tests_a05.png}
    \caption{ Power of the test for $ \alpha =0.05$. A detailed experiments with different values $\alpha$ is available in Fig. \ref{fig:power_all}}
    \label{fig:test_results_linear}
\end{figure*}



We simulate data from a linear SCM with Gaussian noise, where $Z$ acts as a confounder for both $X$ and $Y$ \eqref{eq:scm}. The noise terms $N_x$ and $N_z$ are independent. For the nonlinear case, we define $f_a(z) := \exp(-z^2/2)\sin(az)$, where $a$ controls nonlinearity. In the linear case, $f_a$ is set to the identity. The coefficients $\Gamma, \mathbf{b}, \mathbf{C}, \mathbf{D}$ are uniformly sampled from $[0, 1]$. We run 20 repetitions for each sample size $n$ and dimension $d$, reporting median values and quartiles.

\paragraph{Causal Effect Representation}
% 
We assess the performance of our algorithm in recovering the direct effect of $X$ on $Y$, modeled as $f_a(\Gamma^\top X)$. The recovery is tested as $d$ increases and with varying noise structures. We set $p = r = 10$, and use $n = 4000$ samples for robust evaluation. Performance is evaluated by the absolute correlation between $\mathbf{w}^\top Y$ and $f_a(\Gamma^\top X)$, comparing nested models ($T_S$, $T_F$, $T_D$) against PCA and partial CCA (pCCA) as baselines.
% 
To understand the contexts where learning algorithms may fail to fully recover $\phi(X)$, we consider various configurations of $\mathbf{\Sigma}$ and $\mathbf{b}$. We set $\mathbf{\Sigma}$ to be diagonal and explore four sets of entries for $Diag(\mathbf{\Sigma})$ and $\mathbf{b}$: $(1, \dots, i, \dots, d)$, $(1, \dots, 1)$, $(1, \dots, 1/i, \dots, 1/d)$, and $(1, \dots, 1/i^2, \dots, 1/d^2)$. Our main observation is that when $\mathbf{b}$ grows slowly relative to $\mathbf{\Sigma}$, none of the methods fully recover the signal. Specifically, pCCA tends to converge to a correlation of approximately 0.75, as it only recovers the part of $\phi(X)$ independent of $Z$—the signal correlated with $\psi(Z)$ is regressed out from both residuals before regression. This behavior is clearer in Appendix figure \ref{fig:DR_noise_behavior_indep}, where $X$ and $Z$ are simulated as independent variables, and pCCA can recover $\phi(X)$. Additionally, we observe that $T_F$ and $T_D$ outperform $T_S$ when $\mathbf{b}$ grows too slowly relative to the noise. Both $T_F$ and $T_D$ effectively control the variance contributions from $\mathbf{\Sigma}$, resulting in better performance in these challenging contexts (figure \ref{fig:DR_noise_behavior_noNoise} in appendix).
% 
We analyse recovery across various noise configurations for $\mathbf{\Sigma}$: Diagonal, Full-rank, and Low-rank (rank = 10). We also test three weighting schemes: \textit{equal}, \textit{strong\_N\_Y}, and \textit{strong\_Z}, setting $(u, v, w)$ in \eqref{eq:simulation_SCM} by $(1/3, 1/3, 1/3)$, $(0.1, 0.1, 0.8)$, and $(0.1, 0.8, 0.1)$, respectively. As shown in Appendix figure \ref{fig:DR}, $T_D$ consistently outperforms other methods, with correlation approaching 1 as $d$ increases. Similar trends are observed in nonlinear and high-dimensional cases (Appendix figures \ref{fig:DR_nonlinear}, \ref{fig:DR_high_dimensional}).

\paragraph{Level and Power of the Test}
% 
We assume that the data are generated from a linear SCM with Gaussian noise, where $f_a(Z) = Z$ and set $p = r = q = 1$. Our analysis compares tests based on the optimisation of $T_F$ and $T_D$ against four common conditional independence (CI) tests: partial CCA ~\citep{Rao1969}, the Generalised Covariance Measure (GCM)~\citep{Shah2018TheHO}, Fisher's Z test~\citep{kalisch2007estimating}, and the Kernel Conditional Independence (KCI) test~\citep{zhang2012}. The primary focus is on test performance with respect to sample size and $Y$'s dimensionality.
% 
All tests maintain valid control of false positives when $d < n$ (see figure \ref{fig:type_I_control}), ensuring effective Type I error control. However, for test power (see figure \ref{fig:test_results_linear}), Fisher's Z and KCI show lower performance, especially for small samples and large $d$, due to their broader hypothesis set $\mathcal{P}$, which includes potentially nonlinear relationships. 
% 
Tests based on $T_F$, $T_D$ and pCCA leverage $Y$'s dimensionality, show better performance with higher dimensions for fixed sample sizes. This contrasts with Fisher's $Z$, which performance does not increases with $d$.

\subsection{Real-world experiments}

We present two real-world climate detection and attribution experiments: the first leverages the algorithm's ability to learn disentangled representations, and the second applies $T_D$ to test causal effects.

\paragraph{Separating internal climate variability from the externally forced response.}

% In the field of climate change detection and attribution, a major challenge lies in disentangling the portion of climate change driven by external forcings (e.g., greenhouse gas emissions) from internal climate variability (i.e., natural climate fluctuations)~\citep{Sippel2019}. A key question remains whether internal variability itself is influenced by external forcings. Let $ \mathbf{b}\phi(x) $ represent the forced response signal to greenhouse gas (GHG) emissions, $\mathbf{C}^\top Z $ denote the effect of a proxy $ Z $ for internal variability (e.g., Sea Level Pressure - SLP), and $ N_y $ the measurement noise. The observed climate field (e.g., temperature) can then be modeled as $Y = \mathbf{b}\phi(X) + \mathbf{C} Z + N_y$.
% We use $ n = 2000 $ observations from CMIP6 climate simulations~\citep{Eyring2016}, where the spatial resolution for both $ X $ and $ Y $ is $ p = d = 648 $. Using these simulations, we estimate the response pattern $ \mathbf{w} $ of $ X $ on $ Y $ through the learning algorithm $ T_D $ by projecting $Y$ in the null space $\mathbf{b}^\perp$ to get an estimate of internal variability and take it's. 
% % 
% By removing the direct effect $ \mathbf{b}\phi(X) $ of internal variability from $ Y $, we achieve a significant reduction in the noise associated with the response to external forcings (see \ref{fig:climate_experiment} in Appendix). This reduction enhances the reliability of detecting the climate signal attributed to greenhouse gas emissions, thereby enabling more robust assessments of climate change impacts.


We evaluate the ability of our method to disentangle internal climate variability from the externally forced response using temperature fields from CESM2 historical climate simulations~\citep{danabasoglu2020community}. Use of the optimal projection $\mathbf{w}_D$ is compared against two commonly used baselines in climate science: \textit{Detrending} and \textit{Dynamical Adjustment}~\citep{Sippel2019}. 
% 
To achieve this, we model internal variability using Sea Level Pressure (SLP) as a proxy and estimate the externally forced response using a smoothed version of the Global Mean Temperature (GMT). $T_D$ learns a projection that isolates the internal component of temperature fluctuations while preserving their dynamical structure. Once trained, the model allows us to separate the forced and internal components of temperature fields.
% 
Figure~\ref{fig:mse_boxplot} presents the mean squared error (MSE) for trend estimation across different algorithms. $T_D$ performs comparably to Detrending for reconstructing forced trends but performs better in recovering internal variability trends, providing better worst-case control. The spatial distribution of estimated internal trends (Figures~\ref{fig:trends_maps_DEA} and \ref{fig:trends_maps_Detrending}) further highlights that both methods capture large-scale patterns but tend to underestimate trends in polar regions. Additionally, Figure~\ref{fig:climate_experiment_forced_response_trends_TS} illustrates that  $T_D$ effectively reconstructs the forced response across different locations, although both $T_D$ and Detrending struggle in highly variable regions.
% 
Overall, our approach provides a principled framework for disentangling forced and internal climate variability.
\paragraph{Climate change attribution.}

\begin{table}
    \centering
    \caption{Performance comparison of different approaches for detecting various effects. Bold values indicate the lowest Type II Error and Type I Error at level $5\%$.}
    \label{tab:effects}
    \begin{tabular}{llcc}
        \toprule
        \bfseries Effect & \bfseries Approach & \bfseries Type II Err. & \bfseries Type I Err. \\
        \midrule
        $\mathrm{CO}_2$ & DEA & $\mathbf{0.00}$ & $\mathbf{0.00}$ \\
                        & GMT Reg & $0.06$ & $0.30$ \\
                        & EOF & $0.06$ & $0.30$ \\
        \midrule
        $\mathrm{CH}_4$ & DEA & $\mathbf{0.52}$ & $\mathbf{0.00}$ \\
                        & GMT Reg & $0.70$ & $0.30$ \\
                        & EOF & $0.74$ & $0.26$ \\
        \midrule
        Aerosol & DEA & $\mathbf{0.00}$ & $\mathbf{0.04}$ \\
                & GMT Reg & $0.76$ & $0.24$ \\
                & EOF & $0.76$ & $0.24$ \\
        \midrule
        Land Use & DEA & $\mathbf{0.00}$ & $\mathbf{0.14}$ \\
                 & GMT Reg & $0.36$ & $0.64$ \\
                 & EOF & $0.74$ & $0.26$ \\
        \bottomrule
    \end{tabular}
\end{table}

In this experiment, we examine the direct effects of external forcing and investigate whether external forcing factors—such as aerosols, $\mathrm{CO}_2$, $\mathrm{CH}_4$, and land use have a direct effect on the annual mean temperature field ($Y_{\text{factual}}$). Using 50 historical climate simulations from CESM2, we compute counterfactual temperature fields ($Y_{\text{counterfactual}}$), following the methodology described in Eq. \ref{eq:intern_forced} in the supplementary materials. We apply the algorithm $T_D$ to test for the significance of each forcing ($X$) while controlling for the effects of the others ($Z$). Our results are compared to two common approaches in climate attribution~\citep{lean2008natural}: regression-based tests where forcings are assessed for their significance in predicting climate patterns, specifically Global Mean Temperature (spatial average) and the first Empirical Orthogonal Function (EOF) of the climate field. The findings demonstrate that our method effectively controls type I error (when applied to $Y_{\text{factual}}$) and type II error (when applied to $Y_{\text{counterfactual}}$) and outperforms the other approaches. These results highlight the potential of our method in attributing causal effects of external forcing, with implications for its use in analysing observational data, such as the ERA5 or HADCRUT datasets.



\section{Conclusion}

This paper proposes a novel framework for recovering the direct effect of low-rank interventions in multivariate response variables. Our approach combines conditional independence testing and causal representation learning, enabling robust estimation of direct causal effects in multivariate settings.  
% 
We showed that the choice of test statistic \(T\) significantly influences algorithm performance, with different choices yielding varying effectiveness. Notably, the learning algorithm that controls noise variance exhibits stronger theoretical guarantees and improved performance in simulations, even in nonlinear settings.  
% 
Our results highlight that performance depends on noise matrix assumptions, particularly as dimensionality \(d\) increases, leading to better discriminative power of intervention distributions. Furthermore, the loss function serves as a statistic in CI tests, allowing us to assess whether \(X\) significantly affects \(Y\) while enhancing interpretability. Our approach ensures robustness in multivariate settings and enables extensions to other CI test statistics and regression models, fostering broader applicability.  
% 
Future work will derive the distribution of the optimal learning loss under null and alternative hypotheses to enhance test power. We will also explore nonlinear representations via projection into a reproducing kernel Hilbert space and assess cases where the effects of \(X\) and \(Z\) on \(Y\) are not linearly separable. Additionally, we aim to further investigate this problem from an information-geometric perspective.  



\newpage































\begin{contributions} % will be removed in pdf for initial submission 
					  % (without ‘accepted’ option in \documentclass)
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
    Gherardo Varando and Homer Durand conceived the original idea of Direct Effect Analysis, generalising an earlier concept introduced by Gustau Camps-Valls and Gherardo Varando. Durand and Varando developed the methodology and algorithmic framework, with Durand deriving the theoretical guarantees. Durand implemented the code and conducted the experiments. Gustau Camps-Valls supervised the project and provided critical guidance. All authors collaboratively wrote and edited the manuscript.

\end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    Authors acknowledge funding from the Horizon projects AI4PEX (grant agreement 101137682) and ThinkingEarth (grant agreement 101130544), the European Research Council (ERC) support under the ERC Synergy Grant USMILE (grant agreement 855187).
\end{acknowledgements}

% References
\bibliography{uai2025-template/bib}

\newpage

\onecolumn


\appendix

\input{uai2025-template/supp_2}


\end{document}
