\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentcthe removal ofhfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage[american]{babel}
\usepackage[ruled,linesnumbered]{algorithm2e}

%Package for subfigures ---------------------------
\usepackage{caption}
\usepackage{subcaption}
%--------------------------------------------------
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
% %--------------------------------------------------
\usepackage{bm}
\usepackage{color}
\usepackage{soul}           % For strikethrough text.
\usepackage{booktabs}       % professional-quality tables
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
% \usepackage{enumerate}
\usepackage{graphicx} % DO NOT CHANGE THIS
\DeclareGraphicsExtensions{.pdf,.png,.jpg}
\graphicspath{{img/}}
\usepackage{comment}
% \usepackage[colorlinks, citecolor=blue]{hyperref}
% \hypersetup{colorlinks, citecolor=blue}
\usepackage{centernot}    
\usepackage{float}
\usepackage[titletoc,title]{appendix}

\input{macros}
\input{la_macros}
\input{mymacros}
\usepackage{tikz}
\usepackage{tkz-graph}
\usetikzlibrary{patterns}
\usetikzlibrary{shapes,decorations}

\overfullrule=1mm
\allowdisplaybreaks

%% Some suggested packages, as needed:
\usepackage[]{natbib} % has a nice set of citation styles and commands
\bibliographystyle{agsm}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{siunitx} % for proper typesetting of numbers and units


\newcommand{\papertitle}{Identifying Causal Changes Between Linear Structural Equation Models}
\title{\papertitle}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<malik83@purdue.edu>}{Vineet Malik}{}}
\author[2,3]{Kevin Bello}
\author[4]{Asish Ghoshal}
\author[5]{Jean Honorio}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Department\\
    Purdue University\\
    West Lafayette, Indiana, USA
}
\affil[2]{%
    Machine Learning Department\\
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[3]{%
    Booth School of Business\\
    University of Chicago\\
    Chicago, Illinois, USA
  }
\affil[4]{%
    Meta AI\\
    Seattle, Washington, USA
  }
\affil[5]{%
    School of Computing and Information Systems\\
    The University of Melbourne\\
    Melbourne, Australia
  }
  
\begin{document}
\maketitle

\begin{abstract}
Learning the structures of structural equation models (SEMs) as directed acyclic graphs (DAGs) from data is crucial for representing causal relationships in various scientific domains. Instead of estimating individual DAG structures, it is often preferable to directly estimate changes in causal relations between conditions, such as changes in genetic expression between healthy and diseased subjects.
This work studies the problem of directly estimating the difference between two linear SEMs, i.e.  \textit{without estimating the individual DAG structures}, given two sets of samples drawn from the individual SEMs. We consider general classes of linear SEMs where the noise distributions are allowed to be Gaussian or non-Gaussian and have different noise variances across the variables in the individual SEMs. We rigorously characterize novel conditions related to the topological layering of the structural difference that lead to the \textit{identifiability} of the difference DAG (DDAG). Moreover, we propose an \textit{efficient} algorithm to identify the DDAG via sequential re-estimation of the difference of precision matrices. A surprising implication of our results is that causal changes can be identifiable even between \textit{non-identifiable} models such as Gaussian SEMs with unequal noise variances. Synthetic experiments are presented to validate our theoretical results and to show the scalability of our method.
\end{abstract}


\section{Introduction}
\label{sec:intro}
    Structural equation models (SEMs) are effective models to express causal relationships among variables in a system~\citep{pearl2009causality,peters2017elements}.
    However, both the parameters and the graphical structure representing causal relations, typically assumed to be a directed acyclic graph (DAG), are \textit{unknown}. 
    In various fields, including computational biology. \citep{sachs2005causal,hu2018application, friedman2000using}, epidemiology \citep{robins2000marginal}, medicine \citep{plis2010meg,plis2011effective}, and econometrics \citep{imbens2020potential,hoover2009empirical,demiralp2003searching}, developing methods to estimate the  underlying DAG structure from available data is of utmost importance. This task is commonly known as causal discovery or structure learning, and numerous algorithms have been proposed for this purpose in the past few decades.
    
    In this work, we assume causal sufficiency, which means that there are no unobserved confounders. However, even under this assumption, it is generally not possible to identify the underlying DAG structure, and the problem remains NP-complete in general \citep{chickering1996learning,chickering2004}. 
    Popular methods like PC \citep{spirtes2000causation} and GES \citep{chickering2002} require an additional assumption known as faithfulness \citep{uhler2013geometry} to consistently estimate the Markov equivalent class of the true DAG in large samples. However, these methods are not consistent in high-dimensional settings \citep{ghoshal2017information,ghoshal2018learning} unless there is an assumption of sparsity or small maximum-degree of the true DAG \citep{kalisch2007estimating,nandy2018high,van2013ell_}. As a result, the presence of hub nodes, which are commonly observed in many networks \citep{barabasi1999emergence,barabasi2011network,barabasi2004network}, adds significant complexity to the problem of learning the DAG.

    However, in many cases, the main objective is to identify changes in the causal mechanisms between two or more related SEMs, rather than to estimate the full underlying DAG structure of each SEM. For instance, in root cause analysis, an operator may be interested in identifying the sources that explain the differences between the working and failure states of a microservices system \citep{ikram2022root,paleyes2023dataflow,li2022causal}. Recent work by \citet{assaad2023root} propose an approach to estimate the difference in causal changes between normal and anomalous regimes based on a causal graph of the normal regime to detect root causes. Our work complements such methods by providing a framework to directly estimate the differences between two SEMs without requiring the individual DAG structures.
    In the context of biological pathways, genes have the ability to control different groups of target genes based on the cellular environment or the presence of specific disease conditions \citep{hudson2009differential,pimanda2007gata2}.
    Although the individual DAGs may be \textit{dense}, the number of causal changes could be \textit{sparse} \citep{scholkopf2021toward,tanay2005conservation, perry2022causal}. An additional practical scenario where our problem setting is applicable includes time-varying models, as discussed by \citet{giannakis2018topology}, where data samples are divided into possibly overlapping windows. For linear SEMs, \citet{natali2021online} describe the model as $X_t = B_t X_t + e_t$. Considering scenarios where the strength of causal relationships diminishes over time, such as the waning efficacy of a vaccine against disease resistance, demonstrates the relevance of our problem setting in practical environments.
    
    In more detail, we focus on the problem of identifiability of \textit{causal} structural changes given samples from two related linear SEMs.
    We consider linear SEMs where the noise variances at each individual SEM are allowed to vary, and, moreover, these noises can have arbitrary distributions with finite mean and second moment.
    Crucially, we do not impose additional structural assumptions such as sparsity, small maximum degree, or bounded tree width on the individual DAGs.
	
    \textbf{Our contributions.}
    Our work introduces two key innovations to this problem. First, we prove that the difference DAG (causal changes) are identifiable for general linear SEMs, including non-identifiable models such as Gaussian SEMs with unequal noise variances.
    Second, motivated by our identifiability conditions, we propose an \textit{efficient} algorithm that scales to thousands of variables. More specifically:
    \begin{enumerate}
        \item We present novel sufficient conditions (Assumptions \ref{ass:level_sets} and \ref{ass:dn_levels}) for identifiability of the difference DAG between two linear SEMs. 
        \item We develop a polynomial-time algorithm for directly estimating the DDAG between two linear SEMs (Algorithm \ref{alg:exact_alg}) and show that our two conditions, Assumptions \ref{ass:level_sets} and \ref{ass:dn_levels} are necessary for Algorithm \ref{alg:exact_alg} to identify the DDAG.
        \item Since our algorithm is agnostic to the type of estimator for the difference of precision matrices, we leverage recent progress in this area and implement an efficient method that scales to thousands of nodes.
    \end{enumerate}
Proofs for all theoretical results are provided in the Supplementary Material.
\section{Related work}

    \paragraph{Learning individual DAGs.}
    One way to identify causal changes (albeit inefficient) would be to estimate individual DAGs for each environment and then to test for structural differences between the two DAGs. 
    Some classical and recent methods for learning DAGs from a single dataset include:
    Constraint-based algorithms such as PC and FCI \citep{spirtes2000causation}; 
    in score-based methods, we have greedy approaches such as GES \citep{chickering2004}, likelihood-based methods~\citep{peters2014identifiability,loh2014,aragam2015concave,aragam2019globally,hoyer2008nonlinear}, and continuous-constrained learning \citep{zheng2018dags, bello2022dagma, deng2023optimizing,deng2023global}.
    Order-based methods~\citep{teyssier2005ordering,larranaga1996learning,ghoshal2018learning,rolland2022score,montagna2023causal}, methods that test for asymmetries \citep{shimizu2006linear,buhlmann2014cam}, and hybrid methods \citep{nandy2018high,tsamardinos2006}. Additionally, recent recursive algorithms have been developed for causal structure learning, such as the method by \citet{mokhtarian2021recursive}.
    Finally, note that in order to use these methods, the individual DAGs must be identifiable, which is not the case for Gaussian SEMs with unequal noise variances~\citep{pearl2009causality}. The identifiability of Gaussian noises with equal variances were proven in \citet{peters2014identifiability} and \citet{loh2014}; while the identifiability of linear non-Gaussian models is given in \citet{shimizu2006linear}.
    In fact, a key implication of our results is that we can identify causal changes even when individual DAGs are unidentifiable.
    

    \paragraph{Differences in undirected graphs.}
    The problem of learning the difference between undirected graphs (or Markov random fields) has received much more attention than the directed case. 
    For instance, \citet{zhao_direct_2014, liu2017support, yuan_differential_2017, fazayeli_generalized_2016} develop algorithms for estimating the difference between Markov random fields and Ising models with finite sample guarantees. See \citet{zhao2022fudge, varici2021learning} for recent developments in this direction. Another closely related problem is estimating invariances between causal structure across multiple environments \citep{peters2016causal}. 
    However, this is desirable when the \emph{common structure} is expected to be sparse across environments, as opposed to our setting where the \textit{difference} is expected to be sparse.

    \paragraph{Differences in directed graphs.}
    The problem of estimating the difference between DAGs has been previously studied by \citet{wang_direct_2018,varici2022intervention,chen2023iscan,yang2024learning}.
    Under the same setting as ours, \citet{wang_direct_2018} developed a PC-style algorithm \citep{spirtes2000causation}, which they call \emph{DCI}, for learning the difference between the two DAGs by testing for invariances between regression coefficients and noise variances between the two models. However, sample complexity guarantees are hard to obtain for their method due to the use of many approximate asymptotic distributions of test statistics. Since the primary motivation behind directly estimating the difference between two DAGs is sample-efficiency, a lack of finite sample guarantees is a significant shortcoming. 
    In contrast, our algorithm works by repeatedly eliminating vertices and re-estimating the difference of precision matrix over the remaining vertices. 
    Thereby, we are able to leverage existing algorithms for computing the difference of precision matrix to obtain finite sample guarantees for our method.  Furthermore, the DCI algorithm estimates regression coefficients (and noise variances) in the individual DAGs,  while our method never estimates weights or noise variances of individual SEMs. Consider the example given in Figure \ref{fig:example} where the difference DAG contains only one edge $X_4 \rightarrow X_2$. In order to prune the edges $X_1 - X_4$ and $X_3 - X_4$ which are present in the difference undirected graph but not in the difference DAG, DCI would compute regression coefficients  $\theta^{1}_{4 | S}$ and $\theta^{2}_{4 | S}$ for all $S \subseteq \Set{1, 2, 3}$, where $\theta^{1}_{j | S}$ (resp. $\theta^{2}_{j | S}$) denotes regression coefficients obtained by regressing $X_j$ against $X_S$ in the first (resp. second) SEM. 
	For linear SEMs, estimating regression coefficients is equivalent to estimating the precision matrix (Lemma 1 from \citet{ghoshal2017learning}).
	Furthermore, \citet{danaher2014joint} have shown that directly estimating the difference between precision matrices is more sample efficient than estimating individual precision matrices and computing the difference. 
	\begin{figure}[!t]
	    \centering
	    \includegraphics[width=1\columnwidth]{img/sem_diff_figure.pdf}
	    \caption{From left to right: the two SEMs, the difference undirected graph (or difference of moral graphs), and the difference of precision matrices between the two SEMs with non-zero entries shown in black. \label{fig:example}}
	\end{figure}




\section{Preliminaries}

We use $[p]$ to denote the set of integers $\{1\ldots p\}$.
For a matrix $A$, we will denote its $i$-th row (resp. $i$-th column) by $A_{i,*}$ (resp. $A_{*,i}$). Furthermore, we define the support of the matrix $A$, denoted as $\text{supp}(A)$, as the set of indices $(i, j)$ for which the entries of $A$ are non-zero, i.e., $\text{supp}(A) = \{(i, j) \mid A_{i,j} \neq 0, \text{ for } i,j \in [p]\}$.

Let $X = (X_1, \ldots, X_p)$ be a $p$-dimensional random vector. 
We will denote a structural equation model (SEM) by the tuple $(B, D)$ where $B$
is an autoregression matrix and $D = \Diag(\Set{\var_i})$ is a diagonal matrix of noise variances.
Then, the SEM $(B, D)$ defines the following generative model over $X$:
\begin{align*}
X_i = B_{i,*} X  + \varepsilon_i, \quad \forall i \in [p],
\end{align*}
where the noises are mutually independent with $\Exp{\varepsilon_i} = 0$ and $\Var{\varepsilon_i} = D_{i,i} = \var_i < \infty$. 
In this work, the autoregression matrix $B$ encodes a directed acyclic graph (DAG) $G = ([p], \supp(B))$ over $[p]$, where the edge $(i,j)$ denotes the directed edge $i \leftarrow j$.

\begin{remark}
It is worth noting that distributions with bounded second moment include a large set of distributions such as Gaussian, uniform, Gumbel, exponential, Laplace, etc. This does not include distributions like the Cauchy distribution, which have infinite variance. Our class of SEMs covers a significant part of the classical LiNGAM models while also allowing for Gaussian distributions. Therefore, the classical linear non-Gaussian acyclic model (LiNGAM) \citep{shimizu2006linear} is a special case of our class of SEMs, with the added restriction of bounded variance.
\end{remark}

Given two SEMs, $(\B{1}, D)$ and $(\B{2}, D)$, our goal is to recover the structure of the difference between the two DAGs, that is, $\supp(\B{1} - \B{2})$. Going forward, we use $\Delta_{B}$ to denote $\B{1} - \B{2}$ and $\Delta$ to denote the difference graph, $([p], supp(\Delta_{B}))$. We assume that the two DAGs, $G^{(1)}$ and $G^{(2)}$, share a topological ordering, thereby resulting in no edge reversals between them, which is a reasonable assumption in several practical problems \citep{zhao_direct_2014,belyaeva2021dci}. Formally, we are interested in the following problem:
\begin{problem}
Given two sets of observations $\X{1} \in \R^{n_1 \times p}$ and $\X{2} \in \R^{n_2 \times p}$,
drawn from the \emph{unknown} SEMs $(\B{1}, D)$ and $(\B{2}, D)$ respectively, estimate $\Delta$.
\end{problem}
We will often index the two SEMs by $\kappa \in \Set{1,2}$. 
% We will denote the set of parents of the $i$-th node in the SEM indexed by $\kappa$ by $\Par{\kappa}(i)$,
% while the set of children are denoted by $\Chi{\kappa}(i)$. 
We will denote the difference between the precision matrices\footnote{We use the standard definition of a precision matrix, i.e., the inverse covariance matrix.} of the two SEMs by: $\D_{\Omega}$, and the precision matrix
over any subset of variables $S \subseteq [p]$ by $\D_{\Omega}^S$. Similarly, $\Om{\kappa, S}$ denotes the precision
matrix over the subset $S$ in the SEM indexed by $\kappa$. We will denote the set of topological orderings
induced by a DAG $G = ([p], E)$ by $\mathcal{T}(G) = \Set{(\tau_1, \ldots, \tau_p) \in \Pi ([p]) \mid \forall i,j \in [p]\text{ if } i > j, (\tau_j, \tau_i) \notin E}$, where $\Pi([p])$ is the set of permutations of $[p]$.
The notation $i \preceq_{\tau} j$ denotes that the vertex $i$ comes before $j$ (or $i = j$) in the topological order $\tau$.
For any $\tau \in \mathcal{T}(G)$, we will consider sequence of graphs $G_{[m, \tau ]} = (V_{[m, \tau ]}, E_{[m, \tau ]})$, indexed by $(m, \tau )$, where $G_{[m, \tau ]}$ is the induced subgraph of $G$ over the first $m$ vertices in the topological ordering $\tau$ , i.e., $V_{[m, \tau ]} = \{\tau_{i} \mid i \leq m\}$ and $E_{[m, \tau]} = \{(i, j) \in E \mid i,j \in V_{[m, \tau ]}\}$. We use the term "terminal vertices" to denote the vertices in a DAG that have no outgoing edges.

Finally, we will always index precision matrices by vertex labels, i.e., $\Omega_{i,j}$ denotes the precision
matrix entry corresponding to the $i$-th and $j$-th node of the graph.

\section{Main Results: Novel Identifiability Condition and Poly-Time Algorithm}

In our analysis, we discuss different strategies for removing terminal vertices and their implications on the edge requirements for the difference graph $\Delta$. We explore two extreme cases: (i) removing terminals one-by-one, and (ii) removing terminals all-at-once. We establish the significance of the minimal topological layering of the difference DAG in this context and provide an example of how the edge requirements can be relaxed by considering the all-at-once removal strategy. Finally we establish the identifiability conditions of difference DAGs and present Algorithm \ref{alg:exact_alg}, a poly-time algorithm to identify those DAGs.

\subsection{Terminal Vertices}
Let $(B^{(1)}, D)$ , $(B^{(2)}, D)$ be two Structural Equation Models (SEMs) such that they share at least one topological ordering. The difference precision matrix is defined as 
\begin{equation*}
    \Delta_\Omega = \Omega^{(1)} - \Omega^{(2)}.
\end{equation*}
Using Proposition 2 from \citet{ghoshal2018learning}, the diagonal entries of difference precision matrix are given as
\begin{equation}
    \label{eq:diagonal_diff_precision}
    \Delta_{\Omega_{i,i}} = \sum_{l \in [p]}\frac{(B_{l,i}^{(1)}+B_{l,i}^{(2)})(B_{l,i}^{(1)}-B_{l,i}^{(2)})}{\sigma_{l}^{2}}.
\end{equation}
From Equation \ref{eq:diagonal_diff_precision},  we derive the following proposition:
\begin{proposition}
    \label{prop:delta_omega_ii}
    For any $i \in [p]$, if $i$ is a terminal vertex in $\Delta$, then $\Delta_{\Omega_{i,i}} = 0$.
\end{proposition}


Recall that the two SEMs share a topological ordering, which is equivalent to saying that union of their DAGs is also a DAG, i.e. $G^{\cup} = G^{(1)} \cup G^{(2)}$ is a DAG. The terminals of $G^{\cup}$ is given by the intersection of set of terminals of DAGs of the two SEMs. Since $\Delta$ is a subgraph of $G^{\cup}$, we have the following proposition:
\begin{proposition}
    \label{prop:terminal_delta}
    For any $i \in [p]$, if $i$ is a terminal vertex in $G^{\cup}$, then $i$ is a terminal vertex in $\Delta$.
\end{proposition}
% \textbf{}\\
It is important to note that the converse of both Proposition \ref{prop:delta_omega_ii} and Proposition \ref{prop:terminal_delta} is not true in general. 

The validity of converse of Proposition \ref{prop:delta_omega_ii} is contingent upon certain weight conditions. 
In contrast, the converse of Proposition \ref{prop:terminal_delta} hinges on structural constraints of the difference graph $\Delta$. For the converse of Proposition \ref{prop:terminal_delta} to hold, for every non-terminal vertex in $G^{\cup}$, at least one of their outgoing edge should also be in $\Delta$. So $\Delta$ must have at least $p-t$ edges, where $t$ is the number of terminals vertices. Hence, this proposition's converse does not universally hold but requires specific structural alignment within the graph. Finally the combined assumption for converse of Proposition \ref{prop:delta_omega_ii} and Proposition \ref{prop:terminal_delta} to hold, can be stated as follows:
\begin{assumption}
    \label{ass:converse_propositions}
    For any $i \in [p]$, if $\Delta_{\Omega_{i,i}} = 0$, then $i$ is a terminal vertex in $G^{\cup}$.
\end{assumption}

To find the incoming edges for these terminal vertices, we can examine the difference precision matrix $\Delta_{\Omega}$. By looking at the non-zero entries in the corresponding row or column of $\Delta_{\Omega}$ for each terminal vertex, we can identify the incoming edges for these vertices in the graph $\Delta$. This is given by the Lemma \ref{lem:edges_to_terminals}.
\begin{lemma}
    \label{lem:edges_to_terminals}
    Under Assumption \ref{ass:converse_propositions}, for any $i \in [p]$, if $\Delta_{\Omega_{i,i}} = 0$, then $\forall j \in [p], \Delta_{\Omega_{i,j}} = -\frac{\Delta_{B_{i,j}}}{\sigma_{i}^{2}}$.
\end{lemma}
This lemma allows us to identify the terminal vertices of $\Delta$ and the incoming edges to those terminals through the difference precision matrix. By iteratively removing these terminal vertices and repeating the process, we can recover the entire structure of $\Delta$, under the condition that Assumption \ref{ass:converse_propositions} holds for the linear SEMs obtained from removing the terminal vertices. There are multiple ways to remove these terminal vertices, and in the subsequent sub-sections, we provide the analysis for two extreme cases: removing terminals one-by-one and removing them all-at-once. Assumption \ref{ass:converse_propositions} is essentially the unification of the converses of both Proposition \ref{prop:delta_omega_ii} and Proposition \ref{prop:terminal_delta}. Each of these propositions imposes distinct types of constraints on the SEMs. First, we will elucidate the structural implications of the converse of Proposition \ref{prop:terminal_delta}, particularly highlighting the significance of the minimal topological layering of $\Delta$. Subsequently, we will integrate this with the converse of Proposition \ref{prop:delta_omega_ii} in Section \ref{sec:converse_prop_delta_omega_ii}.

\subsection{On the Failure of Removing Terminals One-by-One}
In this approach, we sequentially remove a single terminal vertex and its incoming edges from the difference graph $\Delta$ and both the SEMs. At each step, we re-estimate the difference precision matrix and re-apply Lemma \ref{lem:edges_to_terminals} to identify the next terminal vertex and its incoming edges. This process is repeated until all vertices have been removed, and the entire structure of $\Delta$ is recovered.
For this we need the converse of Proposition \ref{prop:terminal_delta} to hold after removal of every terminal vertex. 

Removing terminal vertices one-by-one is essentially removing vertices in reverse topological ordering. We can formally state this as follows:  for all topological ordering $\tau$ of $\Delta$, for all $m \in [p]$, $\tau_{m}$ is a terminal in $G^{(1)}_{[m,\tau]}$ and $G^{(2)}_{[m,\tau]}$.
% \begin{assumption}
%     \label{ass:removal_1b1}
%     For all topological ordering $\tau$ of $\Delta$, for all $m \in [p]$, $\tau(m)$ is a terminal in $G^{(1)}_{[m,\tau]}$ and $G^{(2)}_{[m,\tau]}$.
% \end{assumption}

This is equivalent to every topological ordering of $\Delta$ being also a valid topological ordering of $G^{\cup}$. 
% Hence we can restate this in terms of topological orderings as follows:
% \begin{assumption}
%     \label{ass:removal_topo}
%     Every topological ordering of $\Delta$ is a topological ordering of $G^{\cup}$.
% \end{assumption}
Since $\Delta$ is a subgraph of $G^{\cup}$, every topological ordering of $G^{\cup}$ is a topological ordering of $\Delta$. However, the converse is not true in general. This highlights the importance of understanding the relationship between the topological orderings of $\Delta$ and $G^{\cup}$ when analyzing the structure of the difference graph. We further examine the conditions under which this is valid to gain further insight into the structural constraints necessary for the recovery of the entire structure of $\Delta$ through the iterative one-by-one removal of terminal vertices. 

We introduce the concept of transitive edges in a DAG and explore their impact on topological orderings and the structure of the difference graph $\Delta$.
\begin{definition}[Transitive Edge\footnote{Transitive edges have also been employed in \citep{bello2018computationally} for learning DAGs.}]
    \label{def:transtive_edge}
    Let $G$ be a DAG. An edge $u\rightarrow v$ is called a transitive edge of $G$ if there exist multiple directed paths from $u$ to $v$ in $G$.
    %there exists a directed path from $u$ to $v$ even after removing the #edge $u\rightarrow v$ from $G$.
\end{definition}
Removing an edge from a DAG cannot decrease the set of topological orderings compatible with it; it can either remain the same or increase. The key property here is that it remains the same if and only if the removed edge is a transitive edge of the DAG. 
\begin{proposition}
\label{prop:removal_1b1}
    The converse of Proposition \ref{prop:terminal_delta} holds while removing terminals one-by-one if and only if all the edges of $G^{\cup}$ missing from $\Delta$ are transitive edges of $G^{\cup}$.
\end{proposition}
% Hence, Assumption \ref{ass:removal_topo} is equivalent to stating that all the edges of $G^{\cup}$ missing from $\Delta$ are transitive edges of $G^{\cup}$.

Unfortunately, in the worst case, this can require $\Delta$ to be dense. For instance, consider $G^{\cup}$ to be the complete bipartite graph $K_{n,n}$ with the direction of all edges from one partition of $n$ vertices to the other partition of $n$ vertices. This graph has no transitive edges, i.e., all $n^2$ edges are non-transitive. Therefore, to identify $\Delta$ through the one-by-one removal process of the terminals, $\Delta$ must contain all the $n^2$ edges. In the next section, we present that removing all terminals at once will reduce this requirement from $n^2$ edges to just $n$ edges.

\subsection{Removing Terminals All-at-Once}
In this alternative approach, we simultaneously remove all terminal vertices and their incoming edges from the difference graph $\Delta$ in one step. The difference precision matrix is then re-estimated and Lemma \ref{lem:edges_to_terminals} is applied to identify all new terminal vertices and their incoming edges. This process is repeated until all the vertices are removed and the entire structure of $\Delta$ is recovered.

The converse of Proposition \ref{prop:terminal_delta} states that the set of terminals of $\Delta$ is the same as the set of terminals $G^{\cup}$. This should hold even after all these common terminals are removed simultaneously. We first introduce the concept of topological layering of a DAG, which is a generalization of the well-known concept of topological ordering. Subsequently, we establish that the iterative necessity of the converse of Proposition \ref{prop:terminal_delta} while removing terminals all-at-once is equivalent to minimal topological layering of $\Delta$ being a valid topological layering of $G^{\cup}$.
\begin{definition}[Topological Layering]
    Let $G(V, E)$ be a DAG. A topological layering of $G$ is a partitioning of the vertex set $V$ into a sequence of sets $(L_{0}, L_{1}, ..., L_{r})$, such that if $(u, v) \in E$, $u \in L_{i}$, and $v \in L_{j}$, then $i > j$.
\end{definition}
Each set of the partition corresponds to a layer. Essentially, a topological layering of G is a function $L: V \mapsto \{0, 1, 2, ..., r\}$, where $r<p$ and such that for every edge $(u, v) \in E, L(u) > L(v)$, that is, edges are allowed only to go from the vertices of a higher layer to the vertices of a lower layer. This concept generalizes topological ordering, since a topological ordering can be considered a special case where $r=p-1$ and the function operates as a bijection. We then introduce the notion of minimal topological layering in which every vertex $v$ is assigned to the lowest possible layer $L(v)$ such that the condition of the topological layering still holds.
\begin{definition}[Minimal Topological Layering]
    Let $G(V, E)$ be a DAG. The minimal topological layering of $G$ is a topological layering $L$ of $G$ such that there does not exist a topological layering $L'$ of $G$ with $L'(v)<L(v)$ for some $v \in V$.
\end{definition}
Note that the minimal topological layering of a DAG is unique. This is the topological layering that one obtains by recursively removing terminals of a DAG all-at-once as layers.  Similarly, one can obtain a layering by recursively removing the roots (vertices with no incoming edges) of a DAG in an all-at-once fashion. This layering obtained by removing roots is used in many recent works on causal structure learning \citep{gao2020polynomial,zhou2022efficient,park2023computationally}. We observe that this layering of roots is the maximal topological layering of a DAG, where each vertex is assigned to the highest possible layer while maintaining the constraints of a topological layering.
We now establish the link between the converse of Proposition \ref{prop:terminal_delta} and the topological layering of $\Delta$.
\begin{lemma}
\label{lem:removal_all}
    The converse of Proposition \ref{prop:terminal_delta} holds while removing terminals all-at-once if and only if the minimal topological layering of $\Delta$ is a valid topological layering of $G^{\cup}$.
\end{lemma}
Therefore, the iterative requirement of the converse of Proposition \ref{prop:terminal_delta} can be stated as the following assumption:
\begin{assumption}
    \label{ass:level_sets}
    The minimal topological layering of $\Delta$ is a valid topological layering of $G^{\cup}$.
\end{assumption}
Assumption \ref{ass:level_sets} is a significantly less stringent condition compared to 
% Assumption \ref{ass:removal_topo}. While Assumption \ref{ass:removal_topo} 
requiring all topological orderings of $\Delta$ being topological orderings of $G^{\cup}$, which is equivalent to needing all topological layerings of $\Delta$ being topological layerings of $G^{\cup}$, Assumption \ref{ass:level_sets} only asks for only one special topological layering of $\Delta$ to be compatible with $G^{\cup}$. Hence, under Assumption \ref{ass:level_sets} $\Delta$ may have topological orderings that are not compatible with $G^{\cup}$.

From the unique minimal topological layering of a DAG, we define the notion of topological level of a vertex in a DAG. This concept of topological level will play a pivotal role in the following section.
% \begin{proposition}
%     Let $G$ be a DAG. $G$ has a unique minimal topological layering.
% \end{proposition}
\begin{definition}[Topological Level]
    Let $G$ be a DAG and $L$ be its unique minimal topological layering. The topological level of a vertex $v$ in $G$ is defined as the value $L(v)$ assigned to that vertex by $L$.
\end{definition}
Going back to the example of the complete bipartite graph $K_{n,n}$ with the direction of all edges from one partition of $n$ vertices to the other partition of $n$ vertices, the topological level of all non-terminal vertices is one. For the minimal topological layering of $\Delta$ to be compatible with $G^{\cup}$, at least one edge from each non-terminal vertex of $G^{\cup}$ should be in $\Delta$. Hence, we only require $n$ of these edges to be part of $\Delta$. This is a significant relaxation compared to the $n^2$ requirement in case of removing terminals one-by-one.

In fact, for every DAG $G$, the minimal subgraph (in terms of the number of edges), in which topological levels of all vertices remain the same as in $G$, has $p - t$ edges, where $p$ is the number of vertices and $t$ is the number of terminal vertices, as for every non-terminal vertex, at least one of their outgoing edges to the immediate lower topological level should be part of the minimal subgraph. It is important to note that this minimal subgraph is not unique, and, in fact, there can be exponentially many such minimal subgraphs for a given DAG.

\subsection{Identifiability of Difference DAGs}
\label{sec:converse_prop_delta_omega_ii}
In this section, we investigate the intricacies of the converse of Proposition \ref{prop:delta_omega_ii}, combining it with Assumption \ref{ass:level_sets} to produce the final identifiability condition.

Proposition \ref{prop:delta_omega_ii} states that if $i$ is a terminal in $\Delta$, then $\Delta_{\Omega_{i,i}}$ is 0. The converse being true necessitates that $\Delta_{\Omega_{i,i}} =0$ only if $i$ is a terminal in $\Delta$. This condition should hold level-wise as the condition in Assumption \ref{ass:level_sets}. First we introduce the concept of Diagonal Null levels of the difference precision matrix $\Delta_{\Omega}$, then we establish the relationship to the levels of the DAG structure. The diagonal entries $\Delta_{\Omega_{i,i}}$ of the difference precision matrix represent the difference in the variances of the variable $i$ in the system. We define the concept of Diagonal Null (DN) levels as follows:
\begin{enumerate}
    \item DN Level-0 variables: These are the variables that correspond to the indices $i$ where $\Delta_{\Omega_{i,i}}$ = 0 in the original $\Delta_{\Omega}$, i.e., the diagonal entry of the difference precision matrix is zero.
    \item DN Level-$k$ variables: For $k > 0$, these are the variables that become level-0 only after eliminating all DN level-0, DN level-1, ..., DN level-($k-1$) variables from the system and recalculating the difference precision matrix.
\end{enumerate}
Hence, the iterative constraint of the converse of Proposition \ref{prop:delta_omega_ii} becomes the following:
\begin{assumption}
\label{ass:dn_levels}
    For every vertex, its DN level in $\Delta_{\Omega}$ is greater than or equal to its topological level in $\Delta$.
\end{assumption}

Next, we demonstrate the necessity of the identifiability assumptions, Assumption \ref{ass:level_sets} and Assumption \ref{ass:dn_levels}. In other words, if either Assumption \ref{ass:level_sets} or Assumption \ref{ass:dn_levels} is violated, it is possible to find an exponential number of pairs of SEMs that exhibit distinct DDAG structures while inducing the same difference precision matrix.
\begin{theorem}
\label{thm:necessity_of_ass4}
    There exists an exponentially large set, $S(\alpha, \beta, \sigma)$, of pairs of SEMs, parameterized by $\alpha, \beta, \sigma \in \mathbb{R}_{+}$, s.t. for every pair of SEMs, $(B^{(1)}(\alpha, \beta),$ $D(\sigma))$ and $(B^{(2)}(\alpha, \beta),$ $D(\sigma))$, in $S(\alpha, \beta, \sigma)$, the identifiability Assumption \ref{ass:level_sets} does not hold and produces the same difference precision matrix but distinct difference DAG.
\end{theorem}
\begin{theorem}
\label{thm:necessity_of_ass5}
    There exists an exponentially large set, $S(\alpha, \sigma)$, of pairs of SEMs, parameterized by $\alpha, \sigma \in \mathbb{R}_{+}$, s.t. for every pair of SEMs, $(B^{(1)}(\alpha),$ $D(\sigma))$ and $(B^{(2)}(\alpha),$ $D(\sigma))$, in $S(\alpha, \sigma)$, the identifiability Assumption \ref{ass:dn_levels} does not hold and produces the same difference precision matrix but distinct difference DAG.
\end{theorem}

We note that Assumption \ref{ass:dn_levels} is not only theoretically significant but also holds in many practical scenarios. For instance, consider a situation where SEM 2 represents the interventional distribution of SEM 1, obtained through hard node interventions on SEM 1. In such cases, Assumption \ref{ass:dn_levels} is naturally satisfied. Similarly, this assumption is valid in scenarios where an agent external to the system performs stochastic do-interventions on any collection of variables. These instances are common in experimental designs and causal inference studies.




\subsection{Poly-Time Algorithm}
	With the above results in place, we are now ready to state our algorithm for directly learning the difference DAG. 

\begin{algorithm}[!ht]
\caption{Learning causal changes in linear SEMs\label{alg:exact_alg}}
\KwIn{$\Sm{1}$ and $\Sm{2}$}
\KwResult{$\D$}
$V \gets [p]$\;
\While{$\Abs{V} > 1$}{
    Estimate $\DO$ over $V$\;
    $S \gets \{i \mid (\DO)_{i,i} = 0\}$\;
    \For{$i \in S$}{
        $N_i \gets \{j \mid (\DO)_{i,j} \neq 0\}$\;
        \For{$j \in N_i$}{
            \If{$(j,i) \notin \D$ and $j \notin S$}{
                Add $(i,j)$ in $\D$\;
            }
        }
    }
    $V \gets V-S$\;
}
\Return $\D$\;
\end{algorithm}
	
	We prove the correctness of Algorithm \ref{alg:exact_alg} in the population setting, i.e., when
	$\Sm{\kappa}$ is the true covariance matrix of the SEM $(\B{\kappa}, \Di{\kappa})$ for $\kappa \in \Set{1,2}$.
	In this case $\DO$ can be computed efficiently by solving the linear system: 
	$\Sm{1}(\DO)\Sm{2} = \Sm{2} - \Sm{1}$ \citep{zhao_direct_2014}. Since $\Sm{\kappa}$ is positive definite,
	the above system has a unique solution. 
 
\begin{theorem}
\label{thm:population}
Let $(B^{(1)}, D)$ and $(B^{(2)}, D)$ be two SEMs. Let $\Delta_{B} = \B{1} - \B{2}$ denote the difference between the two SEMs. Given the true covariance matrices $\Sm{1}$ and $\Sm{2}$, under Assumption \ref{ass:level_sets} and Assumption \ref{ass:dn_levels}, Algorithm \ref{alg:exact_alg} returns $\D$ such that $\D = ([p], \supp(\Delta_{B}))$.
\end{theorem}

\begin{remark}
    It is known that Gaussian SEMs with unequal noise variances are known to be unidentifiable~\citep{pearl2009causality}. Here we emphasize the surprising fact that we can identify causal changes even in non-identifiable models such as Gaussian SEMs with unequal noise variances.
\end{remark}

\begin{remark}[Computational Complexity]
The computational complexity of Algorithm \ref{alg:exact_alg} is primarily influenced by the repeated estimation of the difference precision matrix, denoted as $\Delta_{\Omega}$, over the set $V$. Each estimation step is considered as an oracle call with a time complexity of $T(m)$, where $m$ represents the current size of the set $V$. Initially, the set $V$ contains $p$ elements, and in each iteration of the while loop, at least one vertex is removed from $V$, ensuring a maximum of $p$ iterations. Within each iteration, the for-loops over the sets $S$ and $N_i$ contribute an additional factor to the complexity, but this factor is bounded by $O(p^2)$ since it involves checking pairwise relationships in the worst case. Therefore, the overall computational complexity of the algorithm is $O(p \cdot T(m) + p^2)$.  
We invite the reader to see Figures \ref{fig:low_dim_performance_vs_sample_size}, \ref{fig:baselines}, and \ref{fig:high_dim_performance_vs_sample_size} for empirical runtimes of our algorithm.
\end{remark}
 
	
\begin{figure}[!t]
    \centering
    \includegraphics[width=\columnwidth]{img/main_sample_comp.png}
    \caption{Performance vs sample size in low dimensions (up to 100 nodes). See Section \ref{sec:data_generation} for details on the data generation process. We note that our method is capable of learning the DDAG in seconds and perfectly recovering the causal changes as the number of samples increases.}
    \label{fig:low_dim_performance_vs_sample_size}
\end{figure}

\subsection{On Difference of Precision Matrices}
The performance of our method depends on the accuracy with which the difference between the precision matrices is estimated. The problem of directly estimating the difference between the precision matrices of two Gaussian SEMs (or more generally Markov random fields), given samples drawn from the two individual models, has received significant attention over the past few years \citep{zhao_direct_2014, belilovsky_testing_2016, yuan_differential_2017, liu2017support,jiang2018direct}. Among these, the ADMM method of \citet{jiang2018direct}, the KLIEP algorithm of \citet{liu2017support}, and the algorithm of \citet{zhao_direct_2014} come with provable finite sample guarantees. We use the algorithm of \citet{jiang2018direct}
in our study, which is particularly effective when dealing with the sparse differences between two linear SEMs. This sparsity is also reflected in the differences between their precision matrices. For the two SEMs, the sparsity is evident in their precision matrix difference, denoted as $\Delta_{\Omega} = \Omega^{(1)} - \Omega^{(2)}$. As in \citep{jiang2018direct}, we initially calculate the sample covariance matrices, $\hat{\Sigma}^{(1)}$ and $\hat{\Sigma}^{(2)}$. Subsequently, a convex optimization problem is solved using ADMM, formulated as:
\begin{align*}
\hat{\Delta}_{\Omega} = \arg\min_{\Delta_{\Omega}} \bigg\{ &\frac{1}{2} \text{Tr}\left(\Delta_{\Omega}^\top \hat{\Sigma}^{(1)} \Delta_{\Omega} \hat{\Sigma}^{(2)} \right) \\
&+ \text{Tr}\left(\Delta_{\Omega}(\hat{\Sigma}^{(1)} - \hat{\Sigma}^{(2)})\right) + \lambda \|\Delta_{\Omega}\|_1 \bigg\},
\end{align*}
where $\lambda$ is a regularization parameter. The approach of \citet{jiang2018direct} is preferred in our work due to its computational efficiency, offering a complexity of $O(p^3)$. We also tested with other estimators in the literature such as \citet{zhao_direct_2014} and the results were very similar, but much slower to obtain. We emphasize that our main contributions are related to the identifiability of the DDAG, and not to propose a new estimator of the difference of precision matrices. For our theory, the $\Delta_{\Omega}$ estimator is a black box. Thus, by using any estimator with guarantees, we implicitly borrow its conditions for correctness. The sample complexity of our algorithm follows straightforwardly from the sample complexity of the $\Delta_{\Omega}$ estimator; since we use the estimator by \citet{jiang2018direct}, we can make use of their finite-sample rates, see, for instance, Theorem 1 in \citet{jiang2018direct}.

In Figure \ref{fig:low_dim_performance_vs_sample_size}, we explore the performance of Algorithm \ref{alg:exact_alg} by using the estimator of \citet{jiang2018direct}.


\section{Experiments}
\label{sec:exps}

In this section, we describe the empirical results from the execution of our Alg. \ref{alg:exact_alg} on finite samples with the goal of verifying our theoretical results and showing the efficiency of our method on graphs of size up to $p=1000$ nodes.

\subsection{Synthetic Data Generation}
\label{sec:data_generation}
For the generation of random SEM pairs, our approach starts with the construction of two random DAGs over $p$ nodes, adhering to the structural identifiability criteria outlined in Assumption \ref{ass:level_sets}. 
These random DAGs are designed to be individually dense, with an expected edge count of $O(p^{1.75})$, while ensuring that the difference DAG remains sparse, featuring an expected $O(p)$ edges. 
Our evaluation encompasses two prevalent models of random graphs: \Erdos–\Renyi graphs and Scale-Free graphs, where the latter are likely to generate hub-nodes, a known challenge for causal structure learning \citep{kalisch2007estimating}. 
Importantly, our algorithm does not presuppose specific exogenous noise distributions. To this end, we assess performance across Gaussian, Gumbel, and Uniform noise distributions, with noise variance values selected uniformly at random from the interval $[0.25, 0.5]$. Additionally, edge weights are selected randomly from the combined intervals $[-0.25, -0.5] \cup [0.25, 0.5]$. If these sampled edge weights fail to satisfy Assumption \ref{ass:dn_levels}, then we simply sample them again.
After generating the pair of SEMs, we generate $\floor{e^{c} \log p}$ number of samples from each SEM for $c \in \{3, 4, ..., 13\}$.



\begin{figure}[!tb]
    \centering
    \includegraphics[width=\columnwidth]{img/main_all_methods.png}
    \caption{Performance vs number of variables (up to 100 nodes). See Section \ref{sec:data_generation} for details on the data generation process. We note that our method outperforms direct learning methods such as DCI and indirect methods such as PC and GES in both recovery and time execution.}
    \label{fig:baselines}
\end{figure}


\subsection{Comparison Against Baselines}
For experiments with a finite number of samples, we follow our generative process above.
We generate $30$ pairs of SEMs with $p \in \{10, 20, 50, 100\}$.
We then generate $\floor{e^{c} \log p}$ number of samples from each SEM for $c \in \{9, 10\}$.
In Figure \ref{fig:baselines}, we compare against the algorithms: PC \citep{spirtes2000causation} and GES \citep{meek1997graphical}, both of which first learn each SEM \emph{separately} and then output the difference of adjacency matrices as the difference DAG. For GES and PC, the undirected edges are oriented according to the true graphs, this way we provide a slight advantage to these methods for fair comparison. For PC we used Fisher tests and kernel-based tests for Gaussian and non-Gaussian noises.
Finally, we also compare against the DCI-C method of \citet{wang_direct_2018}, which, as in our setting, also estimates the difference of SEMs.
We note how traditional state-of-the-art methods (PC and GES) struggle to learn the difference DAG since each DAG independently is \emph{dense}.
The closest to our results is the DCI-C method, although, as seen in Figure \ref{fig:baselines}, our algorithm performs better in both the F1-score and the runtime. We also compare the performance of our method on different sample sizes for graphs up to 100 nodes.
As shown in Figure \ref{fig:low_dim_performance_vs_sample_size}, our method learns the perfect 
difference DAG in all noise distributions for $c \geq 12$, within a few seconds. 



\subsection{Experiments in High Dimensions}

We next present experiments in high dimensions (where the number of variables $p$ is large, up to 1000 nodes) to evaluate the performance of our algorithm.
We do not compare against baselines in this setting as the baselines are not scalable to high dimensions, as can be seen from
Figure \ref{fig:baselines}, where the baselines take 1000s of seconds to run on graphs with 100 nodes, while
our algorithm takes only a few seconds. Similar to the low-dimensional setting, the metrics are averaged over 30 runs. 
Here $p \in \{200, 400, 600, 800, 1000\}$. Our algorithm is able to perfectly recover the difference DAG,
as shown in Figure \ref{fig:high_dim_performance_vs_sample_size}, for $c \geq 12$.


\begin{figure}[!tb]
    \centering
    \includegraphics[width=\columnwidth]{img/main_sample_comp_high.png}
    \caption{Performance vs sample size in high dimensions (up to 1000 nodes). See Section \ref{sec:data_generation} for details on the data generation process. We note that our method is capable of learning the DDAG in around 15 minutes for $p=1000$, and perfectly recover the causal changes as the number of samples increases.}
    \label{fig:high_dim_performance_vs_sample_size}
\end{figure}



\section{Conclusion}
We studied the problem of directly estimating the difference DAG of two linear SEMs.
We presented novel conditions for the identifiability of causal shifts between linear SEMs leveraging the information encoded in the difference of precision matrices.
By analyzing the strategy for removing terminal vertices, we showed the importance of minimal topological layering and its implications on the edge requirements for the difference DAG $\Delta$. 
Our findings not only provide a deeper understanding of the structural constraints necessary to recover the structure of $\Delta$, but also pave the way for the development of more efficient and accurate algorithms to learn the difference DAG between SEMs, even in high-dimensional settings.

\begin{acknowledgements} 
K. B. was supported by NSF under Grant \#2127309 to the Computing Research Association for the CIFellows 2021 Project.
We express our gratitude to our colleagues and reviewers for their valuable feedback and insightful comments, which greatly contributed to the improvement of this work. Additionally, we are  grateful for the support of the University of Chicago Research Computing Center for assistance with the calculations carried out in this work.
\end{acknowledgements}

% References
% \bibliographystyle{agsm}
\bibliography{paper}


\input{appendix}


\end{document}
