% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{natbib}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage{microtype}
%\usepackage[export]{adjustbox} % top align images
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{float}

\usepackage{pdfpages}
\usepackage{hyperref}


% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}


% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}




\DeclareMathOperator{\doop}{\textit{do}}


% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}

\newcommand\mw[1]{\textcolor{red}{[MW: #1]}}
\newcommand\hp[1]{\textcolor{red}{[HP: #1]}}


\title{$\chi$SPN: Characteristic Interventional Sum-Product Networks for \\Causal Inference in Hybrid Domains}

\author[1]{\href{mailto:<harshpoonia@cse.iitb.ac.in>?Subject=Your UAI 2024 paper}{Harsh Poonia}{}}
% Harsh Poonia, Moritz Willig, Zhongjie Yu, Matej Zečević, Kristian Kersting, Devendra Singh Dhami
\author[2]{Moritz Willig}
\author[2]{Zhongjie Yu}
\author[2]{Matej Zečević}
\author[2,3,4]{Kristian Kersting}
\author[5]{Devendra Singh Dhami}
% Add affiliations after the authors
\affil[1]{%
    Indian Institute of Technology Bombay\\
    India
}
\affil[2]{%
    Technical University of Darmstadt\\
    Darmstadt, Germany\\
}
\affil[3]{%
    Hessian Center for Artificial Intelligence (hessian.AI)\\
    Darmstadt, Germany\\
}
\affil[4]{%
    German Research Center for Artificial Intelligence (DFKI)\\
    Darmstadt, Germany\\
}
\affil[5]{%
    Eindhoven University of Technology\\
    Eindhoven, Netherlands\\
  }

\begin{document}

\maketitle

\begin{abstract}
Causal inference in hybrid domains, characterized by a mixture of discrete and continuous variables, presents a formidable challenge. We take a step towards this direction and propose \textbf{Ch}aracteristic \textbf{I}nterventional Sum-Product Network ($\chi$SPN) that is capable of estimating interventional distributions in presence of random variables drawn from mixed distributions. $\chi$SPN uses characteristic functions in the leaves of an interventional SPN (iSPN) thereby providing a unified view for discrete and continuous random variables through the Fourier–Stieltjes transform of the probability measures. A neural network is used to estimate the parameters of the learned iSPN using the intervened data. Our experiments on 3 synthetic heterogeneous datasets suggest that $\chi$SPN can effectively capture the interventional distributions for both discrete and continuous variables while being expressive and causally adequate. We also show that $\chi$SPN generalize to multiple interventions while being trained only on single intervention data.
\end{abstract}

\section{Introduction}
\label{intro}

Most real-world data, irrespective of the underlying domain, consists of variables originating from multiple distributions such as continuous, discrete and/or categorical. In the realm of statistical modeling, understanding and accurately characterizing such data poses formidable challenges. Mixed distributions, arising from the amalgamation of distinct subpopulations within a dataset, exhibit a complexity that traditional statistical methodologies often struggle to capture. This can lead to machine learning models becoming either inapplicable or producing incorrect results during inference. This applies not only to correlation-based methods but can also have adverse effects on causality-based methods.

\begin{figure}[t]
    \centering
    \includegraphics[width=0.7\linewidth]{figures/mixedVsChi_v2.pdf}
    \caption{\textbf{Correct Mixing of Distributions via $\chi$SPN.} Classical mixedSPN na\"ively multiply discrete probabilities and continuous densities, leading to an ill-defined probability measure. For practical applications, large density values can possibly outweigh normalized discrete probabilities, biasing parameter estimation. $\chi$SPN overcome this problem by transforming discrete and continuous variables into a shared spectral domain. Sum and product operations on the spectral representations are well defined. (Best viewed in color.)}
    \label{fig:mixedChiCompare}
\end{figure}

Causal inference~\citep{pearl2009causality,spirtes2010introduction}, the study of cause-and-effect relationships, is a fundamental pursuit in statistics, yet its application to mixed distributions introduces unique challenges. Whether the data stems from diverse demographic groups, heterogeneous environments, or multifaceted systems, the ability to infer causal relationships in presence of mixed distributions is crucial for advancing our understanding of causal phenomena in various real-world domains.

Several probabilistic methods, such as hybrid Bayesian networks~\citep{monti1998learning,murphy1998inference}, Gaussian-Ising mixed model~\citep{lauritzen1989mixed,cheng2017high} and variants of Markov random fields~\citep{fahrmeir2001bayesian,fridman2003mixed}, have been proposed to handle hybrid domains. A major drawback of these methods is the difficulty of inference~\citep{lerner2001inference} as it can quickly become intractable. This becomes a glaring issue in widespread adoption of these methods for causal inference which in itself is a challenging problem~\citep{peters2017elements}. To overcome the problem of intractable inference, a lot of work has been done on probabilistic circuits (PCs)~\citep{jaeger2004probabilistic,lowd2012learning}, specifically sum-product networks~\citep{domingos2012sum,gens2013learning}, which guarantee inference in linear time under some specific conditions. There have been efforts within the PCs to handle hybrid data with development of methods such as mixed SPN~\citep{molina2018mixed} and Bayesian SPN~\citep{trapp2019bayesian}. These methods are restrictive as they fail to model leaves with distributions that do not have closed-form density expressions and rely on a histogram or density function representation of the probability measures. A recent approach~\citep{yu2023characteristic}, proposes the use of characteristic functions to provide a unified formalization of distributions over heterogeneous data in the spectral domain.

SPNs have also been successfully applied to different rungs of the causal ladder \citep{zevcevic2021interventional, busch4578551structural}. We specifically consider interventional SPN (iSPN)~\citep{zevcevic2021interventional} that learns interventional distributions using SPNs over-parameterized by neural networks. In this work, we propose $\chi$SPN (\textbf{Ch}aracteristic \textbf{I}nterventional Sum-Product Networks), the first causal models that are capable of efficiently inferring causal quantities i.e., interventional distributions in presence of mixed data. Methods such as mixedSPN na\"ively multiply discrete probabilities and continuous densities, leading to an ill-defined probability measure as large density values can possibly outweigh normalized discrete probabilities biasing parameter estimation. $\chi$SPN overcome this problem by transforming discrete and continuous variables into a shared spectral domain using characteristic functions in the leaves of iSPN (see Fig.\ref{fig:mixedChiCompare}). Overall, we make the following important contributions: 
\begin{enumerate}
    \item We present the first causal model capable of performing inference on hybrid domains in a tractable fashion.
    \item We demonstrate the effectiveness of combining characteristic functions with iSPN's to naturally handle mixed data i.e. data containing random variables with discrete and continuous distributions. 
    \item We show that $\chi$SPN can generalize to multiple interventions without any retraining.
\end{enumerate}
% (1) (2) (3) 

We make our code publicly available at: \url{https://github.com/harpoonix/chi-SPN}. We will proceed as follows: we first present the required preliminaries and discuss the related work and then define $\chi$SPN. We then present extensive experiments on mixed domains before concluding. 

\section{Preliminaries \& Related Work}
Before diving into the proposed $\chi$SPN model, we present some necessary background on SPNs, causal models and characteristic functions.

\subsection{Sum-Product Networks}
Sum-Product Networks~\citep{domingos2012sum} are a class of deep tractable models, which belong to the family of probabilistic circuits. SPNs facilitate a wide range of exact and efficient inference routines. In particular, marginalisation and conditioning can be done in time which is linear in the size of the network \citep{zhao2015relationship, PeharzSPNProp}.  Formally, an SPN is a rooted directed acyclic graph, comprising of sum, product and leaf (or distribution) nodes to encode joint probability distributions $p(\mathbf{X})$. Given an SPN $\mathcal{S} = (G, \mathbf{w})$ with positive parameters $\mathbf{w}$ and a DAG $G = (V, E)$, the values at sum ($\mathrm{S}$) and product ($\mathrm{P}$) nodes can be computed by 
\begin{equation}
    \mathrm{S}(\mathbf{x}) = \sum_{C \in \mathrm{ch}(\mathrm{S})} \mathbf{w}_C C(\mathbf{x}) \quad \mathrm{P}(\mathbf{x}) = \prod_{C \in \mathrm{ch} (\mathrm{P})} C(\mathbf{x})
\end{equation} 
where $\text{ch}(P)$ are the children of $P$.
The SPN outputs are computed at the root node, $\mathrm{S}_R(\mathbf{x})$. The scope of a leaf node is the random variable $X$ that it models. The scope of an internal node is the union of scopes of all its children. SPNs satisfy the properties of \textit{completeness} and \textit{decomposability}. An SPN $\mathcal{S} $ is complete if for every sum node $u$ in $\mathcal{S}$ the 
scopes of its children are all the same. An SPN $\mathcal{S}$ is decomposable if for every product node $u$ in $\mathcal{S}$ the
scopes of its children are pairwise disjoint.  

Gated or Conditional SPNs are deep tractable models for estimating multivariate conditional densities $p(Y | x)$ \citep{shaocspn}, by conditioning the parameters of vanilla SPNs on the input using DNNs as gate functions. They introduce gating nodes where the weights $g_i(X)$ are parameterized by the provided evidence $X$ to encode functional dependencies on the input.

% \noindent \paragraph{Interventional SPNs}\citep{zevcevic2021interventional} 

\subsection{Causal Models}

%In general we write sets of variables in bold upper-case ($\mathbf{X}$) and their values in lower-case ($\mathbf{x}$). Single variables and their values are written in normal style ($X$, $x$). Specific elements of a set are indicated by a subscript index ($X_i$). Probability distributions over a variable $X$ or a set of variables $\mathbf{X}$ are denoted by $\CProb_X$ and $\CProb_{\mathbf{X}}$ respectively.

Structural Causal Models provide a framework to formalize a notion of causality via graphical models \citep{pearl2009causality}.

\begin{definition}[SCM]\label{def:scm}
\textit{A structural causal model is a tuple $\mathcal{M} \coloneqq \langle \mathbf{V}, \mathbf{U}, \mathbf{F}, P_\mathbf{U} \rangle $ over a set of variables $\mathbf{X} = \{X_1, \dots, X_K\}$ taking values in $\pmb{\mathcal{X}}=\prod_{k\in\{1 \dots K\}}\mathcal{X}_k$ subject to a strict partial order $<_\mathbf{X}$, where 
\begin{itemize}
    \item $\mathbf{V}  =\{X_1, \dots, X_N\} \subseteq \mathbf{X}, N \leq K$ is the set of endogenous variables,
    \item $\mathbf{U} = \mathbf{X} \setminus \mathbf{V} = \{X_{N+1}, \dots, X_K\}$ is the set of exogenous variables,
    \item $\mathbf{F} = \{f_1, \dots, f_N\}$ is the set of deterministic structural equations, i.~e. $V_i \coloneqq f_i(\mathbf{X}')$ for $V_i \in \mathbf{V}$ and $\mathbf{X}' \subseteq \{X_j \in \mathbf{X} | X_j <_\mathbf{X} V_i\}$,
    \item $P_\mathbf{U}$ is the probability distribution over the exogenous variables $\mathbf{U}$.
\end{itemize}}
\end{definition}

The relationships between the variables as described by $\mathbf{F}$ induce the directed graph $G(\mathcal{M})$ which by definition is acyclic due to $<_\mathbf{X}$.
The exogenous variables $\mathbf{U}$ are usually unobserved.
We say that an SCM $\mathcal{M}$ entails the probability distribution $P_\mathbf{V}^\mathcal{M}$ over the set of endogenous variables $\mathbf{V}$. 

Interventions $\doop(X)$ change the way variables are determined by replacing their respective structural equation $f_i$. In particular perfect interventions $\doop(X_i = v)$ replace the unintervened $f_i$ by the constant assignment $X_i := v$. Every intervention induces a new intervened graph $G(\mathcal{M}_{\doop(V_i=v_i)})$ to which we will refer to as $\hat{G}$ for notational brevity. Likewise, every intervened causal model $\mathcal{M}_{\doop(V_i=v_i)}$ entails a new probability distribution $P_\mathbf{V}^{\mathcal{M}_{\doop(V_i=v_i)}}$.

Often times only a subset of all possible interventions is considered. If not silently omitted, this restriction can be made explicit by modeling SCM with a set of \emph{allowed interventions} $\mathcal{I}$ \citep{halpern2000axiomatizing, beckers2019abstracting, rubenstein2017causal}. In this paper, we will usually evaluate our models over the set of single perfect interventions:
\begin{equation}
    %\mathcal{I} \subseteq \{\{\doop(X_i=v_i)\}\}_{i \subseteq \{1\dots N\}, \mathbf{v} \in {\pmb{\mathcal{X}}}}
    \mathcal{I} = \{\{\doop(X_i=v_i)\}\}_{i \subseteq \{1\dots N\}}.
\end{equation}
Note that for further practical application of our model, training is not restricted to any particular choice of $\mathcal{I}$. We provide additional evaluations inspecting multi-intervention generalization of the model.

%That is, the conditional distributions of unintervened $V_j$ remain unchanged. This then allows for the truncated factorization of the SCM\citep{pearl2009causality}
%\begin{equation}
%    P(\mathbf{V}) = \prod_{j \neq i} P(V_j | \textbf{PA}(V_j))
%\end{equation}

%Until recently all types of probabilistic circuits struggled with mixed discrete and continuous variables. Existing mixed graphical models and circuits \citep{sedgewick2017mixed, molina2018mixed} simply merged probabilities stemming from discrete distributions and densities of continuous variables, leading to an ill-defined measure over the distribution. Only recently \citet{yu2023characteristic} proposed a novel class of characteristic circuits, able to represent both types of variables in a unified fashion.

\begin{figure*}
    \centering
    %\adjustbox{valign=t}{
      %\includegraphics[width=0.55\textwidth]{figures/causalChiSPN_v2.pdf}
      %\hspace{3mm}
      
      %\vrule
      
      %\hspace{3mm}
      %\begin{minipage}[t]{0.40\linewidth}
      %\begin{minipage}[t][5cm][t]{0.40\textwidth}
      %\includegraphics[width=0.40\linewidth]{figures/training.pdf}
      %\vspace{5mm}
      %\end{minipage}
    %}
    \includegraphics[width=0.95\linewidth]{figures/figure2.pdf}
    \caption{\textbf{$\chi$SPN parameters are provided by intervention information (Left).} $\chi$SPN accounts for interventions that change the graph structure and --in consequence-- the intervened probability distribution. The parameterization of the SPN leaves and weights ($\theta$) is predicted by a neural network conditioned on intervention information. \textbf{Training Setup (Right).} Parameters $\theta$ of the $\chi$SPN are trained by matching the predicted $\chi$ distribution at the root node against the $\chi$ distribution computed from interventional data. (Best viewed in color.)}
    \label{fig:mixedChiLearning}
\end{figure*}

%\begin{figure}
%    \centering
%    \includegraphics[width=\linewidth]{figures/causalChiSPN_v2.pdf}
%    \caption{\textbf{$\chi$SPN parameters are provided by intervention information (Left).} $\chi$SPN accounts for interventions that change the graph structure and --in consequence-- the intervened probability distribution. The parameterization of the SPN leaves and weights ($\theta$) is predicted by a neural network conditioned on intervention information.}
%    \label{fig:mixedChiLearning}
%\end{figure}


\textbf{Probabilistic Circuits and Causality.} Several types of probabilistic models exist as of today that allow for varying degrees of tractable inference. Classical SCM as extensions of Bayesian Networks \citep{pearl1985bayesian} as well as their neural realizations \citep{xia2021causal} suffer from \#P-hard time complexity for exact (and NP-hard complexity for approximate) inference~\citep{eiter2002complexity}. To alleviate parts of this problem, other model choices such as normalizing flows \citep{papamakarios2021normalizing} are picked to approximate the causal distributions \citep{khemakhem2021causal,melnychuk2023normalizing,javaloy2023causal}. These models, however, are not able to perform tractable marginal inference. When required to perform such queries, Sum-Product-Networks pose a suitable model class. 


%\begin{figure}
%    \centering
%    \includegraphics[width=\linewidth]{figures/training.pdf}
%    \caption{\textbf{Training Setup (Right).} Parameters $\theta$ of the $\chi$SPN are trained by matching the predicted $\chi$ distribution at the root node against the $\chi$ distribution computed from interventional data (Best viewed in color.)}
%    \label{fig:training}
%\end{figure}

\subsection{Characteristic Functions} 
% \footnote{verbatim from cc for now, will edit if needed}
Characteristic functions (CF) provide a unified view for
discrete and continuous RVs through the Fourier–Stieltjes
transform of their probability measures. Let $\boldsymbol{X} \in \mathbb{R}^d$ be a random
vector, the CF of $\boldsymbol{X}$ for $\boldsymbol{t} \in \mathbb{R}^d$ is given as:
%\vspace{-2mm} XXX
\begin{equation} \label{cf:def}
    \varphi_{\boldsymbol{X}}(\boldsymbol{t})=\mathbb{E}\left[\exp \left(\mathrm{i} \boldsymbol{t}^{\top} \boldsymbol{X}\right)\right]=\int_{\boldsymbol{x} \in \mathbb{R}^d} \exp \left(\mathrm{i} \boldsymbol{t}^{\top} \boldsymbol{x}\right) \mu_{\boldsymbol{X}}(\mathrm{d} \boldsymbol{x}),
\end{equation}
where $\mu_{\boldsymbol{X}}$ is the distribution/probability measure of $\boldsymbol{X}$. 
The following properties of CFs are relevant for the remaining discussion: 
%\vspace{-3mm} XXX
\begin{enumerate}
\itemsep-0.15em
    \item $\varphi_X(0)=1$ and $\left|\varphi_X(t)\right| \leq 1$
    \item any two RVs $X_1$ and $X_2$ have the same distribution iff $\varphi_{X_1}=\varphi_{X_2}$
    % \item if $X$ has $k$ moments, then $\varphi_X$ is $k$-times differentiable
    \item two RVs $X_1, X_2$ are independent iff $\varphi_{X_1, X_2}(s, t)=\varphi_{X_1}(s) \varphi_{X_2}(t)$
\end{enumerate}
% \vspace{-2mm} XXX
% (i) $\varphi_X(0)=1$ and $\left|\varphi_X(t)\right| \leq 1$; (ii) for any two RVs $X_1, X_2$, both have the same distribution iff $\varphi_{X_1}=\varphi_{X_2}$; (iii) if $X$ has $k$ moments, then $\varphi_X$ is $k$-times differentiable; and (iv) two RVs $X_1, X_2$ are independent iff $\varphi_{X_1, X_2}(s, t)=\varphi_{X_1}(s) \varphi_{X_2}(t)$.
We refer to~\citet{Sasvári+2013} for more details of CFs. 

\begin{theorem}[Lévy's inversion theorem~\citep{Sasvári+2013}] 
\label{levy}
Let $X$ be a real-valued random variable, $\mu_X$ its probability measure, and $\varphi_X: \mathbb{R} \rightarrow \mathbb{C}$ its characteristic function. Then for any $a, b \in$ $\mathbb{R}, a<b$, we have that
\begin{equation}
\begin{array}{r}
\lim _{T \rightarrow \infty} \displaystyle \frac{1}{2 \pi} \int_{-T}^T \frac{\exp (-\mathrm{i} t a)-\exp (-\mathrm{i} t b)}{\mathrm{i} t} \varphi_X(t) \mathrm{d} t \\
=\mu_X[(a, b)]+\displaystyle \frac{1}{2}\left(\mu_X(a)+\mu_X(b)\right),
\end{array}
\end{equation}
and, hence, $\varphi_X$ uniquely determines $\mu_X$.
    
\end{theorem} 
\begin{corollary} \label{cor:inv}
If $\int_{\mathbb{R}}\left|\varphi_X(t)\right| \mathrm{d} t<\infty$, then $X$ has a continuous probability density function $f_x$ given by
\begin{equation}
  f_X(x)=\frac{1}{2 \pi} \int_{\mathbb{R}} \exp (-\mathrm{i} t x) \varphi_X(t) \mathrm{d} t .  
  \label{eq:pdf}
\end{equation}

    
\end{corollary}
    

Note that not every probability measure admits an analytical solution to Eq.~\ref{eq:pdf}, e.g., only special cases of $\alpha$-stable distributions have a closed-form density function~\citep{Nolan2013}, and numerical integration might be needed.


\section{$\chi$SPN}

We build upon the construction of interventional sum-product networks (iSPN) by \citet{zevcevic2021interventional}.
We estimate $p\left(V_i \mid d o\left(\mathbf{U}_j=\mathbf{u}_j\right)\right)$ by learning a function approximator $f(\mathbf{G} ; \boldsymbol{\theta})$ (e.g. neural network), which takes as input the (mutilated) causal graph $\mathbf{G} \in\{0,1\}^{N \times N}$ encoded as an adjacency matrix, to predict the parameters $\psi$ of a SPN $g(\mathbf{D} ; \boldsymbol{\psi})$ that estimates the density of the given data matrix $\left\{\mathbf{V}_i\right\}_{i = 1}^K=\mathbf{D} \in \mathbb{R}^{K \times N}$. 

When the iSPN is trained end to end on the log likelihood of the training data, the log densities computed at leaves modeling both discrete and continuous variables are propagated up the network to the root of the SPN. When a common class of leaves is used, say one parameterized by a normal distribution, it acts as a suboptimal way to model discrete variables that do not quite fit this class of normals. Even different distributions at the leaves may not fully be able to capture the joint distribution of a heterogeneous group of variables, since a sum-product combination of different kinds of discrete and continuous densities is likely to result in some variables overshadowing the others in the value computed at the root of the SPN. Moreover, we are restricted to using only those parametric distributions at the leaves that have a closed form density function. This is true only in the case of special $\alpha$-stable distributions \citep{Nolan2013}. We aim to address these problems with our proposed $\chi$SPN that can be defined as follows:
\begin{definition}[$\chi$ Sum-Product Network]
    \textit{A $\chi$SPN $\mathcal{C}$ is the joint model $C(\mathbf{G}, \mathbf{D})=(g_{\varphi}, g_\mu)(\mathbf{D} ; \boldsymbol{\psi}=f(\mathbf{G} ; \boldsymbol{\theta}))$, where $g(\cdot)$ is a $S P N$ that learns the population characteristic function $\varphi$ during training and estimates the interventional density $\mu$ during inference. $f(\cdot)$ is a function approximator and $\boldsymbol{\psi}=f(\mathbf{G})$ are shared parameters.}
\end{definition}

A $\chi$SPN is capable of answering interventional queries and, most importantly, allow working with mixed data i.e., where variables of both discrete and continuous distributions are present. Fig.~\ref{fig:mixedChiLearning}(left) shows the overall process of the underlying probability prediction by $\chi$SPN. The parameterization of the $\chi$SPN leaves and weights is predicted by a neural network conditioned on intervention information. 

% \vspace{-4mm}

\subsection{$\chi$SPN Structure}

Inspired by \citet{yu2023characteristic}, we modify our iSPN to learn the characteristic function $\varphi_{\boldsymbol{X}}(\boldsymbol{t})$ of the joint density. To this end, we make the leaves of the network learn the CF of a univariate distribution, to model a particular random variable. We modify the calculations at the product and sum nodes as follows.
%\vspace{-2mm} XXX
\noindent \paragraph{Product Nodes.} Decomposability of $\chi$SPN implies that a product node encodes the independence of its children. Let $X$ and $Y$ be two RVs. Following property (3) of CFs, the CF of $X, Y$ is given as $\varphi_{X, Y}(t, s)=\varphi_X(t) \varphi_Y(s)$, if and only if $X$ and $Y$ are independent. Therefore, since the children of a product node all have different scopes, with $\boldsymbol{t}=\bigcup_{\mathrm{N} \in \operatorname{ch}(\mathrm{P})} \boldsymbol{t}_{\mathrm{sc}(\mathrm{N})}$, the characteristic function of product nodes is defined as:
\begin{equation}
\varphi_{\mathrm{P}}(\boldsymbol{t})=\prod_{\mathrm{N} \in \operatorname{ch}(\mathrm{P})} \varphi_{\mathrm{N}}\left(\boldsymbol{t}_{\mathrm{sc}(\mathrm{N})}\right), 
\end{equation}
where $\mathrm{sc}$ denotes the scope of a node.

\noindent \paragraph{Sum Nodes.} Completeness of $\chi$SPN implies that a sum node encodes the mixture of its children. Let the parameters of $S$ be given as $\sum_{\mathrm{N} \in \operatorname{ch}(\mathrm{S})} w_{\mathrm{S}, \mathrm{N}}=1$ and $w_{\mathrm{S}, \mathrm{N}} \geq 0, \forall \mathrm{S}, \mathrm{N}$. Since all the children of a sum node $S$ have the same scope, the CF at a sum node is:
\begin{equation}
\begin{aligned}
\varphi_{\mathrm{S}}(\boldsymbol{t})=& \int_{\boldsymbol{x} \in \mathbb{R}^d} \exp \left(\mathrm{i} \boldsymbol{t}^{\top} \boldsymbol{x}\right)\left[\sum_{\mathrm{N} \in \operatorname{ch}(\mathrm{S})} w_{\mathrm{S}, \mathrm{N}} \mu_{\mathrm{N}}(\mathrm{d} \boldsymbol{x})\right] \\
= & \sum_{\mathrm{N} \in \operatorname{ch}(\mathrm{S})} w_{\mathrm{S}, \mathrm{N}} \underbrace{\int_{\boldsymbol{x} \in \mathbb{R}^{p_{\mathrm{S}}}} \exp \left(\mathrm{i} \boldsymbol{t}^{\top} \boldsymbol{x}\right) \mu_{\mathrm{N}}(\mathrm{d} \boldsymbol{x})}_{=\varphi_{\mathrm{N}}(\boldsymbol{t})} .
\end{aligned}
\end{equation}
\noindent \paragraph{Leaf Nodes.}
For discrete RVs, we utilize categorical distributions and for continuous RVs, we use $\alpha$-stable distributions. A more detailed discussion on the leaf types can be found in Appendix \ref{suppl:leaf}.
% Besides, the empirical CF leaf $\varphi_{\mathrm{L}_{\mathrm{ECF}}}(t)=\frac{1}{n} \sum_{j=1}^n \exp \left(\mathrm{i} t x_j\right)$ is also employed as a non-parametric leaf.
\subsection{Expressivity}
The shared parameters $\psi$ of the $\chi$SPN allow learning of the joint distribution for any dataset $\mathbf{D}_{\hat{G}}$ conditioned on the mutilated causal graph $\hat{G}$, that contains information about the interventions. Neural networks have been shown to act as causal sub-modules e.g.\ \citet{ke2019learning} used a cohort of neural nets to represent a set of structural equations which in turn represent an SCM, providing grounding to the idea of having parameters being estimated from $f$. 

The $\chi$SPN also can model any interventional distribution $p_G\left(\mathbf{V} \mid d o\left(\mathbf{U}\right)\right)$, permitted by an SCM through interventions to construct the mutilated causal graph $\hat{G}$ by modelling the conditional distribution $p_{\hat{G}}\left(\mathbf{V} \mid \mathbf{U}\right)$. This follows from \citet{pearl2009causality} since $p_G\left(\mathbf{V}_i=\mathbf{v}_i \mid d o\left(\mathbf{U}_j=\mathbf{u}_j\right)\right)$ = $p_{\hat{G}}\left(\mathbf{V}_i=\mathbf{v}_i \mid \mathbf{U}_j=\mathbf{u}_j\right)$. The SPN can learn the joint probability $p({X}_1 \ldots {X}_n)$ on the $\mathbf{D}_{\hat{G}}$ generated post-intervention and is thus causally adequate. The question of availability of experimental data is an orthogonal one. While in many applications we do not have access to sets of experiments e.g., because of monetary or ethical reasons, many other applications in science can in fact provide said sets of experiments e.g., high-throughput biology.

\subsection{Learning}

%\begin{figure}[t]
%    \centering
%    \includegraphics[width=0.9\linewidth]{figures/training.pdf}
%    \caption{\textbf{Training Setup.} Parameters $\theta$ of the $\chi$SPN are trained by matching the predicted $\chi$ distribution at the root node against the $\chi$ distribution computed from interventional data. (Best viewed in color.)
%    }
%    \label{fig:training}
%\end{figure}

The $\chi$SPN is learned from a set of mixed distribution heterogeneous samples generated from simulating interventions on the underlying SCM. Instead of maximising the log-likelihood, which is not guaranteed to be tractable, we aim to learn the CF for the distribution corresponding to a given intervention. We use the Empirical Characteristic Function (ECF) \citep{ecf77} which has been proven to be an unbiased and consistent estimator of the population characteristic function. Given data $\left\{\boldsymbol{x}_j\right\}_{j=1}^n$ the ECF is given as
\begin{equation}
    \hat{\varphi}_{\mathbb{P}}(\boldsymbol{t})=\frac{1}{n} \sum_{j=1}^n \exp \left(\mathrm{i} \boldsymbol{t}^{\top} \boldsymbol{x}_j\right).
\end{equation}

% XXX fig:training
The overall goal of learning, as shown in Fig.~\ref{fig:mixedChiLearning}(right) is to approximate, as closely as possible, the underlying characteristic function of the intervened data (which we call $\chi$ distribution\footnote{Not to be confused with the Chi distribution, which is the positive square root of a sum of squared independent Gaussian random variables}).
 % using the interventional SPN with characteristic functions at its leaves.

\noindent \paragraph{Evaluation Metric.} A measure of the closeness of two distributions represented by their characteristic functions is the squared characteristic function distance (CFD). The squared CFD
between two distributions $\mathbb{P}$ and $\mathbb{Q}$ is defined as
\begin{equation} \label{eq:cfd1}
\mathrm{CFD}_\omega^2(\mathbb{P}, \mathbb{Q})=\int_{\mathbb{R}^d}\left|\varphi_{\mathbb{P}}(\boldsymbol{t})-\varphi_{\mathbb{Q}}(\boldsymbol{t})\right|^2 \omega(\boldsymbol{t} ; \eta) \mathrm{d} \boldsymbol{t}    ,
\end{equation}
where $\omega(\boldsymbol{t} ; \eta) > 0$ is a weighting function parameterized by $\eta$ that guarantees the integral in Eq. \ref{eq:cfd1} converges. When $\omega(\boldsymbol{t} ; \eta)$ is a probability density function, Eq. \ref{eq:cfd1} can be rewritten as an expectation over $\boldsymbol{t}$ sampled from $\omega$:

\begin{equation} \label{eq:cfd2}
\mathrm{CFD}_\omega^2(\mathbb{P}, \mathbb{Q})=\mathbb{E}_{\boldsymbol{t} \sim \omega(\boldsymbol{t} ; \eta)}\left[\left|\varphi_{\mathbb{P}}(\boldsymbol{t})-\varphi_{\mathbb{Q}}(\boldsymbol{t})\right|^2\right]. 
\end{equation}

\citet{sriperumbudur2010relation} showed that using the uniqueness theorem of $\operatorname{CFs}, \operatorname{CFD}_\omega(\mathbb{P}, \mathbb{Q})=0$ iff $\mathbb{P}=\mathbb{Q}$ which motivates our choice of this distance metric. We refer to \citet{ansari2020characteristic} for a detailed discussion on CFD.

Our learning objective is then to minimise the squared characteristic function distance between the characteristic function estimated at the root of $\chi$SPN and the ECF of the intervened dataset:

\begin{equation} \label{cfd:ecf}
\begin{split} 
    &\mathrm{CFD}^2(\mathcal{C}, \hat{\mathbb{P}}_{I}) = \displaystyle \mathbb{E}_{\boldsymbol{t}}[\varphi_{\mathcal{C}}(\boldsymbol{t}) - \mathbb{E}_{\mathbf{x}_I} \exp{(\mathrm{i} \boldsymbol{t}^{T}\boldsymbol{x})}]^2 \\
    &= \frac{1}{k} \sum_{j=1}^k\left|\varphi_{\mathcal{C}}\left(\boldsymbol{t}_j\right)-\frac{1}{n} \sum_{i=1}^n \exp \left(\mathrm{i} \boldsymbol{t}_j^{\top} \boldsymbol{x}_i\right)\right|^2,
\end{split}
\end{equation}

where $n$ is the number of data points, $k$ is the number of MCMC samples to estimate the expectation from Eq.~\ref{eq:cfd2}, and $\boldsymbol{t}_j$ are samples from $\omega(\boldsymbol{t} ; \eta)$ which we use $\mathcal{N}\left(\mathbf{0}, \operatorname{diag}\left(\eta^2\right)\right)$ throughout this paper.
%where $k$ is the number of MCMC samples we evaluate the function on, $n$ is the number of data points and $\boldsymbol{t}_j \sim \mathcal{N}\left(\mathbf{0}, \operatorname{diag}\left(\eta^2\right)\right)$ here. 
Applying Sedryakyan's Inequality to Eq.~\ref{cfd:ecf}, the parameter learning can be operated batch-wise~\citep{yu2023characteristic}. A parameter update step backpropagates through the CFD evaluated on a batch corresponding to single intervention $I$.

It is important to note here that our contribution, in the form of structure and algorithm, for $\chi$SPN isn't a straightforward combination of interventional SPNs with Characteristic Circuits (CCs). The training of CCs and iSPNs are very different. iSPN accepts conditional input about interventions whereas the parameters of a CC are not conditioned on any input. In order to adapt iSPNs to the spectral domain, we need the model to learn the joint interventional density, and for that we make the root of the $\chi$SPN learn the characteristic function of the interventional distribution. We cannot simply introduce characteristic leaves in an iSPN and later transform it into density (through inversion, Section \ref{sec:inv})  for the purpose of learning with log-likelihood of the observed interventional data. 

\subsection{Tractability of Inference}\label{sec:inv}
Through their recursive nature, $\chi$SPN allow efficient computation of densities in a high dimensional setting even if closed form densities don't exist. To get the joint probability density over all random variables in the SCM, we perform inversion of the characteristic function at the root of the network, for which we use an extension of Corollary \ref{cor:inv}. 

\begin{lemma}[Inversion] \label{lem:inv}
Let $\mathcal{C}$ be a $\chi$SPN modeling the distribution of RVs $\mathbf{X} = \{X_j\}_{j = 1}^d$ and employing univariate leaf nodes. If $\int_{\mathbb{R}}\left|\varphi_{\mathrm{L}}(t)\right| \mathrm{d} t<\infty$ for every leaf $\mathrm{L}$, then $\boldsymbol{X}$ has a continuous probability density function $f_{\boldsymbol{x}}$ given by the integral on the $d$-dimensional space $\mathbb{R}^d$, \textit{i.e.},
\begin{equation} \label{eq:inv}
f_{\boldsymbol{X}}(\boldsymbol{x})=\frac{1}{(2 \pi)^d} \underbrace{\int_{\boldsymbol{t} \in \mathbb{R}^d} \exp \left(-\mathrm{i} \boldsymbol{t}^{\top} \boldsymbol{x}\right) \varphi_{\mathcal{C}}(\boldsymbol{t}) \lambda_d(\mathrm{d} \boldsymbol{t})}_{=\hat{f_{\mathcal{C}}}(\boldsymbol{x})},
\end{equation}
where $\varphi_{\mathcal{C}}(\boldsymbol{t})$ denotes the CF defined by the root of the $\chi$SPN and $\lambda_d$ is the Lebesque measure on $\left(\mathbb{R}^d, \mathcal{B}\left(\mathbb{R}^d\right)\right).$
\end{lemma}

\begin{figure*}
    \centering
    %\includegraphics[width=0.75\linewidth]{figures/datasets_full.pdf}
    \includegraphics[width=0.9\linewidth]{figures/datasets_graphs_wide2.pdf}
    \caption{\textbf{Evaluated Mixed Datasets.} All $\chi$SPN are trained and evaluated on three mixed type data sets. Hiring and Student data sets contain a mix of continuous (indicated via black circles) and discrete (indicated via green squares) within an exemplary causal process. Causal Health Classification features the important special case of a categorization process resulting in three discrete diagnosis variables which are derived from all-continuous observations. (Best viewed in color.)}
    \label{fig:datasetFigure}
\end{figure*}

We can recursively compute Eq. \ref{eq:inv} for every node. Thus, inversion at every inner node reduces to inversion at its children.
We can invoke Corollary \ref{cor:inv} to obtain density measures at leaves, since they model a univariate distribution:
%\vspace{-3mm}
\begin{equation}
    \hat{f}_{\mathrm{L}}(x)=2 \pi f_{\mathrm{L}}(x)=\int_{\mathbb{R}} \exp (-\mathrm{i} t x) \varphi_X(t) \lambda(\mathrm{d} t).
\end{equation}

Let $p_\mathrm{N}$ be the number of variables in the scope of node $\mathrm{N}$.

\textbf{Sum Nodes.} Using the completeness property of SPNs, for a sum node $\mathrm{S}$:
\begin{equation}
\begin{split}
&\hat{f}_{\mathrm{S}}(\boldsymbol{x})=\int_{\boldsymbol{t} \in \mathbb{R}^p} \exp \left(-\mathrm{i} \boldsymbol{t}^{\top} \boldsymbol{x}\right) \varphi_{\mathrm{S}}(\boldsymbol{t}) \lambda_p(\mathrm{~d} \boldsymbol{t}) \\
&=\sum_{\mathrm{N} \in \operatorname{ch}(\mathrm{S})} w_{\mathrm{S}, \mathrm{N}} \underbrace{\int_{\boldsymbol{t} \in \mathbb{R}^{p_{\mathrm{S}}}} \exp\left(-\mathrm{i} \boldsymbol{t}^{\top} \boldsymbol{x}\right) \varphi_{\mathrm{N}}(\boldsymbol{t}) \lambda_{p_{\mathrm{S}}}(\mathrm{d} \boldsymbol{t})}_{=\hat{f}_{\mathrm{N}}(\boldsymbol{x})},
\end{split}
\end{equation}
which is the weighted sum of inversions at its children.

\textbf{Product Nodes.} Using the decomposability property of SPNs, and the fact that a product node $\mathrm{P}$ models a product distribution assuming independence among its children,
%\vspace{-1mm}
\begin{equation}
\begin{split}
&\hat{f}_{\mathrm{P}}(\boldsymbol{x})=\int_{\boldsymbol{t} \in \mathbb{R}^{p_{\mathrm{P}}}} \exp \left(-\mathrm{i} \boldsymbol{t}^{\top} \boldsymbol{x}\right) \varphi_{\mathrm{P}}(\boldsymbol{t}) \lambda_{p_{\mathrm{P}}}(\mathrm{d} \boldsymbol{t}) \\
&=\prod_{\mathrm{N} \in \operatorname{ch}(\mathrm{P})} \underbrace{\int_{s \in \mathbb{R}^{p_{\mathrm{N}}}}\exp\left(-\mathrm{i} \boldsymbol{s}^{\top} \boldsymbol{x}_{[\text{sc}(\mathrm{N})]}\right) \varphi_{\mathrm{N}}(s) \lambda_{p_{\mathrm{N}}}(\mathrm{d} s)}_{=\hat{f}_{\mathrm{N}}\left(\boldsymbol{x}_{[\text{sc}(\mathrm{N})]}\right)},
\end{split}
\end{equation}
where $\boldsymbol{x}_{[\mathrm{sc}(\mathrm{N})]}$ is the subset of dimensions in $\boldsymbol{x}$ that belong to the scope of its child $\mathrm{N}$. The product appears as an application of Fubini's theorem~\citep{fubini1907sugli} for product measures.

Numerical integration may be needed for such measures when there is no closed form density at the leaves. 
A good approximation technique for the inversion at $\alpha$-stable leaves is the Gauss-Hermite quadrature~\citep{hildebrand1987introduction}, since the integral over the entire domain $[-\infty, \infty]$ is intractable. We approximate the integral $\int_{\mathbb{R}} \exp (-\mathrm{i} t x) \varphi_X(t) \mathrm{d} t$ with a weighted sum ($w_i$ of function values at certain sampled points ($t_i$) as 
\begin{equation}
    \int_{-\infty}^{+\infty} e^{-t^2} \left( \underbrace{e^{t^2} \exp (-\mathrm{i} t x) \varphi_X(t)}_{h(t)} \right) \mathrm{d} t \approx \sum_{i=1}^n w_i h\left(t_i\right),
\end{equation}
where $n$ is the number of sample points used (typically $<100$). This is tractable since the closed form of $\varphi_X(t)$ and by extension $h(t)$ for the $\alpha$-stable leaf is known.

%\vspace{-4mm}
% \noindent \paragraph{Marginals.} $\chi$SPNs allow efficient computation of marginal densities. Given a CF on $\mathrm{RVs}\  \boldsymbol{Z}=\boldsymbol{X} \cup \boldsymbol{Y}$, we can obtain the marginal CF over $\boldsymbol{X}$ as follows. Let $n=|\boldsymbol{X}|, m=|\boldsymbol{Y}|, \boldsymbol{t}=\boldsymbol{t}_{\boldsymbol{X}} \cup \boldsymbol{t}_{\boldsymbol{Y}} \in \mathbb{R}^{n+m}$ and let the characteristic function of the joint be given by $\varphi_{\mathcal{C}}\left(\boldsymbol{t}_{\boldsymbol{X}}, \boldsymbol{t}_{\boldsymbol{Y}}\right)=\int_{\boldsymbol{x} \in \mathbb{R}^{n+m}} \exp \left(\mathrm{i} \boldsymbol{t}^{\top} \boldsymbol{x}\right) \mu_{\mathrm{S}}(\mathrm{d} \boldsymbol{x})$, where $\mu_{\mathrm{S}}$ denotes the distribution of the root. Then following property (i) of CFs, the marginal CC over $\boldsymbol{X}$ is given by setting $\boldsymbol{t}_{\boldsymbol{Y}}=\mathbf{0}$, since $[t_X\ 0]^{\top}[x\ y] = t_X^{\top}x$.

% <Analyse the time complexity etc and linear size of network thing for tractability of inference>

\subsection{$\chi$SPN is a Universal Function Approximator}
The weights of the $\chi$SPN are parameterised by gating functions and distribution parameters and this allows them to induce universal approximators. By using threshold functions, $\theta^{+}\mathbb{I}(x_i \geq c) + \theta^{-}\cdot \neg \mathbb{I}(x_i \geq c), c \in \mathbb{R}$, one can encode testing arithmetic circuits \citep{choi-ac} , which are proven universal approximators. This renders $\chi$SPN to be universal approximators by design. Moreover, use of characteristic functions allows the leaves of the network to theoretically model all probability distributions, including those that do not have a density function.

\begin{figure*}[t]
    \centering
    \includegraphics[width=0.93\linewidth]{figures/hiring-paper.pdf}
    \vspace{2mm}
    
    \includegraphics[width=0.93\linewidth]{figures/student-paper.pdf}
    \vspace{2mm}
    
    \includegraphics[width=0.93\linewidth]{figures/chc-paper.pdf}
    \caption{\textbf{Approximation of Interventional Densities.} Plots feature the approximated densities of continuous variables for different interventional distributions. Marginalized ground truth distributions (plotted as bar diagrams) and $\chi$SPN approximations (red line) are shown. Modes of the distributions are generally well matched across most plots. Deviations from ground truth show at distribution boundaries as artifacts of the $\chi$ function discretization. (Best viewed in color.)}
    \label{fig:interventionFigure}
\end{figure*}

\section{Experimental Evaluation}
$\chi$SPN are tailored towards handling causal graphs with hybrid data i.e. graphs consisting of random variables drawn from both discrete and continuous distributions. Our experiments thus focus on capturing the interventional distributions within such exemplary causal processes. We aim to answer the following questions:
\begin{enumerate}
    \item[\textbf{(Q1)}] Can $\chi$SPN effectively estimate the joint probability of the heterogeneous variables conditioned on arbitrary interventions?
    \item[\textbf{(Q2)}] How well does $\chi$SPN capture individual interventional distributions?
    \item[\textbf{(Q3)}] Does $\chi$SPN generalize to multiple interventions?
\end{enumerate}

Before presenting our results and answering these questions we briefly describe the data generating process.
% To answer these questions we construct 3 causal graphs. The extended causal health dataset extends the causal health dataset from 

\textbf{Data Generating Process.}
To evaluate $\chi$SPN's ability to model arbitrary interventional distributions on heterogeneous data, we curate 3 synthetic datasets, comprising of an extension to the causal health dataset from ~\citet{zevcevic2021interventional} and two new causal datasets themed around hiring practices and student performance. The SCMs for the 3 datasets are outlined with the variables and their corresponding domains in Fig.~\ref{fig:datasetFigure}. We generate 100,000 data points for the extended causal health dataset and 120,000 for the hiring and student datasets. We also use different types of distributions for the noise, such as Gaussian and Pareto noise across all datasets. The train/test split is 80\%/20\%. A detailed description of the datasets with the underlying distributions can be found in Appendix \ref{suppl:data}.

% \begin{figure}
%     \centering
%     \includegraphics[width=0.95\linewidth]{figures/i2StudA.pdf}
    
%     \caption{\textbf{Generalization to multi-intervention setting.} Although only trained on single intervention data, $\chi$SPN can generalize to multi intervention estimation. The plots show two exemplary density predictions for the Student data set under two simultaneously applied interventions. As with in the single intervention case, distributions shapes and modes are roughly matched. A slight increase of errors over single intervention setting can be observed. Additional combinations of interventions as well as plots for the Hiring data set found in Figures~\ref{fig:mulinterventionFigureStudent} and \ref{fig:mulinterventionFigureHiring} in the appendix. (Best viewed in color.)}
%     \label{fig:mulinterventionFigure}
% \end{figure*}
\begin{figure*}
\centering
%\begin{minipage}[c]{\linewidth}

\begin{minipage}[c]{0.4\linewidth}
\centering
\hspace{3mm}
\textbf{Causal Health Classification}
\begin{table}[H]
\hspace{9mm}
%\begin{tabular}{|c|c|c|c|}
\begin{tabular}{c|c c c}
%\hline
Interv. & D1 & D2 & D3 \\ \hline
Observ.                              &  76.8\%  &  69.4\%  &  53.8\%  \\ \hline
do(A)                                &  76.6\%  &  69.6\%  &  53.8\%  \\ %\hline
do(F)                                &  83.3\%  &  63.0\%  &  53.7\%  \\ %\hline
do(H)                                &  78.9\%  &  64.0\%  &  57.0\%  \\ %\hline
do(M)                                &  84.7\%  &  46.0\%  & 69.3\%  \\ %\hline
\end{tabular}
\end{table}
\end{minipage}
\begin{minipage}[c]{0.3\linewidth}
\centering
\textbf{Hiring}
\begin{table}[H]
\hspace{5mm}
%\begin{tabular}{|c|c|c|}
\begin{tabular}{c|c c}
%\hline
Interv. & E   & D \\ \hline
Observ.                              &  64.2\%   &  85.9\% \\ \hline
do(I)                                &  64.1\%   &  65.2\% \\ %\hline
do(E)                                & N/A &  84.0\% \\ %\hline
do(Sk)                               &  64.5\%   &  61.1\% \\ %\hline
do(B)                                &   64.6\%  &  86.7\% \\ %\hline
\end{tabular}
\end{table}
\end{minipage}
\begin{minipage}[c]{0.19\linewidth}
\centering
\textbf{Student}

\begin{table}[H]
\hspace{3mm}
%\begin{tabular}{|c|c|}
\begin{tabular}{c|c}
%\hline
Interv. & S \\ \hline
Observ.                              & 58.6\%  \\ \hline
do(Q)                                & 56.5\%  \\ %\hline
do(M)                                & 54.6\%  \\ %\hline
do(C)                                & 59.5\%  \\ %\hline
do(T)                                & 56.4\%  \\ %\hline
\end{tabular}
\end{table}
\end{minipage}

\caption{\textbf{Accuracies of Discrete Variable Prediction.} Tables contain the prediction accuracies over all discrete variables of the data sets. Results for observational and interventions on the remaining (unintervened) continuous variables are presented.}
\label{fig:discreteResults}

%\end{minipage}
\end{figure*}

\textbf{Underlying model.} 
We use the RAT-SPN~\citep{peharz2020random} as a structural base upon which we build our own model. We do not perform structure learning of the $\chi$SPN and instead adopt a randomised splitting strategy at the nodes. We chose RAT-SPN as it follows a simple randomized procedure for structuring an SPN, thus providing a significant computational advantage over explicit structure learning. The shared parameters $\boldsymbol{\psi}$ of the $\chi$SPN are predicted from a fully connected neural network with 2 hidden layers, with different final layer for gate and leaf parameters. We choose $n = 50$ as the number of sample points used in the Gauss-Hermite quadrature.

\textbf{Capability of $\chi$SPN for handling hybrid domains (Q1).} We test our $\chi$SPN on the 3 synthetic datasets described above with interventions on both discrete and continuous variables. We compare $\chi$SPN with the baseline model of iSPN~\citep{zevcevic2021interventional} for the continuous case. Fig.~\ref{fig:interventionFigure} shows the captured interventional distributions for the continuous variables. Note that, due to lack of space, we show only 2 variables per dataset here with the complete results shown in the Appendix~\ref{suppl:datasetInt} in Figs.~\ref{fig:chc_complete}, \ref{fig:student_complete} and \ref{fig:hiring_complete}. It can be seen that our $\chi$SPN captures the modes of the distributions well matched across the datasets. Deviations from ground truth at the distribution boundaries can be attributed to the artifacts of the $\chi$ function discretization. The baseline iSPN generally over or underestimates the underlying distributions. This is expected since iSPN's cannot handle mixed distributions. 

Furthermore, Fig.~\ref{fig:discreteResults} shows the accuracies of discrete variable prediction after intervention on the continuous variables. Since we consider the discrete variables to be the target variables we calculate the accuracies of the discrete value being correctly predicted. Please note that the discrete variables are not always binary. For instance, in the hiring dataset the variable E (education) can take 7 values and in the student dataset the variable S (selection) can take 3 values. For education (E) prediction in the hiring dataset, the top-3 accuracy is reported. The results show that $\chi$SPN can effectively capture the discrete variable values after intervening on then continuous variables. We can thus answer (Q1) affirmatively: $\chi$SPN can handle interventions on mixed datasets, thereby making them applicable to hybrid domains.

% \vspace{-0.1in}

\begin{figure*}[!htb]
    \centering
    
    \includegraphics[width=0.9\linewidth]{figures/i2StudA.pdf}
    
    % \vspace{2mm}
    \includegraphics[width=0.9\linewidth]{figures/i2StudB.pdf}
    
    % \vspace{2mm}
    \includegraphics[width=0.9\linewidth]{figures/i2StudC.pdf}
    
    \caption{\textbf{Generalization to multi-intervention.} Although only trained on single intervention data, $\chi$SPN can generalize to multi intervention estimation. The plots show six combinations of density predictions under two simultaneously applied interventions for the Student data set. As with the single intervention case, modes are generally matched well or offset slightly. Distribution shapes generally match, but show increased errors as distributions flatten out. (Best viewed in color.)}
    \label{fig:mulinterventionFigure}
\end{figure*}

\begin{figure*}[!htb]
    \centering
    
    \includegraphics[width=0.9\linewidth]{figures/i2HireA.pdf}
    
    \vspace{2mm}
    
    \includegraphics[width=0.9\linewidth]{figures/i2HireB.pdf}
    
    \caption{\textbf{Multi-Intervention Density Estimation (Hiring Data Set).} The plots show four combinations of density predictions under two simultaneously applied interventions for the Hiring data set. As with the single intervention case, modes are generally matched well with occasional slight offsets. However, in contrast to the Student dataset some intervention combinations (B,E and Sk,B) feature an increased mismatch to ground-truth for `\textit{W}ork Experience'. (Best viewed in color.)}
    \label{fig:mulinterventionFigureHiring}
\end{figure*}

\textbf{Observational vs Interventional (Q2).} In the following we inspect the ability of our $\chi$SPN to truthfully capture individual interventional distributions. Depending on the number of variables in a graph, every individual interventional distribution is only seen in a small fraction of the samples. For the Hiring and Causal Health Classification, which both contain 7 variables, each intervention is only visible in $\sim14\%$ of the training data. Even within these samples all causal mechanisms --except the intervened one-- are computed in the standard `observational' behaviour, increasing bias towards observational behaviour. While being powerful density approximators, there exists a chance that $\chi$SPN overfit to the observational distribution in practice for the stated reasons.

Comparing observational (Fig.~\ref{fig:interventionFigure}) and interventional (Fig.~\ref{fig:observationalDensities} in the appendix) density estimates we find that no strong degradation in performance is observed when switching from observational to interventional estimation. In particular, the modes of all distributions seem to be estimated matched well. For Hiring and Causal Health data sets slight biases in mode prediction (Hiring variables W, Sk, I; Causal Health variables F, H) are learned for observational data. Qualitatively, we find that the discretization artifacts --present at boundaries of the interventional distributions-- are strongly reduced for observational data. Like previously, this can be a consequence of the higher supply of observational data.

For discrete variables we compare observational results in the first rows of Fig.~\ref{fig:discreteResults} against the accuracies of intervention graphs. Recall, that intervening on any variable in the graph does change the actual underlying distribution. Depending on the intervened variable, predictions might, therefore, become easier or harder to predict. While accuracies vary across different interventions, we observe no severe performance degradations. Overall, we can now answer (Q2): $\chi$SPN do not suffer from sampling bias towards observation data and predict interventional distributions equally well.

\textbf{Multiple Interventions (Q3).} To test the generalization capabilities of $\chi$SPN, we test the model trained on a single intervention to estimate the distributions captured from multiple interventions. Fig.~\ref{fig:mulinterventionFigure} shows the results for the Student dataset with the application of intervention on two variables simultaneously. For example, the top left 2 graphs show capture the distributions after intervention on the cultural activities and test scores variables~(C,T) while the top right 2 graphs capture the distributions after intervention on motivation and cultural activities (M,C). As it can be seen $\chi$SPN is able to faithfully capture the interventional distributions thereby showing the generalization capability of our approach. Additional results, found in Fig.~\ref{fig:mulinterventionFigureHiring}, for the Hiring dataset confirm this finding. As before, there are slight biases in the captured distributions but can be attributed to the discretization of the characteristic function. This answers (Q3) affirmatively: $\chi$SPN can efficiently generalize to multiple interventions.

A question might arise here: how does $\chi$SPN generalize to unseen interventions? As the SCM structure is not enforced within the NN, relations have to be learned from a suitable data presentation. Clearly, if a variable is never intervened, the NN can not correlate the input of the intervention signal to an appropriate weight vector for the SPN. Assuming that interventions on a variable are trained, but novel values are observed, general assumptions about neural networks extrapolating/generalizing to novel out-of-distribution values apply~\citep{zhang2021deep,liu2021towards}. 

As far as SPN's are concerned, they can be made robust to out-of-distribution data by modeling uncertainty quantification~\citep{ventola2023probabilistic}. This is achieved by a tractable dropout inference (TDI) procedure to estimate uncertainty by deriving an analytical solution to Monte Carlo dropout through variance propagation. Thus, $\chi$SPN can be extended to be more robust towards unseen interventions.

% \begin{figure*}
%     \centering
    
%     \includegraphics[width=0.9\linewidth]{figures/i2StudA.pdf}
    
%     \vspace{2mm}
%     \includegraphics[width=0.9\linewidth]{figures/i2StudB.pdf}
    
%     \vspace{2mm}
%     \includegraphics[width=0.9\linewidth]{figures/i2StudC.pdf}
    
%     \caption{\textbf{Multi-Intervention Density Estimation (Student Data Set).} The plots show six combinations of density predictions under two simultaneously applied interventions for the Student data set. As with the single intervention case, modes are generally matched well or offset slightly. Distribution shapes are generally matches, but show increased errors as distributions flatten out. (Best viewed in color.)}
%     \label{fig:mulinterventionFigureStudent}
% \end{figure*}

\section{Conclusion}
We presented $\chi$SPN, the first causal models that are capable of efficiently inferring causal quantities (i.e., interventional distributions) in presence of mixed data. $\chi$SPN transforms discrete and continuous variables into
a shared spectral domain using characteristic functions in the leaves of the interventional SPN. This enables $\chi$SPN to capture the interventional distributions effectively. In addition, we show that $\chi$SPN are able to generalize to multiple interventions while being trained only on a single intervention data thereby showing the generality of our proposed approach.

As most real-world data is mixed by nature, immediate future work includes testing the $\chi$SPN on such real world data sets. Incorporating rich expert domain knowledge, alongside observational data, proves crucial for achieving robust causal inference. Extending our method to integrate such expertise becomes imperative for enhancing the accuracy and reliability of causal models. Also, scaling $\chi$SPN to very large data sets is essential for their adaptation to complex real world scenarios. 


% \section*{Software and Data}

% If a paper is accepted, we strongly encourage the publication of software and data with the
% camera-ready version of the paper whenever appropriate. This can be
% done by including a URL in the camera-ready copy. However, \textbf{do not}
% include URLs that reveal your institution or identity in your
% submission for review. Instead, provide an anonymous URL or upload
% the material as ``Supplementary Material'' into the OpenReview reviewing
% system. Note that reviewers are not required to look at this material
% when writing their review.

% Acknowledgements should only appear in the accepted version.
\section*{Acknowledgements}
The TU Darmstadt authors acknowledge the support of the German Science Foundation (DFG) project “Causality, Argumentation, and Machine Learning” (CAML2, KE 1686/3-2) of the SPP 1999 “Robust Argumentation Machines” (RATIO). It benefited from the Hessian Ministry of Higher Education, Research, Science and the Arts (HMWK; projects “The Third Wave of AI” and “The Adaptive Mind”),
the Collaboration Lab ``AI in Construction'' (AICO) of the TU Darmstadt and HOCHTIEF,
and the Hessian research priority program LOEWE within the project “WhiteBox”. This work was partly funded by the ICT-48 Network of AI Research Excellence Center ``TAILOR'' (EU Horizon 2020, GA No 952215). This work was supported by the Federal Ministry of Education and Research (BMBF) Competence Center for AI and Labour (“KompAKI”, FKZ 02L19C150).
The Eindhoven University of Technology authors received support from their Department of Mathematics and Computer Science and the Eindhoven Artificial Intelligence Systems Institute.
% \textbf{Do not} include acknowledgements in the initial version of
% the paper submitted for blind review.

% If a paper is accepted, the final camera-ready version can (and
% probably should) include acknowledgements. In this case, please
% place such acknowledgements in an unnumbered section at the
% end of the paper. Typically, this will include thanks to reviewers
% who gave useful comments, to colleagues who contributed to the ideas,
% and to funding agencies and corporate sponsors that provided financial
% support.


% In the unusual situation where you want a paper to appear in the
% references without citing it in the main text, use \nocite
% \nocite{langley00}

\bibliography{main}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\appendix
\onecolumn
% \section{You \emph{can} have an appendix here.}

% You can have as much text here as you want. The main body must be at most $8$ pages long.
% For the final version, one more page can be added.
% If you want, you can use an appendix like this one.  

% The $\mathtt{\backslash onecolumn}$ command above can be kept in place if you prefer a one-column appendix, or can be removed if you prefer a two-column appendix.  Apart from this possible change, the style (font size, spacing, margins, page numbering, etc.) should be kept the same as the main body.

\section{Description of Synthetic Heterogeneous Causal Datasets} \label{suppl:data}

Structural equations to the corresponding graphs shown in Fig.~\ref{fig:datasetFigure} used for the experimental evaluation of this paper.

\subsection{Causal Health Classification}

\begin{equation*}
    \begin{split}
        A &=U(0,100) \\
        F &=\frac{1}{2} A+\mathcal{N}(10,10) \\
        H &=\frac{1}{100}\left(100-A^2\right)+\frac{1}{2} F+\mathcal{N}(40,30) \\
        M &=\frac{1}{2} H+\mathcal{N}(20,10)\\
        \\
             &\text{\color{gray}// helper variables (used for brevity of notation)}\\
        D'_1 &:=
          \begin{cases}
            0.00108 A^3 - 0.08862 A^2 + 1.337 A + \mathcal{N}(25, 10) & \mathrm{if } A\leq4.09837\\
            \mathcal{N}(5, 10), & \text{otherwise}
          \end{cases}\\
        D'_2 &:= 0.0175F + 0.525M + \mathcal{N}(0, 5)\\
        D'_3 &:= 0.00013857 A^3 - 0.0135 A^2 + 0.2025 A + 0.2025 H + \mathcal{N}(17.1714, 0.2A)\\
            &\text{\color{gray}// actual diagnose variables}\\
        D_{i \in \{1..3\}} &:= 
          \begin{cases}
            \mathit{true} & \mathrm{if} \mathrm{argmax}(\{D'_i\}_{i \in \{1..3\}}) = i \\
            \mathit{false} & \mathrm{otherwise}
          \end{cases}
    \end{split}
\end{equation*}
%$\begin{aligned}  \\ & \end{aligned}$


\subsection{Hiring Dataset}

\begin{equation*}
    \begin{split}
        Sc &= U[0, 9]), \mathrm{Discrete} \\
        W &= \frac{1}{2} \mathrm{ChiSquared}(df = 4) \\
        E &= U[0, 6], \mathrm{Discrete} \\
        Sk &= 0.8*E + 1.2*W + \mathrm{Pareto}(a = 2.75) \\
        B &= Sc + \mathcal{N}(0, 1.5) \\
        I &= 3*Sk - \frac{1}{2} B + \mathcal{N}(0, 4) \\
        D &= \mathbb{I}[3*I + Sk \geq \mathrm{Cutoff}], \mathrm{Binary}
    \end{split}
\end{equation*}

\subsection{Student Dataset}

\begin{equation*}
    \begin{split}
        Sc &= U[0, 4], \mathrm{Discrete} \\
        Q &= \mathcal{N}(2.5, 3) - Sc \\
        M &= \mathcal{N}(10, 3) \\
        C &= 0.8*Q + 0.2*M + \mathrm{Pareto}(a = 3) \\
        T &= 0.4*Q + 0.6*M + \mathcal{N}(0, 1) \\
        D &= \mathbb{I}[T + C \geq \mathrm{Regional Cutoff}] + \mathbb{I}[T + C \geq \mathrm{National Cutoff}],\ \mathrm{3\ categories}
    \end{split}
\end{equation*}

\section{Leaf Types} \label{suppl:leaf}
Here we describe the leaf types that are used in the $\chi$SPN, by following their definitions in~\citet{yu2023characteristic}.


% \textbf{ECF leaf.} In many cases, a parametric form of the data distribution is not available and one needs to use a non-parametric estimator. The ECF is an unbiased and consistent non-parametric estimator of the population characteristic function. Thus, the most straightforward way for modelling the leaf node is to directly employ the empirical characteristic function for the local data at each leaf, defined as $\varphi_{\mathrm{L}_{\mathrm{ECF}}}(t)=\frac{1}{n} \sum_{j=1}^n \exp \left(\mathrm{i} t x_j\right)$, where $n$ is the number of instances at leaf $\mathrm{L}$, and $x_j$ is the $j^{\text {th }}$ instance. The ECF leaf is non-parametric and is determined by the $n$ instances $x_j$ at $\mathrm{L}$.

% Note that the computation of a probability measure at the ECF leaf by inversion is not guaranteed to be bounded, and so we do not use ECF leaves for our purpose of modeling densities. They can however serve in the structure learning for the $\chi$ SPN.


\textbf{Parametric leaf for continuous RVs.} We can assume that the RV at a leaf node follows a parametric continuous distribution $e . g$. normal distribution. With this, the leaf node is equipped with the CF of normal distribution $\varphi_{\mathrm{L}_{\text {Normal }}}(t)=\exp \left(\mathrm{i} t \mu-\frac{1}{2} \sigma^2 t^2\right)$, where parameters $\mu$ and $\sigma^2$ are the mean and variance.


\textbf{Categorical leaf.} For discrete RVs, if it is assumed to follow categorical distribution $\left(P(X=j)=p_j\right)$, then the CF at the leaf node can be defined as $\varphi_{\mathrm{L}_{\text {Categorical }}}(t)=\mathbb{E}[\exp (\mathrm{i} t x)]=\sum_{j=1}^k p_j \exp (\mathrm{i} t j)$. 


\textbf{$\alpha$-stable leaf.} In the case of financial data or data distributed with heavy tails, the $\alpha$-stable distribution is frequently employed. $\alpha$-stable distributions are more flexible in modelling $e . g$. data with skewed centered distributions. The characteristic function of an $\alpha$-stable distribution is $\varphi_{\mathrm{L}_{\alpha \text {-stable }}}(t)=\exp \left(\mathrm{i} t \mu-|c t|^\alpha(1-\mathrm{i} \beta \operatorname{sgn}(t) \Phi)\right)$, where $\operatorname{sgn}(t)$ takes the sign of $t$ and $\Phi=\left\{\begin{array}{cc}\tan (\pi \alpha / 2) & \alpha \neq 1 \\ -2 / \pi \log |t| & \alpha=1\end{array}\right.$. The parameters in $\alpha$-stable distribution are the stability parameter $\alpha$, the skewness parameter $\beta$, the scale parameter $c$ and the location parameter $\mu$.


% \section{Gauss Hermite Quadrature for Approximation} \label{suppl:hermgauss}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\section{Observational Distributions} \label{suppl:datasetsObserv}

\begin{figure}[H]
    \centering
    \includegraphics[width=\linewidth]{figures/observational-paper.pdf}
    \caption{\textbf{Approximation of Observational Densities.} Plots feature the approximated densities of continuous variables for all observational data set distributions. Marginalized ground truth distributions (plotted as bar diagrams) and $\chi$SPN approximations (red lines) are shown. Modes of the distributions are generally matched. Within the Hiring dataset (variables W, Sk and I) as well as Causal Health (variables F, H) predicted modes are offset to the ground truth. Discretization artifacts as observed at the boundaries of interventional distributions are strongly reduced. (Best viewed in color.)}
    \label{fig:observationalDensities}
\end{figure}

\newpage
\section{$\chi$SPN Captures Interventional Distributions: Extended Results}
\label{suppl:datasetInt}

\begin{figure*}[!htb]
    \centering
     \includegraphics[width=0.6\linewidth]{figures/compare_chc-1.pdf}
    % \vspace{2mm}
    \includegraphics[width=0.6\linewidth]{figures/compare_chc-2.pdf}
     % \vspace{2mm}
    \includegraphics[width=0.6\linewidth]{figures/compare_chc-3.pdf}
     % \vspace{2mm}
    \includegraphics[width=0.6\linewidth]{figures/compare_chc-4.pdf}
     % \vspace{2mm}
    \includegraphics[width=0.6\linewidth]{figures/compare_chc-5.pdf}
    \caption{\textbf{Approximation of Interventional Densities (Causal Health Data Set).}}
    \label{fig:chc_complete}
\end{figure*}

\begin{figure*}[!htb]
    \centering
     \includegraphics[width=0.65\linewidth]{figures/compare_student-1.pdf}
    % \vspace{2mm}
    \includegraphics[width=0.65\linewidth]{figures/compare_student-2.pdf}\\
     % \vspace{2mm}
    \includegraphics[width=0.65\linewidth]{figures/compare_student-3.pdf}\\
     % \vspace{2mm}
    \includegraphics[width=0.65\linewidth]{figures/compare_student-4.pdf}\\
     % \vspace{2mm}
    \includegraphics[width=0.65\linewidth]{figures/compare_student-5.pdf}
    \caption{\textbf{Approximation of Interventional Densities (Student Data Set).}}
    \label{fig:student_complete}
\end{figure*}

\begin{figure*}[!htb]
    \centering
     \includegraphics[width=0.65\linewidth]{figures/compare_job-1.pdf}
    % \vspace{2mm}
    \includegraphics[width=0.65\linewidth]{figures/compare_job-2.pdf}\\
     % \vspace{2mm}
    \includegraphics[width=0.65\linewidth]{figures/compare_job-3.pdf}\\
     % \vspace{2mm}
    \includegraphics[width=0.65\linewidth]{figures/compare_job-4.pdf}\\
     % \vspace{2mm}
    \includegraphics[width=0.65\linewidth]{figures/compare_job-5.pdf}
    \caption{\textbf{Approximation of Interventional Densities (Hiring Data Set).}}
    \label{fig:hiring_complete}
\end{figure*}

\end{document}
