% \documentclass{uai2025} % for initial submission


\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage{algorithm}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage[numbers]{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example



% Added from outside of template

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
% \usepackage{subfigure}
\usepackage{booktabs} % for professional tables
 \usepackage{caption}
 \usepackage{subcaption}

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2025} with \usepackage[nohyperref]{icml2025} above.
\usepackage{hyperref}



% Attempt to make hyperref and algorithmic work together better:
% \newcommand{\theHalgorithm}{\arabic{algorithm}}
\usepackage[noend]{algorithmic}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
% \usepackage[textsize=tiny]{todonotes}
\usepackage{longtable} % For multi-page tables, if needed


\newcommand{\indep}{\perp \!\!\! \perp}
\newcommand{\blue}{\textcolor{blue}}
\newcommand{\red}{\textcolor{red}}
\newcommand{\orng}{\textcolor{orange}}
\newcommand{\mk}[1]{{\color{red} MK: \{#1\}}}
\newcommand{\mr}{\textcolor{orange}}
\newcommand{\gray}{\textcolor{gray}}
\newcommand{\lgray}{\textcolor{lightgray}}

\newcommand{\h}{\mathcal{H}}
% \newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}

\newcommand{\Oc}{\mathcal{O}}
\newcommand{\Xc}{\mathcal{X}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\V}{\mathcal{V}}

\newcommand{\vb}{\mathbf{v}}

\newcommand{\mbf}{\mathbf}
\newcommand{\data}{\mathcal{D}}

\newcommand{\Vc}{\mathcal{V}'}
\newcommand{\X}{\mathcal{I}}
\newcommand{\x}{{i}}
\newcommand{\T}{\mathcal{T}}

% models
\newcommand{\G}{\mathbb{G}}
\newcommand{\M}{\mathcal{M}}
\newcommand{\glob}{\mathbf{M}}
\newcommand{\F}{\mathcal{S}_{FL}}
\newcommand{\cpc}{\mathcal{S}_{user}}
\newcommand{\fw}{w}

\newcommand{\Do}{\text{do}}
\newcommand{\fedcm}{\texttt{FeDCM} }
\newcommand{\ydox}{P_{\mathbf{x}}(\mathbf{y})\xspace}
\newcommand{\mP}{{P}_{\theta}}
\newcommand{\iP}{\hat{P}}
\newcommand{\Cl}{C}
\newcommand{\cl}{c}
\newcommand{\notindep}{\not\!\perp\!\!\!\perp}

% \newcommand{\fedcm}{\texttt{FeDCM}}



\usepackage{tikz}
\usetikzlibrary{positioning, calc, shapes.geometric, shapes, shapes.multipart, arrows.meta, arrows, decorations.markings, external, trees}
\usetikzlibrary{backgrounds,automata}
\usetikzlibrary{backgrounds}
\usepackage{scalefnt}
\usetikzlibrary{shapes.misc}
\usetikzlibrary{positioning, calc, shapes.geometric, shapes, shapes.multipart, arrows.meta, arrows, decorations.markings, external, trees, fit}
\tikzset{
	-Latex,auto,node distance =1 cm and 1 cm,semithick,
	state/.style ={ellipse, draw, minimum width = 0.7 cm},
	point/.style = {circle, draw, inner sep=0.04cm,fill,node contents={}},
  	nnh/.style={
		 rectangle, draw,thick,minimum width=1.5cm,minimum height=1.0cm
	},
	 nnv/.style={
  % circle,
    rectangle, draw, very thick, fill=gray!28, inner sep=0.04cm, minimum width=1.2cm, minimum height=1.2cm, rounded corners=0.05cm
  },
   nnvsm/.style={
    rectangle, draw, very thick, fill=gray!28, inner sep=0.0cm, minimum width=1.0cm, minimum height=1.0cm, rounded corners=0.05cm
  },
   outer/.style={ inner sep=3pt, fill=blue!15
  },
  outer1/.style={ inner sep=3pt, fill=green!15
  },
  outer1t/.style={ inner sep=0pt, fill=green!15
  },
  louter/.style={ inner sep=5pt, fill=blue!15
  },
	XOR/.style={draw,circle,append after command={
			[shorten >=\pgflinewidth, shorten <=\pgflinewidth,]
			(\tikzlastnode.north) edge (\tikzlastnode.south)
			(\tikzlastnode.east) edge (\tikzlastnode.west)
		}
	},
	bidirected/.style={Latex-Latex,dashed},
	el/.style = {inner sep=2pt, align=left, sloped},
	cross/.style={cross out, draw=black, minimum size=2*(#1-\pgflinewidth), inner sep=0pt, outer sep=0pt},
	%default radius will be 1pt. 
	cross/.default={1pt}
}





\title{FeDCM: Federated Learning of Deep Causal Generative Models}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<rahman89@purdue.edu>?Subject=FeDCM: Federated Learning of Deep Causal Generative Models}{Md Musfiqur Rahman}{}}
\author[1]{Murat Kocaoglu}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    School of Electrical and Computer Engineering\\
    Purdue University\\
    % Pittsburgh, Pennsylvania, USA
}
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
  \begin{document}
\maketitle


% Existing ml-based approaches focus on learning a particular model from the conditional distributions available in observational training data.
%  causal inference, causal graphs, deep generative models, federated learning
\begin{abstract}
In many real-world settings, such as medicine and finance  
causal effect is a valuable metric for decision making. 
For many predictive tasks, causal mechanisms provide  robust estimators while existing ML-driven predictors might be vulnerable to spurious correlations.
In such settings, when data is decentralized and privacy must be preserved, federated learning plays an important role. However,  causal inference in a federated learning setup is a largely unexplored research area. 
In this paper, we learn a proxy of the underlying structural causal model (SCM) with deep generative models from decentralized observational data sources possibly containing high-dimensional variables. 
Based on client preference or high dimensionality of variables, we modularize the SCM mechanisms and find the minimal subset appropriate for federated learning while
having rest of the mechanisms trained on individual client's local data. 
When all connected together, the proxy SCM, named as the federated deep causal generative model (\fedcm),   offers  estimation of any identifiable causal effect.
% for any treatment-outcome pairs. 
 We perform extensive experiments to illustrate the utility and performance of our approach. 

\end{abstract}


% To specify the heterogeneous mechanisms across clients, we represent the non-iid data setup in FL as the selection bias problem in causal inference. Next, we define a neighborhood around it based on the causal graph and train the neighborhood mechanisms globally in a federated fashion.

% For the rest of the SCM mechanisms, we train them in individual clients using only local data.

\section{Introduction}



% \blue{There is pain:}

% What happens if we dont have causality 

% What happens if we dont have deep causal generative model.

% What happens if we dont have efficient training of deep-scm.

% \blue{Why do we need causal model in federated learning?:}
% \red{Do we need FL for DCM training or need DCM for federated training? Or why do we need DCM in federated learning.}
\par 
% Confounding and data scarcity are two major problems in data analysis and create failure cases for current machine learning algorithms.
% Due to confounding bias, conditional distributions $P(y|x)$ do not give us a correct prediction, and we need interventional distribution $P(y|do(x))$ as prediction.

There is growing appeal for causal inference in machine learning as causality can improve robustness, fairness, explainability, and data efficiency in machine learning systems~\cite{gultchin2023casual,wu2019pc,xu2019achieving,parafita2021deep} .
Predictive models $f: \mathcal{X}\rightarrow \mathcal{Y}$ 
are susceptible to spurious correlations due to unobserved confounding among variables %in a system 
and may produce biased and unfair predictions. Also, due to the dependence on domain-specific conditional distributions $P_{1}(y|x)$ (suppose domain 1), their predictions are not invariant and experience low test performance in a new domain (domain 2).
Estimation of causal effects $P(y|do(x))$ or sampling of the corresponding interventional distribution alleviate these issues by removing any such confounding bias between $X$ and $Y$~\cite{subbaswamy2019preventing, lee2023finding}.
%
% These datasets can be either from the observational distributions or combined of observational and interventional distributions.
% we can not obtain the correct causal effect $P(y|do(x))$ except specific identifiable cases~\cite{bareinboim2015recovering}.
% \citet{tikka2019causal} proposes an approach to alleviate the 
%  selection bias problem, one type of data scarcity by learning from multiple datasets. 
 % \red{\cite{??} propose estimating causal effects from multiple observational and interventional datasets or their distributions.}  
% \mk{only mention assumptions we don't need. we also need positivity. focus on sample issues} such as no positivity violations (i.e., all possible variable combinations must appear in the data) 
 \par 
 Many causal inference algorithms only work~\cite{hwang2024positivity,
markoulidakis2021balance} when the positivity (also known as overlap) assumption is satisfied, i.e., every joint combination of the variables values has positive probability. 
% (ex: P(age=90, sex=male, diseaseHistory=none)>0). 
When we have small number of samples, it is unlikely that we will see all possible combinations in the data. As a result, some conditional distributions might be undefined (ex: P(recovery|age=90, sex=male, diseaseHistory=none))
% as their denominator have zero probability when expressed as joint.
Also, when we have low sample size, estimations of some conditional distributions might not be correct (sample mean
$\neq$ population mean). If we plug in such biased conditional distributions in our causal effect estimators,
the  estimates will not be correct.
Thus, the algorithms perform poor in low sample size settings.
% the performance of the algorithms with such dependence is highly dependent on the amount of data they have and will either fail or perform badly in low sample size settings.
%
% These methods require a large number of samples to satisfy assumptions and to learn the estimators used for computing the causal effects.
% Due to the low sample size, estimations in causal inference algorithms become biased.
% assumptions in causal inference algorithms are violated in practice 
% \mk{this conflates two things, positivity violation is about SCM has nothing to do with no. of samples. rewrite}. 
%
This problem is more prevalent when high-dimensional covariates such as images  are present in the system. 
% It becomes challenging to learn unbiased conditional distributions, making the estimated causal effects possibly incorrect.
Some existing algorithms deal with such issues by learning from multiple datasets~\cite{tikka2019causal,bareinboim2013general,gresele2022causal}. 
 Even though combining multiple observational datasets resolve the data scarcity problem, in most real-world scenarios, such data are collected by different authorities (e.g., hospitals) and kept confidential at different locations. 
 % Thus, without access to other datasets, estimating correct causal effects is a 
 This creates a significant challenge. 
 % for causal effect estimation.  

 Data scarcity is also a major problem in data analysis and current machine learning. 
Federated learning (FL) is an effective approach to train powerful ML models without different clients/parties having to share sensitive data. 
Client $c$ trains the global model $f_{\theta}$ on its local datasets $D^c= \{x_i,y_i\}_{i=1}^{n_c}$ and share the model weights with the server so that when aggregated from multiple clients, $f_{\theta}$ learns to sample from the original distribution $P(y|x)$. 
% \mk{f doesn't learn a distribution. f is trained to sample from Bayes optimal argmax p(y|x) is at most what you can say here. or you can say it differently: It learns the ERM solution one would obtain if one had access to the consolidated dataset. rewrite.}
% Instead, clients share model weights trained on local data.
%
Note that FL shares a global model weights across local clients whereas the existing causal inference algorithms require explicit probability tables for effect estimation. Even if we plan to share probability tables instead of model weights, such data statistics are infeasible to be computed for high-dimensional variables. For example, hospitals might share probability distribution of getting pneumonia ($n$) given patient age ($a$), i.e, $P(n|a=1)$. Such approach would not work if we aim to use X-ray images ($x$) for better prediction, i.e, $P(n|a,x)$. This invalidates the use of existing causal inference algorithms in federated learning setup.

% \mk{Start with this instead:
 % A variety of causal inference algorithms have been developed in the literature to estimate causal effects using neural networks when data is centralized
A recent promising  idea for estimating causal effect in the presence of high-dimensional variables (such as images) is to utilize neural networks ~\citep{jerzak2022image, qin2021causal, shalit2017estimating, louizos2017causal}.
However, it is non-trivial to apply these approaches in the federated learning setup. Few works~\citep{han2021federated, xiong2021federated, xiong2023federated, qiao2023collaborative, vo2022adaptive} have proposed to estimate specific treatment effects when data is decentralized across clients. 
These methods address only specific causal queries and are not suitable for arbitrary causal structures. Besides, for a new query, they have to redesign the training process to learn the conditional distributions and initiate the costly federated training from scratch. 

In this paper, we assume that variables influencing each other in a system can be specified by causal mechanisms and modeled as a structural causal model (SCM).
Mimicking the SCM of the environment allows us to measure the causal effect of \textit{any} variable on others without being affected by spurious correlations, as long as the effect is structurally identifiable.
Notably, learning all SCM mechanisms, rather than estimating a specific causal effect, better utilizes client connections during the federated communication phase.
As a first step toward learning SCMs from decentralized data, we focus on existing approaches~\citep{kocaoglu2018causalgan, pawlowski2020deep,xia2021causal, zhang2021treatment, rahman2024modular} that use deep generative models to learn the structural causal model from training data.
A set of neural networks, called deep causal generative models (DCMs), are typically arranged according to the causal structure and trained on observational data.
After convergence, they match the observational distribution and can be used to sample from identifiable interventional distributions or estimate causal effects.
% Next, we explore how to collaboratively train deep causal generative models (DCM) in a distributed fashion for each client while addressing data-sharing concerns.
% In this paper, we assume that the variables influencing each other in a system, can be specified by causal mechanisms. and can be modeled as a structural causal model (SCM).
% Mimicking the SCM mechanisms of the environment allows us to measure the causal effect of \textit{any} variable on others, without being affected by spurious correlations, as long as the effect is structurally identifiable. 
% Notably, learning all SCM mechanisms, rather than estimating a specific causal effect, is a better use of the client connections during the federated communication phase.
% As a first step to learn SCM from decentralized data, we focus on existing approaches~\citep{kocaoglu2018causalgan, pawlowski2020deep,xia2021causal, zhang2021treatment, rahman2024modular} that employ deep generative models to learn the structural causal model from training data.
% % and adopt  them into the federated learning setting.
% Generally, a set of neural networks is arranged according to the causal structure and trained on observational (centralized) training data. After convergence, they match the observational distribution and can be utilized to sample from the identifiable interventional distributions or estimate any causal effects.
% Next, we explore how we can collaboratively train deep causal generative models (DCM) in a distributed fashion for each client and resolve the concerns of data sharing.  
% Thus, we can best utilize the client connections from a federated communication phase by learning all SCM mechanisms instead of estimating any specific causal effect.
% \blue{Introducing DCMs}
% is known as deep causal generative models (DCM). The core idea is to arrange neural network architectures mimicking the causal structure and perform  adversarial training to match the joint distribution implied by DCM with the real distribution of the system. 
% \mk{this sounds incremental. can we say we adopt them into the federated learning setting instead? }


% --------------
% \mk{instead of giving away our solution immediately like this next sentence, set it up as a research question first: }
% \blue{Why full dcm is not feasible and modular-dcm required?}
New challenges appear when we aim to train a DCM in a federated fashion, as it requires training $|V|$ number of generative mechanisms of all $V$ variables in the causal graph, i.e., $\{f_{V_i}\}_{V_i\in V}$ to match the joint distribution. 
A trivial implementation of a federated learning algorithm such as FedAVG~\citep{mcmahan2017communication} would be to consider $|V|$ global models which we share across clients and aggregate in the server. 
Such communication overhead is a major concern and infeasible for clients such as edge devices with limited compute and memory.  
A fundamental question is by how much we can reduce this complexity of distribution matching. Bayesian networks provide a way to modularly represent the joint distribution. Causal Bayesian Networks\footnote{A BN considers dependencies among a set of observed variables while a CBN considers causal dependencies among them.
} 
enjoy the same compact representation (see Definition~\ref{def:cbn}). 
In the presence of latent variables, a term we call ``modularity" allows the factorization of joint distribution into some products called c-factors.
However, it is not clear how compactly we can represent the joint distribution and how much of it we can achieve in a federated learning setup.
We do not want to transfer all causal mechanisms as global models between the server and the clients. 
% To efficiently learn the deep causal generative model (DCM) in a FL setup, we consider a common real-world assumption where we have a single high-dimensional variable (denoted as $\X$) such as images in the dataset and all other variables are low-dimensional.
%%%%
% Example
Rather, we consider a set of models, $\cpc$
that clients prefer to share or based on high-dimensional variables that require federated learning. Our method \fedcm either accepts such a set or offers the minimal set of mechanisms $\mathcal{S}_{FL}$ containing $\cpc$ for federated training.
We execute FL for the set of models, $\mathcal{S}_{FL}$ and utilize
only local client data to learn the remaining mechanisms in the SCM.

% \mk{again i don't think you have to give concrete examples like this at the beginning. just pose the high level research questions to build anticipation. it's not a good idea to give so much spoiler. } For example, tconsider the graph: $Z \rightarrow X \rightarrow Y; Z \leftrightarrow Y$. Suppose, clients wish to share only mechanism $X\rightarrow Y$, i.e, $\cpc=\{f_Y\}$. However, since there is an unobserved confounder between $Z$ and $Y$, using FL to only learn the prediction $P(y|x)$ would be biased. Thus, 
% we offer $\mathcal{S}_{FL}= \{f_Z,f_Y\}$ and perform federated learning to learn both $f_{Z}$ and $f_{Y}$ simultaneously  to learn the correct joint distributions.
% Meanwhile, we utilize local data to train $f_X$ and learn the distribution $P(x|z)$. 
% For $f_{\X}$ and $f_{Y}$ to learn the correct joint distributions, we need to train all $\{f_{Z},f_{Y}\}$ as global models.
% Thus, $f_{\X}$ and $f_{Y}$ has to join federated training as global models. Now, suppose, there exists an unobserved confounder that affects both $Y$ and a neighbor $Y'$ ($X\rightarrow Y \leftrightarrow Y'$). For $f_{\X}$ and $f_{Y}$ to learn the correct joint distributions, we need to train all $f_{\X}, f_{Y},f_{Y'}$ as global models.
% However, we need to perform federated learning for $f_{\X}$ and and $f_{Y}$, as with only small number of samples in local dataset, $f_{\X}$ cannot learn high-dimensional $\X\sim P(x)$ and $f_Y$ cannot predict $Y$ for any $\X$. 

% ---- Our contribution
% C1: how to determine the set of mechanisms $\M$ that depend on the high-dimensional variable $\X$. We train $\M$ collaboratively as global models in FL manner.
\par 
To generalize this process, we solve two challenges:
C1: How can we determine the minimal set of models $\mathcal{S}_{FL}$ that need to be trained collaboratively as FL global models based on $\cpc:$ a client-proposed set or any high-dimensional variable set.
C2: During and after FL training, how to utilize the global models $\mathcal{S}_{FL}$ to train the SCM mechanisms $\{f_{V_i}\}_{V_i\in V}$ consistently to match the joint distribution $P(v)$.
Our proposed algorithm solves C1 by achieving what we call {maximal modularity} for training a causal model: we can learn mechanisms in each c-component (maximal subsets of nodes that are connected by unobserved confounding) independently. %, which can learn learning based on a graphical structure called c-components: groups of variables affected by unobserved confounders. 
%
% [REWRITE THIS NEXT SENTENCE BASED ON MY EDIT OF PREV] Such modularization informs us of the c-components, $\X$ and its children, belong to and about the mechanisms we have to train as global models, collaboratively in the FL manner.
%
%
As part of solving C2, we note that, distributions corresponding to c-components (i.e., c-factors) are identifiable interventional distributions. Thus, instead of making $\cpc$, we make a proxy set of models $\glob$ participate in FL and later use them to train $\mathcal{S}_{FL}$.
Therefore, we train all SCM mechanisms on local observational data and simultaneously train $\mathcal{S}_{FL}$ on the interventional data generated by the proxy global models $\glob$.
 % output
After convergence, the DCM trained according to this approach represents the underlying true causal model. We can fix a specific value for a variable $X=x$ and perform ancestral sampling to generate interventional samples from $P(Z,Y|do(x))$ or estimate the causal effect. To the best of our knowledge, \fedcm is the first approach to efficiently learn \emph{a neural SCM to answer any identifiable causal effect in a federated learning setup}.
Precisely, our contributions are as follows:
\begin{itemize} 
 % the structural causal model. we are not learnign THE true one. pay attnention to using the right language. otherwise they will think we don't know what we are talking about} 
    \item We propose a novel approach to learn a proxy structural causal model from multiple decentralized data sources containing both high and low-dimensional variables.
    \item We introduce the concept of maximal modularity 
    that allows us to find the minimal set of SCM mechanisms for global training containing any client-preferred set of models and keep the communication cost low.
    % provide a theoretically sound criterion to find the minimal set of SCM mechanisms for global training reducing communication cost.
    \item We provide extensive empirical analysis on synthetic data showing the utility of our approach for learning a SCM in federated learning setup.
\end{itemize}

% \red{Add positivity violation as motivation and non-identifiability fact. How incorrect are the local learned mechanisms compared to global mechanisms.}





% \red{discussion on confounders and neural causal models is absent}



% No cure       

% \blue{What is neural/deep causal models? What is modular-dcm? How does that become useful here?}


% \blue{What problem are you trying to solve? What is your proposed solution?}

\section{Related works}
% follow exisiting causal fed algos's related works.
%%%%%%% FL approaches that does not use causal inference.
% Spurious correlation removing FL methods

In presence of data scarcity and privacy concerns, many existing works 
% in federated learning literature 
utilize deep learning based approach to deal with data heterogeneity, distribution shift~\citep{huang2023rethinking, bao2024adaptive,chen2024classifier,yu2023turning,liao2024foogd,tan2024heterogeneity,yang2024dual} and spurious correlations 
% for federated domain generalization
~\cite{singh2023augmenting, wangpersonalized,ma2024reducing}.
Even though, these approaches are  effective in their targeted problem instances, they might not generalize to any scenarios as they do not consider the causal relationships.
% and 
% relations among the variables. 
%
%%%%%%%% Causal inference approaches that does not use FL
There exists many causal inference-based approaches that learn causal structures or distributions from multiple datasets. ~\citet{huang2020causal} learns the causal graph from multiple data sets with non-identical variable sets.
~\citet{tikka2019causal} perform causal effect identification from multiple incomplete data sources while dealing with confounding and selection bias.
\citet{gresele2022causal} utilize information from 
multiple datasets with overlapping variable set to obtain counterfactual inference. ~\citet{bareinboim2013general} propose causal knowledge transportability by incorporating data from multiple causal domains. 
% ~\citet{rahman2024modular, xia2021causal}  employ deep generative models for learning structural causal models from observational and interventional distributions. 
These methods assume access to all datasets and fail to adapt when data is decentralized.
%
% However, due to low sample size in our data, we can not obtain the correct causal effect $P(y|do(x))$ except specific identifiable cases~\cite{bareinboim2015recovering}.
% \citet{tikka2019causal} proposes an approach to alleviate the 
%  selection bias problem, one type of data scarcity by learning from multiple datasets. \red{\cite{??} propose estimating causal effects from multiple observational and interventional datasets or their distributions.}
%
%

%%%%%%%% Approaches that use both.
Recently researchers have proposed different causal inference methods in the federated learning setup
\cite{bhattacharya2021differentiable,turnbull2024constraint, pena2016learning,mian2023nothing,khellaf2025federated}.
\citet{ng2022towards} estimates the Bayesian network structure  from data that is  partitioned across different parties with continuous optimization using the alternating direction method of multipliers.
\citet{ye2024federated} propose federated learning of generalized linear causal networks from distributed datasets by simulating an annealing process and searching over the space of topological sorts. 
% The optimal graphical structure compatible with a sort is found by distributed optimization.
% 
\citet{gao2021federated} employ federated learning to learn the underlying causal structure and the causal mechanisms from local heterogeneous data generated from additive noise models. ~\citet{li2024federated} propose an algorithm  for structure and orientation learning
utilizing summary statistics from distributed heterogeneous data.
% where they  instead of proxy data

\citet{han2021federated} utilizes semi-parametric density ratio weighting approach to provide treatment effect estimation where multiple clients contain heterogeneous covariate distributions.
\citet{xiong2023federated} infer the average treatment effects
by computing summary statistics locally using propensity scores and aggregating those across sites to obtain asymptotically normal point and variance estimators.
%
%
Finally, \citet{qiao2023collaborative} estimate causal effect with data aggregated from multiple self-interested parties while rewarding them 
based on their unique statistical properties relating to a modified variant of the Shapley value.
\citet{vo2022adaptive} divides the objective function into multiple components
to estimate causal effects with federated training.
These above works focus on estimation of specific queries and are not suitable for high-dimensional data in general.
% where explicit probability estimation is not feasible.
%
% \citet{khellaf2025federated} compare meta-analysis ATE estimators, one-shot federated estimators and gradient-based
% federated estimators and derive their 
% asymptotic variances under a linear
% outcome model. 
 %
\section{Problem Description }
\begin{definition}[Structural causal model (SCM)~\citep{pearl2009causality}]
% \textbf{Definition 1} (Structural causal model, (SCM)).
An SCM $\mathcal{M}$ is a $5$-tuple 
{$ \mathcal{M}=(\mathcal{V}, \mathcal{N}, \mathcal{U}, \mathcal{F}, P(.) )$,} where each observed variable $V_i\in\mathcal{V}$ is realized as an evaluation of the function $f_i^*\in\mathcal{F}$ which looks at a subset of the remaining observed variables $Pa_i\subset \mathcal{V}$, an unobserved exogenous noise variable $E_i\in \mathcal{N}$, and an unobserved confounding (latent) variable $U_i\in\mathcal{U}$. 
This refers to the \textbf{semi-Markovian causal model}.
$P(.)$ is a product joint distribution over all unobserved variables $\mathcal{N}\cup\mathcal{U}$. 
\end{definition}


\begin{definition}[Acyclic Directed Mixed Graph (ADMG)]
Each SCM induces a directed graph called the \emph{causal graph},
or acyclic directed mixed graph (ADMG)
with $\mathcal{V}$ as the vertex set. The directed edges are determined by which variables directly affect which other variable by appearing explicitly in that variable's function. Thus the causal graph is $G=(V,E)$ where $V_i\rightarrow V_j$ iff $V_i\in Pa_j$. The set $Pa_j$ is called the parent set of $V_j$. We assume this directed graph is acyclic (DAG). Under the semi-Markovian assumption, each unobserved confounder can appear in the equation of exactly two observed variables. We represent the existence of an unobserved confounder $[U=U_X= U_Y]\in \mathcal{U}$ between $X,Y$ in the SCM with a bidirected edge $X\leftrightarrow Y$ to the causal graph. These graphs are no longer DAGs although still acyclic. $V_i$ is called an ancestor for $V_j$ if there is a directed path from $V_i$ to $V_j$. Then $V_j$ is said to be a descendant of $V_i$. The set of ancestors of $V_i$ in graph $G$ is shown by $An_G(V_i)$.
\textbf{C-components:}
Given an ADMG $G$, a maximal subset of nodes where any two nodes are connected by  bidirected paths is called a \textbf{c-component} $C(G)$. For any $S\in C(G)$, $P(S|\Do(\V\setminus S))$ is called a c-factor. We assume that we have access to the ADMG through some causal structure learning algorithm and expert knowledge. 
\end{definition}

   
% \end{definition}


\begin{definition}[Causal effect and do-intervention]
% \textbf{Causal effect, Layer 1, Layer 2:}
A do-intervention $do(v_i)$ replaces the functional equation of $V_i$ with $V_i=v_i$ without affecting other equations. The distribution induced on the observed variables after such an intervention is called an interventional distribution, shown by $P(\mathcal{V}|do({v_i}))$. $P(\mathcal{V}|do({\emptyset}))=P(\mathcal{V})$ is called the observational distribution. 
\end{definition}

\begin{definition}[Deep causal generative models (DCM)~\cite{kocaoglu2018causalgan,xia2021causal,rahman2024modular}]
\label{def:scm}
	A neural net architecture $\mathbb{G}$ is called a deep causal generative model (DCM) for an ADMG $G=(\mathcal{V},\mathcal{E})$ if it is composed of a collection of neural nets, one  $f_i$ (or interchangeably $f_{V_i}$) for each $V_i\in\mathcal{V}$ such that 
		i) \emph{each $f_i$ accepts a sufficiently high-dimensional noise vector $N_i$,} 
		ii) \emph{the output of $f_j$ is input to $f_i$ iff $V_j\in Pa_G(V_i)$,}
		iii) \emph{$N_i=N_j$ iff $V_i\leftrightarrow V_j$. }
        {A DCM is trained to learn a proxy of the true SCM.}
\end{definition}

DCM generators are represented as $\G=\{f_{1},...,f_{n}\}$ parameterized by $\Theta= \{\theta_1, ... , \theta_{|\V|}\}$ where $n=|\V|$.
Similar to the original data distribution, $P(\V)$, we define $\mP(\V)$ to be the distribution induced by the $\theta$ parameterized DCM.
% We define $\mP(.)$ as the distribution induced by the DCM.
Noise vectors $N_i$ replace both the exogenous noises and the unobserved confounders in the true SCM. They are of sufficiently high dimension to induce the observed distribution. We say that a DCM is \emph{representative enough for an SCM} if the neural networks have sufficiently many parameters to induce the observed distribution induced by the SCM. 
For the neural architectures of variables in the same c-component, we can consider conditional GANs~\citep{mirza2014conditional}, as they are effective in matching the joint distribution by feeding the same prior noise $N_i=N_j$ (as confounders) into multiple generators. 
% For variables that are not confounded ($N_i\neq N_j$), we can use conditional models such as an MLP, VAE or diffusion model.
Let $v=[v_1, v_2,..., v_{n}]$ st $V_i \in \V$.  $D(v)$ is real samples and $D(\hat{v})\sim P_{\theta}(v)$ is DCM generated fake samples:
\begin{equation*}
\begin{split}
% \hat{v} &\sim P_{\theta}(v)\\
% \hat{v} &= \{f_i(\hat{pa}(V_i), u_{V_i}\}; \forall f_i \in \{f_V: V\in \V\} \\
\hat{v}_i &= f_i(\hat{pa}(V_i), u_{V_i});  f_i \in \{f_V: V\in \V\}, u_{V_i}\sim {N}(0,I) \\
% v\sim P^r;\\
\end{split}
\end{equation*}
The critic and generator (WGAN) loss functions are:
\begin{equation}
\label{eq:dcm-gen}
    \begin{split}
        L_D &= \mathbb{E}_{v\sim P}[D(v)]- \mathbb{E}_{v\sim P_{\theta}}[D(v)]\\
        L_G&= W(P, P^{\theta}) = -\mathbb{E}_{u\sim P(u)} [D(\hat{v})] \\
    \end{split}
\end{equation}
%
The gradient updates are in such case are:
\begin{equation}
\label{eq:single-loss}
    % f_{V}^{(t+1)} = f_{V}^{(t)} - \eta \frac{\partial L}{\partial f_{V}}
     f_{V}^{(t+1)} = f_{V}^{(t)} - \eta \frac{\partial L_G}{\partial f_{V}};  \{f_V: V\in \V \} 
\end{equation}
%
%
With Defintion\ref{def:scm}, we have the following, similar to \cite{xia2021causal}:
%%%%%%%%%%%%% DCM theoretical guarnatee .%%%%%%%%%%%%%
\begin{theorem}
\cite{kocaoglu2018causalgan,xia2021causal,rahman2024modular}
	\label{th:identifiability}
	Consider any SCM $\mathcal{M}=(G, \mathcal{N}, \mathcal{U}, \mathcal{F}, P(.) )$.  A DCM $\G=\{f_{1},...,f_{n}\}$ for $G$ entails the same identifiable interventional distributions as the SCM $\mathcal{M}$ if it entails the same observational distribution.  
	\end{theorem}
%
Thus, even with high-dimensional variables, given a causal graph, in principle, any identifiable interventional query can be sampled from, with a DCM that fits the joint distribution.
%
%
% \begin{definition}[Interventional sampling with DCM]
% \label{def:dcm-sample}
% Given that variables {$\V$} are connected as a directed acyclic graph
% and we have conditional generative models trained to learn the distributions $P(v_i|pa(v_i))$, we can perform \textbf{ancestral sampling} from the joint distribution, $P(\vb) = \prod_{V_i\in \V} P(v_i|pa(v_i))$ by making one pass through each 
% model in the topological order while sampling from the conditional distributions~\citep{bishop2006pattern}.
% \red{wrong def}.
% \red{How intervention is performed?}
% \end{definition}
%
\begin{definition}[Interventional sampling with DCM]
\label{def:dcm-sample}
Given the generators in a DCM,  to perform a hard intervention $do(X=x)$
and produce samples accordingly, we manually set the values for the intervened variables as $X=x$ instead of using their neural network. Then, we feed forward those values into its children’s mechanisms and execute \textbf{ancestral sampling}~\citep{bishop2006pattern} to generate the rest of the variables.
\end{definition}
%
%
%
\textbf{Modular-DCM~\citep{rahman2024modular}:} Modular-DCM use the c-factorization to modularize the DCM learning. 
%
% in equation~\ref{eq:c-fact}
% . Since we have access to only $P(\mathcal{V})$ dataset,
Even though they point out the fact that each c-factor is an interventional distribution, they suggest learning a proxy distribution of each c-factor involving more variables than the c-component. For complicated graphs, the suggested proxy distribution might include all variables in the causal graph. This becomes wasteful especially in our considered federated learning setup.
Here, we show that c-component based modularity is sufficient to learn the DCM and match the joint distribution $P(v)$ which has remained unexplored to date.
% . This c-component based modularity
% and the closest work~\cite{rahman2024modular} considered additional models and data of proxy variables.


\textbf{Federated Learning (FL):}
We consider a federated learning setting where {$\Cl$} clients participate at each round of a training process coordinated at a central server. Training data is independent and identically distributed (iid)  and is decentralized across multiple clients such that each client dataset $D^{\cl}$ is sampled from a joint distribution $P(.)$. 
For a case of two variables, each sample in $D^{\cl}$ is denoted as $(x,y)\in sup(X) \times sup(Y)$ with {$sup(X), sup(Y)$} being the support of input $X$ and output $Y$.
Clients collaboratively train a mechanism: $F(\fw, x ): sup(X) \rightarrow sup(Y)$ to learn the conditional distribution $P(y|x)$. The global optimization problem is designed as the FedAVG algorithm~\cite{mcmahan2017communication}: 
\begin{equation*}
\label{eq:fed1mech}
\min_{\fw} L(\fw) = \sum_{\cl=1}^{\Cl}  L_{\cl}(\theta) 
= \sum_{\cl=1}^{\Cl} \mathbb{E}_{(x, y) \sim \mathcal{D}_{\cl}} l(F(\fw,x), y)
\end{equation*}
In the classic FedAvg~\cite{mcmahan2017communication} algorithm, the server samples a subset of $\Cl$ clients and broadcast the mechanism parameters $\fw^t$ to those clients during round $t$. After performing local gradient updates, these clients return optimized mechanism $\fw^t_{\cl}$ to the server. The server aggregates the local model to obtain a global model. In this paper, we consider a more general case where each client can share multiple models.


% Given a learned SCM and its ADMG $G$, what is the most efficient way to transfer to a new SCM where the new SCM might have i) change in its conditional distributions. ii) change in the edges iii) change in the number of variables.

\textbf{Problem setup:}
% \red{Addressed problem is not very clear.}
We assume that data sets for all clients are generated from the same underlying SCM.
% containing the same set of variables and mechanisms. 
% We relax this assumption later. 
Dataset $\{D^{\cl}\}_{c=1}^{\Cl}$ is collected from client $c$'s environment with joint distribution $P(v)$ which is assumed to be generated from an unknown SCM $\M^{}$. 
% We assume there exists a high-dimensional variable $\X \in \V$.
% Due to low sample size, any client is unable the learn the correct generation mechanism $f_{\X}$ such that $f_{\X}(pa(\X)) \sim P(\X|pa(\X))$.
% Due to data heterogeneity, we assume that there exists a heterogenous variable $\Vc$ such that for any client $i$, client $j$: $P^{i}(v') \neq P^{j}(v')$ 
% We relax for arbitrary mechanism set later.
 % \cite{vo2022adaptive}
Our tasks are: i) to learn a DCM $\hat \M$ proxy to the true SCM $\M^{}$ without exchanging any client data such that the observationa distribution $P(v)$ is matched, and ii) to estimate causal effects between any pair $X,Y \in \V$ or sample from the corresponding interventional distribution $P(y|do(x))$. Formally, in any client $\cl\in \Cl$, our goal is to find a DCM $\hat{\M}$ s.t.
$\arg\min_{\hat \M}  d(P(v), {P}_{\theta}(\hat{v})) $.
% \begin{equation}        
%     % \arg\min_{\hat \M}  d(P(y|do(x)), \hat{P}(y|do(x))) 
%         \arg\min_{\hat \M}  d(P(v), {P}_{\theta}(\hat{v})) 
% \end{equation}
% \red{how do I estimate causal effect?}
Since we do not want any spurious correlation in our prediction, our goal is always to obtain the interventional distributions (ex: $P(y|do(x))$ for arbitrary $X,Y \in \V$, and not the conditional distributions (ex: $P(y|x)$).

% \begin{figure}[t!]
%     \centering
% %
% \begin{subfigure}[c]{0.45\linewidth}
%   \centering
% \begin{tikzpicture}[scale=0.7, transform shape]
%     %\tikzstyle{every node}=[font=\tiny]
%     \tikzstyle{every node}=[]
%     \node   [ ] (x) {$W$};
%     \node [below =0.8cm of x] (w1) {$Z$};
%     \node [below left =0.3cm and 0.4 of x] (y1) {$Y_1$};
%     \node [right =1cm of x] (w2) {$X$};
%     \node [below =0.8cm of w2] (y) {$Y_2$};
%     \draw[ thick] (x) to  (w1); 
%     \draw[ thick] (x) to  (y1); 
%     \draw[ thick] (w1) to  (y1); 
%     \draw[ thick] (w1) to  (w2); 
%     \draw[ thick] (w2) to  (y); 
%     \path[bidirected] (x) edge[bend left=35] (w2);
%     \path[bidirected] (w1) edge[bend right=35] (y);
% \end{tikzpicture}
% \caption{Graph example}
% \label{fig:fail-ex}
% \end{subfigure}
% \caption{Causal graphs}
% \end{figure}



\textbf{Assumptions:}
i) The observational dataset contains iid samples and distributed across multiple clients that collectively can represent the correct joint distribution $P(v)$ 
 ii) we have access to the ADMG.
  iii) the causal model is semi-Markovian. iv) Each generator $f_i, \forall_i$ in the
DCM can correctly learn the target distribution.
(App~\ref{violate-ass} for details). 

% containing our expensive mechanism $f_m$ (i.e.,  $f_m \in \mathcal{F}$) that we can train globally while training rest of the mechanism locally.

% Our objective is to approximate the true SCM with $\hat \M^c$ such that $P^{c}(v)= \hat{P}^{c}(v)$ with the same causal graph $G^c$.




% \begin{figure}
%     \centering
%     \includegraphics[width=1\linewidth]{Fig.s/Mod2/fed.pdf}
%     \caption{Enter Caption}
%     \label{fig:enter-label}
% \end{figure}

% \begin{figure}
%     \centering
%     \includegraphics[width=1.1\linewidth]{Fig.s/Mod2/root.pdf}
%     \caption{Enter Caption}
%     \label{fig:enter-label}
% \end{figure}

% \begin{figure}
%     \centering
%     \includegraphics[width=1\linewidth]{Fig.s/Mod2/architectures.pdf}
%     \caption{Applications of modularity}
%     \label{fig:enter-label}
% \end{figure}



%%%%%%% Methodology
% Minimum modularity (how modular-dcm can be improved
% Theoretical guarantee of minimum modularity and why can we not modularize further.
% Connect it with federated learning : Collaborate based on agreed upor architecture
% If pre-specified architecture, how do we adapt local training,




\section{Methodology }

Suppose that due to privacy concerns or financial incentives, clients do not want to share any data and wish to share as few model weights as possible. 
% propose to learn the generative mechanisms for a set of variables.
Given a set of models proposed for federated training by the client $\cpc$, \fedcm %acts as a decider of whether 
evaluates if it is possible to learn an SCM by \emph{only participating into the FL process with this set and use local data for training the remaining mechanisms}.
% training the mechanisms of the proposed set in a FL setup and later connecting them with locally trained mechanisms.
If their intended mechanisms are not sufficient to learn the whole SCM, \fedcm rejects it and offers the minimal super set of $\cpc$ that are required to be learned  collaboratively. For this purpose, we establish a maximal modularity concept considering the causal relations among the variables in the causal graph, more specifically the c-components. 
% and proof its correctness with a 
% First we consider the idea case where there is no
% First, irrespective of data characteristics, we focus on what the clients 
% want to share with each other and we verify if that would allow them to learn the true causal models. 
\par
\textbf{Challenges of training a DCM in FL:}
Given the causal graph $G(\V)$, we have $\{f_i\}_{i=1}^{|\V|}$ mechanisms in the causal model. To learn $\{f_i\}; \forall i$ we are required to learn the true joint distribution $P(v)$ by training local models on client data and sharing necessary model weights. To be precise, consider the task of approximating the structural causal models in different setups shown in Fig.~\ref{Case 1}-~\ref{Case 4}. Fig.~\ref{Case 1} shows the most common FL setup in which the goal is to predict pneumonia $Y$ from X-ray images ($\X$). In Fig.~\ref{Case 2}, we have access to patient symptoms $X$ (w/ $X\rightarrow \X$) and patient age $A$.  Older patients have a higher risk of developing different symptoms ($A\rightarrow X$) and are more likely to be diagnosed for pneumonia ($A\rightarrow Y$).
In most hospital scenarios, the goal is to obtain pneumonia prediction ($Y$) in the form of causal effect estimation.
Data are decentralized and suppose that clients proposed to share only the model weights of the mechanism $\cpc = \{f_{Y}\}$.
Below, we show a few examples and then build a formal characterization of the set of mechanisms that need to be trained globally.


\textbf{Trivial Solution:}
The trivial solution would be to always reject the client-proposed set and train the whole DCM: $\{f_i\}_{V_i\in \V}$ globally in a FL setup.
That would ensure that $P(v)$ is matched. To obtain that, we need to minimize the following loss function (similar to Equation~\ref{eq:dcm-gen}):
\begin{equation*}
% \label{eq:triv-sol2}
\begin{split}
\min_{\theta} L(\theta)  &= \sum_{\cl=1}^{\Cl}  L_{\cl}(\theta) 
 = \sum_{\cl=1}^{\Cl} \sum_{v\in \V} \mathbb{E}_{[pa(v),v] \sim \mathcal{D}_{\cl}} l( \hat{v}, v)\\
   % & \hat{v} =  f_i(\theta_i, pa(v), U_i )
   \hat{v}_i &= f_{\theta_i}({pa}(V_i), u_{V_i}); u_{V_i}\sim {N}(0,I) 
\end{split}
\end{equation*}
However, this will be computationally expensive and unnecessary.
We aim to find the smallest set of mechanisms $\F$ such that $f_{\X} \in \F$ and 
% it is necessary\mk{the notion of necessity is unclear. why do we ever have to train globally?}
we need to train them with global information to match $P(v)$. 

\begin{figure}[t!]
\vspace{-4mm}
    \centering
\begin{subfigure}[c]{0.15\linewidth}
  \centering
\begin{tikzpicture}[scale=0.7, transform shape]
    %\tikzstyle{every node}=[font=\tiny]
    \tikzstyle{every node}=[]
    \node   [] (x) {${\X}$};
    \node [ right =0.8cm of x] (w2) {${Y}$};
    % \node [ above =0.6cm of x] (s) {$S$};
    % \draw[ thick] (s) to (x)   ; 
    \draw[ thick] (x) to  (w2); 
\end{tikzpicture}
\caption{Case 1}
\label{Case 1}
\end{subfigure}
% \begin{subfigure}[c]{0.27\linewidth}
%   \centering
% \begin{tikzpicture}[scale=0.7, transform shape]
%     %\tikzstyle{every node}=[font=\tiny]
%     \tikzstyle{every node}=[]
%     \node   [] (x) {${\X}$};
%     \node [ right =0.8cm of x] (w2) {${Y}$};
%     \node [ left =0.8cm of x] (z) {${X}$};
%     % \node [ above =0.6cm of x] (s) {$S$};
%     \draw[ thick] (z) to  (x); 
%     % \draw[ thick] (s) to (x)   ; 
%     \draw[ thick] (x) to  (w2); 
% \end{tikzpicture}
% \caption{Case 2}
% \label{Case 2}
% \end{subfigure}
\begin{subfigure}[c]{0.27\linewidth}
\vspace{-4mm}
  \centering
\begin{tikzpicture}[scale=0.7, transform shape]
    %\tikzstyle{every node}=[font=\tiny]
    \tikzstyle{every node}=[]
    \node   [] (x) {${\X}$};
    \node [ right =0.8cm of x] (w2) {${Y}$};
    \node [ left =0.8cm of x] (z) {${X}$};
    \node [ above =0.1cm of x] (w) {$A$};
    % \node [ above =0.5cm of x] (s) {$S$};
    \draw[ thick] (z) to  (x); 
    % \draw[ thick] (s) to (x)   ; 
    \draw[ thick] (x) to  (w2);
    \draw[ thick] (w) to  (z); 
    \draw[ thick] (w) to  (w2); 
\end{tikzpicture}
\caption{Case 2}
\label{Case 2}
\end{subfigure}
\begin{subfigure}[c]{0.27\linewidth}
  \centering
\begin{tikzpicture}[scale=0.7, transform shape]
    %\tikzstyle{every node}=[font=\tiny]
    \tikzstyle{every node}=[]
    \node   [] (x) {${\X}$};
    \node [ right =0.8cm of x] (w2) {${Y}$};
    \node [ left =0.8cm of x] (z) {${X}$};
    % \node [ above =0.6cm of x] (s) {$S$};
    \draw[ thick] (z) to  (x); 
    % \draw[ thick] (s) to (x)   ; 
    \draw[ thick] (x) to  (w2); 
    \path[bidirected] (z) edge[bend left=35] 
    node[pos=0.5,sloped,font=\small, align=center] {A}
    (w2);
\end{tikzpicture}
\caption{Case 3}
\label{Case 3}
\end{subfigure}
\begin{subfigure}[c]{0.27\linewidth}
  \centering
\begin{tikzpicture}[scale=0.7, transform shape]
    %\tikzstyle{every node}=[font=\tiny]
    \tikzstyle{every node}=[]
    \node   [] (x) {${\X}$};
    \node [ right =0.8cm of x] (w2) {${Y}$};
    \node [ left =0.8cm of x] (z) {${X}$};
    \node [ below =0.6cm of x] (s) {$M$};
    \draw[ thick] (z) to  (x); 
    \draw[ thick] (z) to (s); 
    \draw[ thick] (x) to  (w2); 
    \path[bidirected] (z) edge[bend left=35] 
    node[pos=0.5,sloped,font=\small, align=center] {A}(w2);
    \path[bidirected] (x) edge[bend left=35]
    node[pos=0.5,sloped,font=\small, align=center] {H}(s);
\end{tikzpicture}
\caption{Case 4}
\label{Case 4}
\end{subfigure}
%
\vspace{-2mm}
\caption{$\X:$ X-ray image, $Y$: pneumonia prediction, $X$: symptoms, $A$: age. All clients maintain the same graph. }
\label{fig:four-graphs}
\vspace{-4mm}
\end{figure}


Fig.~\ref{Case 1}: Since, $P(y|do(\x))= P(y|\x)$ for this graph, federated learning of only $f_y(\x); \forall \x\in \X$ is sufficient. Thus, we accept the client-proposed set and define $\F= \{f_{y}\}$. We train the model $f_{\theta}\in \F$ on local data $D^{\cl}$ before sending it to the server so that, when aggregated from all clients, we get a global model for $P(y|x)$.
In  Fig.~\ref{Case 2}, we can accept the client-proposed set and perform FL to learn the mechanism $f_{Y}(A,\X)$ collaboratively and remaining mechanisms $f_{X}, f_{A}, f_{\X}$ locally. Thus, $\F=\{f_y\}$.

% \begin{equation}          
% P(y|x) = \sum_s P(y, s|x) = \sum_s P(y|s, x) P(s|x)
% \end{equation}   
% $P(y|s, x)$ is learned from the local dataset $D^s$
%  of client $S = s$.
% Case 2 (Fig.~\ref{Case 2}): 
% Suppose, we have information about patient symptoms $Z$ now. Symptoms affect how the x-ray would look like ($Z\rightarrow X)$ but 
% pneumonia prediction ($Y$) should depend on $X$ not on $Z$. Thus, no $Z\rightarrow Y$. 
% For prediction $P(y|do(z))= \sum_{x}P(y,x|do(z))= \sum_x P(x|do(z)) P(y|do(z),x) = \sum_x P(x|z) P(y|x) $. Thus,  client $i$ need to learn how i) $x= f_{\X}(s);\forall x\in sup(\X)$ and $\forall s\in sup({S})$ are generated and ii) $y= f_Y(x);\forall x\in sup(\X)$ and $\forall y\in sup({Y})$ are generated. Thus, $\F = \{f_{\X}, f_Y\}$.
% % \red{partial support and positivity violation is different. Assume positivity for now.}

In both Fig.~\ref{Case 3} and ~\ref{Case 4}, patient age $A$ is unobserved and represented as a bi-directed edge $X\leftrightarrow Y$. This forms the c-component $\{X,Y\}$. Note that in this graph $X \not\indep Y|\X$ due to the unobserved confounder $A$. As suggested in the client-proposed set, if we train only $f_Y$ independently with $\X$ as input, we have {$y= f_Y(\x, u_Y )$} which would create a wrong independence: $X \indep Y|\X$. Here, $X$ and $Y$ share a joint $P(x, y)$ that must be matched to be consistent with the full joint $P(\V)$.
Training $f_{Y}$ globally and $f_{X}$ locally would not allow feeding the same noise $U_A$ and matching $P(x, y)$.
Rather, we need to train both $f_X(u_A)$ and $f_Y(u_A,\x)$ together with the same confounding noise $U_A\sim \mathcal{N}(0, I)$. Thus, we reject $\cpc$ and offer the minimal set as $\F=\{f_x, f_y\}$ that must participate in FL. Similarly, if $\cpc=\{\X\}$ in Fig.~\ref{Case 4}, we have to reject it and propose $\F=\{\X, M\}$ due to the unobserved confounder $H$.
%
%
 % Thus, we need to train both $z=f_Z(u)$ and $y= f_Y(x,u)$ together with the same confounding noise $U\sim \mathcal{N}(0, I)$. Therefore, the FL training set \red{$\F= \{f_z, f_x, f_y\}$}.
 % Even though we have an additional measured confounder $A$, the FL training set $\F = \{f_{\X}, f_Y\}$ stays the same. 
% As FL will train the global model locally, using age $A$ attribute might help the model overfit to local data (\red{doesnt age has same distribution across clients?}). Thus, we do not want age as a feature in our models.
%
% \textbf{Trivial Solution 2:} Since $P(v')$ changes across clients due to data heterogeneity, we might want to train only the mechanism  $f_{\Vc}$ globally according to equation~\ref{eq:fed1mech} and train $\{f_i\}_{v_i\in V\setminus \{\Vc\}}$ locally.  If we have bi-directed edge $\Vc \leftrightarrow W$ in the causal graph, $\Vc$ and $W$ do not cause each other, but they share a joint $P(v', w)$ that must be matched to be consistent with the full joint $P(V)$.
% As a result, both $f_{\Vc}$ and $f_{W}$ need to be trained together with the same confounding noise $U$. Training $f_{\Vc}$ globally and $f_{W}$ locally would not allow feeding the same confounding noise and matching $P(v', w)$. In Fig.~\ref{fig:fail-ex}, if $\Vc=Y_2$, we can not train $f_{Y_2}$ globally and $f_{Z}$ locally as we have to feed the same confounding noise to both models.
% \red{What is the issue of training only X and Ch(X), i.e, generation and classification. How do we connect with other parts of the DCM? So we utilize c-factorization to modularize X and Ch(X) mechanisms from rest of the DCM.}


% \textbf{Valid Solution:}
% We can obtain a valid solution for Fig.~\ref{fig:fail-ex} by considering the c-components. We train $\{f_Z, f_{Y_2}\}$ globally and $\{f_{W}, f_{X}, f_{Y_1}\}$ locally.  Below, we generalize the idea of federated learning based on c-components.


% If $C$ is child of both $X$ and $Y$ them it implies that based on both $X,Y$, the sample belongs to a client. For example: a specific X-ray image without major symptoms has equal likelihood of being assingned to both emegency and regular client. Howver, if a clinician predicts somenthings  


\begin{definition}[Maximal Modularity Set]
    A set of mechanisms constructs an maximal modularity set denoted as $\F$, if any mechanism $f\in\F$ is trained independently with a loss function $L(f)$ while other mechanisms are trained with the loss function $L(\F\setminus\{f\})$, they are not guaranteed to match the joint distribution $P(v)$.
\end{definition}


\subsection{Modular learning of DCM}
Now, we characterize the maximal modularity set based on the causal relations among observed and unobserved variables in the given causal graph.
Suppose, we have a causal graph containing observed variables and unobserved confounders as their parents (ADMG), that represents a semi-Markovian model and consisten with the observational distribution $P(v)$. 
~\citet{tian2002general} propose a method to factorize the joint distribution $P(v)$ into c-factors based on the c-component modules of the given ADMG.
% Given an acyclic directed mixed graph (ADMG) of a semi-Markovian model,  ~\citet{tian2002general} utilizes the c-component sub-graph modules of the ADMG and factorizes the joint distribution $P(\mathcal{V})$ into c-factors: the joint distributions of each c-component $S_i$ intervened on their parents, i.e., $P(s_j|\Do(pa(s_i)))$.
\begin{equation}
\label{eq:c-fact}
	\begin{split}
		P(v)&= \prod_{s_i\in C(G)} P(s_i|\Do(pa(s_i))\\
	\end{split}
\end{equation}
Here, $C(G)$ is the set of all c-components and $P(s_i|\Do(pa(s_i))$ is the c-factor corresponding to c-component $S_i$. Now, we establish a connection with c-factorization and training deep causal generative models (Definition~\ref{def:scm}: DCM). Note that in DCM we train the generative mechanism of all variables: $\G=\{f_V: V\in \V\}$ with a single loss function $L(\G)$ (Equation~\ref{eq:single-loss}) such that ${P}_{\theta}(\hat{v})$ matches the empirical joint distribution $P(v)$.
%
%
%
% \red{Condtion/intervention for c-factor.}
%
The factorization in Equation~\ref{eq:c-fact} suggests that we can modularize the training process of mechanisms into c-components. 
If we can enforce our approximated SCM $\hat{\mathcal{M}}$ represented with the DCM $\G$  to match each of the c-factors, the joint distribution implied by the DCM 
% (Def~\ref{def:dcm-sample})
will also match $P({v})$. More precisely, we can have $|C(G)|$ different loss functions $\{L_{S_i}(\{f_{V\in S_i}\}): S_i\in C(G)\}$ and use them to independently train mechanisms in each c-component $\{f_V: V\in S_i\}$. The gradient updates are in such case are:
\begin{equation}
        f_{V}^{(t+1)} = f_{V}^{(t)} - \eta \frac{\partial L_{S_i}}{\partial f_{V}},  \text{ for } V\in S_i, \text{ and } S_i\in C(G)
\end{equation}
%
The main goal of this paper is to leverage federated learning to approximate the structural causal model. The above c-component based modularization allows us to train mechanisms in a c-component i.e., $\{f_V:V\in S_i\}$ together but independently from other c-components $C(G)\setminus S_i$. Therefore, mechanisms in c-component $S_i$ can utilize this opportunity to join collaborative federated training without affecting other mechanisms in the DCM.
Thus, based on the client proposed set $\cpc$ we can find a partition of the c-components, i.e, $C(G) = C_l(G) \cup C_g(G)$ where $C_g(G)$ is the minimal set of c-components such that $\cpc \subseteq \{f_V: V\in C_g(G)$\}. We can re-write Equation~\ref{eq:c-fact} as,
% local:$C_l(G)$ and global:$C_g(G)$ set of c-components such that $\cpc \subseteq \{f_V: V\in C_g(G)$\}.
% 
\begin{equation*}
    \begin{split}
		P(v)&= 
        \prod_{s_i\in C_l(G)} P(s_i|\Do(pa(s_i))
        \prod_{s_j\in C_g(G)} P(s_j|\Do(pa(s_j))\\
	\end{split}
\end{equation*}
%
If $\cpc = \{f_V: V\in C_g(G)\}$ then we accept $\cpc$ as global models for FL. Otherwise, we offer $\F= \{f_V: V\in C_g(G)\}$ as the minimal set for collaborative training and elect them as global models.

Now that we have selected the minimal subset of c-components as global models $\F$, we update our training process. We train all mechanisms $\{f_V\}_{V\in \V}$ in the DCM jointly with a single loss function $L(\{f_V\}_{V\in \V})$ as usual (Equation~\ref{eq:single-loss}) to match the observational distribution. Additionally, for the global mechanisms $f_V\in \F$, we update their model weights with another loss function $L_{S_i}(\{f_{V\in S_i}\})$ aggregated with the original one.
%
% \begin{equation}
% \begin{split}
%        f_{V}^{(t+1)} &= f_{V}^{(t)} - \eta \frac{\partial L}{\partial f_{V}}
%        \text{if } V\notin \F\\
%     f_{V}^{(t+1)} &= f_{V}^{(t)} - \eta \frac{\partial (L+L_{S_i})}{\partial f_{V}},  \text{ if } V\in \F 
% \end{split}
% \end{equation}
%
Let $v=[v_1,v_2,..., v_m]$ such that $V_i \in C_g(G)$. $P_{\theta}^{in}(v)$ is the fake interventional distribution of DCM and $P^{inr}(v)$ is the  \textbf{in}terventional \textbf{r}eal data distribution. 
\begin{equation}
    \begin{split}
        \hat{v}& \sim P_{\theta}^{in}(v) ; v\sim P^{inr}(v)\\
         \hat{v} &= \{f_j(\hat{pa}(v_j), u_{V_j}\}; \forall {f_j\in \{f_V:{V\in C_g(G)}\}}
    \end{split}
\end{equation}
The critic and generator loss functions ($L_D^{in} , L_{\F}$ for matching interventional distributions are as follows:
\begin{equation}
    \begin{split}
    \label{eq:intv-gen-loss}
        L_D^{in} &= \mathbb{E}_{x\sim P^{inr}}[D^{in}(v)]- \mathbb{E}_{\hat{v}\sim P_{\theta}}[D^{in}(\hat{v})]\\
   L^{in}_G&=  W(P^{inr}, P^{in}_{{\theta}}) = -\mathbb{E}_{u\sim P(u)} [D(\hat{v})] \\
    \end{split}
\end{equation}
 And the gradient updates for DCM after training on both observational and interventional data are as follows:
\begin{equation}
    f_{V}^{(t+1)} =
    \begin{cases}
        f_{V}^{(t)} - \eta \frac{\partial L_G}{\partial f_{V}}, & \text{if } V\notin \F \\
        f_{V}^{(t)} - \eta \frac{\partial (L_G+L^{in}_G)}{\partial f_{V}}, & \text{if } V\in \F
    \end{cases}
\end{equation}
\begin{figure*}[t!]
    \vspace{-5mm}
    \centering
    \includegraphics[width=1.\linewidth]{Figures/algo_simulation.pdf}
    \caption{Algorithm simulation of  \fedcm between two clients. Here $\cpc=\{f_x\}$ and $\F=\{f_x,f_y\}$.
    % \red{Not sure if the figure makes enough sense.}
    }
    \label{fig:end-to-end}
        \vspace{-3mm}
\end{figure*}
%
% In this way, we can train 
This gives us an effective approach to learn a proxy of the SCM by training all mechanisms on local data while training the models in $\F$ collaboratively with other clients. The loss function for DCM generators, $L_G = W(P^r, P_{\theta})$ is optimized to match the local training distribution $P(v)$ and the additional loss function for generators $f_V \in \F$ is $L^{in}_G = W(P^{inr}, P^{in}_{{\theta}})$ is optimized to match global c-factor distribution $P(s_i|\Do(pa(s_i))$ where $S_i\subseteq C_g(G)$. 

\par Note that, the c-factor is an interventional distribution and optimizing for the 2nd loss function $L^{in}_G$, would require real samples from that interventional distribution $P^{inr}$. However, we have only observational samples as training data which we use to optimize for the first loss function. 
How can we utilize observational samples to obtain samples from the c-factor interventional distribution and thus train the global models with them? In the next section, we provide a systematic approach as part of our novel \fedcm framework.
% One major challenge in the current approach is that to train the global models $\F$ modularly, we need to match their c-factor $ P(s_i|\Do(pa(s_i))$ where $S_i\subseteq C_g(G)$.

\subsection{Interventional training data and where to find them?}
\label{sec:intv-data-find}
Given access to only observational data $D\sim P(v)$, we aim to minimize the {loss function} to train DCM mechanisms in each c-component $S_i\in C_g(G)$ utilizing federated learning. Equation~\ref{eq:intv-gen-loss} can be written as follow.
\begin{equation}
\label{eq:intv}
\begin{split}
               L^{in}_G & =  W(P^{inr}, P^{in}_{{\theta}}) \\
               &= W(P(S_i|do(pa(S_i))), \mP(S_i|do(pa(S_i))))
   \end{split}
\end{equation}
%
% Let $\Vc$ be the heterogeneous variable and $S_{\Vc}$ be the c-component containing $\Vc$. 
% Our main idea is to train the c-component $S_{i} \in C_g(G)$ utilizing federated learning, and train rest of the c-components $S_i \in C_l(G)= C(G)\setminus C_g(G)$ locally.
% \red{Talk about generating interventional data based on the c-factors and then use them to train local DMC.}
%
Equation~\ref{eq:intv} is a comparison between fake and real interventional distributions. To train DCM mechanisms $f\in \F$ and learn this distributions, we need to compare its generated fake interventional samples, $\hat{D}^{in}$ against real interventional samples $D^{in}$. Even though we do not have access to any real interventional data, we implement the concept of causal effect identifiability to generate semi-synthetic interventional data and use them for our training.
% Below, we describe more precisely.



\begin{proposition}[~\cite{tian2002general, shpitser2008complete}]
\label{prop:c-fact-dox}
If $C(G\setminus X)=\{S\}$ and $S \in C(G)$ then,
% \begin{equation}
    $P_x(y) = \sum_{s \setminus y} \prod_{V_i \in S} P(v_i | v_{\pi}^{(i-1)})$
% \end{equation}
\end{proposition}
This is a modification of c-factorization. Here, the condition implies that after removing the intervened variables from the original graph $G$, there exists a single c-component in the modified graph $G\setminus X$ that was also a c-component in the original graph $G$. This scenario occurs when there are no bi-directed edges from $X$ to the c-component $S$. Then, we can utilize the above estimand for our causal query.
In our case, for each query $P(S_i|do(pa(S_i))$, we consider the causal graph to be $G=Pa(S_i) \cup S_i$ and that satisfies the condition in Proposition~\ref{prop:c-fact-dox}. Thus, we can obtain:
\begin{equation}
\label{eq:step6}
    P(s_i| do(pa(s_i))=\prod_{\{j|V_j\in S_i\}} P(v_j|v_{\pi}^{(j-1)} )
\end{equation}
%
% as $C(G\setminus Pa(S_i))= S_i; S_i\in G$
% As our case satisfies the condition
% \begin{proposition}[Identifiability~\cite{tian2002general}]
% Let us consider a c-component $S_i$ and we need to all train mechanisms $f_{V_j} \in S_i$.
%  Here, the c-factor $P(s_i|do(pa(s_i))$ is identifiable, i.e., we can uniquely estimate it as a function of the observational distribution $P(v)$ as the intervention set $\mathbf{X}= Pa(S_i)$ is located outside the c-component. We can estimate the c-factor with the following formula:
% \begin{equation}
% \label{eq:step6}
%     % P(s_i| do(pa(s_i))=\prod_{V_j\in S_i} P(v_j|v_{\pi^{j-1}} \cap (S_i \cup pa(S_i)))
%     P(s_i| do(pa(s_i))=\prod_{\{j|V_j\in S_i\}} P(v_j|v_{\pi}^{j-1} )
% \end{equation}
% \end{proposition}
%
Intuitively, identifiability gives us a way to express an identifiable interventional distributions, as a function of observational probability distributions. However, we can not train DCM {generators} with numeric values of probability tables rather need samples from the corresponding distribution.
% are required.

Note that Equation~\ref{eq:step6} is a product of a set of conditional distributions in the form $P(v_j|v_{\pi}^{(j-1)})$. 
We follow~\cite{rahman2024conditional} to generate samples from each such distribution by training a conditional model $M_j$ that takes values of all ancestral dependent variables: $D[v_{\pi}^{(j-1)}]$ as input and generates $v_j$ as outputs. 
We train $|S_i|$ number of conditional models $\glob=\{M_1,...,M_{|S_i|}\}$ with observational data each parameterized by 
% $\Phi=\{\phi_1, ..., \phi_{|S_i|}\}$. $P_{\Phi}(v)$ 
${w}=\{w_1, ..., w_{|S_i|}\}$.
$P_{w}(v)$ 
is the distribution learned by $\glob$.
%
Here, $d(.,.)$ can be any loss function to measure distance between the distribution of two set of samples.
\begin{equation}
\begin{split}
 L_{FL} = &d(P(S|do(pa(S))),  {P}_{w}(S|do(pa(S))) )  \\
= & \sum_{i:V_i\in S} d(P(v_i| v_{\pi}^{(i-1)}), 
 P_{w_i}(v_i| v_{\pi}^{(i-1)})) 
%  L_1 = &d(P(S|do(pa(S))),  {P}_{\Phi}(S|do(pa(S))) )  \\
% = & \sum_{i:V_i\in S} d(P(v_i| v_{\pi}^{(i-1)}), 
%  P_{\phi_i}(v_i| v_{\pi}^{(i-1)}))    
\end{split}
\end{equation}
% There are $|S_i|$ such conditional distributions 
% We can train a conditional model $M_j$ for each of these conditional distributions in Equation~\ref{eq:step6} with 
% the $\Phi=\{\phi_1, \phi_2, ..., \phi_m\}$ parameterized models (trained in Algorithm~\ref{alg:getRealIntvData})%
After convergence, we connect these $|S_i|$ trained models according to the conditional distributions in Equation~\ref{eq:step6}.
\begin{equation}
    \hat{v_j} = M_j(\hat{v}_{\pi}^{(j-1)}, pa(S_i)); [{V}_j, {V}_{\pi}^{j-1}]\subseteq S_i.
\end{equation}
Finally, we feed output of one model as input to other models, i.e., perform ancestral sampling to generate samples from the interventional distribution $P(s_i| do(pa(s_i))$.
% \red{This is illustrated in Algorithm~\ref{alg:getRealIntvData}: getRealIntvData(.), lines 3-5.}

Recall that, we elected $\F$ as global models but they require interventional data for training. Therefore, we elect $\glob$ for collaborative training as proxy to $\F$ as we can train models in $\glob$ on observational data using federated training. Later we can generate interventional data from $\glob$ to train the models in $\F$.
As the proxy models are trained on observational data $D[Pa(S_i), S_i]$ only having access to models $M_{V_j}; V_j\in S_i, S_i\subset \F$ and corresponding model weights are being shared during FL, privacy is preserved.
% as much as $\F$.

\subsection{End-to-end \fedcm framework}
Here, we connect all pieces of our framework together. This is simulated in Fig.~\ref{fig:end-to-end} for a frontdoor graph w/ $\cpc=\{X\}$ . 

\textbf{Model initialization:}
All clients are assumed to agree upon a common causal graph, $G$. 
Each client initiates two sets of models.
The first set (yellow) contains $\G=\{f_{V}\}_{V\in \V}$ to act as the local DCM (set of connected generators) according to $G$. 
Weights of each model $f_i$ in the DCM are initialized as $\theta_i$.
The local DCM will generate fake observational data, $\hat{D}[{\V}]\sim {P}_{\theta}(\hat{v})$ (Fig~\ref{fig:end-to-end}: left-top) and fake interventional data, $\hat{D}[{\V}]\sim {P}_{\theta}(v|do(x)), X\subseteq \V$ (Fig~\ref{fig:end-to-end}: left-bottom).


We obtain all c-components $S_i\in C(G)$ from the causal graph.
Based on the c-components in the causal graph, \fedcm either accepts the client-proposed set of models, $\cpc$ or proposes $\F$: the c-component mechanisms of $C_g(G)$ that contains $\cpc$. Since corresponding c-factors are interventional distributions, we initiate a second set of models (orange) as proxy $\glob= \{M_j\}_{V_j\in C_g(G)}$ to generate synthetic interventional data from c-factors, i.e., $\tilde{D}[Pa(S_i), S_i]\sim {P}_{w}(s_i|do(pa(s_i))$.
Causal identifiability discussed in Section~\ref{sec:intv-data-find} ensures $\{M_j\}_{V_j\in C_g(G)}$ combinedly generate interventional data even though trained on only observational data. Each client performs the same process and join federated learning only for the second set of models.

% \textbf{Client update:}
% In each client $c$, we train the models in c-components that are personalized and non-heterogeneous ($\{{S_i}\}_i\setminus \{S_{\Vc}\}$) using the function \textbf{LocalTraining(.)}. We save these models locally.

% \red{till here}

% This local training for each c-component can be performed in parallel.
% Calling \textbf{LocalTraining(.)} trains conditional models 
% To make the second set of models $\{M_j\}_{V_j\in C_g(G)}$ generate realistic synthetic interventional data, we train them locally with observational data $D[\V]$ and allow them in federated learning to collaboratively learn the data distribution. 
%
% That is, each client locally takes one step of gradient descent on the current global models $\glob= \{M_{j}\}_{V_j \in \F}$  using its local data, and the server then takes a weighted average of the resulting models. 
% \textbf{Server executes:}
% First, each source computes gradients from all sources and subsequently updates the model. Next, the server broadcasts the new the local gradient,  using its own data and sends to the server. The server, then, collects these model to all the sources.
\textbf{Training  global conditional models:}
We perform federated training for the set of models $\glob= \{M_{j}\}_{V_j \in \F}$
to learn conditional distributions collaboratively and utilize them to generate samples from the c-factor $P(s_i|do(pa(s_i))$.
First, the model weights $w_j$ of each function $M_j \in \glob$ is updated as $w[V_j] \gets w[V_j] - \eta \nabla \ell$ after training on local observational data. Next, each client sends the model weights to the server to be aggregated as the global model.  
The server receives the model weights from each client and takes a weighted average of the sent models. For the weights of each function $M_{j}$ in $\glob$, the server performs $w_{t+1}[V_j] \gets \sum_{k=1}^K \frac{n_k}{n} w_{t+1}^{k}[V_j]$ at round $t$.
Finally, the server broadcasts the new global models to each clients and they update their local models accordingly. 





% \textbf{Obtaining GAN training data:}
% Now, given that we have generated real interventional data for $P(s_i|do(pa(s_i))$ with $\glob$, we can train the mechanisms in the DCM $\{f_{V_j}\}_{V_j\in \F}$ on those. We generate fake data from the GAN architecture and perform adversarial training considering the generated interventional data as the real dataset. We have the original training data as the real observational data.
% Now that we have obtained real interventional samples from $P(s_i| do(pa(s_i))$, we can utilize those as training data to train mechanisms $f_{V_j}; V_j\in S_i$. 
% \textbf{Obtaining GAN generated fake data:}
% as the real observational data.
% \textbf{Training generators and critics:}
% Now, we can compare the generated fake interventional dataset  as Algorithm~\ref{alg:getFakeIntvData} with generated real interventional dataset as Algorithm~\ref{alg:getRealIntvData} and train the models in $\{f_{V_j}\}_{V_j\in S_i}$ accordingly.
% \textbf{GAN convergence:}
% Now, we can back propagate on $L^{\theta}$ and $L^{\theta'}$ 
% % , \forall i:V_i\in S 
% which lets us update each parameter $\theta_i': \forall_i$ and $\theta$ independently. 
\textbf{Training  client local DCM  architecture:}
The first training dataset $\tilde{D}^{in}[Pa(S_i),S_i]$ is the synthetic (real) interventional data for $P(s_i|do(pa(s_i))$ generated with conditional models in $\glob$ and the second training dataset $D[\V]$ is the original observational training data.
We train critic 1 to distinguish between DCM $\G$ generated fake $\hat{D}[\V]$ vs real observational data $D[\V]$ and critic 2 to distinguish between fake $\hat{D}[Pa(S_i), S_i]$ vs synthetic $\tilde{D}[Pa(S_i), S_i]$ interventional data. 
Now, instead of the loss function for c-factors in Equation~\ref{eq:intv}, we now have:
\begin{equation}
               \tilde{L}_{G}^{in} =d(P_{w}(S_i|do(pa(S_i))), P_{\theta}(S_i|do(pa(S_i))))
\end{equation}
 According to triangle inequality, with the above loss we can upper bound  the original DCM loss function in Equation~\ref{eq:intv}:
 % $ d(P(S_i|do(pa(S_i))), P_{\theta}(S|do(pa(S_i)))) 
 % \leq 
 %               d(P(S_i|do(pa(S_i))), P_{w}(S|do(pa(S_i)))) 
 %         +  d(P_{w}(S_i|do(pa(S_i))), P_{\theta}(S_i|do(pa(S_i)))) 
 %         \implies {L}_{G}^{in} \leq L_{FL} + \tilde{L}_{G}^{in}  $ 
\begin{equation}
    \begin{split}
        & d(P(S_i|do(pa(S_i))), P_{\theta}(S|do(pa(S_i))))  \\
               &\leq 
               d(P(S_i|do(pa(S_i))), P_{w}(S|do(pa(S_i)))) \\
        & +  d(P_{w}(S_i|do(pa(S_i))), P_{\theta}(S_i|do(pa(S_i)))) \\
        & \implies {L}_{G}^{in} \leq L_{FL} + \tilde{L}_{G}^{in}   
    \end{split}
\end{equation}
%
 %   
\begin{theorem}
    Let $A$ be any algorithm that, given a partition $\{S_1, S_2, \dots, S_k\}$ of the nodes of a causal graph $G$, trains a deep causal generative model sequentially on $S_1, S_2, \dots, S_k$ in that order to fit $P(S_i|S_1,S_2,\hdots S_{i-1})$, respectively.
\begin{enumerate}
    \item If $S_i$ are \textit{c-components} of the graph $G$, then $A$ is \textit{consistent}, i.e., it fits the joint distribution correctly for any execution.
%
\item Conversely, when $S_i$ are not \textit{c-components}, then there exists a graph $G$ for which algorithm A may fail for any order $S_1,S_2,\hdots S_k$, i.e., there exists a training execution that is \textit{inconsistent} and algorithm A will not fit the joint distribution.
\end{enumerate}
% a sequence $\{S_1, S_2, \dots, S_k\}$, and 
\end{theorem}
%
%
% to actually learn the mechanisms located in a c-component, they 
% our maximum modularity result shows that we do not need access to $W$. Since $P(x,y|do(z))= P(x) P(y|x,z)$, we can sample from this distribution and use those samples as training data to train $G_X,G_Y$. 
% The proof is provide in Appendix~\ref{theo-analysis}


% As we are minimizing the loss functions $L_1$ and $L_2$, our target loss function $L_0$ will be minimized as well.



% \subsection{Efficient learning of c-factors: $P(s_i|do(pa(S_i))$ }
% \label{subsec:c-factor-train}
% % We can apply step 7 of the ID algorithm for the case in modular-DCM when rule-2 does not apply. 

% Equation~\ref{eq:c-fact} suggests that we have to match each of the c-factors in the product. This can be done independently and in-parallel. 
% Thus, we first focus on how a specific c-factor can be matched by training the mechanisms of the c-component. 
% The main idea is that we train a set of models $\{M_j\}$ to learn conditional distributions and utilize them to generate samples from the c-factor $P(s_i|do(pa(s_i))$. 






% \subsection{Connection with Federated Learning}


% Calling \textbf{LocalTraining(.)} similarly, we train models in the heterogeneous c-component $S_{\Vc}$ but send those models to the server to be aggregated with the global model.  That is, each client locally takes one step of gradient descent on the current global models $\{f_{V_j}\}_{V_j \in S_{\Vc}}$  using its local data, and the server then takes a weighted average of the resulting models. 





% When client $c$ calls \textbf{LocalTraining(.)} for a c-component $S$, it makes $E$ training passes over its local dataset $D^c$ of $\mathcal{B}$ batches to train models $\{f_{V}\}_{V \in S}$. Client $c$ compares the fake samples $D^F \sim P_{\theta}(s|do(pa(s))$ with real interventional data $D^R \sim P(s|do(pa(s))$ to obtain a loss function $\ell$ (discussed in Section~\ref{subsec:c-factor-train}). The model weights $w[V]$ of each function $f_{V}$ in $S$ is updated as $w[V] \gets w[V] - \eta \nabla \ell$.
% This local training for each c-component is performed in parallel.



% \red{First, each source computes gradients from all sources and subsequently updates the model. Next, the server broadcasts the new the local gradient,  using its own data and sends to the server. The server, then, collects these model to all the sources.}




% \begin{figure}[t!]
%     \centering
%         \begin{subfigure}[c]{0.45\linewidth}
%   \centering
% \begin{tikzpicture}[scale=0.7, transform shape]
%     %\tikzstyle{every node}=[font=\tiny]
%     \tikzstyle{every node}=[]
%     \node   [] (x) {${Z}$};
%     \node [right =1.cm of x] (w1) {$\mathbf{X}$};
%     \node [ right=1cm of w1] (w2) {${Y}$};
%     \draw[ thick] (x) to  (w1); 
%     \draw[ thick] (w1) to  (w2); 
%     \path[bidirected] (x) edge[bend left=35] (w2);
% \end{tikzpicture}
% % \caption{Generalized format}
% \end{subfigure}
% %
% \caption{Causal graphs}
% \label{fig:gen-form}
% \end{figure}


\begin{figure}[t!]
    \vspace{-4mm}
    \centering
    \includegraphics[width=0.9\linewidth]{Figures/cond_model.pdf}
      \vspace{-2mm}
    \caption{TVD of $P(X|Z)$ w/ 1000 training samples}
    \label{fig:vary-dim}
    \vspace{-3mm}
\end{figure}
%
\section{Experiments}
We illustrate performance of \fedcm algorithm on synthetic and real-world IHDP data with extensive analysis. 
We provide additional experiments for non-identifiable causal effects in Appendix~\ref{uncertainty}.
Our codes are made
\href{https://github.com/Musfiqshohan/Fedcm}{public}.
%
\par \textbf{Setup:}
We select the front-door causal graph in Figure~\ref{fig:frontdoor} for our synthetic experiments. $\leftrightarrow$ implies that there is an unobserved confounder between $Z$ and $Y$.
We evaluate the performance of our algorithm  on mainly two setups.
First, we consider that each client has 1000 training samples and the training datasets have increasing support size of $X$, i.e., $X\in \{20, 128, 256\}$.
For the second setup, we fix the support of $X$ as $|X|=20$ and evaluate our algorithm when each client has dataset size $|D^c|\in \{500, 1000, 1500, 2000\}$. $Z$ and $Y$ are considered to be binary. 
Suppose, clients wish to share only the model $f_X$, i.e.,  $\cpc= \{f_X\}$. Since $X$ is a c-component itself, we accept it and perform federated training for $f_X$ while training all $f_Z, f_X,f_Y$ on local data.
%
%
%
\textbf{Baseline (NoFL):}
We consider a baseline where each client follows every step of our algorithm exactly but train on only local data and does not communicate with each other.
%
\par\textbf{Varying support size of $X$:} 
% \red{  For dimensionality =20, the loss goes down in Figure 4 when we increase the number of samples from 500 to 2000.
% Thus, our claim is that the performance drop is due to low sample size, not because the variable is continuous. }
%
We illustrate the total variation distance (TVD) between the true distribution of $P(x|z)$ and the model approximated distribution for varying support size but for a fixed number of samples in Fig.~\ref{fig:vary-dim}.
The orange, green and red bars represent $X$ having support size equal 256, 128 and 20 respectively. The hatched bars represent \fedcm, and smooth bars represent the baseline NoFL. 
For client 1 that TVD for \fedcm (hatched bar) reduces from $0.22$ to $0.19$ to $0.08$ as the support size of $X$ reduces from $256$ to $128$ to $20$, respectively.
This behavior remains consistent for all clients (client 1-5).
Note that \fedcm has lower TVD in (almost) all cases compared to NoFL (smooth bar).
Since we keep the sample size fixed at 1000, when the support size is larger (ex: $|X|=256$), the small number of samples is not sufficient to accurately learn $P(x|z)$ of a high-dimensional variable $X$. Thus, TVD loss is higher for $|X|=256$ compared to $|X|=20$. Our results show that federated training is highly effective when the mechanisms that participate in FL, i.e., $\F$, are high dimensional.
%
\begin{figure}[t!]
\centering
\begin{subfigure}[c]{0.45\linewidth}
\centering
\begin{tikzpicture}
[scale=0.7, transform shape]
% [>=Stealth, node distance=0.5cm]
  % Nodes
  \node (Z) {${Z}$};
  \node[right=of Z] (X) {${X}$};
  \node[right=of X] (Y) {${Y}$};

  % Directed edges
  \draw[thick] (Z) -- (X);
  \draw[thick] (X) -- (Y);

  % Bidirected edge (Z <--> Y)
  \draw[bidirected, dashed, bend left=50] (Z) to (Y);
\end{tikzpicture}
\caption{Query: $P(y|do(z))$.}
\label{fig:frontdoor}
\end{subfigure}
\hfill
\begin{subfigure}[c]{0.45\linewidth}
\centering
\begin{tikzpicture}
[scale=0.7, transform shape]
% [>=Stealth, node distance=0.5cm]
  % Nodes
  \node (Z) at (0,1) {\texttt{$Z$}};
  \node (T) at (-1,0) {\texttt{$T$}};
  \node (Y) at (1,0) {\texttt{$Y$}};

  % Arrows
  \draw[thick] (Z) -- (T);
  \draw[thick] (Z) -- (Y);
  \draw[thick] (T) -- (Y);
\end{tikzpicture}
\caption{Query: $P(y|do(t))$.}
% 
\label{fig:ihdp}
\end{subfigure}
\caption{Causal graphs: synthetic and real (IHDP) datasets}
\vspace{-4mm}
\end{figure}
%
\par \textbf{Varying sample size:} 
In figure~\ref{fig:vary-support}, we show our performance on each client having sample sizes 500 (blue), 1000 (orange), 1500 (green) and 2000 (red) while keeping $|X|=20$ as fixed. This plot represent how closely \fedcm  approximated the true SCM as the TVD metric indicates the distance between the true joint distribution $P(v)$ and the distribution implied by the trained DCM $P_{\theta}(v)$.
As the sample size increases to 500,1000, 1500 and 2000,
the \fedcm TVD for client 1 (hatched) decreases to 0.13, 0.118, 0.105 and 0.094, accordingly.
Here, \fedcm has a lower TVD in (almost) all cases compared to the baseline NoFL (smooth bar) even for $X$ having low dimension. 
% However, as the sample size increases, for small support size, the gap becomes significant between federated training and \red{individual} training. 
%
Figure~\ref{fig:vary-support} shows that with small support size (i.e., 20), when clients have high number of samples (eg., red, 2000), FL does not significantly improve the training performance as
each client has enough samples to obtain a good estimator for $P(x|z)$.
% as each client has enough samples to learn the corresponding distribution by themselves (individual training). 
Thus, both individual clients (smooth red bars) and \fedcm (hatched red bars) obtain similar performance.
However, if we have small number of samples (eg., blue, 500), clients can not learn the unbiased estimator for the distribution by themselves in individual training. 
Our method exploits the federated learning setup to obtain a better estimator of $P(x|z)$ compared to individual clients.
% Thus, FL plays an important role in performance gain. 
Thus when the sample size decreases (small sample size), even for small support sizes, the performance gap between federated training (hatched blue bars) and individual training (smooth blue bars) increases.
%
% When the sample size is 2000 for each client (red bars), each client has enough samples to obtain a good estimator for $P(x|z)$. 
% Thus, both individual clients (smooth red bars) and \fedcm (hatched red bars) obtain similar performance.
% However, when the sample size is 500 (blue bars), our method exploits the federated learning setup to obtain a better estimator of $P(x|z)$ compared to individual clients. 
% Thus, the performance gap between individual clients  and fedcm  increases.}
% Thus, our algorithm shows better performance when there is data scarcity in each client and proves its effectiveness.
%
% Now, that the DCM has matched the observational distribution $P(v)$ with low TVD, any causal effects such as \red{$P(y|do(x))$} should be close to the true causal effect based on identifiability.
After matching the joint distribution with low TVD, prediction for identifiable causal effect should be close to the ground truth.
%
%%%
% Since the global model aggregates all local models, it generalizes to all clients and should have better performance on average even though it performs worse in some clients. 
%
%
\begin{figure}[t!]
    \vspace{-4mm}
    \centering
    \includegraphics[width=0.9\linewidth]{Figures/dcm_tvd.pdf}
    \vspace{-2mm}
    \caption{TVD of $P(Z,X,Y)$  w/ different sample sizes. 
    % \red{update caption: dim=20, what this mean}
    }
    \label{fig:vary-support}
    \vspace{-3mm}
\end{figure}
%
%
%
%
\subsection{Experimental Analysis on IHDP}

\textbf{Dataset:}
We performed an experiment on a real benchmark dataset, the {Infant Health and Development Program (IHDP)} dataset~\cite{hill2011bayesian}. It contains 747 records with a total of 27 variables of varying data types that increase problem complexity. It has i) 25 covariates: 6 continuous variables and 19 discrete variables, ii) 1 treatment variable: discrete, iii) 1 outcome variable: continuous.
To mimic the federated setup, samples are randomly distributed to the clients. Even though non-iid distribution is not considered here, due to the small number of samples in each client, data heterogeneity will arise.
To compare with our baselines, we aim to estimate the Average Treatment Effect (ATE): $ATE = \mathbb{E}[P(Y \mid do(T=1))] - \mathbb{E}[P(Y \mid do(T=0))]$
% \begin{equation*}
% ATE = \mathbb{E}[P(Y \mid do(T=1))] - \mathbb{E}[P(Y \mid do(T=0))]
% \end{equation*}
Note that, we assume the causal graph in Figure~\ref{fig:ihdp} for the IHDP dataset.
%
%
Our main baseline, Vo et al. (2022), assumes that they do not have access to the original set of covariates. Rather, they use some proxy ($X$) of them. They try to obtain a posterior of the covariates $Z$ from the proxy $X$, treatment $T$, and outcome $Y$.
%
In contrast, we consider a one-to-one invertible mapping between covariates and the proxy, i.e., $Z = X$. Hence, we assume full observability of $Z$ and no unobserved confounders.

\textbf{Solution Design:}
We aim to learn a deep causal model consistent with our assumed causal graph: a proxy of the true SCM. Ideally, we train $G_Z$, $G_T$, $G_Y$ to match $P(Z, T, Y)$.
%
However, for our target ATE (same as the baselines), we need to estimate $P(Y \mid do(T=1))$ and $P(Y \mid do(T=0))$. By the backdoor criterion:
$P(Y \mid do(T)) = \sum_z P(Y \mid z, T) P(z)$.
%
% \begin{equation}
% P(Y \mid do(T)) = \sum_z P(Y \mid z, T) P(z)
% \end{equation}
%
Two approaches can be used to estimate the causal effects.
i) Approach 1: Federated learning (FL) trains model $M_Z$ to learn $P(z)$ and model $M_Y$ to learn $P(y \mid z, t)$. Then, generate $Z \sim P(Z)$ using $M_Z$, and sample $Y$ from $M_Y$ using $P(y \mid z, t)$. These $Y$ samples approximate $P(Y \mid do(T))$.
ii) Approach 2: FL trains only $M_Y$ to learn $P(y \mid z, t)$. Use $Z$ directly from the training dataset ($Z \sim P(Z)$) and feed into $M_Y$ to sample $Y \sim P(y \mid z, t)$.
In both cases, the generated $Y$ are used to estimate ATE.
%
%
We ran our algorithm for 200 federated learning rounds, with each client trained for 2000 epochs per round. We utilize a GAN architecture to learn the joint distributions. 
To verify our estimation, we need a reference value for ATE. As no ground truth exists, we used: i)~\citet{vo2022adaptive}'s implementation: ground truth ATE $\approx 3.98$ ii) Dowhy package~\cite{sharma2020dowhy} (propensity score matching on full dataset): also yielded similar value.
Thus, we consider 3.98 as our reference ATE.

\textbf{Baselines Comparison:}
We follow the same experimental setup as described by ~\citet{vo2022adaptive}, and compare our performance with the results reported in their paper, obtained by their method and other baselines. 
%
We consider the last 20 global rounds of our model training for evaluation. Based on the reference and predicted ATE, we calculate the mean and standard deviation of the ATE error for each client and report the average across all clients.
%
%
% \begin{table}[h!]
% \centering
% \begin{tabular}{@{}ll@{}}
% \toprule
% \textbf{Method} & \textbf{ATE error (ATE = 3.98)} \\
% \midrule
% BART\_ag & $1.3 \pm 0.05$ \\
% X-Learner\_ag & $1.2 \pm 0.09$ \\
% R-Learner\_ag & $1.0 \pm 0.07$ \\
% OthoRF\_ag & $1.3 \pm 0.09$ \\
% TARNet\_ag & $2.5 \pm 0.06$ \\
% CFR-wass\_ag & $2.7 \pm 0.05$ \\
% CFR-mmd\_ag & $2.5 \pm 0.03$ \\
% CEVAE\_ag & $2.1 \pm 0.09$ \\
% FedCI (~\cite{vo2022adaptive}) & $0.5 \pm 0.09$ \\
% CausalRFF (~\cite{vo2022adaptive}) & $0.5 \pm 0.16$ \\
% \textbf{Ours\_3clients\_w/50samples} & $\mathbf{0.418 \pm 0.24}$ \\
% \textbf{Ours\_12clients\_w/50samples} & $\mathbf{0.351 \pm 0.22}$ \\
% \textbf{Ours\_3clients\_w/200samples} & $\mathbf{0.271 \pm 0.16}$ \\
% \bottomrule
% \end{tabular}
% \end{table}
%
\begin{table}[t!]
\vspace{-3mm}
\scalebox{0.9}{
\begin{tabular}{|cccc|}
\hline
\multicolumn{4}{|c|}{Method vs ATE error (Reference ATE = 3.98)}                                                                         \\ \hline
\multicolumn{1}{|c|}{BART\_ag}      & \multicolumn{1}{c|}{$1.3 \pm 0.05$} & \multicolumn{1}{c|}{TARNet\_ag}   & $2.5 \pm 0.06$ \\ \hline
\multicolumn{1}{|c|}{X-Learner\_ag} & \multicolumn{1}{c|}{$1.2 \pm 0.09$} & \multicolumn{1}{c|}{CFR-wass\_ag} & $2.7 \pm 0.05$ \\ \hline
\multicolumn{1}{|c|}{R-Learner\_ag} & \multicolumn{1}{c|}{$1.0 \pm 0.07$} & \multicolumn{1}{c|}{CFR-mmd\_ag}  & $2.5 \pm 0.03$ \\ \hline
\multicolumn{1}{|c|}{OthoRF\_ag}    & \multicolumn{1}{c|}{$1.3 \pm 0.09$} & \multicolumn{1}{c|}{CEVAE\_ag}    & $2.1 \pm 0.09$ \\ \hline
\multicolumn{1}{|c|}{FedCI}         & \multicolumn{1}{c|}{$0.5 \pm 0.09$} & \multicolumn{1}{c|}{CausalRFF}    & $0.5 \pm 0.16$ \\ \hline
\multicolumn{2}{|c|}{\textbf{Ours\_3clients\_w/50samples}}                & \multicolumn{2}{c|}{$\mathbf{0.418 \pm 0.24}$}     \\ \hline
\multicolumn{2}{|c|}{\textbf{Ours\_12clients\_w/50samples}}               & \multicolumn{2}{c|}{$\mathbf{0.351 \pm 0.22}$}     \\ \hline
\multicolumn{2}{|c|}{\textbf{Ours\_3clients\_w/200samples}}               & \multicolumn{2}{c|}{$\mathbf{0.271 \pm 0.16}$}     \\ \hline
\end{tabular}
}
\vspace{-3mm}
\end{table}
According to~\citet{vo2022adaptive}, these baselines use 50, 100, and 99 samples for train, test, and validation sets per client, with number of clients = 3.
To illustrate the scale of our experiment, we consider 3 setups: i) 3 clients each with 50 samples, ii) 12 clients each with 50 samples, iii) 3 clients each with 200 samples.

We have two main observations. \textbf{Observation 1:} In all setups (i, ii, iii), we obtain relatively small mean ATE error compared to other baselines. 
\textbf{Observation 2:} Sorting ATE errors yields: 
a: 3 clients 50 samples $>$ b: 12 clients 50 samples $>$ c: 3 clients 200 samples. 
a $>$ b shows that our method can benefit from federated learning to reduce ATE error. 
b $>$ c shows how sample size impacts error in FL settings.
The observed standard deviation is likely due to GAN training on small datasets.
Moreover, for the model that matches $P(Z)$ we observe max TVD loss $= 0.0768$ (discrete covariates). For the model that matches $P(Y \mid Z, T)$, we observe Wasserstein distance $= 0.0147$ (continuous $Y$).
These metrics indicate reasonable convergence of the models.


\section{Conclusion}
We explore the federated learning to learn a proxy of the structural causal model from  distributed datasets. We introduce maximal modularity and propose a framework where a set of models trains to learn the SCM from local data while another set of models participates in the collaborative training to aid global information to the local models. 
% The maximal modularity concept we introduce allows us to modularize the SCM training process and utilize global training. 
% Empirical results show that our approach approximates the joint with low TVD.
After convergence, the DCM can estimate any identifiable causal effects. In our future work, we aim to remove the assumption on having access to the true causal graph.


% \section{Acknowledgement}
% \red{Acknowledgement: Prof's funds and Ilya shpitser and uai reviwers.}


\newpage
% References

\begin{acknowledgements} % will be removed in pdf for initial submission,
						
This research has been supported in part by NSF CAREER 2239375, IIS 2348717, Amazon Research Award and Adobe Research. 
We also thank Ilya Shpitser for sharing helpful insights on the identifiability of c-factors.
\end{acknowledgements}


\bibliography{ref}


\newpage

\onecolumn

\title{FeDCM: Federated Learning of Deep Causal Generative Models\\(Supplementary Material)}
\maketitle

\appendix

\section{Additional Details}
\begin{definition}[Causal Bayesian Network~\cite{pearl1995bayesian}]
\label{def:cbn}
    A Bayesian network considers dependencies among a set of observed variables while a causal Bayesian network considers causal dependencies among them.
When we perform an intervention on a variable (i.e. set a variable to a fixed value), it affects the variables that have causal relations with it and keeps remaining variables unaffected. A Bayesian network does not have the concept of intervention as it considers only correlation and not causal relations. However, suppose the Bayesian network has a consistent structure such that i) after performing an intervention, no other nodes except the descendants are affected due to the intervention ii) and this is true for all possible interventions, we can call it a causal Bayesian network.
\end{definition}

Since our goal is causal effect estimation, the reviewer is correct to observe that our focus is on causal Bayesian networks in this paper.




\subsection{\fedcm main Contributions}
Our contributions in this paper are two folds: First (algorithmic improvement with maximum modularity): our maximum modularity result shows that to learn a c-factor distribution, we do not need access to variables outside the c-component unlike ~\cite{rahman2024modular} who 
% suggest modular learning based on c-components but 
construct a complicated structure called, H-graph and learn a joint distribution larger than the c-factor in many scenarios.
Training a smaller number of models is particularly helpful in a federated setting to reduce communication cost.

Secondly (Adaptation in federated learning setting):
Although there are some recent works~\cite{pawlowski2020deep,rahman2024modular, xia2021causal, chao2023interventional} that learn deep/neural causal models from observational data, all of them assume access to the entire dataset, i.e., they are designed for centralized settings. We propose the first approach to learn a deep causal model in the federated learning setup. 

\subsection{Data Types:}
{Note that in our synthetic experiments, all of our variables are discrete (with finite support size). However, this is a specific instance of experiment and not a limitation of our approach. Any variable (covariates, mediators, outcome variables), except the intervened variable can be discrete, continuous or high-dimensional (ex: images) and our method will work when employed with appropriate generative models.
However, when the intervened variable (treatment) is continuous, causal effect estimation is generally non-regular and requires a more careful handling~\cite{balazadeh2022partial}.
}


\subsection{Violation of assumptions and Future Works}
\label{violate-ass}
Below we provide some details on our assumptions and what challenges one might face if assumptions are violated or relaxed.

% \subsubsection{Assumption of iid data}
% % \label{non-iid}
% \blue{First, we focus on the fact that even if client datasets are iid samples from the same distribution, due to finite sample size, the empirical distributions obtained from the client datasets, might look very different, i.e., client data heterogeneity may arise.}

% \red{Next, when we consider non iid data, the challenge is that each client's distribution might differ arbitrarily from the true distribution. If two clients have different probability table for a conditional distribution, it needs to be decided which one is correct. We might want a global model that learns the distribution which is weighted average of the clients. It might also be possible that clients observed different supports of the same joint distribution. Thus, to deal with non-iid data and make the problem setup precise, we would need to establish additional assumptions. Thus, we keep these features for our future work and focus on reducing communication cost with our proposed modularity for federated learning of deep causal models.}


\subsubsection{Assumption of having access to the causal graph}
{The access to ADMG assumption can be relaxed by employing any federated-learning-compatible causal discovery algorithm, as a pre-processing step. We aim to address the challenges associated with non-iid data in our future works.}


\subsubsection{{What if some clients only observe a subset of variables?}
}
 If clients observe variable values located at different parts of the causal graph, it might be challenging to obtain the whole joint distribution. But as our method offers c-component based modular training, clients do not need to observe the whole joint. Additionally, we can resort to approaches such as~\cite{gresele2022causal} which merges available causal marginal information from given observations of subsets of variables in a causal graph. This is  an interesting direction we aim to explore in our future works.


\subsubsection{The impact of model mis-specification on performance}
One interesting future direction is to see how the estimation changes when we have the incorrect causal graph. We might assume incorrectly i) presence of an edge, ii) orientation of an edge, iii) unobserved confounders etc. If iii) happens, i.e., we assume that we have observed all confounders but there still exists some unobserved confounder, then the causal effect will be non-identifiable and we can obtain a bound using maximization and minimization in the NCM/DCM algorithm.

For i) and ii) one solution might be to iterate all possible choice of edge presence/orientation to obtain the set of causal graphs and then train a DCM for those. This will give us a set of possible causal effects. In our future work, we plan to explore how these possibilities will work in a federated learning setup.



\subsection{Complexity Analysis}
\label{appex:complex}

\textbf{Communication complexity:}
The communication cost for a single client is $O(T*|C_{max}(G)|*m)$ where $T:$ global rounds, $m:$ the communication cost for sending all weights of a single neural network and $C_{max}(G):$ the largest c-component in the graph. 

Below we discuss in detail.

Suppose, clients propose a set of mechanisms, $S_{user}$ for collaborative learning. If corresponding
variables of $S_{user}$ does not
form a whole c-component in the graph, we reject the set and offer a super-set $S_{FL}$ that construct a
c-component in the graph. Note: A c-component is maximal subsets of nodes that are connected by unobserved confounders.

Let $S_{FL}$ correspond to the largest c-component in the graph, $C_{max}(G)$. This implies that we have to
share model weights for $|C_{max}(G)|$ number of neural networks (NN)
to participate in federated training. If m is the communication cost for sending all weights of a single neural network,
our
communication cost for a client is=  $O(|C_{max}(G)|*m)$.

If there are total $T$ global rounds, the communication cost for a single client is $O(T*|C_{max}(G)|*m)$.



\textbf{Server computational complexity:} As the server will aggregate weights of $|C_{max}(G)|$ different models, its
computation time complexity is $O(|C_{max}(G)|)$.

\textbf{Client computational complexity:}
To join the federated learning, each client will have to train $|C_{max}(G)|$ number of NN models.
Next, clients will learn the deep causal model with local data and interventional data generated from
these $|C_{max}(G)|$ NN models.

Suppose the set of all variables in the SCM is $\mathcal{V}$.
As each client learns a deep causal model $\mathbb{G}$ that consists of $|\mathcal{V}|$ NN models,
it would require $O(|\mathcal{V}|)$ time to train them. So, the total complexity of NN model training
would be $O(|C_{max}(G)|+ |\mathcal{V}|)$.




\subsection{Additional experiments}
\label{uncertainty}

We have shared our codes at: 
\href{https://github.com/Musfiqshohan/Fedcm}{github.com/musfiqshohan/fedcm}

\begin{center}
\begin{tikzpicture}[scale=1, transform shape]
    %\tikzstyle{every node}=[font=\tiny]
    \tikzstyle{every node}=[]
    \node   [] (x) {${\X}$};
    \node [ right =0.8cm of x] (w2) {${Y}$};
    \node [ left =0.8cm of x] (z) {${Z}$};
    \node [ below =0.6cm of x] (s) {$W$};
    \draw[ thick] (z) to  (x); 
    \draw[ thick] (z) to (s); 
    \draw[ thick] (x) to  (w2); 
    \draw[ thick] (s) to  (x); 
    \path[bidirected] (z) edge[bend left=35] 
    node[pos=0.5,sloped,font=\small, align=center] {$U_1$}(w2);
    \path[bidirected] (s) edge[bend left=45]
    node[pos=0.5, below, sloped,font=\small, align=center] {$U_2$}(z);
\end{tikzpicture}
\end{center}

The unobserved variables $\{U_1, U_2\}$ have a support size of 3, while the observed variables $\{Z, W, \X, Y\}$ have a support size of 2. In this graph, the unobserved confounder $U_2$ makes the causal effect $P(Y \mid do(Z))$ \textit{non-identifiable}. The goal is to obtain a bound over all possible causal effects such that the true effect lies within it. The challenge is that data is decentralized and user preferred model that is allowed to train globally is $\cpc= \{\X\}$.

We employ \textbf{FeDCM} to design our solution.
Here we have two c-components $\{Z,W,Y\}$ and $\{\X\}$.
We train a conditional model $M$ to learn {$P(\X \mid Z,W)$} in a federated manner, as it belongs to a different c-component. Next, we train the deep causal model: $\mathbb{G}_{Z}, \mathbb{G}_{W}, \mathbb{G}_{Y}, \mathbb{G}_{\X}$ such that:
\begin{itemize}
    \item[i)] the observational distribution $P(z,w,i,y)$ of local data is matched,
    \item[ii)] the interventional distribution $P(\X \mid do(z))$ is matched using samples generated from the globally trained model $M$, and
    \item[iii)] our target query $P(y=1 \mid do(z=1))$ is maximized (or minimized).
\end{itemize}

We use the total variation distance (TVD) to assess how well the distributions are learned. In practice, TVD is never exactly zero due to approximation errors during model training. Therefore, we construct bounds for the causal effect whenever the corresponding joint distribution is matched with TVD $< 0.15$. Below, we provide the bounds for three clients:

\begin{table}[h!]
\centering
\begin{tabular}{|c|c|c|c|c|}
\hline
\textbf{TVD $< 0.15$} & \textbf{Ground Truth} $P(y=1 \mid do(z=1))$ & \textbf{Client 1} & \textbf{Client 2} & \textbf{Client 3} \\
\hline
& & Bound [min, max] & Bound [min, max] & Bound [min, max] \\
\hline
$|X|=2$ & 0.186 & [0.125, 0.2692] & [0.0674, 0.2998] & [0.0572, 0.2728] \\
\hline
\end{tabular}
\caption{Bounds on $P(y=1 \mid do(z=1))$ estimated by FeDCM for three clients.}
\end{table}

\section{Theoretical Analysis}
\label{theo-analysis}





% \begin{theorem}
%     Let $A$ be an algorithm that, given a partition $\{S_1, S_2, \dots, S_k\}$ of the nodes of a causal graph $G$, trains a deep causal generative model sequentially on $S_1, S_2, \dots, S_k$ in that order to fit $P(S_i)$, respectively.
% \begin{enumerate}
%     \item If $S_i$ are \textit{c-components} of the graph $G$, then $A$ is \textit{consistent}, i.e., it fits the joint distribution $P(v)$ correctly for any execution.

% \item Conversely, when $S_i$ are not \textit{c-components} of $G$, then there exists a graph $G$, a sequence $\{S_1, S_2, \dots, S_k\}$, and a training execution that is \textit{inconsistent}, i.e., it will not be able to fit the joint distribution $P(v)$.
% \end{enumerate}

% \end{theorem}


\begin{theorem}
    Let $A$ be any algorithm that, given a partition $\{S_1, S_2, \dots, S_k\}$ of the nodes of a causal graph $G$, trains a deep causal generative model sequentially on $S_1, S_2, \dots, S_k$ in that order to fit $P(S_i|S_1,S_2,\hdots S_{i-1})$, respectively.
\begin{enumerate}
    \item If $S_i$ are \textit{c-components} of the graph $G$, then $A$ is \textit{consistent}, i.e., it fits the joint distribution correctly for any execution.
%
\item Conversely, when $S_i$ are not \textit{c-components}, then there exists a graph $G$ for which algorithm A may fail for any order $S_1,S_2,\hdots S_k$, i.e., there exists a training execution that is \textit{inconsistent} and algorithm A will not fit the joint distribution.
\end{enumerate}
\end{theorem}


\begin{proof}
Let us define the bi-directed neighbors, $bNb(X)$ as the set of variables such that $\forall V\in bnb(X)$ there exists a bi-directed edge $V\leftrightarrow X \in E$.

\textbf{(1.)} Consider a c-component $S_i\in C(G)$ and $S_i=\{X_1,...,X_j,X_{j+1},...,X_m\}$. Let variable $X_j$ and $X_{j+1}$ have a bi directed edge between them, i.e, $X_j\leftrightarrow X_{j+1}$ and the shared common confounder is $U$. 
Since they share a common confounder, $X_j 
\notindep X_{j+1}$. Thus we need to feed a same confounding (Gaussian) noise $U\sim N(0,I)$ as input to both $f_{X_j}$ and $f_{X_{j+1}}$ and train them together using the same loss function to match the joint distribution $P(x_j, x_{j+1})$.
Now, consider a bi-directed neighbor of $X_{j+1}$: $X_{j+2}\in bnb(X_{j+1})$. As they share another common confounder, $X_{j+1}\notindep X_{j+2}$. This implies that
we need to feed a same confounding noise as input to both $f_{X_{j+1}}$ and $f_{X_{j+2}}$ and train them together to match the joint distribution $P(x_{j+1}, x_{j+2})$. However, $X_j\leftrightarrow X_{j+1}$ demands we train $f_{X_j}$ and $f_{X_{j+1}}$ together and $X_{j+1}\leftrightarrow X_{j+2}$ demands that we train $f_{X_{j+1}}$ and $f_{X_{j+2}}$ together. Thus, to preserve both dependency we have to train all $f_{X_j}, f_{X_{j+1}}$ and $f_{X_{j+2}}$ together. This set gradually expands to the whole c-component $S_i$, i.e, all variables that are connected with bi-directed edges. Thus if we train mechanisms for all variables in the c-component together, and do the same for all c-components $\{S_1, S_2,..., S_k\}$, we can match the joint $P(v)$.



\textbf{(2)}
To prove the converse statement, consider a causal graph that consist of a cycle of c-component:

\begin{equation*}
    X_1\leftrightarrow X_2\leftrightarrow ... \leftrightarrow X_n \leftrightarrow X_1;
\end{equation*}

We proved in (1.) that if we have a partition $\{S_1\}=\{X_1,...,X_n\}$ where $S_1$ is a c-component then algorithm $A$ is consistent, i.e., it fits the joint distribution correctly.

Now, suppose, we can modularize the training process further based on any partition $\{S_1, S_2, ... , S_k\}$ of the c-component $\{X_1,...,X_n\}$ where $k>1$.

Note that each variable $X_i$ in this c-component cycle contains at least two unobserved confounders as parents.
Without loss of generality, lets assume that $S_i=\{X_i\}$.
If we want to train $S_i$ modularly; separated from rest, we have to break atleast one of the neighbor of $X_i$ in the c-component cycle.
This implies that, $f_{X_i}$ receives both $U_{X_{i-1}, X_i}$ and $U_{X_i, X_{i+1}}$ as inputs in the true SCM, but we are training a model to learn a proxy of $f_{X_i}$ while giving it signal from one neighbor. So, there is no guarantee $f_{X_i}$ will utilize both of the confounders. 

No matter which subset $S_i$, we start with for modular training, the neural networks in $X_i$ might ignore one of the unobserved confounder between $X_{i-1}\leftrightarrow X_i$ and $X_{i}\leftrightarrow X_{i+1}$ and use the other confounder attempting to match its dependence with all other variables, i.e, $P(S_i| S_{\pi_{i-1}})$ 
\footnote{here, $\pi_{{i-1}}$ is all variables in the topological order before $S_i$}. 
As we are training $S_i$ modularly cut off from one of its neighbor, it will try to get all signals from the confounder associated with remaining neighbor. Further modularization than the c-component level will prevent $X_i$ to learn proper dependence with both of the neighbors $X_{i-1}$ and $X_{i+1}$. Thus, for the considered cyclic c-component graph, whatever modularization is performed and training order is adopted, the joint distribution will not fit.

\end{proof}

% \red{give example of the cyclic graph. X<->Y<->Z<->T<-X; No matter which variable we start training from P(x) or maybe P(T) etc. The NN of that variable may ignore one of the unobs. conf. P(x,y,z,t)
% P(x)p(y|x)
% whatever it is .
% Training of the first guy may prevent proper connections with both its neightbors.}


%  \red{for cycle c-component we can not factorize anymore. 
%  Claim: whats the algorithm that can modularize for any given graph st the algorith  is sound. Write carefully. 
% %
%  For any graph, modularization algorithm A is sound.
%  %
%  Converesely, there are graphs for which no algorithm can modularize further.
%  }







\subsection{Additional discussion}  Suppose, mechanisms in $S_i$ are not trained together.
First let us consider the case when any neighboring  pair $X_j, X_{j+1}\in S_i$ are trained independently, i.e., $f_{X_j}$ is trained independently from $f_{X_{j+1}}$. When they are trained together, the generative mechanisms are as follows:
\begin{equation}
\label{eq:first-eq}
    \begin{split}
    x_j &= f_j(pa(x_j), u, \mathcal{U}_{X_j}\setminus\{u\}\\
    x_{j+1} &= f_{j+1}(pa(x_{j+1}), u, \mathcal{U}_{X_{j+1}}\setminus\{u\}
    \end{split}
\end{equation}
Here, $U$ is the shared confounder. $\mathcal{U}_{X_j}\setminus\{u\}$ are remaining confounder that affects $X_j$ and $\mathcal{U}_{X_{j+1}}\setminus\{u\}$ are remaining confounders that affect $X_{j+1}$.

When $f_{X_j}$ and $f_{X_{j+1}}$ are trained independently, the generative mechanisms becomes as follows:
\begin{equation}
    \begin{split}
    x_j &= f_j(pa(x_j), u', \mathcal{U}_{X_j}\setminus\{u'\}\\
    x_{j+1} &= f_{j+1}(pa(x_{j+1}), u, \mathcal{U}_{X_{j+1}}\setminus\{u\}
    \end{split}
\end{equation}
Here, both $u'$ and $u$ are sampled from the same distribution $P(u)$.  As $u'$ is fed as input during training of $f_{X_j}$ which is separate from $f_{X_{j+1}}$,  $u$ and $u'$ varies independently. This breaks the dependency between $X_j$ and $X_{j+1}$ as they are not controlled by the same noise values. Thus $P_{\theta}(x_j, x_{j+1}) \neq P(x_j, x_{j+1})$ and eventually $P_{\theta}(v) \neq P(v)$.


\textbf{(2.2)}
Now consider that mechanisms of
$X_1,..., X_j, X_{j+1},...,X_{m}$ are trained sequentially one by one in this order.
$X_1$ is trained as mentioned before (Equation~\ref{eq:first-eq}):
\begin{equation}
    \begin{split}
    x_1 &= f_1(pa(x_1), u, \mathcal{U}_{X_1}\setminus\{u\}
    \end{split}
\end{equation}

After training $f_1$, we freeze its model weights. As both $f_1$ and $f_2$ share the common confounder $U$, we sample the same  $u\sim P(u)$ and feed it to both $f_1$ and $f_2$. As $f_1$ is frozen, it will use the confounding noise for inference while $f_2$ will use it for training. We can match fake $X_1,X_2$ with real $X_1,X_2$ samples to update model weights of $f_2$.
\begin{equation}
    \begin{split}
    x_2 &= f_1(pa(x_2), u, \mathcal{U}_{X_2}\setminus\{u\}
    \end{split}
\end{equation}

% \begin{equation}
%     \begin{split}
%     x_j &= f_j(pa(x_j), u, \mathcal{U}_{X_j}\setminus\{u\}\\
%     x_{j+1} &= f_{j+1}(pa(x_{j+1}), u, \mathcal{U}_{X_{j+1}}\setminus\{u\}
%     \end{split}
% \end{equation}

Suppose both $\{X_1,X_3\} \subseteq bnb(X_2)$ and the shared confounder between $X_1$ and $X_2$ is $U$ while the confounder between $X_2,X_3$ is $U'$.
Now, as we have already trained $f_1$ but not $f_3$ yet, while training $f_2$, even if we feed both $U$ and $U'$ as input to $f_2$, there is no guarantee that $f_2$ would use $U'$ at all. There might be a possibility that $f_2$ might utilize only $U$ to match the joint distribution $P_{\theta}(x_1,x_2)= P(x_1,x_2)$ but not $P_{\theta}(x_1,x_2, x_3)= P(x_1,x_2, x_3)$. If we had $f_3$ as trained, we could freeze both $f_1$ and $f_3$, feed them $U$ and $U'$ as well for inference and match $P_{\theta}(x_1,x_2,x_3)$ with $P(x_1,x_2,x_3)$ to update model weights of $f_2$. However, to train $f_3$, we would need $f_2$ as pre-trained which creates a cyclic situation. Thus, eventually, sequentially training fails to match the joint distribution $P(v)$.


% \end{proof}





% \begin{theorem}[impossibility theorem]
% Suppose that the positivity and faithfulness assumptions hold. Given an acyclic directed mixed graph, $G$ (ADMG) consistent with a semi-Markovian SCM, $\M$, the variables in a c-component $S$ form an maximal modularity set for the deep causal generative model $\G$ defined over $\M$. Any further modularization fails to match the joint $P(v)$.
% \end{theorem}


% \begin{proof}[proof sketch:] 
% Each pair of variables $X_i,X_j$ in the c-component share an unobserved common confounder. Thus, their mechanisms need to be trained together with the same confounding noise. As these pairs are not disconnected in the graph and at least two pairs have a shared variable, the whole group combining all these pairs, that is, the c-component has to be trained together. We provide the proof in Appendix~\ref{theo-analysis}.
% \end{proof}



% \section{More general cases}


% \subsection{Setup 2: Suppose, clients do not have any specific restriction on sharing mechanisms}. Their goal is to learn the SCM. In that scenario, 
% due to the dimensionality of high-dimensional variables, local client data is not enough to learn their mechanism properly while sufficient for low-dimensional variables. For example, digits can be predicted from different images with small number of images. However, such local data is not enough to learn the image generation mechanism. Thus, FL is required to learn their mechanisms. We can utlize \fedcm to understand what minimal mechanism set we have to train. If we factorize the observational distribution as before:
% \red{Equation}
% This expression implies that we have to learn all mechainsms in its c-component with federeated learning.


% \subsection{Setup 3: Now, we consider the extreme case of data scarcity causing positivity violation for the high-dimensional variables.} In such case, client $i$ never seen specific images and thus the generative mechanism $f$ dont know how to generate such images. At the same time, Ch(image) also never seen such images in their local training data. Thus, learning only the mechanism of the high-dimensional variable is not enough, we need to learn the mechanism of children variable as well. Thus, our mechanism would be c-component of image and c-component of its children.


% Although $\X$ is high-dimensional, local data is not sufficient to learn its true generative mechanism, all mechanisms except $\X$ and its dependent ones are computationally less expensive. Each client's local data is sufficient to learn them. 

% Suppose for client $i$ has few samples for a specific support $\X=x$ for variable $X$ (ex: image of digit 9). Client $i$ needs to learn how to train mechanism $f_{\X}(.)$ to generate $X=x$ (i.e. all possible appearance of 9). Also, we need to learn how the child ($S=Ch(\X)$) mechanisms of $\X$ and their neighbors ($N=Nbr(S))$: $f_{S\cup N} (x); $ should output for $X=x$ (classify all images of 9). 



% \textbf{Computation complexity} of a client depends on parameters such as: $\alpha:$ clients that perform computation on each round, $E:$ number of epochs on local data at each round, $B:$ local minibatch size and $|{f_{V_j}\}_{V_j \in S_{\Vc}}}|:$ number of       models contained in the heterogeneous c-component $S_{\Vc}$.

% \textbf{Defining the Neighborhood Mechanisms for FL}
% We need to learn the i) mechanism for generating the new $X=x$ given its parent ($Pa(X)$) values  and ii) the mechanisms that determine what values its children ($Ch(X)$ should take for the previously unseen $X=x$. For example, suppose the clients maintain a causal graph: $color (C) \rightarrow Image (X) \rightarrow digits (d)$. 
% Both clients has images of colors: $R,G,B$ and digits: $0-9$ but images in their local data are collected from different environments.
% Client 1 has images of $0-4$ w/ all red color while client 2 has images of $5-9$ with green color. 





% % Suppose the graph can be generalized in the format: $W \rightarrow {X} \rightarrow {Z} \rightarrow {Y}; {X} \leftrightarrow {Y}$ as shown in Fig.~\ref{fig:gen-form}.





% % ID-GEN should be able to be utilized easily
% % to obtain c-component based modularity in modular-DCM.
% % Previously we discussed about applications such as adaptation to distribution shift, dcm for time-series, transportability etc.

% % Also \cite{jung2024estimating} show that any
% % g-identifiable causal effect can be expressed as a function of generalized multi outcome sequential back-door adjustments that are amenable to estimation.


% % To obtain a DCM we need to train $|V|$ models for $V$ variables and match all of the following terms.
% % \begin{equation}
% %     P(V) = \prod_{i\in \{n\}} P(S_i| do(pa(S_i))
% % \end{equation}



\section{Algorithms}


\begin{algorithm}[H]
\caption{Fed-DCM Algorithm}
\begin{algorithmic}[1]
\STATE \textbf{Input:} Dataset $\mathcal{D}$, Causal graph $\mathcal{G}$, Variables $\mathbf{V} = \{V_1, V_2, \dots, V_n\}, n = |\mathbf{V}|$

\STATE \textbf{Client initialization:}
\FOR{each $ V \in \mathbf{V}$}
    \STATE Initialize weights of $f_{V}(Pa(V), U_{V} )$ as $w[V]$
\ENDFOR
\STATE $[\mathbf{S}_i, \text{Pa}(\mathbf{S}_i)] \leftarrow \text{c\_component\_partition}(\mathcal{G})$
\STATE $C_g{(G)} =$ Find c-component $S \in \{{S}\}_i $ s.t. $\cpc \in S$

\STATE \textbf{Server executes: }
\STATE Initialize model weights $w_0[V];$ for all $V\in S_{\Vc}$.
\FOR{each round $t = 1, 2, \dots$}
    % \STATE $m \gets \max(C \cdot K, 1)$
    \STATE $C_t \gets$ (random set of $max(\alpha C, 1)$ clients)
    \FOR{each client $k \in C_t$ \textbf{in parallel}}
        \STATE $w_{t+1}^{k}$ $\gets \textsc{ClientUpdate}(k, w_t)$
        % \FOR{each variable $V\in S_{\Vc}$ }
        % \STATE $w_{t+1}^{k}[V] \gets  keep[V] $
        % \ENDFOR
    \ENDFOR
    \FOR{each variable $V\in S_{\Vc}$ }
    \STATE $w_{t+1}[V] \gets \sum_{k=1}^K \frac{n_k}{n} w_{t+1}^{k}[V]$
    \ENDFOR
\ENDFOR
\vspace{1em}


\STATE \textbf{LocalTraining}($w, \mathcal{B}, S, Pa(S)$)): \textit{(Models in a cc)}
\FOR{each local epoch $i$ from $1$ to $E$}
    \FOR{each batch $b \in \mathcal{B}$}
\STATE Sample   $pa(\mathbf{S}_i)  \sim \text{Uniform}(\text{support}(\text{Pa}(\mathbf{S}_i)))$
    \STATE $D^R[\mathbf{S}] = $ getRealIntvData($b, \mathcal{G}, \mathbf{S_i}, pa(\mathbf{S_i})$)
    \STATE $D^F[\mathbf{S}] = $ getFakeIntvData($f_{V_i\in \mathbf{V}}, \mathbf{S_i}, pa(\mathbf{S_i})$))
    \STATE $\ell$ = dist($D^F, D^R$)
    \FOR{each $V\in S$}
        \STATE $w[V] \gets w[V] - \eta \nabla \ell$
    \ENDFOR
    \ENDFOR
\ENDFOR
\STATE \textbf{return} $w$

\vspace{1em}
\STATE \textbf{ClientUpdate($c, w$):} \textit{(Run on client $c$)}
\STATE $\mathcal{B} \gets$ (split $D^c$ into batches of size $B$)

\FOR{each $ {S} \in \{{S_i}\}_i\setminus \{S_{\Vc}\} $ }
\STATE $w = \textbf{LocalTraining($w, \mathcal{B}, S_{i}, Pa(S_{i})$)}$ \textbf{in parallel}
\STATE Save $\{w[V]\}_{V\in S}$ locally.
\ENDFOR
\STATE $w = \textbf{LocalTraining($w, \mathcal{B}, S_{\Vc}, Pa(S_{\Vc})$)}$ \textbf{in parallel}
\STATE \textbf{return} $\{w[V]\}_{V\in S_{\Vc}}$ to server
% \STATE \textbf{return } SCM mechanisms $f_{V}$
\end{algorithmic}
\end{algorithm}






    \begin{algorithm}[H]
\caption{getRealIntvData($\mathcal{D}, \mathcal{G}, \mathbf{S}, pa(\mathbf{S})$)}
\begin{algorithmic}[1]
\STATE \textbf{Input:} Dataset $\mathcal{D}$, Causal graph $\mathcal{G}$, C-component $\mathbf{S}$, blanket $Pa(\mathbf{S})$.
\STATE $An= V_{\pi^{j-1}} \cap (S_i \cup pa(S_i))$; $\pi_{\mathcal{G}}$ be the ancestral order.
\FOR{each $V_j \in \mathbf{S}$}
\STATE Train $M_j(An)$ on $\mathcal{D}$ such that $M_j(An) \sim P(v_j|An)$
\ENDFOR
\STATE Fix $ pa(\mathbf{S}) $ in $M_{j: V_j \in \mathbf{S}}$ and ancestral sample to obtain $ D^R[\mathbf{S}] \sim P(\mathbf{S} \mid \text{do}(\text{Pa}(\mathbf{S}))) $
\STATE \textbf{Return } $D^R[\mathbf{S}]$
\label{alg:getRealIntvData}
\end{algorithmic}
\end{algorithm}



\begin{algorithm}[t!]
\caption{getFakeIntvData($\mathbb{G}_{V_i\in \mathbf{V}}, \mathbf{S}, pa(\mathbf{S})$)}
\begin{algorithmic}[1]
\STATE \textbf{Input:}  DCM $\mathbb{G}_{V_i\in \mathbf{V}}$, C-component $\mathbf{S}$, blanket $Pa(\mathbf{S})$.
\STATE Fix $ pa(\mathbf{S}) $ in $\mathbb{G}_{V_i \in \mathbf{V}}$ and ancestral sample to obtain $ D^F[\mathbf{S}] \sim P(\mathbf{S} \mid \text{do}(\text{Pa}(\mathbf{S}))) $
\STATE \textbf{Return } $D^F[\mathbf{S}]$
\label{alg:getFakeIntvData}
\end{algorithmic}
\end{algorithm}



% \begin{algorithm}[t!]
% \caption{Client Executes}
% \begin{algorithmic}[1]
% \STATE \textbf{Input:} Dataset $\mathcal{D}$, Causal graph $\mathcal{G}$, Variables $\mathbf{V} = \{V_1, V_2, \dots, V_n\}, n = |\mathbf{V}|,$ High-dim variable $\X$.

% % \STATE \textbf{Client initialization:}
% \FOR{each $ V \in \mathbf{V}$}
%     \STATE Initialize weights of $f_{V}(Pa(V), U_{V} )$ as $w[V]$
% \ENDFOR
% \STATE $[\mathbf{S}_i, \text{Pa}(\mathbf{S}_i)] \leftarrow \text{c\_component\_partition}(\mathcal{G})$

% % \STATE $S_{\Vc} =$ Find c-component $S \in \{{S}\}_i $ s.t. $\Vc \in S$


% \vspace{1em}

% \vspace{1em}
% \STATE \textbf{ClientUpdate($c, w$):} \textit{(Run on client $c$)}
% \STATE $\mathcal{B} \gets$ (split $D^c$ into batches of size $B$)

% \FOR{each $ {S} \in \{{S_i}\}_i\setminus \{S_{\Vc}\} $ }
% \STATE $w = \textbf{LocalTraining($w, \mathcal{B}, S_{i}, Pa(S_{i})$)}$ \textbf{in parallel}
% \STATE Save $\{w[V]\}_{V\in S}$ locally.
% \ENDFOR
% \STATE $w = \textbf{LocalTraining($w, \mathcal{B}, S_{\Vc}, Pa(S_{\Vc})$)}$ \textbf{in parallel}
% \STATE \textbf{return} $\{w[V]\}_{V\in S_{\Vc}}$ to server
% % \STATE \textbf{return } SCM mechanisms $f_{V}$
% \end{algorithmic}
% \end{algorithm}



%     \begin{algorithm}[t!]
% \caption{LocalTraining($w, \mathcal{B}, S, Pa(S)$))}
% \begin{algorithmic}[1]

% % \STATE \textbf{}: \textit{(Models in a cc)}


% \STATE \textbf{Input:} Dataset $\mathcal{D}$, Causal graph $\mathcal{G}$, C-component $\mathbf{S}$, blanket $Pa(\mathbf{S})$.


% \textbf{getRealIntvData}($\mathcal{D}, \mathcal{G}, \mathbf{S}, pa(\mathbf{S})$):


% \STATE $An= V_{\pi^{j-1}} \cap (S_i \cup pa(S_i))$; $\pi_{\mathcal{G}}$ be the ancestral order.
% \FOR{each $V_j \in \mathbf{S}$}
% \STATE Train $M_j(An)$ on $\mathcal{D}$ such that $M_j(An) \sim P(v_j|An)$
% \ENDFOR
% \STATE Fix $ pa(\mathbf{S}) $ in $M_{j: V_j \in \mathbf{S}}$ and ancestral sample to obtain $ D^R[\mathbf{S}] \sim P(\mathbf{S} \mid \text{do}(\text{Pa}(\mathbf{S}))) $
% \STATE \textbf{Return } $D^R[\mathbf{S}]$



% \vspace{1mm}
% \FOR{each local epoch $i$ from $1$ to $E$}
%     \FOR{each batch $b \in \mathcal{B}$}
% \STATE Sample   $pa(\mathbf{S}_i)  \sim \text{Uniform}(\text{support}(\text{Pa}(\mathbf{S}_i)))$
%     \STATE $D^R[\mathbf{S}] = $ getRealIntvData($b, \mathcal{G}, \mathbf{S_i}, pa(\mathbf{S_i})$)
%     \STATE $D^F[\mathbf{S}] = $ getFakeIntvData($f_{V_i\in \mathbf{V}}, \mathbf{S_i}, pa(\mathbf{S_i})$))
%     \STATE $\ell$ = dist($D^F, D^R$)
%     \FOR{each $V\in S$}
%         \STATE $w[V] \gets w[V] - \eta \nabla \ell$
%     \ENDFOR
%     \ENDFOR
% \ENDFOR

% \STATE \textbf{return} $w$
% \label{alg:getRealIntvData}
% \end{algorithmic}
% \end{algorithm}



% 
% \begin{algorithm}[t!]
% \caption{Server executes($\mathbb{G}_{V_i\in \mathbf{V}}, \mathbf{S}, pa(\mathbf{S})$)}
% \begin{algorithmic}[1]
% \STATE Initialize model weights $w_0[V];$ for all $V\in S_{\Vc}$.
% \FOR{each round $t = 1, 2, \dots$}
%     % \STATE $m \gets \max(C \cdot K, 1)$
%     \STATE $C_t \gets$ (random set of $max(\alpha C, 1)$ clients)
%     \FOR{each client $k \in C_t$ \textbf{in parallel}}
%         \STATE $w_{t+1}^{k}$ $\gets \textsc{ClientUpdate}(k, w_t)$
%         % \FOR{each variable $V\in S_{\Vc}$ }
%         % \STATE $w_{t+1}^{k}[V] \gets  keep[V] $
%         % \ENDFOR
%     \ENDFOR
%     \FOR{each variable $V\in S_{\Vc}$ }
%     \STATE $w_{t+1}[V] \gets \sum_{k=1}^K \frac{n_k}{n} w_{t+1}^{k}[V]$
%     \ENDFOR
% \ENDFOR
% \label{alg:getFakeIntvData}
% \end{algorithmic}
% \end{algorithm}
% 

% \begin{algorithm}[t!]
% \caption{getFakeIntvData($\mathbb{G}_{V_i\in \mathbf{V}}, \mathbf{S}, pa(\mathbf{S})$)}
% \begin{algorithmic}[1]
% \STATE \textbf{Input:}  DCM $\mathbb{G}_{V_i\in \mathbf{V}}$, C-component $\mathbf{S}$, blanket $Pa(\mathbf{S})$.
% \STATE Fix $ pa(\mathbf{S}) $ in $\mathbb{G}_{V_i \in \mathbf{V}}$ and ancestral sample to obtain $ D^F[\mathbf{S}] \sim P(\mathbf{S} \mid \text{do}(\text{Pa}(\mathbf{S}))) $
% \STATE \textbf{Return } $D^F[\mathbf{S}]$
% \label{alg:getFakeIntvData}
% \end{algorithmic}
% \end{algorithm}


% \onecolumn
% \section{Issues need fix}
% \begin{itemize}
%     \item For overlapping variables, don't we need their parents?
%     \item Why is this restricted to gans only? Does that mean this is minimum component that is dependent on a gan architecture?
%     \item Heterogenous and personalized.
%     \item cite more recent DCM/NCM works. Replace old works with new ones.
%     \item Rephrase SCM def and intro.
%     \item FL def and eq.

%     \item \textbf{Trivial Solution 1:} joint $P(v', w)$\red{*}; confounding noise $U$ \red{*}
%     \item Equation~\ref{eq:triv-sol2} $U$.
%     \item Section 3.1 taken from Modular-DCM paper.
% \end{itemize}


\section*{Mathematical Notation}
The table below lists and defines the mathematical symbols used throughout this paper:

\begin{longtable}{|c|l|}
\hline
\textbf{Symbol} & \textbf{Description} \\ \hline
\endfirsthead
\hline
\textbf{Symbol} & \textbf{Description} \\ \hline
\endhead
\hline
\endfoot

% Add rows for each symbol
$\X$ & The given/detected variable heterogeneous across clients. \\ \hline
$\mathcal{C}$ & Set of all clients \\ \hline
$C(G)$ & Set of all c-components \\ \hline
$C_l(G)$ & Set of c-components trained using only local data \\ \hline
$C_g(G)$ & Set of c-components trained using  \\ \hline
$S_i$ & i-th c-component in the graph after factorization \\ \hline
$\mathbb{G}$ & DCM generators \\ \hline
$\cpc$ &  Client proposed set   \\ \hline
$\F$ & Set of mechanism that we select for federated learning. \\ \hline
$\glob$ & Global models \\ \hline
$\M$ & SCM \\ \hline
$f^*$ & True mechanism of SCM \\ \hline
$f$ & Mechanism of DCM \\ \hline
$\mathcal{V}$ & Set of all causal variables in a structural causal model \\ \hline
$\nabla f(x)$ & Gradient of $f(x)$ \\ \hline
$\partial$ & Partial derivative \\ \hline

% $\sum_{i=1}^n x_i$ & Summation over $n$ terms \\ \hline
% $\arg\max_x f(x)$ & Argument that maximizes $f(x)$ \\ \hline
% $\lambda$ & Regularization parameter \\ \hline
% $\alpha, \beta$ & Model hyperparameters \\ \hline

\end{longtable}







\end{document}
