%%%%%%%% ICML 2025 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%

\documentclass{article}

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2025} with \usepackage[nohyperref]{icml2025} above.
\usepackage{hyperref}


% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}
\usepackage[noend]{algorithmic}

% Use the following line for the initial blind version submitted for review:
\usepackage{icml2025}

% If accepted, instead use the following line for the camera-ready submission:
% \usepackage[accepted]{icml2025}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}



\newcommand{\indep}{\perp \!\!\! \perp}
\newcommand{\blue}{\textcolor{blue}}
\newcommand{\red}{\textcolor{red}}
\newcommand{\orng}{\textcolor{orange}}
\newcommand{\mk}[1]{{\color{red} MK: \{#1\}}}
\newcommand{\mr}{\textcolor{orange}}
\newcommand{\h}{\mathcal{H}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\Oc}{\mathcal{O}}
\newcommand{\Xc}{\mathcal{X}}
\newcommand{\Xp}{{\hat{\mathbf{X}}}}
\newcommand{\xp}{{\hat{\mathbf{x}}}}
\newcommand{\Gp}{{\hat{{G}}}}
\newcommand{\Pp}{\orng{\hat{P}}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\vb}{\mathbf{v}}
\newcommand{\gray}{\textcolor{gray}}
\newcommand{\lgray}{\textcolor{lightgray}}
\newcommand{\mbf}{\mathbf}
% \newcommand{\rm}{\mathrm}
\newcommand{\Do}{\text{do}}
\newcommand{\data}{\mathcal{D}}
\newcommand{\idgen}{\text{ID-GEN}\xspace}
\newcommand{\idtrain}{\text{ID-Train}\xspace}
\newcommand{\idcdag}{\text{IDC-GEN}\xspace}
\newcommand{\condgm}{\texttt{ConditionalGMs(.)}\xspace}
\newcommand{\idmerge}{\texttt{MergeNetwork(.)}\xspace}
\newcommand{\stepsev}{\texttt{Update(.)}\xspace}
\newcommand{\idpar}{\text{ID}$(\y,\x, P, G)$\xspace}
\newcommand{\idgenpar}{\idgen$(\mathbf{Y}, \mathbf{X},  G,  \data, {\Xp}, \Gp)$\xspace}
\newcommand{\ydox}{P_{\mathbf{x}}(\mathbf{y})\xspace}



% The \icmltitle you define below is probably too long as a header.
% Therefore, a short form for the running title is supplied here:
\icmltitlerunning{Submission and Formatting Instructions for ICML 2025}

\begin{document}

\twocolumn[
\icmltitle{Federated DCM}

% It is OKAY to include author information, even for blind
% submissions: the style file will automatically remove it for you
% unless you've provided the [accepted] option to the icml2025
% package.

% List of affiliations: The first argument should be a (short)
% identifier you will use later to specify author affiliations
% Academic affiliations should list Department, University, City, Region, Country
% Industry affiliations should list Company, City, Region, Country

% You can specify symbols, otherwise they are numbered in order.
% Ideally, you should not use this facility. Affiliations will be numbered
% in order of appearance and this is the preferred way.
\icmlsetsymbol{equal}{*}

\begin{icmlauthorlist}
\icmlauthor{Firstname1 Lastname1}{equal,yyy}
\icmlauthor{Firstname2 Lastname2}{equal,yyy,comp}
\icmlauthor{Firstname3 Lastname3}{comp}
\icmlauthor{Firstname4 Lastname4}{sch}
\icmlauthor{Firstname5 Lastname5}{yyy}
\icmlauthor{Firstname6 Lastname6}{sch,yyy,comp}
\icmlauthor{Firstname7 Lastname7}{comp}
%\icmlauthor{}{sch}
\icmlauthor{Firstname8 Lastname8}{sch}
\icmlauthor{Firstname8 Lastname8}{yyy,comp}
%\icmlauthor{}{sch}
%\icmlauthor{}{sch}
\end{icmlauthorlist}

\icmlaffiliation{yyy}{Department of XXX, University of YYY, Location, Country}
\icmlaffiliation{comp}{Company Name, Location, Country}
\icmlaffiliation{sch}{School of ZZZ, Institute of WWW, Location, Country}

\icmlcorrespondingauthor{Firstname1 Lastname1}{first1.last1@xxx.edu}
\icmlcorrespondingauthor{Firstname2 Lastname2}{first2.last2@www.uk}

% You may provide any keywords that you
% find helpful for describing your paper; these are used to populate
% the "keywords" metadata in the PDF but will not be shown in the document
\icmlkeywords{Machine Learning, ICML}

\vskip 0.3in
]

% this must go after the closing bracket ] following \twocolumn[ ...

% This command actually creates the footnote in the first column
% listing the affiliations and the copyright notice.
% The command takes one argument, which is text to display at the start of the footnote.
% The \icmlEqualContribution command is standard text for equal contribution.
% Remove it (just {}) if you do not need this facility.

%\printAffiliationsAndNotice{}  % leave blank if no need to mention equal contribution
\printAffiliationsAndNotice{\icmlEqualContribution} % otherwise use the standard text.

\begin{abstract}
This document provides a basic paper template and submission guidelines.
Abstracts must be a single paragraph, ideally between 4--6 sentences long.
Gross violations will trigger corrections at the camera-ready phase.
\end{abstract}



\section{Introduction}



\blue{There is pain:}

% What happens if we dont have causality 

% What happens if we dont have deep causal generative model.

% What happens if we dont have efficient training of deep-scm.

\blue{Why do we need causal model in ferderated learning?:}
\begin{itemize}

\item What does it mean to have Trivial FL for causal models? Discuss its computation complexity and challenges for that.

    \item 
In federated learning, we might have generative models in local clients, for example an image generator or transformer which performs next token prediction (ex: cell phones).
Each client might be interested in different classified outputs.
Even though each client share some common mechanisms that they learn through federated setup, they generally have their own complex causal system which contains the learned mechanism. There exists no work that discusses the connection or dynamics between other variables in the system and the federated mechanism and performs further prediction. For example, in Figure~\ref{?}, the learned image is an intermediate variable. The input and the output of the variables are confounded and we want a causal effect of the image input on the image classifier output.

\item Local PLM finetuning involves other variables. And finetuning is expensive for edge devices. fine-tuning PLMs
through FL requires the clients and server to frequently exchange model parameters or gradients, usually on a scale of
millions or even billions of parameters. Method which can deal with these: adapter tuning , prefix tuning , LoRA and BitFit

\item Bias Variance tradeoff with causal approach feature distribution shift and data scarcity. Causal graph will also allow us feature reduction.
\end{itemize}


\blue{Why do we need different causal models at different clients}
\begin{itemize}
    \item \cite{9766407} says "Due to the differences of clients,
a single global model may not perform well on all clients, so the
personalized federated learning method, which trains a personalized model for each client that better suits its individual needs,
becomes a research hotspot. .. sharing the
model architecture results in the leakage of the model design,
which may be the intellectual property rights of participating
organizations" They also generate samples which is not secured. So, just use their motivation of keeping model architecture private.

\item Performance on across various tasks in different clients is required.

\item Personalized federated learning (PFL).
\end{itemize}

\blue{Why full dcm is not feasible and modular-dcm required?}
\begin{itemize}
    \item We need to learn a causal model. Existing approach would suggest sending gradients of all models/mechanisms. But the clients are generally edge devices with limited resources. Thus sending gradients of all models is not feasible due to communication overhead.
However, maybe we can just send gradients for the hardest mechanisms and train rest of the mechanisms locally. Local data is enough to learn those mechanisms. Generator-discriminator relationship might be an example of such cases.

\item "This scenario is prevalent in real world situations". Especially for LLMs, these challenges include (1) limited access to the model parameters due
to the high encapsulation, (2) computational and memory
costs for local clients, and (3) communication overhead in the
FL system. 

\item \cite{wang2024why} trains a neural network layer by layer modularly in a federated learning setup.

\item Edge devices with limited resources. We dont want all models to be transferred.

\end{itemize} 

\blue{When federated learning requires two model training and when one model is sufficient.} 
Otherwise, everyone would just suggest to train a single model.

\begin{itemize}

\item \cite{jothimurugesan2023federated} present the first FL solutions that employ multiple models to address FL under distributed concept
drift. Our solutions aim to create one model for each new
concept so that all clients under the same concept can train
that model collaboratively, similar to what is done for personalized or clustered FL.

    \item \cite{wang2024flora} FLORA, a federated fine-tuning algorithm based on LoRA that can perform noise-free aggregation of local LoRA modules.

\item FlexLORA~\cite{bai2024federated} scalable method to fully leverage local client resources for enhancing the global model’s generalization ability. By synthesizing a full-size LoRA weight from individual client contributions and employing Singular Value Decomposition (SVD) for weight redistribution, FlexLoRA fully leverages heterogeneous client resources.

\item \cite{yang2024dual} learn personalized federated foundation models on clients while effectively handling
test-time distribution shifts simultaneously. By co-working with a foundation model, a global adapter
and a local adapter jointly tackle the test-time distribution shifts and client-specific
personalization.

\item \cite{mclaughlin2024personalized}  frame representation learning as a
generative modeling task, where representations are trained with a classifier based
on the global feature distribution. Their algorithm efficiently generates personalized models by adapting global generative classifiers
to their local feature distributions. 

\item Knowledge distillation. Teacher student federated distillation. Different nodes of the causal graph are different representation of the data/different task.
    
\end{itemize}


\red{even though there might be bias in one client, after many iterations it will converge to a point where there is no bias. So, bias/spurious correlation propagation is not a good project proposal.}

Also, we utilize partial identification and remove the most weak edges based on the constraints.


\red{Do we need FL for DCM training or need DCM for federated training? Or why do we need DCM in federated learning.}

\blue{Is there any other method that can solve the same problem? Or you can built upon?}
\begin{itemize}
    \item \cite{tang2024fusefl} provide a causal view to understand the gap between multi-round FL and OFL, showing that
augmenting intermediate features from other clients contributes helps improve OFL. They are the first using causality to analyze the data heterogeneity of OFL. They have code as well. We might built upon their work.

\item
\cite{makhija2024a} propose personalized federated learning utilizing Bayesian principles for improved robustness and reliability, particularly in contexts where data is scarce. We might utilize its approach.
\end{itemize}

\blue{What is the proposed solutions?}

\begin{itemize}
    \item If we follow a causal graph in each client and train a causal model on the data, we can perform interventions and send gradients of the interventional distributions. This would hide the true causal relationships among features from outsiders. For example, even though each client knows some birds are more likely to live in a specific habitants, we can generate images for different habitants hiding their locations.
\end{itemize}



\section{Related works}

\cite{hu2024fissionvae} uses federated setup to generate images with VAE and GANs.

\cite{song2021federated} uses cycle gan in federated setup for image translation task.


\cite{ng2022towards} proposes federated bayesian network structure learning with continuous optimization.



\section{Problem Definition}
Given a learned SCM and its ADMG $G$, what is the most efficient way to transfer to a new SCM where the new SCM might have i) change in its conditional distributions. ii) change in the edges iii) change in the number of variables.

% \begin{figure}
%     \centering
%     \includegraphics[width=1\linewidth]{Figures/Mod2/fed.pdf}
%     \caption{Enter Caption}
%     \label{fig:enter-label}
% \end{figure}

% \begin{figure}
%     \centering
%     \includegraphics[width=1.1\linewidth]{Figures/Mod2/root.pdf}
%     \caption{Enter Caption}
%     \label{fig:enter-label}
% \end{figure}

% \begin{figure}
%     \centering
%     \includegraphics[width=1\linewidth]{Figures/Mod2/architectures.pdf}
%     \caption{Applications of modularity}
%     \label{fig:enter-label}
% \end{figure}



\section{Methodology}

\subsection{Training $P(C|do(Pa(C))$}

We can apply step 7 of the ID algorithm for the case in modular-DCM when rule-2 does not apply. Suppose the graph can be generalized in this format: $\mathbf{X} \leftrightarrow \mathbf{Y}; \mathbf{X} \rightarrow \mathbf{Z} \rightarrow \mathbf{Y}$.
Step 1: Sample Gaussian noise $N$. Step 2: Generate $\mathbf{X}$ with $M_{X}(N)$.
Step 3: Feed generated $\mathbf{X}$ and corresponding $\mathbf{Z}$ to train a model $M(\mathbf{X},Z)$. 
Step 4: Feed the generated $\mathbf{X}$ and random $\mathbf{Z}$ to generate $\mathbf{Y}$ as inference. Step 5: Train $M_{\mathbf{Y}}(N,\mathbf{Z})$ by matching with $\mathbf{Y}$. This will ensure that $M_{\mathbf{Y}}$ is being trained on $P(\mathbf{X,Y|do(Z)})$.


ID-GEN should be able to be utilized easily
to obtain c-component based modularity in modular-DCM.
Previously we discussed about applications such as adaptation to distribution shift, dcm for time-series, transportability etc.

Also \cite{jung2024estimating} show that any
g-identifiable causal effect can be expressed as a function of generalized multi outcome sequential back-door adjustments that are amenable to estimation.


To obtain a DCM we need to train $|V|$ models for $V$ variables and match all of the following terms.
\begin{equation}
    P(V) = \prod_{i\in \{n\}} P(S_i| do(pa(S_i))
\end{equation}


Suppose, we need to update any mechanism $M_{V_j} \in S_i$.
 Here $P(S_i|do(pa(S_i))$ is identifiable from observational data since the intervention set $\mathbf{X}= Pa(S_i)$ is located outside the c-component $S_i$. 

We can identify and estimate the c-factor as following:
\begin{equation}
\label{eq:step6}
    P(S_i| do(pa(S_i))=\prod_{V_j\in S_i} P(v_j|v_{\pi^{j-1}} \cap (S_i \cup pa(S_i)))
\end{equation}
If sample interventional data $D^2$ for variables in $S_i$ according to the above distribution, we can utilize that as training data to train mechanisms $M_{V_j}; V_j\in S_i$. This implies that we do not need interventional data rather observational data is sufficient to train our required models. 

It appears that we need to train $|S_i|$ many models to sample from the distribution in equation~\ref{eq:step6}. However, we can reduce the training cost. Let $\mathbf{Z_i}=\{V_j: V_{\pi^{j-1}} \cap Pa(S_i) =\emptyset \}$, i.e, $Pa(S_i)$ are not ancestors of such $V_j \in \mathbf{Z_i}$.

\begin{equation}
\begin{split}
    &\prod_{V_j\in S_i} P(v_j|v_{\pi^{j-1}} \cap (S_i \cup pa(S_i)))\\
    = &\prod_{V_j\in \mathbf{Z}_i} P(v_j|v_{\pi^{j-1}} \cap (S_i \cup pa(S_i)))\\
    & \times \prod_{V_j\in S_i\setminus \mathbf{Z}_i} P(v_j|v_{\pi^{j-1}} \cap (S_i \cup pa(S_i)))\\
    = &\prod_{V_j\in \mathbf{Z}_i} P(v_j|
    (v_{\pi^{j-1}} \cap S_i) (v_{\pi^{j-1}} \cup pa(S_i)))\\
    & \times \prod_{V_j\in S_i \setminus \mathbf{Z}_i} P(v_j|v_{\pi^{j-1}} \cap (S_i \cup pa(S_i)))\\
    = &\prod_{V_j\in \mathbf{Z}_i} P(v_j|
    (v_{\pi^{j-1}} \cap S_i))\\
    & \times \prod_{V_j\in S_i \setminus \mathbf{Z}_i} P(v_j|v_{\pi^{j-1}} \cap (S_i \cup pa(S_i)))\\
    = &P(\mathbf{Z}_i)
    \prod_{V_j\in S_i \setminus \mathbf{Z}_i} P(v_j|v_{\pi^{j-1}} \cap (S_i \cup pa(S_i)))
\end{split}
\end{equation}

Thus, we would need to train $|S_i \setminus \mathbf{Z}_i|$ number of models to generate the required interventional data.

We can call it blanket which keeps the mechanisms in this blanket independent from any change occurs outside the blanket.


Let $|S_i \setminus \mathbf{Z}_i|=M$ and $|S_i|=N$. We need to train $N$ models in the DCM with data generated from $M$ models. Here we can compress even further. If the number of interventions are $|I|$, then we need to train $|I|+1$ number of models which can be represented as a chain of nodes with each intervention being parent of each node.


 Suppose, $P(V)$ represents the original data distribution such that $D[V]\sim P(V)$.
Let $P_{\theta}(V)$ be the distribution implied by the $\theta$ parameterized DCM.
Let $Q_{\theta'}(V)$ be the distribution learned by the $\theta'$ parameterized models trained in ID-GEN algorithm.

\begin{equation}
    \begin{split}
     L_{\theta'_i:V_i\in S} &= |Q_{\theta'_i}(S|do(pa(S))) -
        P(S|do(pa(S)))| 
        \\
        L_{\theta'_i:V_i\in S} &= |Q_{\theta'_i}(V_i| V_{\pi^{i-1}} \cap S, pa(S)) -
        P(V_i| V_{\pi^{i-1}} \cap S, pa(S))|\\
        %
    \end{split}
\end{equation}
The the loss function for DCM is as follow:
\begin{equation}
               L^{\theta} = 
               |P_{\theta}(S|do(pa(S)))  
               - Q_{\theta'}(S|do(pa(S)))
\end{equation}

Now, we can back propagate on $L^{\theta}$ and $L_{\theta'_i}, \forall i:V_i\in S$ which lets us update each parameter $\theta_i': \forall_i$ and $\theta$ independently. 



\begin{figure}
    \centering
    \includegraphics[width=1.1\linewidth]{Figures/swig_type_modularization.pdf}
    \caption{Enter Caption}
    \label{fig:enter-label}
\end{figure}



\subsection{Theoretical Guarantees}
We prove the necessary number of model updates and necessary amount of variables required for the adaptation to new SCM. This is possible due to the c-blanket of the c-component.


\subsection{Fed-DCM in Practice}
In practice, we might face some situation when only some specific variable mechanism is eligible to be learned in the federated setup. In this section, we show how we can learn the causal model in such scenario.


Figure 1 left, modularing till c-component level.
We match $P(x,y,z)$.
\begin{equation}
    \begin{split}
P(x,y,z)= P(x,y|do(z)) P(z|do(x))\\
= P(x) P(y|x,z) P(z|x)
    \end{split}
\end{equation}

Figure 1 right, 
\begin{equation}
\begin{split}
   &Let X=X';\\
    &P(X)= P(X');\\
    &P(X,X') = P(X)\\ 
\end{split}
\end{equation}

Now match $P(x, x', y, z)$.

\begin{equation}
    \begin{split}
&P(x, x', y, z) \\
&=P(x,x') P(z|do(x)) P(y|do(x',z))\\
&=P(x,x') P(z|x) P(y|x',z)\\
&=P(x) P(z|x) P(y|x,z)\\
&= P(x,y,z)
    \end{split}
\end{equation}
This implies that matching $P(x, x', y, z)$ is equivalent to matching $P(x,y,z)$.

Now, we estimate the causal effect for the new graph:

\begin{equation}
\begin{split}
&P(y| do(x))\\
&=\sum_{x',z} P(z|x) P(x') P(y|x',z)\\
&=\sum_{z} P(z|x) \sum_{x'} P(x') P(y|x',z)
\end{split}
\end{equation}
which equals the causal effect in the original causal graph (Figure 1 right).


\section{Experiments}
We show the applications of our algorithms in two direction:

i) \textbf{Efficient adaptation to distribution shift:} 
Suppose we observe a distribution shift in the coming data.
This can be represented as soft intervention in the causal graph which
changed some mechanism. We can locate the c-component $S_i$ and fine-tune it to adapt to the distribution shift.

ii) \textbf{Transportability of mechansims}:
Suppose, we want to transport some part of our model to a different domain. We can compare our training domain and test domain to determine which mechanisms/blankets stay invariant. We can transport that part and train rest of the mechanisms in the causal graph.  
We can use reddit data for this purpose. We train our DCM on weight gain subreddit and transport it to weight lose subreddit.

ii) \textbf{Federated Learning}: We have some pre-trained setup. Next, we have a new output variable. We add it to the SCM and train only necessary part of the causal graph.


multimodal learning or when auxiliary information can improve the classification task.


\subsection{Real-world experiment 1}
COALA~\cite{zhuang2024coala}, at the task level, we
extend to a broader spectrum of 15 CV tasks, including classification, object detection, segmentation, pose estimation,
face recognition. At the data level, support
semi-supervised FL, unsupervised FL, and multi-domain
FL with feature distribution shifts among local training data. At the model level, clients can train multiple models with varying
parameters and architectures.


\subsection{Real-world experiment 2}
\cite{zhang2023federated} leverage Stable Diffusion to synthesize high quality training data on the server based on the text embeddings collected from clients. They generate prompts based on the characteristics of the client’s data, which are used as inputs to a specific text encoder to obtain corresponding text embeddings. Once all text embeddings are collected from the clients, the server performs embedding aggregation
and then synthesizes a high-quality substitute training dataset. This public synthetic dataset serves as a proxy for the clients’ private data and can be used to train a global model on the server.



\clearpage
\section{Algorithms}

\begin{algorithm}[t!]
\caption{getRealIntvData($\mathcal{D}, \mathcal{G}, \mathbf{S}, pa(\mathbf{S})$)}
\begin{algorithmic}[1]
\STATE \textbf{Input:} Dataset $\mathcal{D}$, Causal graph $\mathcal{G}$, C-component $\mathbf{S}$, blanket $Pa(\mathbf{S})$.
\STATE $An= V_{\pi^{j-1}} \cap (S_i \cup pa(S_i))$; $\pi_{\mathcal{G}}$ be the ancestral order.
\FOR{each $V_j \in \mathbf{S}$}
\STATE Train $M_j(An)$ on $\mathcal{D}$ such that $M_j(An) \sim P(v_j|An)$
\ENDFOR
\STATE Fix $ pa(\mathbf{S}) $ in $M_{j: V_j \in \mathbf{S}}$ and ancestral sample to obtain $ D^R[\mathbf{S}] \sim P(\mathbf{S} \mid \text{do}(\text{Pa}(\mathbf{S}))) $
\STATE \textbf{Return } $D^R[\mathbf{S}]$
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[t!]
\caption{getFakeIntvData($\mathbb{G}_{V_i\in \mathbf{V}}, \mathbf{S}, pa(\mathbf{S})$)}
\begin{algorithmic}[1]
\STATE \textbf{Input:}  DCM $\mathbb{G}_{V_i\in \mathbf{V}}$, C-component $\mathbf{S}$, blanket $Pa(\mathbf{S})$.
\STATE Fix $ pa(\mathbf{S}) $ in $\mathbb{G}_{V_i \in \mathbf{V}}$ and ancestral sample to obtain $ D^F[\mathbf{S}] \sim P(\mathbf{S} \mid \text{do}(\text{Pa}(\mathbf{S}))) $
\STATE \textbf{Return } $D^F[\mathbf{S}]$
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[t!]
\caption{Fed-DCM Algorithm}
\begin{algorithmic}[1]
\STATE \textbf{Input:} Dataset $\mathcal{D}$, Causal graph $\mathcal{G}$, Variables $\mathbf{V} = \{V_1, V_2, \dots, V_n\}, n = |\mathbf{V}|$
% \FOR{each $ \mathbf{S} \in [\mathbf{S}_i] $}
    \FOR{each $ V \in \mathbf{V}$}
        \STATE Initialize $\mathbb{G}_{V}(Pa(V), \red{U_{V}} )$
    \ENDFOR
% \STATE Connect input and output of $ G_{V_i} $ for all $ V_i $, according to directed edges $ \rightarrow $ in $ \mathcal{G} $
% % \ENDFOR
% \FOR{each bi-directed edge $ \leftrightarrow $ in $ \mathcal{G} $}
% \STATE $ U \sim \mathcal{N}(0, I) $
% \ENDFOR
\STATE $[\mathbf{S}_i, \text{Pa}(\mathbf{S}_i)] \leftarrow \text{c\_component\_partition}(\mathcal{G})$
\FOR{each $ \mathbf{S} \in \{\mathbf{S}\}_i $}
    \STATE Sample   $pa(\mathbf{S}_i)  \sim \text{Uniform}(\text{support}(\text{Pa}(\mathbf{S}_i)))$
    \STATE $D^F[\mathbf{S}] = $ getFakeIntvData($\mathbb{G}_{V_i\in \mathbf{V}}, \mathbf{S_i}, pa(\mathbf{S_i})$))
    \STATE $D^R[\mathbf{S}] = $ getRealIntvData($\mathcal{D}, \mathcal{G}, \mathbf{S_i}, pa(\mathbf{S_i})$)
    \STATE Loss = Dist($D^F, D^R$)
\ENDFOR
\STATE \textbf{Return } $\mathbb{G}_{V}$
\end{algorithmic}
\end{algorithm}



\clearpage


\section*{Accessibility}
Authors are kindly asked to make their submissions as accessible as possible for everyone including people with disabilities and sensory or neurological differences.
Tips of how to achieve this and what to pay attention to will be provided on the conference website \url{http://icml.cc/}.

\section*{Software and Data}

If a paper is accepted, we strongly encourage the publication of software and data with the
camera-ready version of the paper whenever appropriate. This can be
done by including a URL in the camera-ready copy. However, \textbf{do not}
include URLs that reveal your institution or identity in your
submission for review. Instead, provide an anonymous URL or upload
the material as ``Supplementary Material'' into the OpenReview reviewing
system. Note that reviewers are not required to look at this material
when writing their review.

% Acknowledgements should only appear in the accepted version.
\section*{Acknowledgements}

\textbf{Do not} include acknowledgements in the initial version of
the paper submitted for blind review.

If a paper is accepted, the final camera-ready version can (and
usually should) include acknowledgements.  Such acknowledgements
should be placed at the end of the section, in an unnumbered section
that does not count towards the paper page limit. Typically, this will 
include thanks to reviewers who gave useful comments, to colleagues 
who contributed to the ideas, and to funding agencies and corporate 
sponsors that provided financial support.

\section*{Impact Statement}

Authors are \textbf{required} to include a statement of the potential 
broader impact of their work, including its ethical aspects and future 
societal consequences. This statement should be in an unnumbered 
section at the end of the paper (co-located with Acknowledgements -- 
the two may appear in either order, but both must be before References), 
and does not count toward the paper page limit. In many cases, where 
the ethical impacts and expected societal implications are those that 
are well established when advancing the field of Machine Learning, 
substantial discussion is not required, and a simple statement such 
as the following will suffice:

``This paper presents work whose goal is to advance the field of 
Machine Learning. There are many potential societal consequences 
of our work, none which we feel must be specifically highlighted here.''

The above statement can be used verbatim in such cases, but we 
encourage authors to think about whether there is content which does 
warrant further discussion, as this statement will be apparent if the 
paper is later flagged for ethics review.


% In the unusual situation where you want a paper to appear in the
% references without citing it in the main text, use \nocite
\nocite{langley00}

\bibliography{references}
\bibliographystyle{icml2025}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\appendix
\onecolumn
\section{You \emph{can} have an appendix here.}

You can have as much text here as you want. The main body must be at most $8$ pages long.
For the final version, one more page can be added.
If you want, you can use an appendix like this one.  

The $\mathtt{\backslash onecolumn}$ command above can be kept in place if you prefer a one-column appendix, or can be removed if you prefer a two-column appendix.  Apart from this possible change, the style (font size, spacing, margins, page numbering, etc.) should be kept the same as the main body.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\end{document}


% This document was modified from the file originally made available by
% Pat Langley and Andrea Danyluk for ICML-2K. This version was created
% by Iain Murray in 2018, and modified by Alexandre Bouchard in
% 2019 and 2021 and by Csaba Szepesvari, Gang Niu and Sivan Sabato in 2022.
% Modified again in 2023 and 2024 by Sivan Sabato and Jonathan Scarlett.
% Previous contributors include Dan Roy, Lise Getoor and Tobias
% Scheffer, which was slightly modified from the 2010 version by
% Thorsten Joachims & Johannes Fuernkranz, slightly modified from the
% 2009 version by Kiri Wagstaff and Sam Roweis's 2008 version, which is
% slightly modified from Prasad Tadepalli's 2007 version which is a
% lightly changed version of the previous year's version by Andrew
% Moore, which was in turn edited from those of Kristian Kersting and
% Codrina Lauth. Alex Smola contributed to the algorithmic style files.
