%\documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
\usepackage{hyperref}
\usepackage{url}

\usepackage{url}
\usepackage{amsfonts}
\usepackage{amsthm}% blackboard math symbols
\usepackage[utf8]{inputenc}
\usepackage{graphicx}
\usepackage{colortbl}
\usepackage{tabulary}
\usepackage{subcaption}
\usepackage{pgf} % for calculating the values for gradient
\usepackage{graphics}
\usepackage{booktabs}       % professional-quality tables
\usepackage{multirow}
\usepackage{pgfplots}

\usepackage{hyperref}
\usepackage{etoolbox}
\usepackage{collcell}
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{natbib} % has a nice set of citation styles and commands
\usepackage{xcolor}         % colors
\usepackage[ruled,linesnumbered,noend]{algorithm2e}
\usepackage{pifont}
\usepackage{adjustbox}
\usepackage{makecell}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{wrapfig}
\newcommand\notejo[1]{\textcolor{red}{#1}}
\newcommand{\indep}{\perp \!\!\! \perp}
%\newtheorem{defin}{Definition}
\newtheorem{proofsketch}{Proof (Sketch)}
\newtheorem{assum}{Assumption}
%\newtheorem{prop}{Proposition}
%\newtheorem{conj}{Conjecture}
\newtheorem{rem}{Remark}

\newcommand{\SumNode}{\mathsf{S}}
\newcommand{\ClientNode}{\mathsf{C}}
\newcommand{\ProductNode}{\mathsf{P}}
\newcommand{\Node}{\mathsf{N}}
\newcommand{\Leaf}{\mathsf{L}}
\newcommand{\graph}{\mathcal{G}}
\newcommand{\ch}[1]{\operatorname{ch}(#1)}
\newcommand{\pa}[1]{\operatorname{pa}(#1)}
\newcommand{\SPN}{\mathfrak{C}}

% TODONOTES
\usepackage{xargs}
%\usepackage[colorinlistoftodos,textsize=small,disable]{todonotes} % Disabled
%\setlength{\marginparwidth}{1.7cm}  % TODO: comment out when todonotes is disabled
\usepackage[colorinlistoftodos,textsize=tiny]{todonotes} % Enabled
\newcommandx{\todoc}[2][1=]{{\todo[linecolor=orange,backgroundcolor=orange!25,bordercolor=orange,#1]{\tiny
      TODO: #2}}}
\newcommandx{\unsure}[2][1=]{{\todo[linecolor=yellow,backgroundcolor=yellow!25,bordercolor=yellow,#1]{\tiny
      UNSURE: #2}}}
\newcommandx{\change}[2][1=]{{\todo[linecolor=blue,backgroundcolor=blue!25,bordercolor=blue,#1]{\tiny
      CHANGE: #2}}}
\newcommandx{\info}[2][1=]{{\todo[linecolor=green,backgroundcolor=green!25,bordercolor=green,#1]{\tiny
      INFO: #2}}}
\newcommandx{\improvement}[2][1=]{{\todo[linecolor=violet,backgroundcolor=violet!25,bordercolor=violet,#1]{\tiny
      IMPROVEMENT: #2}}}
\newcommandx{\fb}[2][1=]{{\todo[inline,linecolor=lime,backgroundcolor=lime!25,bordercolor=lime,#1]{\tiny
      FB: #2}}}
\newcommandx{\thiswillnotshow}[2][1=]{{\todo[disable,#1]{THIS WILL NOT SHOW:
      #2}}}
\newcommand{\expec}{\mathbb{E}}

\newtheorem{defin}{Definition}
\newtheorem{prop}{Proposition}
\newtheorem{conj}{Conjecture}
\newtheorem{fact}{Fact}

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\definecolor{qUmr}{HTML}{AB0392}
\definecolor{4iCr}{HTML}{3492EB}
\definecolor{6dQr}{HTML}{15A123}
\definecolor{allrev}{HTML}{f59342}

\title{Scaling Probabilistic Circuits via Data Partitioning}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jonas.seng@tu-darmstadt>?Subject=Federated Circuits}{Jonas Seng}{}}
\author[1,2]{Florian P. Busch}
\author[4]{Pooja Prasad}
\author[4]{Devendra Singh Dhami}
\author[5]{Martin Mundt}
\author[1,2,3]{Kristian Kersting}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Department\\
    Technical University Darmstadt
}
\affil[2]{%
    Hessian Center for AI (hessian.AI)
}
\affil[3]{%
     German Research Center for AI (DFKI)
}
\affil[4]{%
    Department of Mathematics and Computer Science\\
    Eindhoven University of Technology
  }
\affil[5]{%
    Department of Mathematics and Computer Science\\
    University of Bremen
}
  
  \begin{document}
\maketitle

\begin{abstract}
Probabilistic circuits (PCs) enable us to learn joint distributions over a set of random variables and to perform various probabilistic queries in a tractable fashion. Though the tractability property allows PCs to scale beyond non-tractable models such as Bayesian Networks, scaling training and inference of PCs to larger, real-world datasets remains challenging. To remedy the situation, we show how PCs can be learned across multiple machines by recursively partitioning a distributed dataset, thereby unveiling a deep connection between PCs and federated learning (FL). This leads to federated circuits (FCs)---a novel and flexible federated learning (FL) framework that (1) allows one to scale PCs on distributed learning environments (2) train PCs faster and (3) unifies for the first time horizontal, vertical, and hybrid FL in one framework by re-framing FL as a density estimation problem over distributed datasets. We demonstrate FC's capability to scale PCs on various large-scale datasets. Also, we show FC's versatility in handling horizontal, vertical, and hybrid FL within a unified framework on multiple classification tasks.
\end{abstract}

\section{Introduction}
% In the domain of probabilistic models~\citep{goodfellow2014generative,kingma2013auto}, probabilistic graphical models (PGMs; \citet{poon2011sum,pearl1985bayesian}) allow for a broad set of probabilistic queries, such as marginalization and conditioning.
% In addition, such models provide the user with a measure of uncertainty due to their probabilistic nature.
% Famously, Bayesian Networks (BNs)~\citep{pearl1985bayesian} represent a joint probability distribution by modeling random variables (RVs) as nodes and dependencies among RVs as edges in a graph.
% However, while this graphical representation is intuitive and interpretable, it poses limits on computational feasibility for various probabilistic queries~\citep{Park_2006}.
% Hence, \textit{tractability} of probabilistic models is a desirable property, i.e., probabilistic queries can be computed in polynomial time.

Probabilistic Circuits (PCs) are a family of models that provide tractable inference for various probabilistic queries~\citep{domingos2011spns, Choi2020ProbabilisticCA}.
%i.e., probabilistic queries can be computed in polynomial time.
This is achieved by representing a joint distribution by a computation graph on which certain structural properties are imposed.
% For example, Sum-Product Networks (SPNs) consist of simple arithmetic operations (sums and products) that compute mixtures or products of tractable base distributions~\citep{domingos2011spns}.
% Here, a product operation models a (context-specific) independency between variables, while sum operations allow for different mixtures of distributions over the same set of variables, thus modeling dependencies.
While PCs offer significant computational advantages over traditional probabilistic models such as Bayesian networks~\citep{pearl1985bayesian}, further performance gains can be realized by optimizing the compactness of PC representations and tailoring them to specific hardware architectures~\citep{perharz2020einsum,liu2024scalingtractableprobabilisticcircuits}.
% A common scheme in that regime is parallelizing computations in the forward- and backward pass during inference and training by building a compact layer-wise computation graph, similar to neural networks. This representation allows efficient GPU-based training of PCs. Prominent examples of this paradigm are EiNets~\citep{perharz2020einsum} and PyJuice~\citep{liu2024scalingtractableprobabilisticcircuits}.
However, another natural way to scale up PCs by distributing the model over multiple machines is so far underexplored.
While models like neural networks can be partitioned over multiple machines with relatively low efforts, partitioning PCs is more challenging as they come with certain structural constraints to ensure the validity of the represented joint distribution. Interestingly, we find an inherent connection between the structure of PCs and the paradigm of federated learning (FL).
In PCs, sum nodes combine probability distributions over the same set of variables via a mixture.
This resembles the horizontal FL setting~\citep{konevcny2016federated,li2020federatedChallenges}, where all clients hold the same features but different samples.
In contrast, the case of vertical FL~\citep{yang2019federated, wu2020privacy} in which the same samples are shared, but features are split across clients, can be linked to the product nodes used in PCs, which combine distributions of a disjoint set of variables.
Consequently, the hybrid FL~\citep{Zhang2020HybridFLAlgosAndImplementations} setting, where both samples and features are separated across clients, can be represented by a combination of sum and product nodes.
Thus, PCs are well positioned to connect all three FL settings in a unified way -- an endeavor considered hard to achieve in the FL community~\citep{Li_2023FLSurvey, wen2023federated}.
% For more details on FL, see App.~\ref{app:federated_learning}.
We provide additional details on FL and its associated challenges in App.~\ref{app:federated_learning}.
% This connection turns out to be beneficial for both sides as it allows us to efficiently scale up PCs and to design a highly flexible FL framework that can handle horizontal FL~\citep{konevcny2016federated,li2020federatedChallenges}, vertical FL~\citep{yang2019federated, wu2020privacy}, and hybrid FL~\citep{Zhang2020HybridFLAlgosAndImplementations} in a unified way -- an endeavor considered hard to achieve in the FL community~\citep{Li_2023FLSurvey, wen2023federated}.
%This paper aims to scale probabilistic models to large problems; therefore, PCs are a sensible choice.

%\todo{Figure 1: remove + and =, consider adding different colors for samples 1/2 and 3/4 and same for features. Add HFL/VFL to figure if used in caption}
\begin{figure*}[t]
    \centering
      % \includegraphics[width=.9\columnwidth]{federated-circuits/images/concept.pdf}
    \includegraphics[width=.9\textwidth]{federated-circuits/images/concept_new.pdf}
    \caption{\textbf{Scaling PCs via Federated Circuits.} We scale PCs by splitting a dataset $\mathcal{D}$ into $n$ subsets $\{\mathcal{P}_i\}_{i=1}^n$ s.t. $\mathcal{D} = \bigcup_{i=1}^n \mathcal{P}_i$. Each subset is assigned to a client (i.e., machine) $c_j$, and the resulting federated circuit (FC) is learned jointly by a set of clients. As a novel framework for federated learning (FL), FCs can perform horizontal FL (samples are split across clients), vertical FL (features are split across clients), and hybrid FL (mix of horizontal and vertical).}
    %In FL, data is either split across data samples (HFL), features (VFL), or both (hybrid FL). We can represent all settings using FedPCs by using a sum node to combine samples in HFL, a product node to combine features in HFL, or a combination of sum and product nodes to represent any arbitrary combination of splits among features and samples in hybrid FL. The hybrid FL framework allows us to distribute data amongst hardware to optimize efficiency (Best viewed in color).}
    \label{fig:arch}
\end{figure*}

% As a result of the connection between PCs and FL, we introduce federated circuits (FCs), a novel FL framework that re-frames FL as a density estimation problem over a set of datasets distributed over multiple machines (called clients subsequently). 
As a result of this connection, we introduce \textit{federated circuits (FCs)}, a novel FL framework that re-frames FL as a density estimation problem over a set of datasets distributed over multiple machines (subsequently called clients). 
%Each client holds a local dataset, and we aim to learn a joint distribution over the union of datasets held by all participating clients. 
FCs naturally handle all three FL settings and, therefore, provide a flexible way of scaling up PCs by learning a joint distribution over a dataset arbitrarily partitioned across a set of clients (see Fig. \ref{fig:arch} for an illustration).
Imposing the same structural properties as for PCs, FCs achieve tractable computation of probabilistic queries like marginalization and conditioning across multiple machines.
Based on this, we propose a highly communication-efficient learning algorithm that leverages the semi-ring structure within the design of FCs.
%In horizontal FL settings, FCs even operate at nearly zero communication cost, which is highly valuable in slow or unreliable communication networks. 
Our experimental evaluation shows that FCs outperform EiNets~\citep{perharz2020einsum} and PyJuice~\citep{liu2024scalingtractableprobabilisticcircuits} on large-scale density estimation tasks, demonstrating the benefits of scaling up PCs. Additionally, FCs outperform or achieve competing results on various classification tasks in all federated settings compared to state-of-the-art neural network-based and tree-based methods, demonstrating its effectiveness in FL.
We make the following contributions:
\textbf{(1)} We introduce FCs, a communication-efficient and scalable FL framework unifying horizontal, vertical, and hybrid FL by mapping the semantics of PCs to FL. \textbf{(2)}  We practically instantiate FCs to FedPCs and demonstrate how the FC framework can be leveraged to scale up PCs to large real-world datasets. \textbf{(3)} We propose a one-pass training scheme for FedPCs that is compatible with a broad set of learning algorithms. \textbf{(4)} We provide extensive experiments demonstrating the effectiveness of our approach for learning large-scale PCs and performing FL. We consider classification and density estimation on tabular and image data.
%\begin{enumerate}
%    \item[\textbf{(1)}] We introduce FCs, a communication-efficient and scalable FL framework unifying horizontal, vertical, and hybrid FL by mapping the semantics of PCs to FL.
%    \item[\textbf{(2)}] We practically instantiate FCs to FedPCs and demonstrate how the FC framework can be leveraged to scale up PCs to large real-world datasets.
%    \item[\textbf{(3)}] We propose a one-pass training scheme for FedPCs that is compatible with a broad set of learning algorithms.
    % \item[\textbf{(3)}] We show that learning FCs is highly communication efficient in horizontal, vertical, and hybrid FL.
%    \item[\textbf{(4)}] We provide extensive experiments demonstrating the effectiveness of our approach for learning large-scale PCs and performing FL. We consider classification and density estimation on tabular and image data.
%\end{enumerate}

We proceed as follows: After touching upon related work, we provide the probabilistic view on FL and introduce FCs. Before concluding, we present our extensive experimental evaluation of FedPCs.
Our code is publicly available at \url{https://github.com/J0nasSeng/federated-spn.git}.

\section{Preliminaries and Related Work}
% \fb{background instead of rw, talk about PC stuff here, briefly introduce FL, motivate the use/application of FL by further scaling PCs, use PC semantic for density estimation in FLs $\rightarrow$ we get all three FL cases automatically; also: much more possibilities than regular FL (more than just predictions; can also do marginal queries, MPE, ...)}
%Federated Circuits are related to two lines of research -- Federated Learning and Probabilistic Circuits -- which have not been combined into a single framework yet. %\notejo{Add more about FedTree}
%We now briefly introduce PCs and FL and revise related work.
In the following, we briefly introduce PCs and FL and give an overview of relevant related work.

\textbf{Probabilistic Circuits.}
PCs encode a probability distribution as a computation graph that allows for tractable inference of a wide range of queries such as conditioning and marginalization. 
\cite{perharz2015theoreticalPC} define a PC over random variables $\mathbf{X}$ as a tuple $(\graph, \phi)$ where $\graph = (V, E)$ is a rooted, Directed Acyclic Graph (DAG) and $\phi: V \rightarrow 2^{\mathbf{X}}$ is the \textit{scope} function assigning a subset of random variables to each node in $\graph$. For each internal node $\Node$ of $\graph$, the scope is defined as $\phi(\Node) = \cup_{\Node' \in \ch{\Node}} \phi(\Node')$. Each leaf node $\Leaf$ computes a distribution/density over its scope. 
All internal nodes of $\graph$ are either a sum node $\SumNode$ or a product node $\ProductNode$. Each sum node computes a convex combination of its children, and each product node computes a product of its children. With slight abuse of notation we write $\SumNode = \sum_{\Node \in \ch{\SumNode}} w_{\SumNode, \Node}\Node$, and $\ProductNode = \prod_{\Node \in \ch{\ProductNode}}\Node$.
%To ensure tractability,
%of probabilistic queries such as marginalization, 
If a PC is \textit{decomposable}, this implies tractability. Decomposability requires that for all $\ProductNode \in V$ it holds that $\phi(\Node) \cap \phi(\Node') = \emptyset$ where $\Node, \Node' \in \ch{\ProductNode}$. Further, the \textit{smoothness} condition implies that a PC represents a valid distribution. That is, for each sum $\SumNode \in V$ it holds that $\phi(\Node) = \phi(\Node')$ where $\Node, \Node' \in \ch{\SumNode}$~\citep{domingos2011spns,peharz2015theoretical,sanchez2021sum}. 
%Decomposable and smooth PCs are often referred to as Sum-Product Networks (SPNs) \citep{domingos2011spns,peharz2015theoretical,sanchez2021sum}.

% There is a vast number of similar models that can be unified in the framework of PCs, such as Arithmetic Circuits \citep{darwiche2003AC}, Probabilistic Decision Graphs \citep{jaeger2004PDG} and SPNs.
% They have been successfully applied to image generation/classification~\citep{perharz2020einsum}, speech recognition~\citep{peharz2014modeling}, and language tasks~\citep{cheng2014language}.
% Other approaches for scaling PCs have been considered in the past.
Several works have tackled the goal of scaling PCs.
\citet{peharz2020random} have shown that learned PC structures can be replaced by large, random structures to scale to larger problems.
%On the architecture side, it was shown that large, random structures can be used to scale to larger problems more easily~\citep{peharz2020random}.
Changes in the model layout, such as parallelizable layers via einsum-operations~\citep{perharz2020einsum} and a reduction in IO operations~\citep{liu2024scalingtractableprobabilisticcircuits}, were also shown to reduce the speed of computation drastically.
\citet{lius2022calingviaLV} improved the performance of PCs by latent variable distillation using deep generative models for additional supervision during learning. Other approaches allow for negative mixture weights~\citep{loconte2024subtractive}, introduce complex weights~\citep{wang2025IncPC} or use hypernetworks~\citep{Shih2021HyperSPN}.

%\textbf{Federated Learning.}
\textbf{Federated Learning.}
In federated learning (FL), a set of data owners (or clients) aim to collaboratively learn an ML model without sharing their data. One distinguishes between horizontal, vertical, and hybrid FL depending on how data is partitioned. In horizontal FL, a dataset $\mathbf{D} \in \mathbb{R}^{n \times d}$ is partitioned such that each client holds the same $d$ features but different, non-overlapping sets of samples. In vertical FL, $\mathbf{D}$ is partitioned such that each client holds the same $n$ samples but different, non-overlapping subsets of the $d$ features. Hybrid FL describes a combination of horizontal and vertical FL where clients can hold both different (but possibly overlapping) sets of samples and features~\citep{wen2023federated, Li_2023FLSurvey}.

For all three FL settings, specifically tailored methods have been proposed to enable collaborative learning of models.
The most common scheme in horizontal FL is to average the models of all clients regularly during training \citep{McMahan2016FedAvg, karimireddy2020mime, Karimireddy2020SCAFFOLD, Sahu2018FedProx}. However, model averaging requires each client to share the same model structure. In vertical FL, clients hold different feature sets; thus, there is no guarantee that the model structure can be shared among clients. In these cases, tree-based and neural models are the predominant choice and are typically learned by sharing data statistics or feature representations among clients \citep{Kourtellis2016VHT, Cheng2021Secureboost, vepakomma2018split,Cellabos2020SplitNN, Tianyi2020VAFL, Liu2019FedBCD}. 
%Recently, neural network-based approaches communicating feature representations across clients have emerged \citep{vepakomma2018split,Cellabos2020SplitNN, Tianyi2020VAFL, Liu2019FedBCD}.  
Similar to tree-based vertical FL, tree-based hybrid FL approaches share data statistics (such as histograms) or model properties (such as split rules) among clients \citep{li2023fedtree, li2024effective}. However, tree-based approaches often require complex training procedures.
%Concerning hybrid FL (all clients hold the same features but different samples) ...
%However, FedPC is the first framework that can perform HFL, VFL and hybrid FL. Also, FedPCs learn a joint distribution over client data rather than splitting criteria, making them both, generative and discriminative.

%In the following, we present an elegant and effective federated learning framework called federated circuits (FCs). FCs unify horizontal, vertical, and hybrid FL by hierarchically learning mixtures (horizontal part) and fusing marginals (vertical part). FCs are compatible with a broad set of training algorithms. Further, FCs allow us to tremendously scale up PCs.
%In the following section, we show how federated circuits -- inspired by ideas from FL -- learn large-scale PCs and how federated circuits -- inspired by modeling approaches of PCs -- perform horizontal, vertical, and hybrid FL within one unified framework.

\section{Federated Circuits}
%\fb{figure for this section??}
This work aims to scale up PCs by splitting data and the model across multiple machines, thus harnessing the availability of compute clusters to train PCs in a federated fashion.
In the following, we present an elegant and effective way to achieve that using our novel federated learning framework called federated circuits (FCs) that unifies horizontal, vertical, and hybrid FL.
%by hierarchically learning mixtures (horizontal part) and fusing marginals (vertical part). %FCs are compatible with a broad set of training algorithms. 

%More formally, given a dataset $\mathbf{D}$ and a set of clients $\mathcal{C}$ where each $C \in \mathcal{C}$ holds a partition $\mathbf{D}_C$ of $\mathbf{D}$, we aim to learn the joint distribution $p(\mathbf{X})$ over random variables $\mathbf{X}$ (i.e., the features of $\mathbf{D}$).
%Note that the partitioning of $\mathbf{D}$ is not further specified. Hence, each client might only hold a subset of random variables $\mathbf{X}_C \subseteq \mathbf{X}$ with support $\mathcal{X}_C$. This can be interpreted as each $C \in \mathcal{C}$ holding a dataset $\mathbf{D}_C \sim p_C$ where $p_C$ is a joint distribution over $\mathbf{X}_C$ which is related to $p(\mathbf{X})$. Given this problem, we now briefly state our modeling assumptions, followed by bridging federated learning and probabilistic circuits, resulting in our novel framework of federated circuits. 

%In this section, we undertake a comprehensive reevaluation of the Federated Learning problem through a probabilistic lens.
%Our approach involves treating a dataset distributed across multiple clients as a unified virtual dataset. We break it down recursively along sample and feature dimensions, effectively breaking down the FL problem into smaller instances. Depending on the concrete parameterization of FCs, splitting along the sample dimension and feature dimension can be interpreted in different ways such as learning mixtures or identifying independent features. In the subsequent sections, we demonstrate that this straightforward yet potent approach enables a versatile and communication-efficient framework to address HFL, VFL, and hybrid FL.

%\subsection{Federated Learning for PCs and PCs for Federated Learning}
\subsection{Problem Statement \& Modeling Assumptions}
Given a dataset $\mathbf{D}$ and a set of clients $\mathcal{C}$ where each $c \in \mathcal{C}$ holds a subset $\mathbf{D}_c$ of $\mathbf{D}$; we aim to learn the joint distribution $p(\mathbf{X})$ over random variables $\mathbf{X}$ (i.e., the features of $\mathbf{D}$; $\mathbf{D}$ is a  $n \times |\mathbf{X}|$ matrix).
The partitioning of $\mathbf{D}$ is not further specified. Hence, each client might only hold a subset of random variables $\mathbf{X}_c \subseteq \mathbf{X}$ with support $\mathcal{X}_c$. This can be interpreted as each $c \in \mathcal{C}$ holding a dataset $\mathbf{D}_c \sim p_c$ where $p_c$ is a joint distribution over $\mathbf{X}_c$ which is related to $p(\mathbf{X})$.
We introduce two critical modeling assumptions relevant for learning a joint distribution $p(\mathbf{X})$ from a dataset $\mathbf{D}$ partitioned across a set of machines.

%In the realm of FL, learning the joint $p(\mathbf{X})$ hinges on a central question: How can we effectively merge local distributions, $p_C$, to accurately represent each client's data and thus the full dataset $\mathbf{D}$? This naturally prompts the notion of treating these local distributions as marginals of an undisclosed global joint distribution spanning all client data. Yet, to validate this view, two critical assumptions must be made.

\begin{assum}[Mixture Marginals]\label{assum:decomposition}
    There exists a joint distribution $p$ such that the relation $\int_{\mathbf{X} \setminus \mathbf{X}_S} p(x) = \sum_{l \in L} q(L=l) \cdot p_{S}(x | L=l)$ holds for all $x \in \mathcal{X}$. Here, $\mathbf{X}_{S} \subseteq \mathbf{X}$ is a subset of the union of client random variables $\mathbf{X} = \cup_{c \in \mathcal{C}} \mathbf{X}_c$. Further, $\mathcal{X} = \bigtimes_{i \in \{1, ..., |\mathbf{X}|\}} \mathcal{X}_i$ is the support of $\mathbf{X}$, each $p_{S}$ is defined over $\mathbf{X}_S \subseteq \mathbf{X}$ and $q$ is a prior over a latent $L$.
\end{assum}

%In words, Assumption \ref{assum:decomposition} means that there exists a joint distribution over the union of random variables $\mathbf{X}$ of all clients s.t. we can construct the marginal distribution over any subset $\mathbf{X}_S \subseteq \mathbf{X}$ of random variables as a latent variable model using the local distributions of all clients sharing said subset of random variables. 
To illustrate, consider a subset of variables $\mathbf{X}_S \subseteq \mathbf{X}$ shared among all clients and its complement $\mathbf{X}_{S^-} = \mathbf{X} \setminus \mathbf{X}_S$.
% The marginal $\int_{\mathbf{X}_{S^-}} p(\mathbf{X})$ should be representable as a mixture of all client distributions $p_C(\mathbf{X}_S)$ over $\mathbf{X}_S$.
Assumption \ref{assum:decomposition} ensures that the marginal $\int_{\mathbf{X}_{S^-}} p(\mathbf{X})$ is representable as a mixture of all client distributions $p_c(\mathbf{X}_S)$ over $\mathbf{X}_S$.
% If Assumption \ref{assum:decomposition} would not hold, the information stored on the clients' data subsets would not be sufficient to learn $p(\mathbf{X})$.
Without Assumption \ref{assum:decomposition}, the information stored on the clients' data subsets could not be sufficient to learn $p(\mathbf{X})$.
%Assume data $\mathbf{D}$ is partitioned across clients s.t. there exists a subset of variables $\mathbf{X}_S \subseteq \mathbf{X}$ that is shared among all clients $\mathcal{C}$. Then, the marginal $\int_{\mathbf{X}_{S^-}} p(\mathbf{X})$ where $\mathbf{X}_{S^-} = \mathbf{X} \setminus \mathbf{X}_S$ should be representable as a mixture of all client distributions $p_C(\mathbf{X}_S)$ over $\mathbf{X}_S$ since the mixture accumulates all information about $\mathbf{X}_S$ that is held by the clients. 
%If data $\mathbf{D}$ is partitioned horizontally across clients, different local distributions $p_C$ on clients might be induced. However, for any subset of variables $\mathbf{X}_S \subset \mathbf{X}$, there exists only one marginal $\int_{\mathbf{X} \setminus \mathbf{X}_S} p(\mathbf{X})$; hence, this operation does not respect the fact that clients can have different distributions over the same variables as there only exists exactly one marginal. Assumption \ref{assum:decomposition} ensures that the marginal w.r.t. any subset of variables that are shared by a subset of clients $\mathcal{C}' \subseteq \mathcal{C}$ can be represented as the mixture of the client's distributions $p_C$ where $C \in \mathcal{C}'$.

%\fb{Suggestion for previous paragraph:}
%\color{blue}
%To illustrate, consider a subset of variables $\mathbf{X}_S \subseteq \mathbf{X}$ shared among all clients and its inverse $\mathbf{X}_{S^-} = \mathbf{X} \setminus \mathbf{X}_S$.
% The marginal $\int_{\mathbf{X}_{S^-}} p(\mathbf{X})$ should be representable as a mixture of all client distributions $p_C(\mathbf{X}_S)$ over $\mathbf{X}_S$.
%Here, Assumption \ref{assum:decomposition} ensures that the marginal $\int_{\mathbf{X}_{S^-}} p(\mathbf{X})$ is representable as a mixture of all client distributions $p_C(\mathbf{X}_S)$ over $\mathbf{X}_S$.
% the information stored on the client's data partitions is sufficient to learn $p(\mathbf{X})$.
%\color{black}

%Now that we have considered the case of clients sharing a subset of random variables let us shift to the case where different clients hold different subsets of random variables. 
A key assumption in FL is that data cannot be exchanged among clients. However, dependencies among variables residing on different clients might still exist. 
%Since data is kept private, these cannot be uncovered via standard techniques such as independence tests. 
To enable learning these ``hidden" dependencies while keeping data private, we make the following assumption:

\begin{assum}[Cluster Independence]\label{assum:cluster_independence}
    Given disjoint sets of random variables $\mathbf{X}_1, \cdots, \mathbf{X}_n$ and a joint distribution $p(\mathbf{X}_1, \cdots, \mathbf{X}_n)$, assume that a latent $L$ can be introduced s.t. the joint can be represented as $p(\mathbf{X}_1, \cdots, \mathbf{X}_n) = \sum_l q(L=l) \prod_{i=1}^n p(\mathbf{X}_i | L=l)$ where $q$ is a prior distribution over the latent $L$.
\end{assum}

Note that independence is only assumed within clusters in the data. Thus, the latent variable (which can be thought of as "cluster selectors``) allows capturing dependencies among variables residing on different clients. 
%The parameters $\theta$ of the mixture over latents (i.e., clusters) can be learned, e.g., by optimizing log-likelihood. Also note that setting the cardinality of the support set of the latent to one yields a product distribution, which amounts to assuming mutual independence among variables residing on different clients. Thus, 
Distributions of the form in Assumption \ref{assum:cluster_independence} are strictly more expressive than the product distribution and allow more complex modeling.

Further, we want to emphasize that Assumptions \ref{assum:decomposition} and \ref{assum:cluster_independence} are common throughout PC literature. Assumption \ref{assum:decomposition} forms the basis of the validity of marginalization, and Assumption \ref{assum:cluster_independence} plays a crucial role in constructing structure learning algorithms for PCs. For more details, refer to App. \ref{app:assumptions}.

%\begin{fact}\label{prop:mixture_of_prods}
%A joint distribution $p$ over disjoint sets of random variables $\mathbf{X}_1, \cdots, \mathbf{X}_n$ of the form $p(\mathbf{X}_1, \cdots, \mathbf{X}_n) = \sum_l p_{\theta}(L=l) \prod_{i=1}^n p(\mathbf{X}_i | L=l)$ is strictly more expressive than a distribution of the form $p(\mathbf{X}_1, \cdots, \mathbf{X}_n) = \prod_{i=1}^n p(\mathbf{X}_i)$. We provide proof in the App. \ref{app:proofs}.
%\end{fact}

%\begin{proofsketch}
%It has to be shown that (1) if $|\text{supp}\{L\}| = 1$, then $p(\mathbf{X}_1, \dots, \mathbf{X}_n)$ is a product distribution and (2) $L$ captures dependencies among random variables residing on different clients. (1) can be easily verified as $p_{\theta}$ becomes a point mass and. To see that (2) holds, one can intuitively think of $n$ 2-dimensional Gaussians, all with covariance matrix $\Sigma = \mathbf{I}$ but different mean vectors $\mathbf{\mu}_i$ where $\mathbf{I}$ is the identity matrix. Letting $L$ determine the mean of each Gaussian allows us to introduce dependencies among the two dimensions of the Gaussians. For a rigorous proof, see App. A. \qed
%\end{proofsketch}

%With Assumption \ref{assum:cluster_independence} we follow the principle of maximum entropy: While allowing for dependencies among client's random variables, we seek the joint distribution with the highest entropy (hence highest information) s.t. the marginals of each client can be computed from said joint distribution. Multiple joint distributions can yield these marginals, but non-maximum entropy options demand additional assumptions or require prior knowledge, restricting possible solutions. By assuming independence of all random variables within a cluster, we ensure that the highest entropy distribution can be constructed easily using a mixture of product distributions. For independent random variables, the highest entropy distribution is the product distribution. 
%Further, we highlight that Assumption \ref{assum:cluster_independence} aligns with the principle of maximum entropy: Although we allow dependencies among the clients' variables, we aim to find the joint distribution with maximum entropy \textit{within} clusters s.t. all marginals are preserved. More formally, we aim to find a joint where all clusters consist of independent components and $\int_{\mathbf{X} \setminus \mathbf{X}_i} p(\mathbf{X}) = p(\mathbf{X}_i)$ for any $\mathbf{X}_i$ holds.
%For more details, refer to App. B.

% \subsection{Federated Circuits}
\subsection{Bridging Probabilistic Circuits and Federated Learning}
\label{subsec:fcs}

%Since we aim to train PCs over multiple machines, we must learn PCs on partitioned datasets.
%Equipped with the stated modeling assumptions from above, we now show the inherent connection between PC semantics and FL. This connection ultimately allows us to train PCs on data partitioned over a set of clients.
We now illustrate an inherent connection between PC semantics and FL, allowing us to scale PCs on large datasets by partitioning the data over a set of clients.
%\fb{Suggestion for previous paragraph:}
%\color{blue}
%We now illustrate an inherent connection between PC semantics and %FL.
%This will allow us to train PCs on data partitioned over a set of %clients and thus greatly increase the scaling potential of PCs.
%\color{black}

\textbf{Sum Nodes and Horizontal FL.} In horizontal FL, each client is assumed to hold the same set of features, i.e., $\mathbf{X}_c = \mathbf{X}_{c'}$ for all $c, c' \in \mathcal{C}$. However, each client holds different samples. 
Prominent horizontal FL methods aggregate the \textit{model parameters} of locally learned models regularly during training.
However, the horizontal FL setting also precisely corresponds to the interpretation of sum nodes in PCs: A sum node splits a dataset into multiple disjoint clusters. This results in a mixture distribution representing the data that is learned from the disjoint clusters. 
Thus, instead of aggregating model parameters, we aggregate the \textit{distributions} learned by each client on its subset.

\begin{defin}[Horizontal FL]\label{def:HFL}
    Assume a set of samples $\mathbf{D}_c \sim p_c$ on each client $c \in \mathcal{C}$, a joint distribution $p$ adhering to Assumption \ref{assum:decomposition} and that $\mathbf{X}_c = \mathbf{X}_{c'}$ for all $c, c' \in \mathcal{C}$ s.t. $c \neq c'$. We define horizontal FL as fitting a mixture distribution $\hat{p} = \sum_{c \in \mathcal{C}} q(c) \cdot \hat{p}_c$ such that $d(\hat{p}, p)$ and $d(p_c, \hat{p}_c)$ are minimal for all $c \in \mathcal{C}$ where $d$ is a distance metric and $\hat{p}_c$ local distribution estimates.
\end{defin}

This view on horizontal FL has an appealing positive side effect: Aggregating model parameters can lead to divergence during training if the client's data distributions significantly differ. Since we aggregate distributions in mixtures, we naturally can handle heterogeneous client distributions. Also, since clients can train models independently, the communication cost of the training is minimized.

\textbf{Product Nodes \& Vertical FL.} In vertical FL, each client is assumed to hold a disjoint set of features, i.e., $\mathbf{X}_c \cap \mathbf{X}_{c'} = \emptyset$ for all $c, c' \in \mathcal{C}$. In contrast to horizontal FL, all clients hold different features belonging to the same sample instances.
%Since all clients hold different features, existing vertical FL approaches have to share some kind of data statistics (e.g., feature representations) to learn a model collaboratively. 
As in horizontal FL, there is a semantic connection between vertical FL and PCs. Product nodes in PCs compute a product distribution defined on a disjoint set of random variables. Thus, a product node separates the data along the feature dimension, corresponding to the vertical FL setting. However, a product node assumes the random variables of the child distributions to be independent of each other. Obviously, this is an unrealistic assumption for vertical FL, where features held by different clients might be statistically dependent. Assumption \ref{assum:cluster_independence} can be exploited to capture such dependencies, and a mixture of products of independent clusters can be formed. See Sec. \ref{subsec:FedPCs} for details.

\begin{defin}[Vertical FL]\label{def:VFL}
    Assume a set of samples $\mathbf{D}_c \sim p_c$ on each data owner $c \in \mathcal{C}$, the existence of a joint distribution $p$ adhering to Assumptions \ref{assum:decomposition} and \ref{assum:cluster_independence} and that $\mathbf{X}_c \cap \mathbf{X}_{c'} = \emptyset$ holds for all $c, c' \in \mathcal{C}$ s.t. $c \neq c'$. We define vertical FL as estimating a joint distribution $\hat{p}$ s.t. $d(p, \hat{p})$ is minimal and $\int_{\mathbf{X} \setminus \mathbf{X}_c} \hat{p}(x) = \hat{p}_{c}(x)$ for all $x \in \mathcal{X}$ where $d$ is a distance metric and $\hat{p}_c$ are estimates of client distributions.
\end{defin}

%\begin{defin}[Hybrid FL]\label{def:FL}
%    Given a data partition $\mathbf{D}_C \sim p_C$ on each client $C \in \mathcal{C}$ s.t. $\mathbf{D} = \bigcup_{C \in \mathcal{C}} \mathbf{D}_C$ and assuming the existence of a joint distribution $p$ from Assumption \ref{assum:decomposition}, we define hybrid federated learning as fitting a joint distribution $\hat{p}$ over all random variables $\mathbf{X}$ s.t. $d(p, \hat{p})$ and $d(p_C, \hat{p}_C)$ are minimal for some distance metric $d$ and all $p_C, \hat{p}_C$ where $p_C$ and $\hat{p}_C$ are the true and estimated marginal distribution over client random variables $\mathbf{X}_C \subseteq \mathbf{X}$.
%\end{defin}
\textbf{PCs \& Hybrid FL.}
Given Defs. \ref{def:HFL} and \ref{def:VFL}, hybrid FL is a combination of both. 
%Note that this definition does not make any assumptions on the topology of the communication network. Both, centralized as well as decentralized FL approaches can be taken without violating Assumption \ref{assum:decomposition} as long as the marginalization of the joint distribution $p$ can be represented as a mixture.
%With Definition \ref{def:FL}, we can draw the central connection between learning PCs and performing (hybrid) FL: 
%While $\mathbf{D}$ is assumed to come from a distribution $p(\mathbf{X})$, each client $C \in \mathcal{C}$ only holds a partition $\mathbf{D}_C$. Since there is no further assumption on the partitioning, the set of random variables and samples can vary across clients. Thus, each client $C$ can only learn a local distribution $p_C$ over the random variables and samples contained in its partition $\mathbf{D}_C$.
In terms of PC semantics, this amounts to building a hierarchy of fusing marginals and learning mixtures.
Provided with these probabilistic semantics, we can now formally bridge PCs and FL.
In the following, we distinguish between clients $\mathcal{C}$ and servers $\mathcal{S}$ and define the set of machines participating in training as $\mathcal{N} = \mathcal{C} \cup \mathcal{S}$. Bringing everything together and abstracting from the probabilistic interpretation, we define \textbf{federated circuits} (FCs) as follows.
%Horizontal FL can now be derived as a special case of Definition \ref{def:FL}.

%Definition \ref{def:HFL} is appealing in various ways: With horizontal FL, we obtain a straightforward way to scale PCs simply by partitioning $\mathbf{D}$ along the sample dimension, followed by learning a mixture of PCs. 
%For FL, Definition \ref{def:HFL} reduces the problem of horizontal FL to fitting a mixture model given estimates of local distributions $\hat{p}_C$. 
%From a FL viewpoint, Definition \ref{def:HFL} shifts the burden of merging distributional information of a set of clients from the model parameter space to the distribution space (or parameter space of the mixture model). This approach empowers independent training of local models without communication and eliminates the need to aggregate model parameters as it is typically done~\citep{McMahan2016FedAvg, Sahu2018FedProx, Karimireddy2020SCAFFOLD}.

%Similar to the horizontal case, vertical FL also becomes a special case of Definition \ref{def:FL}.

% TODO: Put it that way: Instead of directly talking about maximum-entropy distributions (and thus leading to independency assumption), state that variables among clients can have dependencies. Hence we propose our more general model/distribution assumption first, then say that this subsumes maximum-entropy distributions which in turn make a minimal set of assumptions (only independence instead of that each client distribution is representable as a mixture AND mixtures are independent).
%In vertical FL, data is partitioned s.t. clients only see \textit{marginals} of the joint distribution w.r.t. the variables they hold as, by definition, there is no overlap in the feature space among clients. The same can occur in hybrid FL since clients do not necessarily share the same feature space (although overlaps are allowed). Assumption \ref{assum:cluster_independence} ensures that dependencies among variables residing on different clients can be learned in vertical and hybrid FL. Also, Assumptions \ref{assum:decomposition} and \ref{assum:decomposition} ensure that all client distributions $p_C$ can be represented as a marginal of the full joint $p(\mathbf{X})$.

\begin{defin}[Federated Circuits]\label{def:fcs}
A \textbf{federated circuit} (FC) is a tuple $(\graph, \psi_{\graph}, \omega)$ where $\graph = (V, E)$ is a rooted, Directed Acyclic Graph (DAG), $\psi_{\graph}: V \rightarrow \mathcal{N}$ assigns each $\Node \in V$ to a client/server $n \in \mathcal{N}$ based on the structure of $\graph$ and $\omega: V \rightarrow O$ assigns an operation $o \in O$ to each node $\Node \in V$ where $o: \text{dom}(\ch{\Node}) \rightarrow \text{dom}(\Node)$ computes the value of $\Node$ given the values of the children of $\Node$.
\end{defin}

FCs extend the definition of PCs in the sense that FCs represent a 
computational graph $\mathcal{G} = (V, E)$ distributed over multiple machines where arbitrary operations can be performed in each node $\Node \in V$. Note that the FC's graph structure is not only a mere computation graph, but also defines the information flow among server(s) and clients, which we refer to as the communication network. Also, FCs are not restricted to the probabilistic interpretation presented above. For example, parameterizing leaves by decision trees and introducing a node $\Node$ that performs averaging yields a bagging model.
%For now, let us dive deeper into the probabilistic interpretation of FCs. To that end, we present a concrete instantiation of FCs leveraging Probabilistic Circuits (PCs) as leaf models, resulting in federated PCs (FedPCs).
%Let us now illustrate FCs for learning Probabilistic Circuits (PCs) in a federated environment, resulting in federated PCs (FedPCs).
% TODO: Here we have to introduce Federated Einsums instead of FedSPNs
\subsection{Federated Probabilistic Circuits}
%\notejo{Describe FedPCs and NFCs in one framework here. Both estimate a density on clients, learning should thus be similar or even equal (except the leaf learning).}
\label{subsec:FedPCs}
Let us now dive deeper into the probabilistic interpretation of FCs. To that end, we present a concrete instantiation of FCs leveraging Probabilistic Circuits (PCs) as leaf models, resulting in federated PCs (FedPCs).
Following the probabilistic interpretation from Sec. \ref{subsec:fcs}, we align the PC structure with the communication network structure to form a federated PC.
\begin{defin}[Federated PC]
A Federated PC (FedPC) is a FC where each leaf node $\ClientNode$ is a density estimator and each node $\Node$ s.t. $\ch{\Node} \neq \emptyset$ is either a sum node ($\SumNode$) or a product node ($\ProductNode$).
\end{defin}
% \todo{namely NFs and PCs}
Note that only the client nodes $\ClientNode$ hold a dataset and we only demand the clients to be parameterized by a density estimator.
%Due to the nature of FedPCs, it is possible to use different density estimators in the leaves.
In order for FedPCs to be computationally efficient, these density estimators should be tractable.
In the following, we parameterize the leaf nodes $\ClientNode$ as PCs.

%The operation assignment $\omega$ is omitted in FedPCs as the operations performed by each node are implicitly defined (sum or product).
The assignment function $\psi$ transforms the PC's computation graph into a distributed computation graph, thus inducing a communication network. 
%Hence, $\mathcal{G}$ defines the model's computation and the communication among participants. 
This establishes a direct correspondence between PC semantics (computation graph) and the communication network structure in FedPCs. Inference is performed as usual in PCs by propagating likelihood values from the leaf nodes to the root node. The only difference is that the result of a node $\Node$ has to be sent to its parent(s) $\mathbf{pa}(\Node)$ over the communication network if $\psi(\Node) \neq \psi(\Node')$ holds for $\Node' \in \mathbf{pa}(\Node)$.

Training FedPCs requires adapting the regular training procedure for PCs because in FL, clients cannot access other clients' data. 
For example, training with Expectation Maximization (EM) requires access to the same samples for all clients, which is incompatible with horizontal and hybrid FL.
Similarly, LearnSPN~\cite {gens2013LearnSPN} requires access to all features due to independence tests performed during training.
%Since a forward pass through a PC requires the same sample to be available on each leaf, prominent learning algorithms such as Expectation Maximization (EM) are not directly applicable in horizontal and hybrid FL settings. For analogous reasons, algorithms like LearnSPN~\cite {gens2013LearnSPN} are not applicable since the use of independence tests during training requires access to all features. 
To solve this, we propose a \textit{one-pass} training procedure for FedPCs.

\SetKwComment{Comment}{/* }{ */}
\begin{algorithm}[t]
\caption{One-Pass Training}\label{alg:fedpcLearning}
\KwData{Clients $\mathcal{C}$, features $\mathbf{X}$, cluster size $K$, FedPC}
\KwResult{Trained fedPC}
Set $\mathbf{M} = \mathbf{0}^{|\mathcal{C}| \times |\mathbf{X}|}$ and map $= []$\;
\Comment{mark features per client}
$\mathbf{M}_{i, j} = 1$ if $X^{(j)}$ on client $i$\;
\For{$j, \mathbf{u}$ in enum. of distinct columns $\mathcal{U}$}{
    $\mathbf{S}^{(j)} = \{i : i \in \{1, \dots, |\mathbf{X}| \land \text{all}(\mathbf{u} == \mathbf{M}_{:, i})\} \}$\;
    \Comment{Group clients per feature}
    $O_{\mathbf{S}^{(j)}} = \text{argwhere}(\mathbf{u} == 1)$\;
    map.append($\mathbf{S}^{(j)}$, $O_{\mathbf{S}^{(j)}}$)\;
}
sums $= []$\;
\For{$\mathbf{S}^{(j)}$, $O_{\mathbf{S}^{(j)}}$ in map}{
    \If{$|O_{\mathbf{S}^{(j)}}| > 1$}{
        s $=$ fedPC.add\_sum($\mathbf{S}^{(j)}$, $O_{\mathbf{S}^{(j)}}$)\;
        sums.add(s)
    }
    \Else{
        client\_clusters $=$ cluster\_local\_data($O_{\mathbf{S}^{(j)}}$, $K$)\;
    }
}
products $=$ fedPC.add\_products($P$)\;
\Comment{Construct random structure over non-shared features}
\For{prod in products}{
    prod.children.add(sums)\;
    \For{client, clusters in client\_clusters}{
        prod.children.add\_rand\_subset(clusters)\;
    }
}
fedPC.add\_mixture\_over\_products(products)\;
fedPC.train\_clients()\;
fedPC.infer\_weights()\;
\Return fedPC
\end{algorithm}
%\vspace{-1cm}

\textbf{One-Pass Training.} Our one-pass learning algorithm learns the structure and parameters of FedPCs such that local models can be trained independently (Algo. \ref{alg:fedpcLearning}, Fig.~\ref{fig:algo}).
%First, clients train local models independently, followed by inferring the FedPC parameters residing on the server-/network-side. This can be interpreted as learning to correctly weight the distributions local PCs represent to maximize, e.g., log-likelihood.
Before training, all clients $c \in \mathcal{C}$ share their set of uniquely identifiable features/random variables $\mathbf{X}_c$ with a server, resulting in the feature set indicator matrix $\mathbf{M}^{|\mathcal{C}| \times |\mathbf{X}|}$ \textbf{(Lines 1-2)}. Feature identifiers can be names of features such as ``account balance" and must correspond to the same random variable on all clients (thus uniquely identifiable). Then, the server divides the joint feature space $\mathbf{X}$ into disjoint subspaces $\mathbf{S}^{(j)}$. For this, we consider the set of distinct column vectors $\mathcal{U}$ of $\mathbf{M}$ where we denote distinct vectors as $\mathbf{u}$. Since each column of $\mathbf{M}$ indicates the set of clients a feature resides on, we can use each $\mathbf{u} \in \mathcal{U}$ to compute a set of features that are shared across the same set of clients. This results in $|\mathcal{U}|$ distinct feature sets, denoted $\{\mathbf{S}^{(1)}, \dots, \mathbf{S}^{(|\mathcal{U}|)}\}$. Each $O_{\mathbf{S}^{(j)}}$ denotes the set of clients that hold the features in $\mathbf{S}^{(j)}$. \textbf{(Lines 3-7)}. This procedure is illustrated in Fig. \ref{fig:algo} (top).

Afterward, the FedPC structure is constructed as shown in Fig.~\ref{fig:algo} (bottom): First, we build a mixture (sum node) for each subspace $\mathbf{S}^{(j)}$ where $|O_{\mathbf{S}^{(j)}}| > 1$, i.e., more than one client holds $\mathbf{S}^{(j)}$ \textbf{(Lines 9-12)}. This enables each client to learn a PC over $\mathbf{S}^{(j)}$ independently. After that,$|O_{\mathbf{S}^{(j)}}| = 1$ holds for all remaining $\mathbf{S}^{(j)}$. Also, the scope of the sums nodes introduced in the FedPC share no features with any of the remaining $\mathbf{S}^{(j)}$ 
since the server divided the feature space into disjoint subspaces. Therefore, we introduce $P$ product nodes to construct the remaining part of the FedPC.
To this end, we divide the data of all subspaces $\mathbf{S}^{(j)}$ where $|O_{\mathbf{S}^{(j)}}| = 1$ holds into $K$ clusters \textbf{(Line 14)}. Each client learns a dedicated PC for each cluster. To ensure that the FedPC spans the 
entire feature space of the clients, the children of product nodes are set as follows: Each sum node introduced 
in the FedPC becomes a child of each product node. Additionally, for each $\mathbf{S}^{(j)}$ where $|O_{\mathbf{S}^{(j)}}| = 1$ holds, we randomly select a PC learned over one of the $K$ clusters s.t. the scope of each product node spans $\mathbf{X}$, and each PC representing a cluster is the child of at least one product node.
Then, we build a mixture over all product nodes using a sum node \textbf{(Lines 15-20)}. Note that we seek to construct product nodes over independent clusters, which aligns with the maximum entropy principle (see App. \ref{app:max_entropy} for details).
Once the FedPC is constructed, all client-sided PCs are learned. Since clients learn their PCs independently, each client can use an arbitrary learning algorithm (even different ones).
As a last step, the network-sided parameters, i.e., the weights of network-sided sum nodes, of the FedPC are inferred \textbf{(Line 21-22)}. For each sum node $\SumNode$, the weight $\mathbf{w}_{\SumNode}^{(i)}$ associated with the $i$-th child (i.e., distribution) of $\SumNode$ is set to $\frac{\rho(\Node_i)}{\sum_i \rho(\Node_i)}$. Here, $\rho(\Node_i) = \sum_{\ClientNode \in \ch{\Node_i}} |\mathbf{D}_{\ClientNode}|$ where $\mathbf{D}_{\ClientNode}$ is the dataset used to train the leaf $\ClientNode$. Hence, the network-sided weights can be inferred without any forward or backward pass. 
Note that this approach reduces horizontal FL to learning a mixture of the client's data distributions and vertical FL to learning a mixture over $P$ product nodes.

%Next, we analyze the communication efficiency of our proposed learning algorithm.

\begin{figure*}
    \centering
    \includegraphics[width=.8\textwidth]{federated-circuits/images/Algotithm.pdf}
    \caption{\textbf{One-Pass Training Visualized.} (Top) First, the matrix $\mathbf{M}$ is initialized, representing which features are held by which client. Feature subsets are constructed by considering distinct column vectors $\mathbf{u}$ of $\mathbf{M}$ that represent the same set of clients. This forms a mapping indicating which features are modeled as a mixture over clients. (Bottom) This mapping is utilized by forming mixtures over different clients sharing the same feature set via sum nodes. Features that are not shared over multiple clients will be clustered into $K$ clusters (here $K=2$). The FedPC is formed by creating product nodes containing all sum nodes from the previous steps and at least one of the $K$ clusters. Lastly, the root node is inserted.}
    \label{fig:algo}
    %\vspace{-0.5cm}
\end{figure*}

%\subsection{Federated Circuits and Bagging}
%\label{subsec:bagging}
%We already unveiled the deep connection between PCs and FL that we formalized in the FC framework. Interestingly, FCs have a connection to another prominent modeling approach, namely bagging. If the leaves of a PC are replaced by discriminative learners that learn a conditional distribution $p_C(\mathbf{Y} | \mathbf{X}_C)$ instead of a joint, and the FC performs a simple averaging operation, the FC can be interpreted as a bagging model.

\subsection{Analysis of Communication Efficiency}\label{subsec:CommunicationAnalysis}
%\notejo{This analysis does only hold for FedPCs (and maybe NFCs)!}
As a key requirement for efficient training when learning models at scale on partitioned data, we now analyze the communication efficiency of FedPCs. 

\textbf{Horizontal FL.}
Assume a client set $\mathcal{C}$, each holding a model with $M$ parameters. Further, assume models are aggregated $K$ times during training ($K$ communication rounds). Then, model aggregation-based algorithms like FedAvg, commonly used in horizontal FL, send $\mathcal{O}(M \cdot |\mathcal{C}| \cdot K)$ messages over the network. $K$ usually ranges between hundreds to even thousands, depending on the task and model complexity.
%as each client sends $M$ model parameters to a server in each communication round. 
Training FedPCs with one-pass training, in contrast, only requires $\mathcal{O}(|\mathcal{C}| \cdot (M + 1))$ messages over the network as models are learned locally and independently, followed by setting the parameters ($\mathcal{O}(|\mathcal{C}|)$ messages) of the sum nodes and aggregating the model on the server ($\mathcal{O}(M |\mathcal{C}|)$ messages).

\textbf{Vertical FL.}
In vertical settings, SplitNN-like architectures are commonly used. Assume training a SplitNN architecture for $E$ epochs that output a feature vector of size $F$ for each sample of a dataset with $S$ samples, vertically distributed over clients $\mathcal{C}$. The training requires sending $\mathcal{O}(E \cdot |\mathcal{C}| \cdot F \cdot S)$ messages over the network. In contrast, with one-pass training of FedPCs, each client learns a dedicated PC with $M$ parameters for each of the $K$ clusters that are learned. The last layer of the FedPC is a mixture of $P$ products of clusters. The mixture parameters are set after training each client's model. 
Aggregating the learned models and setting the network-sided mixture parameters requires $\mathcal{O}(K \cdot M \cdot |\mathcal{C}| + P)$ messages to be sent. 
If $(K \cdot M + \frac{P}{|\mathcal{C}|}) < (E \cdot F \cdot S)$ holds, training FedPCs is more communication efficient than training SplitNN-like architectures. In practice, this is likely to hold: The number of clusters is usually smaller than $100$ while feature vectors can have hundreds of dimensions (i.e., $F > 100$). Further, models should have fewer parameters than samples in the dataset to ensure generalization (i.e., $M < S$). $P$ can be set to an arbitrary value, depending on $|\mathcal{C}|$ and the data.
App. \ref{app:commeff} provides more details and an intuition on communication costs.
%For an intuition on communication costs, we show a plot on communication costs w.r.t. client number, model- and data size in App. C.

\textbf{Hybrid FL.}
In hybrid FL, FedPCs are trained on several subspaces: Some exist on all or a subset of clients (denoted as $R_s$) and some are only available on one client (denoted as $R_d$). Further denote communication costs of FedPCs in horizontal FL and vertical FL as $C_h$ and $C_v$, respectively. Since the training procedure in hybrid cases essentially performs horizontal FL on shared feature spaces and vertical FL on disjoint feature spaces, $\mathcal{O}(|R_s| \cdot C_h + |R_v| \cdot C_v)$ messages are sent over the network during training.

%\begin{rem}
%    When scaling PCs using FedPCs, we do not aggregate the models after training. This distributes computation load across multiple machines also during inference and further decreases communication costs during training.
%\end{rem}

% \subsection{FedPCs for Large-Scale Problems \fb{remove?}}
% % FedPCs are a novel contribution to the FL community, as they provide a framework for combining probability distributions in a hybrid FL setting. 
% Apart from providing a general framework for combining probability distributions in a hybrid FL setting, the intersection of PCs with FL results in benefits for scaling tractable, probabilistic models to large-scale problems.
% While most previous work has focused on speed or scaling of PCs on single devices, parallelizing across devices not only increases speed of training and inference, which might be crucial in real applications where fast inference is necessary, but, thereby, also allows for training larger models in the same or even a smaller amount of time.
% As we will show in our experiments, leveraging the FL setting for PCs improves computation speed throughout and, particularly for complex problems with a large number of samples and features, can also increase performance by large amounts.
% FedPCs allow for combining different types of probability density estimators, so any future progress on PCs or other models can easily be integrated into our framework, improving performance even further.

\section{Experiments}
% \begin{table}[]
% \resizebox{\columnwidth}{!}{
% \centering
% \begin{tabular}{c|ccccccc}
%        & cent.           & \multicolumn{2}{c}{horizontal}    & \multicolumn{2}{c}{vertical}         & \multicolumn{2}{c}{hybrid}        \\
%        & LL              & LL              & Rel. Runtime    & LL              & Rel. Runtime       & LL              & Rel. Runtime    \\ \hline
% MNIST  & $3352 \pm 3.5$  & $3350 \pm 3.2$  & $\mathbf{0.07} \pm \mathbf{0.01}$ & $3351 \pm 3.8$  & $0.13 \pm 0.01$    & $3349 \pm 3.7$  & $0.13 \pm 0.02$ \\
% Income & $-11.5 \pm 0.1$ & $-11.4 \pm 3.5$ & $\mathbf{0.17} \pm \mathbf{0.02}$ & $-11.9 \pm 3.3$ & $0.236 \pm $0.01\$ & $-12.0 \pm 1.5$ & $0.21 \pm 0.02$ \\
% Cancer & $-38.9 \pm 0.3$ & $-38.5 \pm 1.1$ & $\mathbf{0.21} \pm \mathbf{0.07}$ & $-38.6 \pm 0.5$ & $0.35 \pm 0.05$    & $-38.7 \pm 1.5$ & $0.35 \pm 0.1$  \\
% Credit & $-12.8 \pm 1.0$ & $-13.1 \pm 0.5$ & $0.42 \pm 0.05$ & $-12.5 \pm 2.3$ & $\mathbf{0.31} \pm 0.09$    & $-12.5 \pm 1.3$ & $0.40 \pm 0.13$
% \end{tabular}
% }
% \caption{\textbf{FedPCs speed up training while retaining model performance.} We trained PCs in a centralized setting (cent.) and in all FL settings (using FedPCs) on different datasets and the same structure learning algorithm. We find that FedPCs tremendously speed up training (reported as relative runtime w.r.t. centralized training where relative centralized runtime is 1). Also, FedPCs achieve the same log-likelihood as centralized PC on all datasets, demonstrating that PCs can be learned in federated settings (positive log-likelihoods due to Gaussian leaves).}
% \label{tab:likleihoods}
% \end{table}

\begin{table*}[]
\resizebox{\textwidth}{!}{
\centering
\begin{tabular}{c|cccc|cccc}
       & \multicolumn{4}{c}{Log-Likelihood}    & \multicolumn{4}{c}{Relative Runtime} \\
       & cent.              & horizontal              & vertical    & hybrid              & cent. & horizontal       & vertical              & hybrid    \\ \hline
MNIST  & $3352$\scriptsize{$\pm 3.5$}  & $3350$\scriptsize{$\pm 3.2$}  & $3351$\scriptsize{$\pm 3.8$} & $3349$\scriptsize{$\pm 3.7$} & $1.0$ & $\mathbf{0.07}$\scriptsize{$\pm \mathbf{0.01}$} & $0.13$\scriptsize{$\pm 0.01$}  & $0.13$\scriptsize{$\pm 0.02$} \\
Income & $-11.5$\scriptsize{$\pm 0.1$} & $-11.4$\scriptsize{$\pm 3.5$} & $-11.9$\scriptsize{$\pm 3.3$} & $-12.0$\scriptsize{$\pm 1.5$} & $1.0$ & $\mathbf{0.17}$\scriptsize{$\pm \mathbf{0.02}$} & $0.236$\scriptsize{$\pm 0.01$} & $0.21$\scriptsize{$\pm 0.02$} \\
Cancer & $-38.9$\scriptsize{$\pm 0.3$} & $-38.5$\scriptsize{$\pm 1.1$} & $-38.6$\scriptsize{$\pm 0.5$} & $-38.7$\scriptsize{$\pm 1.5$} & $1.0$ & $\mathbf{0.21}$\scriptsize{$\pm \mathbf{0.07}$} & $0.35$\scriptsize{$\pm 0.05$} & $0.35$\scriptsize{$\pm 0.1$}  \\
Credit & $-12.8$\scriptsize{$\pm 1.0$} & $-13.1$\scriptsize{$\pm 0.5$} & $-12.5$\scriptsize{$\pm 2.3$} & $-12.5$\scriptsize{$\pm 1.3$} & $1.0$ & $0.42$\scriptsize{$\pm 0.05$} & $\mathbf{0.31}$\scriptsize{$\pm 0.09$} & $0.40$\scriptsize{$\pm 0.13$}
\end{tabular}
}
\caption{\textbf{FedPCs speed up training while retaining model performance.} We trained PCs in a centralized setting (cent.) and in all FL settings (using FedPCs) on different datasets and the same structure learning algorithm. We find that FedPCs tremendously speed up training while there is no reduction in log-likelihood. This demonstrates that PCs can be learned in federated settings (for MNIST, log densities are reported). We report relative runtime where centralized runtime is 1.0.}
\label{tab:likleihoods}
\vspace{-0.25cm}
\end{table*}

\begin{figure*}[t]
    \begin{minipage}{0.48\textwidth}
        \includegraphics[width=.85\textwidth]{federated-circuits/images/runtimes.pdf}
        \caption{\textbf{FedPCs speed up training} on large-scale image data (64x64 and 32x32 RGB images) due to parallel training on separate data subsets.}
        \label{fig:runtime}
    \end{minipage}
    \hfill
    \begin{minipage}{.48\textwidth}
        \resizebox{\columnwidth}{!}{%
            \begin{tabular}{l|lll}
                              & CelebA             & Imagenet32          & Imagenet64              \\ \hline
            EiNet              & 4.93 $\pm$ \scriptsize{0.09} & 5.35 $\pm$ \scriptsize{0.06}   & 5.38 $\pm$ \scriptsize{0.06}  \\
            PyJuice           & 4.30 $\pm$ \scriptsize{0.03} & 5.19 $\pm$ \scriptsize{0.01}  & 4.95 $\pm$ \scriptsize{0.03}   \\
            PICs           & 4.11 $\pm$ \scriptsize{ 0.03}                          & 4.46 $\pm$ \scriptsize{0.02}                        & 4.42 $\pm$ \scriptsize{0.03} \\
            IncPC$^*$          & -                              & 4.19                        & 3.90 \\
            LVD            &  4.02 $\pm$ \scriptsize{0.02}                              & 4.39 $\pm$ \scriptsize{0.04}                       & 4.12 $\pm$ \scriptsize{0.03} \\
            FedPC (2 cl.)  & 4.14 $\pm$ \scriptsize{0.07} & 4.50 $\pm$ \scriptsize{0.03}  & 3.84 $\pm$ \scriptsize{0.03} \\
            FedPC (4 cl.)  & 4.10 $\pm$ \scriptsize{0.07} & 4.34 $\pm$ \scriptsize{0.04}  & 3.69 $\pm$ \scriptsize{0.04} \\
            FedPC (8 cl.)  & \underline{3.98} $\pm$ \scriptsize{\underline{0.06}} & \underline{4.29} $\pm$ \underline{\scriptsize{0.03}}  & \underline{3.61} $\pm$ \underline{\scriptsize{0.04}}  \\
            FedPC (16 cl.) & \textbf{3.87} $\pm$ \scriptsize{\textbf{0.04}} & \textbf{4.18} $\pm$ \textbf{\scriptsize{0.04}} & \textbf{3.54} $\pm$ \textbf{\scriptsize{0.06}}
            \end{tabular}
            }
            \captionof{table}{\textbf{FedPCs outperform EiNets and PyJuice on density estimation tasks.} FedPCs achieve better results on density estimation tasks on CelebA, Imagenet32, and Imagenet because they can learn large models distributed across multiple machines. Results reported in bits per dimension (bpd) (lower is better). Values for baselines marked with $^*$ were taken from the respective papers (code not available). Best in \textbf{bold}, 2nd best \underline{underlined}.}
            \label{tab:imageLLs}
    \end{minipage}
    \vspace{-0.5cm}
\end{figure*}

%\todo{add dataset information and setup}
Our empirical evaluation corroborates that FedPCs can be leveraged to scale up PCs effectively via data and model
partitioning. By performing horizontal, vertical and hybrid FL in one unified framework, we obtain high-performing models with the same or improved performance compared to prominent FL baselines. 

%Also, we confirm that FCs perform horizontal, vertical, and hybrid FL in one unified framework, yielding high-performing models. 
%The main goal of this work was to scale up PCs by distributing model training to multiple machines. On this path, we found a deep connection between PCs and FL, resulting in a novel and flexible FL framework of federated circuits. 
%Thus, we evaluate FCs w.r.t. both its upscaling capabilities of PCs and its performance in horizontal, vertical, and hybrid FL.
We aim to answer the following questions: \textbf{(Q1)}~Can FedPCs decrease the required training time and successfully learn a joint distribution over distributed data? \textbf{(Q2)}~Do FedPCs effectively scale up PCs, thus yielding more expressive models?
\textbf{(Q3)}~How do FCs with different parameterizations perform on classification tasks compared to existing FL methods? \textbf{(Q4)}~How does our one-pass learning algorithm compare to training with the EM algorithm?

%\begin{figure}
%    \vspace{-0.1cm}
%    \includegraphics[width=.9\linewidth]{federated-circuits/images/runtimes.pdf}
%    \caption{\textbf{FedPCs speed up training.} Due to parallel training on multiple data partitions, FedPCs tremendously speed up training compared to EiNet (shown in relative speed-up).}
%    \label{fig:runtime}
%    \vspace{-0.7cm}
%\end{figure}

\textbf{Experimental Setup.} To see if FedPCs, an instantiation of FCs, successfully scale up PCs, we follow \citet{liu2024scalingtractableprobabilisticcircuits} and perform density estimation on three large-scale, high-resolution image datasets:  Imagenet, Imagenet32 (both 1.2M samples), and CelebA (200K samples). The datasets were partitioned over 2-16 clients horizontally. We compare FedPCs against Einets~\citep{perharz2020einsum}, PyJuice~\citep{liu2024scalingtractableprobabilisticcircuits}, Probabilistic Integral Circuits (PICs)~\citep{gala2024PICs}, Inception PCs (IncPCs)~\citep{wang2025IncPC}, and Latent Variable Distillation (LVD)~\citep{liu2023LVD} on 5 different seeds.

To evaluate FCs in FL scenarios, we selected three tabular datasets that cover various application domains and data regimes present in the real world: one credit fraud dataset ($\sim 300$K samples), a medical dataset (breast cancer detection; $<1000$ samples), and the popular Income dataset ($>1$M samples). The selected datasets for FL cover low-data, medium-data, and large-data regimes (see App.~\ref{app:exp_details} for more details). Both balanced (breast cancer) and imbalanced (income, credit) datasets are included in our evaluation. We selected tabular datasets as they are well suited to investigate FCs in horizontal, vertical, and hybrid settings and represent various real-world applications. We compare FCs to multiple strong and widely used baselines. As a neural network architecture parameterization, we use TabNet~\citep{arik2020tabnetattentiveinterpretabletabular} which is tailored to tabular datasets. We train the networks with the widely used FedAvg (horizontal FL) and SplitNN (vertical FL) frameworks. Additionally, we compare FCs to FedTree~\citep{li2023fedtree} since tree models excel at tabular datasets. For details, see App. \ref{app:exp_details}.

\textbf{\textbf{(Q1)} FedPCs learn joint distributions over 
partitioned data in less time.} First, we validate that FedPCs
correctly and efficiently perform density estimation on partitioned datasets 
distributed over multiple clients. To this end, multiple datasets were distributed over a set of clients corresponding to horizontal (5 clients),
vertical (2 clients), and hybrid FL (2 clients). To demonstrate that FedPCs are also robust against label shifts, a common regime in FL, each client received data from only a subset of classes in the horizontal case, and local PCs were learned over the client samples. 
%Afterward, we aggregated the PCs on server side and constructed the FC by adding a sum-node (the root of the FedPC) with weights set as $w_C = \frac{|\mathbf{D}_C|}{\sum_{C \in \mathcal{C}} |\mathbf{D}_C|}$. 
In the vertical case, we split data s.t. feature spaces of clients 
are disjoint, but
each client 
holds the same samples. In hybrid settings, data was distributed s.t. both feature- and sample-spaces among 
clients have overlaps (but no full overlap).
For all tabular datasets, the leaves of the FedPC were parameterized with MSPNs~\citep{molina2018MSPNs}, a member of the 
PC model family capable of performing density estimation on mixed data domains (i.e., continuous and discrete random variables). 
We chose MSPNs as the centralized models, which were learned using \textsc{learnSPN}, a recursive greedy structure learning algorithm for SPNs~\citep{gens2013LearnSPN}. For MNIST, EiNets with Gaussian densities were used as PC instantiations in all settings. Note that FedPCs were chosen to approximately match the size of centralized models, i.e., no model upscaling was performed.

Tab. \ref{tab:likleihoods} compares log-likelihoods and relative runtime of centralized PC training on the full datasets with log-likelihood scores and relative runtimes achieved by FedPC in different FL settings. FedPCs achieve the same log-likelihoods as centralized PCs on tabular datasets while being tremendously faster in training. Hence, we answer \textbf{(Q1)} affirmatively.
%\begin{figure}[t]
%    \centering
%    \includegraphics[width=\columnwidth]{federated-circuits/images/lls_rt.pdf}
%    \caption{\textbf{FedPCs efficiently scale up PCs.} We performed centralized training of a PC and training of FedPCs with varying number of clients participating (repeated 10 times). Each client trained a separate model on a subset of Imagenet, i.e. the model size increases with the number of clients. Scaling up to more clients lead to a significant decrease in runtime (lower is better) while achieving better negative log-likelihood values (lower is better) due to larger models. \notejo{}}
    %\label{fig:lls_vs_rt}
%\end{figure}

\begin{figure*}[t]
    \centering
    \includegraphics[width=.85\textwidth]{federated-circuits/images/fl_settings_results.pdf}
    \caption{\textbf{FCs are competitive to prominent FL methods in all settings.} FCs achieve competitive performance on various classification tasks compared to prominent horizontal/vertical FL baselines. FCs also handle the more challenging setting of hybrid FL without performance drops. We reported the F1 score (higher is better).}
    \label{fig:fl_results}
    \vspace{-0.25cm}
\end{figure*}

\textbf{(Q2) FedPCs effectively scale up PCs.} To examine whether FedPCs can be leveraged to scale up PCs effectively, we compared FedPCs to EiNets, PyJuice, PICs, IncPCs, and LVD on CelebA, Imagenet32, and Imagenet. Einets, PyJuice, and FedPCs were parameterized with the Poon-Domingos (PD) architecture. PICs define the architecture based on quadrature integration points, while IncPC and LVD use the HCTL structure. FedPCs used EiNets in the clients, and data was distributed among 2, 4, 8, and 16 clients. The FedPC model and baseline models were selected to ensure that each fits within a single GPU (see App. \ref{app:exp_details} for system details). Einets and FedPCs were parameterized with Gaussian leave distributions, while the other baselines were parameterized with Categorical distributions. The parameterizations were chosen based on empirical observations; for Einets and FedPCs, Gaussians worked better than categorical distributions.
%\begin{table}[t]
%\centering
%\begin{tabular}{l|lll}
%                  & CelebA             & Imagenet32          & Imagenet              \\ \hline
%EiNet              & -3.42 \scriptsize{$\pm$ 0.06} & - $\pm$ -   & - $\pm$ -  \\
%PyJuice           & -2.98 \scriptsize{$\pm$ 0.02} & -3.60 $\pm$ \scriptsize{0.01}  & -3.43 $\pm$ \scriptsize{0.02}   \\
%FedPC (2 cl.)  & -2.87 \scriptsize{$\pm$ 0.05} & -2.66 $\pm$ \scriptsize{0.02}  & - $\pm$ - \\
%FedPC (4 cl.)  & -2.84 \scriptsize{$\pm$ 0.05} & -2.56 $\pm$ \scriptsize{0.03}  & - $\pm$ - \\
%FedPC (8 cl.)  & \underline{-2.76} \scriptsize{$\pm$ \underline{0.04}} & - $\pm$ -  & - $\pm$ -  \\
%FedPC (16 cl.) & \textbf{-2.68} \scriptsize{$\pm$ \textbf{0.03}} & - $\pm$ - & - $\pm$ - 
%\end{tabular}
%\caption{\textbf{FedPCs outperform EiNets and PyJuice on density estimation tasks.} FedPCs achieve better results on density estimation tasks on three challenging image datasets (CelebA, Imagenet32, and Imagenet). This is because FedPCs can learn far larger models distributed across multiple machines. Results are reported in nats (higher is better). Best value in \textbf{bold}, 2nd best \underline{underlined}.}
%\label{tab:imageLLs}
%\vspace{-0.5cm}
%\end{table}

For FedPC training, the images were distributed horizontally at random, s.t. each client holds approximately equally large subsets. The leaves and all baselines were trained with EM.
In Tab. \ref{tab:imageLLs}, we show bits per dimension (bpd) achieved by FedPCs and our baselines on the test set. 
It can be seen that with an increasing number of participating clients and, thus, a larger model, the density estimation performance also increases on all three datasets. We posit that this is because larger models exhibit higher expressivity, allowing them to capture statistical characteristics of the data better than smaller models. Also, lower bpd scores achieved on the test set by larger models indicate that no overfitting appeared due to more model parameters. However, note that more exhaustive scaling will likely lead to overfitting. Finding the optimal model size/number of clients in a principled way is beyond the scope of this work and is left for future endeavors.
%For Imagenet and Imagenet32, model performance improves with more participating clients. On CelebA, nats decrease when we scale up to two participating clients. For 8 and 16 clients, nats increases again. We posit that this is because CelebA consists of a low number of relatively homogeneous clusters. Thus, increasing the cluster and model size to 8/16 could lead to overfitting and thus decreasing log-likelihoods. Since Imagenet consists of much more heterogeneous images, larger models and a larger number of clusters are beneficial for learning (see App. \ref{app:results} for more details).
Besides better modeling performance, a larger number of clients reduces training time significantly (see Fig. \ref{fig:runtime}). FedPCs thus efficiently scale tractable probabilistic models to large datasets.

\begin{table}
%\vspace{cm}
    %\begin{table}
    \centering
    \begin{tabular}{c|cc}
    \multicolumn{1}{l|}{} & EM & one-pass \\ \hline
    Synth. Data  & $-53.6$ \scriptsize{$\pm 1.3$} & $-53.2$ \scriptsize{$\pm 1.2$}  \\
    Income  &  $-18.5$ \scriptsize{$\pm 0.1$}        & $-18.0$ \scriptsize{$\pm 0.5$} \\
    Breast-Cancer & $-52.3$ \scriptsize{$\pm 0.2$} & $-55.7$ \scriptsize{$\pm 0.2$} \\
    Credit & $-26.7$ \scriptsize{$\pm 1.2$} & $-28.3$ \scriptsize{$\pm 0.4$}
    \end{tabular}
    \caption{\textbf{One-pass training retains performance.} We trained the same FedPC architecture on various datasets using EM and one-pass training in a vertical setting. The average log-likelihood value of the hold-out test set across 10 runs is reported. 
    %\notejo{Why is 2step better on Income?}
    }
    \label{tab:e2evs2s}
    %\end{table}
    \vspace{-0.7cm}
\end{table}

\textbf{\textbf{(Q3)} FCs achieve state of the art classification results in FL.}
FCs can be parameterized with different models in the leaves. We examine two parameterizations to solve a federated classification task on three tabular datasets. First, we use the FedPC (FC [PC]) from \textbf{(Q1)}, which can be used to solve discriminative tasks leveraging tractable computation of conditionals in PCs. The second FC parameterization we examine is decision trees (FC [DT]), representing an instantiation of a bagging model.
To see how FCs perform in federated classification tasks, we compare FCs to well-known methods for horizontal FL and vertical FL. The experiments were conducted on tabular datasets covering various real-world application domains and distribution properties. 
We employ TabNet and FedTree as strong baselines. In the horizontal FL setting, TabNet was trained using FedAvg; in the vertical FL setting, it was trained in a SplitNN fashion~\citep{Cellabos2020SplitNN}. The results were compared against our one-pass training. FCs yield comparable or better results than our baselines on all datasets (see Fig \ref{fig:fl_results}; App. \ref{app:results}) while being significantly more flexible compared to the baselines. 
%since FCs can be trained with the same unified procedure in all FL settings. 
%In contrast, training neural networks requires substantial changes to the training procedure once the FL setting switches. Hence, FCs are more flexible while still competitive or better than prominent FL baselines.

\textbf{\textbf{(Q4)} One-pass training retains performance.} To see how the proposed one-pass training compares to training PCs
with standard optimization algorithms such as EM, we define an FL setup where data exchange is allowed. 
This is necessary as we have to train the PC and FedPC architecture with EM to compare to our one-pass procedure. We used RAT-SPNs~\citep{peharz2020random} as leaf
parameterizations of the FedPC. Then, we trained a FedPC using standard EM (i.e., data exchange was allowed) and another FedPC with the same FedPC architecture on a vertically split dataset using our one-pass procedure. 
We report the final average log-likelihood of the test dataset, both for EM training 
and one-pass training (see Tab. \ref{tab:e2evs2s}). It can be seen that there is no significant decrease in log-likelihood in any case. 
%Interestingly, the one-pass training seems even to be better than EM. We suspect that that it is easier to solve the subtasks of local training independently instead of jointly optimizing all parameters of the FedPC. 
Hence, our results indicate that one-pass training is preferable since it yields comparable model performance while being more communication efficient. 
%one-pass training can be used instead of the more costly EM scheme.

\section{Conclusion}
In this work, we introduced federated circuits that hinge on an inherent connection between PCs and FL. We demonstrated that both the training speed and expressivity of PCs can be increased by learning PCs on scale across partitioned data.
%model performance due to a larger amount of computational resources that can be leveraged.
%Since our framework allows for the integration of various types of density estimators, other models and advances of PCs and other fields can be integrated seamlessly, highlighting the relevance of our federated approach for scaling.
Since our framework allows for the integration of various types of density estimators, other models and advances of PCs and other fields can be integrated seamlessly and can therefore be scaled up straightforwardly.

\textbf{Limitations and Future Work.}
While our experiments showed that scaling PCs can considerably improve training speed and performance, scaling to such large-scale models requires sufficient computational resources.
For future work, investigating other parametrizations for FCs beyond PCs is promising.
Additionally, it is interesting how the probabilistic framework for hybrid FL could also benefit more traditional FL applications, apart from scaling PCs.

\section*{Acknowledgements}
This work is supported by the Hessian Ministry of Higher Education, Research, Science
and the Arts (HMWK; “The Third Wave of AI”). Further, this work benefited from the
National High-Performance Computing project for Computational Engineering Sciences
(NHR4CES). 

The Eindhoven University of Technology authors received support from their
Department of Mathematics and Computer Science and the Eindhoven Artificial Intelligence
Systems Institute.

%\section{Conclusion}
%Various approaches have been proposed in the literature to scale up PCs to larger, real-world datasets. 
%In this work, we proposed to scale PCs by splitting them across multiple machines and training them on partitioned data. This unveiled a deep connection between PCs and FL, leading to a novel and highly flexible federated learning framework, called federated circuits. We demonstrated that FCs can be leveraged to efficiently scale up PCs on three large image datasets, beating existing scaling approaches to scale PCs. Further, our experiments showed that FCs achieve competitive results compared to strong FL baselines in all FL settings while being more flexible than our baselines. This makes FCs an interesting candidate for scaling PCs and various federated learning applications. 
%Numerous methods tackle horizontal, vertical, or hybrid FL individually. In this work, we revisited FL with a probabilistic lens, paving the way for unified FL approaches that address all three settings in a single algorithm. We demonstrated that our framework encompasses both, HFL and VFL and introduced a novel FL method employing PCs, yielding FedPCs. Our findings highlight FedPCs' capacity to efficiently learn joint distribution approximations, even in scenarios where data owners do not share features or samples. Also, we demonstrated that FedPCs are an effective way to scale up PCs to large scale datasets such as Imagenet.
%\textbf{Limitations.} 
%In this work, we presented Federated Circuits, a general framework to tackle FL. 
%While we demonstrated that, depending on its parameterization, FCs are closely related to PCs, employing other models is possible. Further investigation on the effectiveness of federated circuits, e.g., normalizing flows as a parameterization, is interesting and can open FCs for even more applications.

%\textbf{Further Work.} 
%Further promising research avenues include investigating whether the communication efficiency of federated circuits can be further improved with methods from distributed learning such as ring reduce~\citep{lee2020flexreduce}. 
%Further, it would be interesting to assess the effectiveness of Federated Circuits with model classes other than PCs.

\bibliography{bibfile}
%\bibliographystyle{iclr2025_conference}

\appendix
\newpage
\onecolumn

\section{Federated Learning: Goals and Approaches}
\label{app:federated_learning}
Federated Learning (FL) aims to collaboratively learn a machine learning model across a set of clients, each holding a different dataset without sharing the data with a central server (i.e., keeping data private)~\citep{li2020federatedChallenges}. The server only orchestrates the training. In FL, one distinguishes horizontal, vertical, and hybrid settings. In horizontal FL, all clients share the same features but hold different samples; in vertical FL, all clients hold the same samples but different features; and in hybrid FL, a mix of horizontal and vertical FL is assumed. Most FL approaches focus on horizontal scenarios with neural networks as models. The majority of approaches tackle horizontal FL by averaging the model on a central server regularly, thereby integrating distribution information via model merging~\citep{li2020federatedChallenges, McMahan2016FedAvg,Sahu2018FedProx}. This averaging, however, requires that each client trains the exact same neural architecture. For example, if two clients would like to train an image classifier and one client uses a convolutional neural network (CNN; \citep{lecun1998gradient}) while the other uses a Transformer model~\citep{vaswani2017attention}, it is not possible to average the model parameters due to the different semantics and structure of CNNs and Transformers. In vertical and hybrid FL, the most prominent approaches use neural networks by sharing learned representations instead of the data directly, or by employing tree models that are synchronized by merging learned decision rules~\citep{Shih2021HyperSPN, li2023fedtree}. Sharing learned representations, however, can get costly quickly with respect to communication efficiency, leading to long training. Merging tree models, on the other hand, is more communication efficient, but often requires additional model assumptions or complex learning algorithms, restricting the expressivity of learned models.

In contrast, Federated Circuits (FC) neither require merging of model parameters, nor do they require representations being shared or complex learning algorithms being employed. By exploiting the structural properties of probabilistic circuits (PCs) and mapping PC semantics to FL, we obtain an elegant learning framework that can handle horizontal, vertical, and hybrid FL in one framework effectively while being communication efficient.

\section{Discussion on Assumptions}
\label{app:assumptions}
As a preliminary to FCs, we introduced two assumptions that allowed us to construct the FC framework. Here, we provide some more background on these assumptions. For clarity, let us state the assumptions again.

\textbf{Assumption 1} (Mixture Marginals).
    There exists a joint distribution $p$ such that the relation $\int_{\mathbf{X} \setminus \mathbf{X}_S} p(x) = \sum_{l \in L} q(L=l) \cdot p_{S}(x | L=l)$ holds for all $x \in \mathcal{X}$. Here, $\mathbf{X}_{S} \subseteq \mathbf{X}$ is a subset of the union of client random variables $\mathbf{X} = \cup_{c \in \mathcal{C}} \mathbf{X}_c$. Further, $\mathcal{X} = \bigtimes_{i \in \{1, ..., |\mathbf{X}|\}} \mathcal{X}_i$ is the support of $\mathbf{X}$, each $p_{S}$ is defined over $\mathbf{X}_S \subseteq \mathbf{X}$ and $q$ is a prior over a latent $L$.

\textbf{Assumption 2} (Cluster Independence).
    Given disjoint sets of random variables $\mathbf{X}_1, \cdots, \mathbf{X}_n$ and a joint distribution $p(\mathbf{X}_1, \cdots, \mathbf{X}_n)$, assume that a latent $L$ can be introduced s.t. the joint can be represented as $p(\mathbf{X}_1, \cdots, \mathbf{X}_n) = \sum_l q(L=l) \prod_{i=1}^n p(\mathbf{X}_i | L=l)$ where $q$ is a prior distribution over the latent $L$.

As discussed in the main paper, Assumption \ref{assum:decomposition} ensures that the data that resides on all participating clients is sufficient to learn $p(\mathbf{X})$, at least in the limit of infinite samples available. However, this only covers the federated learning perspective of this assumption. There is also a PC perspective on this assumption. For this, let us introduce the induced tree representation of PCs from~\citep{zhaoa16CollapsedVarInf}:

\begin{defin}
    \label{def:induced_pc}
        Induced Trees~\citep{zhaoa16CollapsedVarInf}. Given a complete and decomposable PC $s$ over $\mathbf{X} = \{X_1, \dots, X_n\}$, $\mathcal{T} = (\mathcal{T}_V, \mathcal{T}_E)$ is called an induced tree PC from $s$ if
    \begin{enumerate}
        \item $\Node \in \mathcal{T}_V$ where $\Node$ is the root of $s$.
        \item for all sum nodes $\SumNode \in \mathcal{T}_V$, exactly one child of $\SumNode$ in $s$ is in $\mathcal{T}_V$, and the corresponding edge is in $\mathcal{T}_E$.
        \item for all product node $\ProductNode \in \mathcal{T}_V$, all children of $\ProductNode$ in $s$ are in $\mathcal{T}_V$, and the corresponding edges in $\mathcal{T}_E$.
    \end{enumerate}
\end{defin}

We can use Def. \ref{def:induced_pc} to represent decomposable and complete PCs as mixtures~\citep{zhaoa16CollapsedVarInf}.

\begin{prop}[Induced Tree Representation]
    \label{prop:induced_tree}
        Let $\tau_s$ be the total number of induced trees in $s$. Then the output at the root of $s$ can be written as $\sum_{t=1}^{\tau_s} \prod_{(k, j) \in \mathcal{T}_{t E}} w_{k j} \prod_{i=1}^n p_t(X_i = x_i)$, where $\mathcal{T}_t$ is the $t$-th unique induced tree of $s$ and $p_t(X_i)$ is a univariate distribution over $X_i$ in $\mathcal{T}_t$ as a leaf node.
\end{prop}

Using Prop. \ref{prop:induced_tree}, we see that any decomposable and smooth PC can be represented as a mixture without any hierarchy, i.e., we can collapse the PC structure into a structure of depth one. Since marginalizing over a decomposable and smooth PC yields another  decomposable and smooth PC again, and since the marginalized PC can be represented as an induced tree, Assumption \ref{assum:decomposition} is a standard assumption in the PC literature.

Also, Assumption \ref{assum:cluster_independence} can be viewed from a PC perspective. In popular structure learning algorithms such as LearnSPN~\cite {gens2013LearnSPN}, a PC is learned by alternating data clustering with testing for independent subsets of features. Thus, the ultimate goal of algorithms like LearnSPN is to find clusters in which subsets of random variables are considered independent in order to maximize log-likelihood. Therefore, Assumption \ref{assum:cluster_independence} is closely related to LearnSPN and, thus, a common assumption in PC modeling.



%\section{Notation}
%\textcolor{4iCr}{
%The following table provides an overview of all symbols used throughout the paper, each with a brief description.}
%\begin{table}[h!]
%\begin{tabular}{l|l}
%Symbol & Meaning                                                                                                                                                       \\ \hline
%$\mathbf{X}$      & Set of random variables                                                                                                                                       \\
%$\mathbf{X}_c$   & Set of random variables on client c                                                                                                                           \\
%$\mathbf{D}$      & Dataset                                                                                                                                                       \\
%$\mathbf{D}_c$   & Dataset on client c                                                                                                                                           \\
%$\mathcal{C}$      & set of clients                                                                                                                                                \\
%$p$      & joint distribution                                                                                                                                            \\
%$p_c$   & marginal distribution over all random variables held by client c                                                                                              \\
%$\hat{p}$  & distribution from data                                                                                                                                        \\
%$\Node$      & node in PC/FC                                                                                                                                                 \\
%$\ClientNode$      & client node in FC                                                                                                                                             \\
%$\SumNode$, $\ProductNode$   & Sum/Product node in PC/FC                                                                                                                                     \\
%$\psi$    & scope function in PC/FC                                                                                                                                       \\
%$\omega$  & \begin{tabular}[c]{@{}l@{}}function assigning compute nodes to nodes of FC. \\ Defines alignment between FC structure and communication network.\end{tabular}
%\end{tabular}
%\end{table}


\section{Proofs}
\label{app:proofs}
In this section we give full proofs for our propositions in the paper.

%\subsection{Fact 1}
%A joint distribution $p$ over disjoint sets of random variables $\mathbf{X}_1, \cdots, \mathbf{X}_c$ of the form $p(\mathbf{X}_1, \cdots, \mathbf{X}_c) = \sum_l p_{\theta}(L=l) \prod_k^c p(\mathbf{X}_k | L=l)$ is strictly more expressive than a distribution of the form $p(\mathbf{X}_1, \cdots, \mathbf{X}_c) = \prod_k^c p(\mathbf{X}_k)$.

%\begin{proof}
%We have to prove two things here: (1) A mixture consisting of one component equals the product distribution for the distribution family assumed in Proposition 1 and (2) a latent variable model is strictly more expressive than the product distribution.

%\textbf{(1):} For a latent $L$ with $|\text{supp}\{L\}| = 1$ (hence $p(L)$ is a point mass), $\sum_l p_{\theta}(L=l) \prod_{k=1}^c p(\mathbf{X}_k | L=l) = \prod_{k=1}^c p(\mathbf{X}_k)$ holds as for $p_{\theta}(L=l) = 1$ for the only $l \in \text{supp}\{L\}$. Also, if there is only one mixture component, conditioning on the only component has no effect, i.e. $p(\mathbf{X}_k | L=l) = p(\mathbf{X}_k)$.

%\textbf{(2):} Assume an $n$-dimensional space $\mathcal{X}_k = \mathcal{X}_{k_1} \times \dots \times \mathcal{X}_{k_n}$ for each set of variables $\mathbf{X}_k$ and a $c \times n \times m$ tensor $\mathbb{X}$ of random variables where each $\mathbb{X}_k$ corresponds to a matrix/set of random variables $\mathbf{X}_k = (X_{1 1}, \dots, X_{n m})$, i.e. there exist $m$ random variables per dimension of $\mathcal{X}_k$. Further assume a distribution $p_{\theta_{k i j}}$ for each $\mathbb{X}_{k i j}$ parameterized by $\theta_{k i j}$ and that $\mathbb{X}_{k i j} \indep \mathbb{X}_{k' l j}$ holds for all $k \neq k'$ and $l \neq i$. Note that this does not forbid dependencies among variables within each matrix $\mathbb{X}_k$. Due to our independence assumption we can define distributions $p_{\mathbf{\theta}_{j}} = \prod_{k=1}^c p(\mathbb{X}_{k : j})$ for each $j$. Since each of these distributions is defined over $\mathcal{X}$, we can introduce a latent $L$ with support $\{1, \dots, m\}$ and associated prior $p_{\theta}(L)$, yielding a mixture of $c$ components over vectorized random variables. Hence we can write $p(\mathbb{X}) = \sum_{l=1}^C p_{\theta}(L=l) \cdot p(\mathbb{X} | L=l)$. This can be rewritten as $p(\mathbb{X}) = \sum_{l=1}^c p_{\theta}(L=l) \cdot p(\mathbb{X}_l)$. As each $p(\mathbb{X}_l)$ is a product distribution over random variables corresponding to some mixture component $j$, rewriting yields $p(\mathbb{X}) = \sum_{l=1}^c p_{\theta}(L=l) \cdot \prod_{j=1}^c p(\mathbb{X}_{l : j})$. Using (1), setting $|\text{supp}\{L\}| = 1$ and setting the number of mixtures also to $1$ yields a special case, namely the product distribution over the only defined mixture component $j$, i.e. $\prod_j p(\mathbb{X}_{l : j})$. Hence a mixture as we have defined it is strictly more expressive as a single product distribution.
%\end{proof}

%\begin{proofsketch}
%We consider the two cases that $\mathbf{X}$ and $\mathbf{Y}$ are mutually independent and that they are not mutually independent. Note that the joint entropy can be written as $h(\mathbf{X}, \mathbf{Y}) = h(\mathbf{X} | \mathbf{Y}) + h(\mathbf{Y})$. In the case of mutual independence, this reduces to $h(\mathbf{X}, \mathbf{Y}) = h(\mathbf{X}) + h(\mathbf{Y})$. Hence it has to be shown that $h(\mathbf{X} | \mathbf{Y}) < h(\mathbf{X})$ holds if $\mathbf{X}$ and $\mathbf{Y}$ are not mutually independent. Intuitively, if $\mathbf{Y}$ lets us infer something about $\mathbf{X}$, the conditional entropy should be lower than the marginal entropy. Exploiting that the mutual information $I(\mathbf{X}, \mathbf{Y}) = h(\mathbf{X}) - h(\mathbf{X} | \mathbf{Y}) \geq 0$, it follows that entropy is maximized if $\mathbf{X}$ and $\mathbf{Y}$ are mutually independent. The full proof can be found in App. A. \qed
%\end{proofsketch}

\subsection{FedPCs and Principle of Maximum Entropy}
\label{app:max_entropy}
Assumption 2 aligns with the principle of maximum entropy: we aim to find the joint distribution with maximum entropy \textit{within} clusters while allowing for dependencies among clients’ random variables and ensuring the marginals for each client are preserved. Although multiple joint distributions can preserve the marginals, non-maximal entropy solutions introduce additional assumptions or prior knowledge, limiting flexibility. By assuming independence of all variables within a cluster, we efficiently construct the maximum entropy distribution via a mixture of product distributions.
For independent variables, the product distribution maximizes entropy, as can be shown by leveraging the joint and conditional differential entropy.
Given random variables $\mathbf{X} = X_1, \dots, X_n$ and a density $p$ defined over support $\mathcal{X} = \mathcal{X}_1 \times \cdots \times \mathcal{X}_n$, the joint differential entropy is defined as:
\begin{equation}
    h(\mathbf{X}) = \int_{\mathcal{X}} p(x_1, \dots, x_n) \, \text{log} p(x_1, \dots, x_n)
\end{equation}
The conditional differential entropy for two sets of random variables $\mathbf{X}$ and $\mathbf{Y}$ and a joint distribution $p(\mathbf{X}, \mathbf{Y})$ defined over support $\mathcal{X} \times \mathcal{Y}$ is defined analogously: 
\begin{equation}
    h(\mathbf{X} | \mathbf{Y}) = \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \, \text{log}p(\mathbf{x} | \mathbf{y})
\end{equation}
Given two sets of random variables $\mathbf{X}$, $\mathbf{Y}$ with densities $p(\mathbf{X})$ and $p(\mathbf{Y})$ and support $\mathcal{X}$, $\mathcal{Y}$ respectively, the joint $p(\mathbf{X}, \mathbf{Y}) = p(\mathbf{X}) \cdot p(\mathbf{Y})$ is the maximum entropy distribution if $\mathbf{X}$ and $\mathbf{Y}$ are mutually independent.
\begin{proof}
 We consider the two cases that $\mathbf{X}$ and $\mathbf{Y}$ are mutually independent and that they are not mutually independent. The joint entropy can be written as $h(\mathbf{X}, \mathbf{Y}) = h(\mathbf{X} | \mathbf{Y}) + h(\mathbf{Y})$. In the case of mutual independence, this reduces to $h(\mathbf{X}, \mathbf{Y}) = h(\mathbf{X}) + h(\mathbf{Y})$. Hence it has to be shown that $h(\mathbf{X} | \mathbf{Y}) < h(\mathbf{X})$ holds if $\mathbf{X}$ and $\mathbf{Y}$ are not mutually independent:
 \begin{align*}
     & h(\mathbf{X} | \mathbf{Y}) < h(\mathbf{X}) \\
     \equiv & - \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} p(\mathbf{x} | \mathbf{y}) < - \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} p(\mathbf{x}) \\
     \equiv & - \bigg( \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} p(\mathbf{x} | \mathbf{y}) - \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} p(\mathbf{x}) \bigg) < 0 \\
     \equiv & - \bigg( \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} \frac{p(\mathbf{x} | \mathbf{y})}{p(\mathbf{x})} \bigg) < 0
 \end{align*}
Since $\mathbf{X} \indep \mathbf{Y}$ holds where $\indep$ means mutual independence, $\frac{p(\mathbf{x} | \mathbf{y})}{p(\mathbf{x})} \neq 1$ at least for some $\mathbf{x}, \mathbf{y}$. Since the mutual independence $I(\mathbf{X}, \mathbf{Y}) = \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} \frac{p(\mathbf{x}, \mathbf{y})}{p(\mathbf{x}) \cdot p(\mathbf{y})}$ can be represented as $I(\mathbf{X}, \mathbf{Y}) = h(\mathbf{X}) - h(\mathbf{X} | \mathbf{Y})$, $I(\mathbf{X}, \mathbf{Y}) \geq 0$ holds and $- \Big( \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} \frac{p(\mathbf{x} | \mathbf{y})}{p(\mathbf{x})} \Big) = h(\mathbf{X} | \mathbf{Y}) - h(\mathbf{X})$ it follows that $h(\mathbf{X}) > h(\mathbf{X} | \mathbf{Y})$.
 
\end{proof}

%\section{Algorithms}\label{app:algos}
%In this section we provide pseudo-code for the end-to-end training algorithm, the two-step training algorithm and the FedSPN structure construction in hybrid FL scenarios.

%\subsection{EM Training}
%In vertical FL settings, a full forward and backward pass can be computed in FedPCs. Thus, we provide a distributed EM training algorithm here.
%\begin{algorithm}[h!]
%\caption{EM Training}\label{alg:e2e}
%\KwData{FedPC-parameter tuple $\langle s, p\rangle$}
%\KwData{Distributed Dataset $\mathbf{D}$}
%\KwResult{Trained FedPC $s$}
%$g \gets 0$ \\
%\For{random batch $\mathbf{x}$ from $\mathbf{D}$}{
%    $\ell \gets \text{log}(s(\mathbf{x}))$ \\
%    $\nabla_p s(\mathbf{x}) \gets$ distributed\_backward$(\ell, \mathbf{x}, s, p)$ %\\
%    em\_step($p$, $\nabla_p s(\mathbf{x})$)  
%}
%\end{algorithm}

%\begin{algorithm}[h!]
%\caption{Distributed Backward}\label{alg:dist_back}
%%\KwData{FedPC-parameter tuple $\langle s, p\rangle$}
%\KwData{Batch $\mathbf{x}$}
%\KwData{Log-likelihood $\ell$}
%\KwResult{Trained FedPC $s$}
%$g \gets 0$ \\
%gradients $\gets []$ \\
%\For{sum node $\SumNode \in s$}{
%    $g_{\pa{\SumNode}} \gets []$ \\
%    \For{$N \in \pa{\SumNode}$}{
%        \If{$N \not \in \phi(\SumNode)$}{
%            obtain $\nabla_N(\mathbf{x}) \ell$ from $\phi(\SumNode)$
%        }
%        \Else{
%            compute $\nabla_N(\mathbf{x}) \ell$
%        }
%        add $\nabla_N(\mathbf{x}) \ell$ to $g_{\pa{\SumNode}}$
%    }
    
%    compute $g_{p(\SumNode)} \gets \sum_{g \in g_{\pa{\SumNode}}} %\nabla_{p(\SumNode)} \sum_{c \in \ch{\SumNode}} p_c(\SumNode) c(\mathbf{x})$ \\
%    add $\langle p(\SumNode), g_{p(\SumNode)} \rangle$ to gradients
%}
%\Return gradients
%\end{algorithm}

\newpage
%\subsection{One-Pass Training}
%\begin{algorithm}[h!]
%\caption{One-Pass Training}\label{alg:2step}
%\KwData{Client PCs $s$, Distributed Dataset $\mathbf{D}$, Setting $\sigma \in \{$ horizontal, vertical, hybrid $\}$}
%\KwResult{Trained FedPC $s_f$}
%\For{Client PC $s_c \in s$}{
%    train $s_c$ with client data $\mathbf{D}_c$ \\
%}
%$s_f \gets$ construct\_fpc($s$, $\sigma$) \\
%$\mathbf{l} \gets (s_1(\mathbf{D}_1), \dots, s_c(\mathbf{D}_c))$ \\
%EM($s_f$, $\mathbf{l}$)
%\end{algorithm}
%
%\begin{algorithm}[h!]
%\caption{Split Feature Space}\label{alg:split_features}
%\KwData{Clients $\mathcal{C}$}
%\KwResult{Split Feature Space}
%$M \gets \{\}$ \\
%$\mathbf{X} \gets \bigcup_{C \in \mathcal{C}} \mathbf{X}_C$ \\
%\For{$X \in \mathbf{X}$}{
%    $O \gets \{C: C \in \mathcal{C} | X \in \mathbf{X}_C\}$ \\
%    $M[X] \gets O$ \\
%}
%$\hat{O} \gets$ set($M$.values()) \\
%\For{$O' \in \hat{O}$}{
%    $F_{O'} \gets \{X: \langle X, O \rangle \in M | O = O'\}$ \\
%    send $F_{O'}$ to all clients in $O'$
%}
%\end{algorithm}
%
%\begin{algorithm}[h!]
%\caption{Build Mixture of Products}\label{alg:fed_spn_head}
%\KwData{Client PCs $s$, number of clusters per client $N$, number of product nodes $P$}
%\KwResult{FedPC structure $s_f$}
%    product\_nodes $\gets \{\}$ \\
%    not\_visited $\gets$ union($\{s[c]$ for $c \in \{1, \dots |s|\}\}$) \\
%    product\_nodes $\gets \{\}$ \\
%    \For{$p \in \{1, \dots, P\}$}{
%        nodes $\gets \{\}$ \\
%        \For{$c \in \{1, \dots, |s|\}$}{
%            $i \sim \mathcal{U}(0, N)$ \\
%            add $s[c][i]$ to nodes \\
%            remove $s[c][i]$ from not\_visited
%        }
%        prod $\gets$ ProductNode(nodes) \\
%        add prod to product\_nodes
%    }
%    \If{$|not\_visited| > 0$}{
%        prod $\gets$ ProductNode(not\_visited) \\
%        add prod to product\_nodes
%    }
%    pc = SumNode(product\_nodes) \\
%    \Return pc
%\end{algorithm}
%
%\begin{algorithm}[h!]
%\caption{Construct FedPC}\label{alg:fed_spn_head}
%\KwData{Client PCs $s$, Number of clusters per client $N$, Number of product nodes $P$, Mapping Feature Space to Clients $M$, Setting $\sigma \in \{$ horizontal, vertical, hybrid $\}$}
%%\KwData{Number of clusters per client $N$ (if vertical)}
%%\KwData{Number of product nodes $P$ (if vertical)}
%%\KwData{Mapping Feature Space to Clients $M$}
%%\KwData{Setting $\sigma \in \{$ horizontal, vertical, hybrid $\}$}
%\KwResult{FedPC structure $s_f$}
%\If{$\sigma$ = horizontal}{
%    pc = SumNode($s$)
%}
%\If{$\sigma$ = vertical}{
%    pc = build\_mixture\_of\_products($s$, $N$, $P$)
%}
%\Else{
%    mixtures $\gets \{ \}$ \\
%    pcsVistied $\gets \{ \}$ \\
%    mixtureSpaces $\gets \{ \langle \mathbf{X}, O \rangle \in M : |O| > 1 \}$ \\
%    \For{$\langle \mathbf{X}, O \rangle \in \text{mixtureSpaces}$}{
%        $s' \gets \{n \in s |$ scope($n$) $= O \}$ \\
%        add $s'$ to pcsVisited \\
%        add SumNode($s'$) to mixtures
%    }
%    
%    nodes $\gets$ mixtures $\cup \{s' \in s | s' \not \in$ pcsVisited$\}$ \\
%    pc = ProductNode(nodes) \\
%    \Return pc
%}
%\end{algorithm}
%
\clearpage
\section{Communication Efficiency}
\label{app:commeff}
Communication efficiency is a critical property when it comes to learning models across multiple machines, as it is done in FL. Here, in addition to our theoretical results, we more intuitively provide further details on the communication efficiency of FCs. For that, we plot the communication cost in Megabytes (MB) required to train a FedPC vs. FedAvg/SplitNN in horizontal/vertical FL settings with datasets of different sizes (1M and 100M samples). Regardless of the number of samples in the dataset, FedPCs are more communication efficient compared to our baselines in both horizontal and vertical settings (see Fig. \ref{fig:comm_effic}).

\begin{figure}[h!]
    \centering
    \includegraphics[scale=0.4]{federated-circuits/images/communication_cost.pdf}
    \caption{\textbf{FedPCs are communication-efficient.} We compare communication cost in Megabytes (MB) sent over the network during one full training of a model (0.5M/50M parameters) on a dataset (1M/100M samples) using results from Section 3.4. Results are shown on log-scale. It can be seen that FedPCs significantly reduce communication cost of training.}
    \label{fig:comm_effic}
\end{figure}

\section{Experimental Details}
\label{app:exp_details}

In this section, we provide a detailed overview of our experimental setup. In all experiments, we consider a cross-silo FL setup with one orchestrating server. Note that in the case of FedPCs, the server does not perform model averaging. Instead, it orchestrates the local model training and construction of the FedPC model. 

\subsection{Datasets}
The following describes the datasets used in our experiments. If not stated differently, the datasets were distributed across clients as follows: 

In horizontal cases, we either split samples randomly across clients (done for all binary classification tasks) or we distribute a subset of the dataset corresponding to a certain label (e.g. the 0 in MNIST) to one client. 

In vertical cases, we split tabular datasets randomly along the feature-dimension, i.e. each client gets all samples but a random subset of features assigned. For image data, we split the images into non-overlapping patches which were then distributed to the clients.

In hybrid cases, we split tabular datasets along both, the feature and the sample-dimension. We do this s.t. at least two clients have at least one randomly chosen feature in commeon (but hold different samples thereof). For image data, we split images into overlapping patches, sample a subset of the dataset and assign the resulting subsets to clients.

\textbf{Income Dataset.}
We used the Income dataset from \url{https://www.kaggle.com/datasets/wenruliu/adult-income-dataset}. This dataset represents a binary classification problem with 14 features and approximate 450K samples in the train and 900 samples in the test set. We encoded discrete variables to numerical values using TargetEncoder from sklearn. Additionally, missing values were imputed using the median of the corresponding feature. Further we standardized all features.

\textbf{Breast Cancer Dataset.}
We used the Breast Cancer dataset from \url{https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data}. It represents a binary classification problem with 31 features and 570 samples. We split the dataset into 450 training samples and 120 test samples. We standardized all features for training.

\textbf{Credit Dataset.}
We used the Give Me Some Credit dataset from \url{https://www.kaggle.com/c/GiveMeSomeCredit}. The dataset represents a binary classification task with 10 features, 1.5M training samples and 100K test samples. We encoded discrete variables to numerical values using TargetEncoder from sklearn. Additionally, missing values were imputed using the median of the corresponding feature. Further we standardized all features.

\textbf{MNIST.}
We used the MNIST dataset provided by pytorch. It contains 70K hand-written digits between 0 and 9 as 28x28 images (60K train, 10K test). We standardized all features as preprocessing.

\textbf{Imagenet/Imagenet32.}
We used the Imagenet dataset provided by pytorch. It consists of about 1.2M images showing objects of 1000 classes. The images come in different resolutions; we resized each image to 64x64 (Imagenet) and 32x32 (Imagenet32) pixels, applied center cropping, and standardized all features as preprocessing. We distributed samples randomly across clients as a simple dataset partioning.

\subsection{Discretization}
In our experimental setup, FCs and Einets were parameterized with Gaussian leaves and fitted on RGB image data. Since image data is discrete (takes integer values from 0-255) and Gaussians are defined over a continuous domain and thus define a probability \textit{density} rather than a probability \textit{mass} function, we have to discretize the Gaussian leaves to obtain the probability for a given image $\mathbf{x}$. Therefore, we construct 255 buckets, discretizing a Gaussian with parameters $\mu$ and $\sigma$ by computing the probability mass as $p(x) = \Phi(\frac{x - \mu + \frac{1}{255}}{\sigma}) - \Phi(\frac{x - \mu}{\sigma})$. The computation graph will remain fixed since the probabilistic semantics of PCs hold for densities and probability mass functions.

\subsection{Training \& Hyperparameters}
\textbf{PyJuice.} For PyJuice we follow the training setup of~\citep{liu2024scalingtractableprobabilisticcircuits} and train the models on randomly cropped 32x32 patches. For Imagenet32, this corresponds to the full image, while for CelebA and Imagenet, this corresponds to a random 32x32 block of the image. We used EM as an optimization procedure.

\textbf{FCs and Einets.} For FCs and Einets, we followed the setup from~\citep{perharz2020einsum} and trained all models on the full image (i.e., 32x32 for Imagenet32 and 64x64 for Celebi/Imagenet). We used EM as an optimization procedure.

For PICs and LVD, we followed the experimental setup described in the corresponding papers. The code of IncPCs was not publicly available.

We ran all experiments with 5 different seeds (0-4). The following tables show the settings of all relevant hyperparameters for each dataset and FL setting.
\begin{table}[h!]
\centering
\begin{tabular}{c|ccccc}
FL-Setting                  & Dataset & Structure & Threshold & min\_num\_instances & glueing       \\ \hline
\multirow{3}{*}{horizontal} & Income  & learned   & 0.3       & 200                 & -             \\
                            & Credit  & learned   & 0.5       & 200                 & -             \\
                            & Cancer  & learned   & 0.4       & 300                 & -             \\
\multirow{3}{*}{vertical}   & Income  & learned   & 0.4       & 100                 & combinatorial \\
                            & Credit  & learned   & 0.5       & 50                  & combinatorial \\
                            & Cancer  & learned   & 0.4       & 300                 & combinatorial \\
\multirow{3}{*}{hybrid}     & Income  & learned   & 0.4       & 100                 & combinatorial \\
                            & Credit  & learned   & 0.5       & 50                  & combinatorial \\
                            & Cancer  & learned   & 0.4       & 300                 & combinatorial
\end{tabular}
\caption{Hyperparameters used in our experiments for all tabular datasets.}
\end{table}

\begin{table}[h!]
\centering
\begin{tabular}{c|cccc}
                      & MNIST             & Imagenet32   & Imagenet         & CelebA\\ \hline
num\_epochs           & 5                 & 10             & 10     & 10\\
batch\_size           & 64                & 64             & 64     & 64\\
online\_em\_frequency & 5                 & 10             & 50     & 10\\
online\_em\_stepsize  & 0.1               & 0.25           & 0.5      & 0.25\\
Structure             & poon-domingos     & poon-domingos  & poon-domingos    & poon-domingos\\
pd\_num\_pieces       & 8                 & 8              & 8     & 8\\
K                     & 10                & 40             & 40     & 40\\
Leaf Distribution     & Gaussian          & Gaussian       & Gaussian     & Gaussian\\
min\_var              & $1 \cdot 10^{-3}$ & $1 \cdot 10^{-3}$ & $1 \cdot 10^{-3}$   & $1 \cdot 10^{-3}$\\
max\_var              & $0.25$ & $0.25$  & $0.25$ & $0.25$
\end{tabular}
\caption{Hyperparameters used in our experiments for image datasets.}
\end{table}

\subsection{Model Details}
For our density estimation tasks on image data, we employ the PD architecture~\citep{poon2011sum}. In horizontal settings, the region graph will be the same for all clients since its construction is deterministic and only based on the number of features to be modelled. In vertical and hybrid settings, the region graphs can vary among clients, depending on how the features are distributed among clients.

Further, the model size varies with the number of clients: The more clients participate in the training, the larger the model can be. The model sizes used in our experiments can be seen in Tab. \ref{tab:model_size}.

\begin{table}[h!]
\centering
\begin{tabular}{c|cccc}
                     & Imagenet32   & Imagenet         & CelebA\\ \hline
cent.                & 152M         & 387M & 387M\\
2 clients            & 304M         & 775M & 775M\\
4 clients            & 608B         & 1.5B & 1.5B\\
8 clients            & 1.2B         & 3.1B & 3.1B\\
16 clients           & 2.4B         & 6.2B & 6.2B\\
\end{tabular}
\caption{Number of parameters of our models, depending on the dataset and the number of clients.}
\label{tab:model_size}
\end{table}


\subsection{Hardware}
All experiments were conducted on Nvidia DGX machines with Nvidia A100 (40GB) GPUs, AMD EPYC 7742 64-Core Processor and 2TiB of RAM.

\section{Further Results}
\label{app:results}
Here, we provide further experimental details on FCs. 

%\textbf{Model Parameter Ablation.} 
%To validate our results, we provide an additional ablation study on the effect the model size (measured in the number of parameters) has on the final model performance. To this end, we trained models of different sizes (1.2M, 34M, and 99M parameters) on CelebA. We used equally clustered data (2, 4, 8, or 16 clusters) and trained a mixture of EiNets in each run to ensure that no other effects affect the result. We find that the model parameters have a significant effect on the final model performance (reported as log-likelihood) and larger models achieve better log-likelihood values. Thus, our ablation confirms that scaling PCs is crucial to obtaining high-quality density estimates on complex data. For detailed results, see Tab. \ref{tab:ablation}.

%\begin{table}[h!]
%\begin{tabular}{l|llll}
            %& 2 clusters           & 4 clusters            & 8 clusters           & 16 clusters           \\ \hline
%1.2M param. & -3692.40 $\pm$ 67.07 & -3263.54 $\pm$ 102.60 & -3668.98 $\pm$ 87.66 & -5145.27 $\pm$ 64.28  \\
%34M param.  & 1659.57 $\pm$ 65.02  & 1154.19 $\pm$  55.31  & 481.02 $\pm$ 103.37  & -1104.55 $\pm$ 109.69 \\
%99M param.  & \textbf{5011.55} $\pm$ \textbf{95.57}  & \textbf{4388.37} $\pm$ \textbf{67.94}   & \textbf{3727.43} $\pm$ \textbf{71.29}  & \textbf{2208.78} $\pm$ \textbf{38.82}                     
%\end{tabular}
%\caption{\textbf{Model size significantly influences log-likelihood.} We trained mixtures of EiNets of various sizes on the same clustering of CelebA to validate our results from the main paper. The model size has a crucial influence on the final model performance and larger models achieve better log-likelihoods.}
%\label{tab:ablation}
%\end{table}

\textbf{Further Discussion of Density Estimation Results.} 
There are two possible ways of improving the modeling performance of PCs. One way is to improve the modeling itself (e.g., by defining different architectures, allowing for negative or complex mixture weights, etc.). Since PCs are a special kind of neural network, another way of improving modeling performance is to exploit neural scaling laws~\citep{caballero2023broken, li2021mitigating}.

Recent works on neural scaling laws suggest that increasing the model size from a small to a mid-sized model yields significant gains in terms of model performance while adding parameters to an already large model has minor effects~\citep{caballero2023broken, li2021mitigating}. This means that a saturation effect occurs when increasing the number of model parameters. Thus, in our setting, observing a larger gain when scaling from 1 to 2 clients and less improvement when going from e.g., 8 to 16 clients is well aligned with recent findings.

\textbf{FL Classification Results.} We compare FCs to several baselines in horizontal, vertical, and hybrid FL. In horizontal FL, we compare against FedAvg (using TabNet~\citep{arik2020tabnetattentiveinterpretabletabular}) and FedTree~\citep{li2023fedtree}; in vertical FL, we compare against SplitNN (also using TabNet) and FedTree. In hybrid FL, we compare different parameterizations of FCs (FedPCs and FCs parameterized with decision trees). We find that FCs are competitive or outperforming the selected baselines in all FL settings (see Tab. \ref{tab:full_FL_results}). This makes them a very flexible FL framework that still yields high-performing models.
% \begin{table}[]
% \begin{tabular}{c|cccccc}
%                              & \multicolumn{2}{c}{Cancer}        & \multicolumn{2}{c}{Credit}        & \multicolumn{2}{c}{Income}        \\ \hline
%                              & Acc.            & F1              & Acc.            & F1              & Acc.            & F1              \\
% FedAvg {[}TabNet{]} (5 cl.)  & $0.92 \pm 0.03$ & $0.92 \pm 0.03$ & $0.71 \pm 0.11$ & $0.48 \pm 0.04$ & $0.68 \pm 0.06$ & $0.51 \pm 0.03$ \\
% FedAvg {[}TabNet{]} (10 cl.) & $0.92 \pm 0.04$ & $0.91 \pm 0.05$ & $0.56 \pm 0.12$ & $0.47 \pm 0.06$ & $0.64 \pm 0.06$ & $0.52 \pm 0.03$ \\
% FedTree (5 cl.)              & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.91 \pm 0.01$ & $0.63 \pm 0.01$ & $0.88 \pm 0.01$ & $0.82 \pm 0.02$ \\
% FedTree (10 cl.)             & $0.94 \pm 0.01$ & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.69 \pm 0.01$ & $0.87 \pm 0.01$ & $0.80 \pm 0.01$ \\
% FC {[}PC{]} (5 cl.)          & $0.98 \pm 0.01$ & $0.98 \pm 0.01$ & $0.93 \pm 0.02$ & $0.68 \pm 0.02$ & $0.87 \pm 0.02$ & $0.80 \pm 0.01$ \\
% FC {[}PC{]} (10 cl.)         & $0.95 \pm 0.02$ & $0.95 \pm 0.02$ & $0.93 \pm 0.01$ & $0.66 \pm 0.02$ & $0.87 \pm 0.01$ & $0.80 \pm 0.02$ \\
% FC {[}DT{]} (5 cl.)          & $0.95 \pm 0.03$ & $0.93 \pm 0.02$ & $0.92 \pm 0.01$ & $0.67 \pm 0.01$ & $0.89 \pm 0.01$ & $0.83 \pm 0.01$ \\
% FC {[}DT{]} (10 cl.)         & $0.95 \pm 0.02$ & $0.93 \pm 0.03$ & $0.92 \pm 0.01$ & $0.97 \pm 0.02$ & $0.89 \pm 0.01$ & $0.83 \pm 0.02$ \\
% SplitNN {[}TabNet{]}         & -               & -               & -               & -               & -               & -              
% \end{tabular}
% \caption{\textbf{FCs achieve competing performance in horizontal setting.}}
% \end{table}

% \begin{table}[]
% \begin{tabular}{c|cccccc}
%                              & \multicolumn{2}{c}{Cancer}        & \multicolumn{2}{c}{Credit}        & \multicolumn{2}{c}{Income}        \\ \hline
%                              & Acc.            & F1              & Acc.            & F1              & Acc.            & F1              \\
% SplitNN {[}TabNet{]} (2 cl.) & $0.98 \pm 0.01$ & $0.98 \pm 0.01$ & $0.93 \pm 0.01$ & $0.48 \pm 0.01$ & $0.56 \pm 0.25$ & $0.42 \pm 0.17$ \\
% SplitNN {[}TabNet{]} (3 cl.) & $0.98 \pm 0.01$ & $0.98 \pm 0.01$ & $0.93 \pm 0.01$ & $0.48 \pm 0.01$ & $0.62 \pm 0.20$ & $0.56 \pm 0.16$ \\
% FedTree (2 cl.)              & $0.94 \pm 0.01$ & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.69 \pm 0.02$ & $0.87 \pm 0.01$ & $0.80 \pm 0.01$ \\
% FedTree (3 cl.)              & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.92 \pm 0.01$ & $0.69 \pm 0.01$ & $0.87 \pm 0.01$ & $0.80 \pm 0.01$ \\
% FC {[}PC{]} (2 cl.)          & $0.96 \pm 0.01$ & $0.96 \pm 0.01$ & $0.92 \pm 0.01$ & $0.67 \pm 0.01$ & $0.84 \pm 0.02$ & $0.74 \pm 0.01$ \\
% FC {[}PC{]} (3 cl.)          & $0.95 \pm 0.01$ & $0.95 \pm 0.01$ & $0.92 \pm 0.01$ & $0.66 \pm 0.02$ & $0.84 \pm 0.01$ & $0.74 \pm 0.01$ \\
% FC {[}DT{]} (2 cl.)          & $0.96 \pm 0.01$ & $0.96 \pm 0.02$ & $0.93 \pm 0.01$ & $0.60 \pm 0.02$ & $0.83 \pm 0.02$ & $0.67 \pm 0.02$ \\
% FC {[}DT{]} (3 cl.)          & $0.95 \pm 0.01$ & $0.95 \pm 0.03$ & $0.93 \pm 0.01$ & $0.60 \pm 0.02$ & $0.82 \pm 0.02$ & $0.67 \pm 0.02$ \\
% FedAvg {[}TabNet{]}          & -               & -               & -               & -               & -               & -              
% \end{tabular}
% \caption{\textbf{FCs achieve competing performance in vertical setting.}}
% \end{table}

% \begin{table}[]
% \begin{tabular}{c|cccccc}
%                      & \multicolumn{2}{c}{Cancer}        & \multicolumn{2}{c}{Credit}        & \multicolumn{2}{c}{Income}        \\ \hline
%                      & Acc.            & F1              & Acc.            & F1              & Acc.            & F1              \\
% FC {[}PC{]} (2 cl.)  & $0.94 \pm 0.01$ & $0.94 \pm 0.01$ & $0.92 \pm 0.01$ & $0.67 \pm 0.01$ & $0.82 \pm 0.02$ & $0.71 \pm 0.01$ \\
% FC {[}PC{]} (3 cl.)  & $0.94 \pm 0.01$ & $0.94 \pm 0.01$ & $0.92 \pm 0.01$ & $0.67 \pm 0.02$ & $0.80 \pm 0.01$ & $0.70 \pm 0.01$ \\
% FC {[}DT{]} (2 cl.)  & $0.96 \pm 0.01$ & $0.96 \pm 0.02$ & $0.93 \pm 0.01$ & $0.60 \pm 0.02$ & $0.82 \pm 0.02$ & $0.66 \pm 0.02$ \\
% FC {[}DT{]} (3 cl.)  & $0.96 \pm 0.01$ & $0.96 \pm 0.01$ & $0.93 \pm 0.01$ & $0.54 \pm 0.02$ & $0.82 \pm 0.02$ & $0.66 \pm 0.02$ \\
% FedAvg {[}TabNet{]}  & -               & -               & -               & -               & -               & -               \\
% SplitNN {[}TabNet{]} & -               & -               & -               & -               & -               & -               \\
% FedTree              & -               & -               & -               & -               & -               & -              
% \end{tabular}
% \caption{\textbf{FCs can handle hybrid FL settings without performance deterioration.}}
% \end{table}

\begin{table}[h!]
\resizebox{\textwidth}{!}{
\begin{tabular}{c|c|cccccc}
&                             & \multicolumn{2}{c}{Cancer}        & \multicolumn{2}{c}{Credit}        & \multicolumn{2}{c}{Income}        \\ \hline
&                             & Acc.            & F1              & Acc.            & F1              & Acc.            & F1              \\
\multirow{9}{*}{\rotatebox{90}{Horizontal FL}} & FedAvg {[}TabNet{]} (5 cl.)  & $0.92 \pm 0.03$ & $0.92 \pm 0.03$ & $0.71 \pm 0.11$ & $0.48 \pm 0.04$ & $0.68 \pm 0.06$ & $0.51 \pm 0.03$ \\
& FedAvg {[}TabNet{]} (10 cl.) & $0.92 \pm 0.04$ & $0.91 \pm 0.05$ & $0.56 \pm 0.12$ & $0.47 \pm 0.06$ & $0.64 \pm 0.06$ & $0.52 \pm 0.03$ \\
& FedTree (5 cl.)              & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.91 \pm 0.01$ & $0.63 \pm 0.01$ & $0.88 \pm 0.01$ & $0.82 \pm 0.02$ \\
& FedTree (10 cl.)             & $0.94 \pm 0.01$ & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.69 \pm 0.01$ & $0.87 \pm 0.01$ & $0.80 \pm 0.01$ \\
& FC {[}PC{]} (5 cl.)          & $0.98 \pm 0.01$ & $0.98 \pm 0.01$ & $0.93 \pm 0.02$ & $0.68 \pm 0.02$ & $0.87 \pm 0.02$ & $0.80 \pm 0.01$ \\
& FC {[}PC{]} (10 cl.)         & $0.95 \pm 0.02$ & $0.95 \pm 0.02$ & $0.93 \pm 0.01$ & $0.66 \pm 0.02$ & $0.87 \pm 0.01$ & $0.80 \pm 0.02$ \\
& FC {[}DT{]} (5 cl.)          & $0.95 \pm 0.03$ & $0.93 \pm 0.02$ & $0.92 \pm 0.01$ & $0.67 \pm 0.01$ & $0.89 \pm 0.01$ & $0.83 \pm 0.01$ \\
& FC {[}DT{]} (10 cl.)         & $0.95 \pm 0.02$ & $0.93 \pm 0.03$ & $0.92 \pm 0.01$ & $0.97 \pm 0.02$ & $0.89 \pm 0.01$ & $0.83 \pm 0.02$ \\
& SplitNN {[}TabNet{]}         & -               & -               & -               & -               & -               & -              \\ \hline
\multirow{9}{*}{\rotatebox{90}{Vertical FL}} & SplitNN {[}TabNet{]} (2 cl.) & $0.98 \pm 0.01$ & $0.98 \pm 0.01$ & $0.93 \pm 0.01$ & $0.48 \pm 0.01$ & $0.56 \pm 0.25$ & $0.42 \pm 0.17$ \\
& SplitNN {[}TabNet{]} (3 cl.) & $0.98 \pm 0.01$ & $0.98 \pm 0.01$ & $0.93 \pm 0.01$ & $0.48 \pm 0.01$ & $0.62 \pm 0.20$ & $0.56 \pm 0.16$ \\
& FedTree (2 cl.)              & $0.94 \pm 0.01$ & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.69 \pm 0.02$ & $0.87 \pm 0.01$ & $0.80 \pm 0.01$ \\
& FedTree (3 cl.)              & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.92 \pm 0.01$ & $0.69 \pm 0.01$ & $0.87 \pm 0.01$ & $0.80 \pm 0.01$ \\
& FC {[}PC{]} (2 cl.)          & $0.96 \pm 0.01$ & $0.96 \pm 0.01$ & $0.92 \pm 0.01$ & $0.67 \pm 0.01$ & $0.84 \pm 0.02$ & $0.74 \pm 0.01$ \\
& FC {[}PC{]} (3 cl.)          & $0.95 \pm 0.01$ & $0.95 \pm 0.01$ & $0.92 \pm 0.01$ & $0.66 \pm 0.02$ & $0.84 \pm 0.01$ & $0.74 \pm 0.01$ \\
& FC {[}DT{]} (2 cl.)          & $0.96 \pm 0.01$ & $0.96 \pm 0.02$ & $0.93 \pm 0.01$ & $0.60 \pm 0.02$ & $0.83 \pm 0.02$ & $0.67 \pm 0.02$ \\
& FC {[}DT{]} (3 cl.)          & $0.95 \pm 0.01$ & $0.95 \pm 0.03$ & $0.93 \pm 0.01$ & $0.60 \pm 0.02$ & $0.82 \pm 0.02$ & $0.67 \pm 0.02$ \\
& FedAvg {[}TabNet{]}          & -               & -               & -               & -               & -               & -              \\ \hline
\multirow{7}{*}{\rotatebox{90}{Hybrid FL}} & FC {[}PC{]} (2 cl.)  & $0.94 \pm 0.01$ & $0.94 \pm 0.01$ & $0.92 \pm 0.01$ & $0.67 \pm 0.01$ & $0.82 \pm 0.02$ & $0.71 \pm 0.01$ \\
& FC {[}PC{]} (3 cl.)  & $0.94 \pm 0.01$ & $0.94 \pm 0.01$ & $0.92 \pm 0.01$ & $0.67 \pm 0.02$ & $0.80 \pm 0.01$ & $0.70 \pm 0.01$ \\
& FC {[}DT{]} (2 cl.)  & $0.96 \pm 0.01$ & $0.96 \pm 0.02$ & $0.93 \pm 0.01$ & $0.60 \pm 0.02$ & $0.82 \pm 0.02$ & $0.66 \pm 0.02$ \\
& FC {[}DT{]} (3 cl.)  & $0.96 \pm 0.01$ & $0.96 \pm 0.01$ & $0.93 \pm 0.01$ & $0.54 \pm 0.02$ & $0.82 \pm 0.02$ & $0.66 \pm 0.02$ \\
& FedAvg {[}TabNet{]}  & -               & -               & -               & -               & -               & -               \\
& SplitNN {[}TabNet{]} & -               & -               & -               & -               & -               & -               \\
& FedTree              & -               & -               & -               & -               & -               & -        
\end{tabular}
}
\caption{\textbf{All Classification results of FL experiments.} Here, we show the detailed performances of FC, FedAvg, and SplitNN in all three FL settings. It can be seen that FCs, while being much more flexible than our baselines, still achieve competitive or better results on various classification tasks.}
\label{tab:full_FL_results}
\end{table}
\end{document}
