
\documentclass{article} % For LaTeX2e
\usepackage{iclr2025_conference,times}

% Optional math commands from https://github.com/goodfeli/dlbook_notation.
\input{math_commands.tex}

\usepackage{hyperref}
\usepackage{url}

\usepackage{url}
\usepackage{amsfonts}
\usepackage{amsthm}% blackboard math symbols
\usepackage[utf8]{inputenc}
\usepackage{graphicx}
\usepackage{colortbl}
\usepackage{tabulary}
\usepackage{subcaption}
\usepackage{pgf} % for calculating the values for gradient
\usepackage{graphics}
\usepackage{booktabs}       % professional-quality tables
\usepackage{multirow}
\usepackage{pgfplots}

\usepackage{hyperref}
\usepackage{etoolbox}
\usepackage{collcell}
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{natbib} % has a nice set of citation styles and commands
\usepackage{xcolor}         % colors
\usepackage[ruled,linesnumbered,noend]{algorithm2e}
\usepackage{pifont}
\usepackage{adjustbox}
\usepackage{makecell}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{wrapfig}
\newcommand\notejo[1]{\textcolor{red}{#1}}
\newcommand{\indep}{\perp \!\!\! \perp}
%\newtheorem{defin}{Definition}
\newtheorem{proofsketch}{Proof (Sketch)}
\newtheorem{assum}{Assumption}
%\newtheorem{prop}{Proposition}
%\newtheorem{conj}{Conjecture}
\newtheorem{rem}{Remark}

\newcommand{\SumNode}{\mathsf{S}}
\newcommand{\ClientNode}{\mathsf{C}}
\newcommand{\ProductNode}{\mathsf{P}}
\newcommand{\Node}{\mathsf{N}}
\newcommand{\Leaf}{\mathsf{L}}
\newcommand{\graph}{\mathcal{G}}
\newcommand{\ch}[1]{\operatorname{ch}(#1)}
\newcommand{\pa}[1]{\operatorname{pa}(#1)}
\newcommand{\SPN}{\mathfrak{C}}

% TODONOTES
\usepackage{xargs}
%\usepackage[colorinlistoftodos,textsize=small,disable]{todonotes} % Disabled
%\setlength{\marginparwidth}{1.7cm}  % TODO: comment out when todonotes is disabled
\usepackage[colorinlistoftodos,textsize=tiny]{todonotes} % Enabled
\newcommandx{\todoc}[2][1=]{{\todo[linecolor=orange,backgroundcolor=orange!25,bordercolor=orange,#1]{\tiny
      TODO: #2}}}
\newcommandx{\unsure}[2][1=]{{\todo[linecolor=yellow,backgroundcolor=yellow!25,bordercolor=yellow,#1]{\tiny
      UNSURE: #2}}}
\newcommandx{\change}[2][1=]{{\todo[linecolor=blue,backgroundcolor=blue!25,bordercolor=blue,#1]{\tiny
      CHANGE: #2}}}
\newcommandx{\info}[2][1=]{{\todo[linecolor=green,backgroundcolor=green!25,bordercolor=green,#1]{\tiny
      INFO: #2}}}
\newcommandx{\improvement}[2][1=]{{\todo[linecolor=violet,backgroundcolor=violet!25,bordercolor=violet,#1]{\tiny
      IMPROVEMENT: #2}}}
\newcommandx{\fb}[2][1=]{{\todo[inline,linecolor=lime,backgroundcolor=lime!25,bordercolor=lime,#1]{\tiny
      FB: #2}}}
\newcommandx{\thiswillnotshow}[2][1=]{{\todo[disable,#1]{THIS WILL NOT SHOW:
      #2}}}
\newcommand{\expec}{\mathbb{E}}

\newtheorem{defin}{Definition}
\newtheorem{prop}{Proposition}
\newtheorem{conj}{Conjecture}
\newtheorem{fact}{Fact}

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}
% Color set related!
\definecolor{high}{HTML}{77b300}  % the color for the highest number in your data set
\definecolor{low}{HTML}{0066ff}  % the color for the lowest number in your data set
\newcommand*{\opacity}{65}% here you can change the opacity of the background color!
%======================================
% Data set related!
\newcommand*{\minvalCancerAcc}{0.89}% define the minimum value on your data set
\newcommand*{\maxvalCancerAcc}{0.98}% define the maximum value in your data set!
\newcommand*{\minvalCancerF}{0.87}% define the minimum value on your data set
\newcommand*{\maxvalCancerF}{0.98}% define the maximum value in your data set!
\newcommand*{\minvalCreditAcc}{0.56}% define the minimum value on your data set
\newcommand*{\maxvalCreditAcc}{0.93}% define the maximum value in your data set!
\newcommand*{\minvalCreditF}{0.46}% define the minimum value on your data set
\newcommand*{\maxvalCreditF}{0.69}% define the maximum value in your data set!
\newcommand*{\minvalIncomeAcc}{0.56}% define the minimum value on your data set
\newcommand*{\maxvalIncomeAcc}{0.88}% define the maximum value in your data set!
\newcommand*{\minvalIncomeF}{0.41}% define the minimum value on your data set
\newcommand*{\maxvalIncomeF}{0.83}% define the maximum value in your data set!

%======================================
\newcommand{\gradientCancerAcc}[1]{
    % The values are calculated linearly between \minval and \maxval
    \ifdimcomp{#1pt}{>}{\maxvalCancerAcc pt}{#1}{
    \ifdimcomp{#1pt}{<}{\minvalCancerAcc pt}{#1}{
        \ifdimcomp{#1pt}{>}{0.93 pt}{} {
            \pgfmathparse{int(round(1*(100*(#1/(\maxvalCancerAcc-\minvalCancerAcc))-(\minvalCancerAcc*(100/(\maxvalCancerAcc-\minvalCancerAcc))))))}
            \xdef\tempa{\pgfmathresult}
            \cellcolor{high!\tempa!low!\opacity} #1
        }
        \ifdimcomp{#1pt}{<}{0.94 pt}{} {
            \pgfmathparse{int(round(100))}
            \xdef\tempa{\pgfmathresult}
            \cellcolor{high!\tempa!low!\opacity} #1
        }
    }}
 }
 
 %\newcommand{\gradientCancerAcc}[1]{
 %   % The values are calculated linearly between \minval and \maxval
 %   \ifdimcomp{#1pt}{>}{\maxvalCancerAcc pt}{#1}{
 %   \ifdimcomp{#1pt}{<}{\minvalCancerAcc pt}{#1}{
 %           \pgfmathparse{int(round(1*(100*(#1/(\maxvalCancerAcc-\minvalCancerAcc))-(%\minvalCancerAcc*(100/(\maxvalCancerAcc-\minvalCancerAcc))))))}
 %           \xdef\tempa{\pgfmathresult}
 %           \cellcolor{high!\tempa!low!\opacity} #1
 %   }}
 %}
 
\newcommand{\gradientCancerF}[1]{
    % The values are calculated linearly between \minval and \maxval
    \ifdimcomp{#1pt}{>}{\maxvalCancerF1 pt}{#1}{
    \ifdimcomp{#1pt}{<}{\minvalCancerF1 pt}{#1}{
        \ifdimcomp{#1pt}{>}{0.93 pt}{} {
            \pgfmathparse{int(round(1*(100*(#1/(\maxvalCancerF-\minvalCancerF))-(\minvalCancerF*(100/(\maxvalCancerF-\minvalCancerF))))))}
            \xdef\tempa{\pgfmathresult}
            \cellcolor{high!\tempa!low!\opacity} #1
        }
        \ifdimcomp{#1pt}{<}{0.94 pt}{} {
            \pgfmathparse{int(round(100))}
            \xdef\tempa{\pgfmathresult}
            \cellcolor{high!\tempa!low!\opacity} #1
        }
    }}
}
 
\newcommand{\gradientCreditAcc}[1]{
    % The values are calculated linearly between \minval and \maxval
    \ifdimcomp{#1pt}{>}{\maxvalCreditAcc pt}{#1}{
    \ifdimcomp{#1pt}{<}{\minvalCreditAcc pt}{#1}{
        \ifdimcomp{#1pt}{>}{0.91 pt}{} {
            \pgfmathparse{int(round(0.1*(100*(#1/(\maxvalCreditAcc-\minvalCreditAcc))-(\minvalCreditAcc*(100/(\maxvalCreditAcc-\minvalCreditAcc))))))}
            \xdef\tempa{\pgfmathresult}
            \cellcolor{high!\tempa!low!\opacity} #1
        }
        \ifdimcomp{#1pt}{<}{0.92 pt}{} {
            \pgfmathparse{int(round(100))}
            \xdef\tempa{\pgfmathresult}
            \cellcolor{high!\tempa!low!\opacity} #1
        }
    }}
 }

\newcommand{\gradientCreditF}[1]{
    % The values are calculated linearly between \minval and \maxval
    \ifdimcomp{#1pt}{>}{\maxvalCreditF1 pt}{#1}{
    \ifdimcomp{#1pt}{<}{\minvalCreditF1 pt}{#1}{
        \ifdimcomp{#1pt}{>}{0.63 pt}{} {
            \pgfmathparse{int(round(0.8*(100*(#1/(\maxvalCreditF-\minvalCreditF))-(\minvalCreditF*(100/(\maxvalCreditF-\minvalCreditF))))))}
            \xdef\tempa{\pgfmathresult}
            \cellcolor{high!\tempa!low!\opacity} #1
        }
        \ifdimcomp{#1pt}{<}{0.64 pt}{} {
            \pgfmathparse{int(round(100))}
            \xdef\tempa{\pgfmathresult}
            \cellcolor{high!\tempa!low!\opacity} #1
        }
    }}
 }
 
\newcommand{\gradientIncomeAcc}[1]{
    % The values are calculated linearly between \minval and \maxval
    \ifdimcomp{#1pt}{>}{\maxvalIncomeAcc pt}{#1}{
    \ifdimcomp{#1pt}{<}{\minvalIncomeAcc pt}{#1}{
        \ifdimcomp{#1pt}{>}{0.83 pt}{} {
            \pgfmathparse{int(round(1*(100*(#1/(\maxvalIncomeAcc-\minvalIncomeAcc))-(\minvalIncomeAcc*(100/(\maxvalIncomeAcc-\minvalIncomeAcc))))))}
            \xdef\tempa{\pgfmathresult}
            \cellcolor{high!\tempa!low!\opacity} #1
        }
        \ifdimcomp{#1pt}{<}{0.84 pt}{} {
            \pgfmathparse{int(round(100))}
            \xdef\tempa{\pgfmathresult}
            \cellcolor{high!\tempa!low!\opacity} #1
        }
    }}
 }
 
\newcommand{\gradientIncomeF}[1]{
    % The values are calculated linearly between \minval and \maxval
    \ifdimcomp{#1pt}{>}{\maxvalIncomeF1 pt}{#1}{
    \ifdimcomp{#1pt}{<}{\minvalIncomeF1 pt}{#1}{
        \ifdimcomp{#1pt}{>}{0.72 pt}{} {
            \pgfmathparse{int(round(1*(100*(#1/(\maxvalIncomeF-\minvalIncomeF))-(\minvalIncomeF*(100/(\maxvalIncomeF-\minvalIncomeF))))))}
            \xdef\tempa{\pgfmathresult}
            \cellcolor{high!\tempa!low!\opacity} #1
        }
        \ifdimcomp{#1pt}{<}{0.73 pt}{} {
            \pgfmathparse{int(round(100))}
            \xdef\tempa{\pgfmathresult}
            \cellcolor{high!\tempa!low!\opacity} #1
        }
    }}
 }

\definecolor{qUmr}{HTML}{AB0392}
\definecolor{4iCr}{HTML}{3492EB}
\definecolor{6dQr}{HTML}{15A123}
\definecolor{allrev}{HTML}{f59342}


% \title{Efficiently Scaling Probabilistic Circuits via Data Partitioning}
\title{Scaling Probabilistic Circuits via Data \\ Partitioning}
% \title{Scaling Training Probabilistic Circuits via  Data Partitioning}

% Authors must not appear in the submitted version. They should be hidden
% as long as the \iclrfinalcopy macro remains commented out below.
% Non-anonymous submissions will be rejected without review.

\author{Antiquus S.~Hippocampus, Natalia Cerebro \& Amelie P. Amygdale \thanks{ Use footnote for providing further information
about author (webpage, alternative address)---\emph{not} for acknowledging
funding agencies.  Funding acknowledgements go at the end of the paper.} \\
Department of Computer Science\\
Cranberry-Lemon University\\
Pittsburgh, PA 15213, USA \\
\texttt{\{hippo,brain,jen\}@cs.cranberry-lemon.edu} \\
\And
Ji Q. Ren \& Yevgeny LeNet \\
Department of Computational Neuroscience \\
University of the Witwatersrand \\
Joburg, South Africa \\
\texttt{\{robot,net\}@wits.ac.za} \\
\AND
Coauthor \\
Affiliation \\
Address \\
\texttt{email}
}

% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to \LaTeX{} to determine where to break
% the lines. Using \AND forces a linebreak at that point. So, if \LaTeX{}
% puts 3 of 4 authors names on the first line, and the last on the second
% line, try using \AND instead of \And before the third author name.

%\iclrfinalcopy % Uncomment for camera-ready version, but NOT for submission.
\begin{document}


\maketitle

\begin{abstract}
Probabilistic circuits (PCs) enable us to learn joint distributions over a set of random variables and to perform various probabilistic queries in a tractable fashion. Though the tractability property allows PCs to scale beyond non-tractable models such as Bayesian Networks, scaling training and inference of PCs to larger, real-world datasets remains challenging. To remedy the situation, we show how PCs can be learned across multiple machines by recursively partitioning a distributed dataset, thereby unveiling a deep connection between PCs and federated learning (FL). This leads to federated circuits (FCs)---a novel and flexible federated learning (FL) framework that (1) allows one to scale PCs on distributed learning environments (2) train PCs faster and (3) unifies for the first time horizontal, vertical, and hybrid FL in one framework by re-framing FL as a density estimation problem over distributed datasets. We demonstrate FC's capability to scale PCs on various large-scale datasets. Also, we show FC's versatility in handling horizontal, vertical, and hybrid FL within a unified framework on multiple classification tasks.
\end{abstract}

\section{Introduction}
% In the domain of probabilistic models~\citep{goodfellow2014generative,kingma2013auto}, probabilistic graphical models (PGMs; \citet{poon2011sum,pearl1985bayesian}) allow for a broad set of probabilistic queries, such as marginalization and conditioning.
% In addition, such models provide the user with a measure of uncertainty due to their probabilistic nature.
% Famously, Bayesian Networks (BNs)~\citep{pearl1985bayesian} represent a joint probability distribution by modeling random variables (RVs) as nodes and dependencies among RVs as edges in a graph.
% However, while this graphical representation is intuitive and interpretable, it poses limits on computational feasibility for various probabilistic queries~\citep{Park_2006}.
% Hence, \textit{tractability} of probabilistic models is a desirable property, i.e., probabilistic queries can be computed in polynomial time.

Probabilistic Circuits (PCs) are a family of models that provide tractable inference for various probabilistic queries~\citep{domingos2011spns, Choi2020ProbabilisticCA}.
%i.e., probabilistic queries can be computed in polynomial time.
This is achieved by representing a joint distribution by a computation graph on which certain structural properties are imposed.
% For example, Sum-Product Networks (SPNs) consist of simple arithmetic operations (sums and products) that compute mixtures or products of tractable base distributions~\citep{domingos2011spns}.
% Here, a product operation models a (context-specific) independency between variables, while sum operations allow for different mixtures of distributions over the same set of variables, thus modeling dependencies.
While PCs offer significant computational advantages over traditional probabilistic models such as Bayesian networks~\citep{pearl1985bayesian}, further performance gains can be realized by optimizing the compactness of PC representations and tailoring them to specific hardware architectures~\citep{perharz2020einsum,liu2024scalingtractableprobabilisticcircuits}.
% A common scheme in that regime is parallelizing computations in the forward- and backward pass during inference and training by building a compact layer-wise computation graph, similar to neural networks. This representation allows efficient GPU-based training of PCs. Prominent examples of this paradigm are EiNets~\citep{perharz2020einsum} and PyJuice~\citep{liu2024scalingtractableprobabilisticcircuits}.
However, another natural way to scale up PCs by distributing the model over multiple machines is so far underexplored.
While models like neural networks can be partitioned over multiple machines with relatively low efforts, partitioning PCs is more challenging as they come with certain structural constraints to ensure the validity of the represented joint distribution. Interestingly, we find an inherent connection between the structure of PCs and the paradigm of federated learning (FL).
In PCs, sum nodes combine probability distributions over the same set of variables via a mixture.
This resembles the horizontal FL~\citep{konevcny2016federated,li2020federatedChallenges} setting, where all clients hold the same features but different samples.
In contrast, the case of vertical FL~\citep{yang2019federated, wu2020privacy} in which the same samples are shared, but features are split across clients, can be linked to the product nodes used in PCs, which combine distributions of a disjoint set of variables.
Consequently, the hybrid FL~\citep{Zhang2020HybridFLAlgosAndImplementations} setting, where both samples and features are separated across clients, can be represented by a combination of sum and product nodes.
Thus, PCs are well positioned to connect all three FL settings in a unified way -- an endeavor considered hard to achieve in the FL community~\citep{Li_2023FLSurvey, wen2023federated}.
% This connection turns out to be beneficial for both sides as it allows us to efficiently scale up PCs and to design a highly flexible FL framework that can handle horizontal FL~\citep{konevcny2016federated,li2020federatedChallenges}, vertical FL~\citep{yang2019federated, wu2020privacy}, and hybrid FL~\citep{Zhang2020HybridFLAlgosAndImplementations} in a unified way -- an endeavor considered hard to achieve in the FL community~\citep{Li_2023FLSurvey, wen2023federated}.
%This paper aims to scale probabilistic models to large problems; therefore, PCs are a sensible choice.

%\todo{Figure 1: remove + and =, consider adding different colors for samples 1/2 and 3/4 and same for features. Add HFL/VFL to figure if used in caption}
\begin{figure}[t]
    \centering
      % \includegraphics[width=.9\columnwidth]{federated-circuits/images/concept.pdf}
      \includegraphics[width=.9\columnwidth]{federated-circuits/images/concept_new.pdf}
    \caption{\textbf{Scaling PCs via Federated Circuits.} We scale PCs by splitting a dataset $\mathcal{D}$ into a set of $n$ partitions $\{\mathcal{P}_i\}_{i=1}^n$ s.t. $\mathcal{D} = \bigcup_{i=1}^n \mathcal{P}_i$. Each partition is assigned to a client (i.e., machine) $c_j$, and the resulting federated circuit (FC) is learned jointly by a set of clients. FCs represent a novel framework for federated learning (FL), capable of performing horizontal FL (samples are split across clients), vertical FL (features are split across clients), and hybrid FL (mix of horizontal and vertical).}
    %In FL, data is either split across data samples (HFL), features (VFL), or both (hybrid FL). We can represent all settings using FedPCs by using a sum node to combine samples in HFL, a product node to combine features in HFL, or a combination of sum and product nodes to represent any arbitrary combination of splits among features and samples in hybrid FL. The hybrid FL framework allows us to distribute data amongst hardware to optimize efficiency (Best viewed in color).}
    \label{fig:arch}
\end{figure}

% As a result of the connection between PCs and FL, we introduce federated circuits (FCs), a novel FL framework that re-frames FL as a density estimation problem over a set of datasets distributed over multiple machines (called clients subsequently). 
As a result of this connection, we introduce \textit{federated circuits (FCs)}, a novel FL framework that re-frames FL as a density estimation problem over a set of datasets distributed over multiple machines (subsequently called clients). 
%Each client holds a local dataset, and we aim to learn a joint distribution over the union of datasets held by all participating clients. 
FCs naturally handle all three FL settings and, therefore, provide a flexible way of scaling up PCs by learning a joint distribution over a dataset arbitrarily partitioned across a set of clients (see Fig. \ref{fig:arch} for an illustration).
Imposing the same structural properties as for PCs, FCs achieve tractable computation of probabilistic queries like marginalization and conditioning across multiple machines.
To this end, we propose a highly communication-efficient learning algorithm that leverages the semi-ring structure within the design of FCs.
%In horizontal FL settings, FCs even operate at nearly zero communication cost, which is highly valuable in slow or unreliable communication networks. 
Our experimental evaluation\footnote{Code available at \url{https://anonymous.4open.science/r/federated-spn-5FDC}.} shows that FCs outperform EiNets~\citep{perharz2020einsum} on large-scale density estimation tasks, demonstrating the benefits of scaling up PCs. Additionally, FCs outperform or achieve competing results on various classification tasks in all federated settings compared to state-of-the-art neural network-based and tree-based methods, demonstrating its effectiveness in FL.
Overall, we make the following contributions:
\begin{enumerate}
    \item[\textbf{(1)}] We introduce FCs, a communication-efficient and scalable FL framework unifying horizontal, vertical, and hybrid FL by mapping the semantics of PCs to FL.
    \item[\textbf{(2)}] We practically instantiate FCs to FedPCs and demonstrate how the FC framework can be leveraged to scale up PCs to large real-world datasets.
    \item[\textbf{(3)}] We propose a novel one-pass training scheme for FedPCs that is compatible with a broad set of learning algorithms.
    % \item[\textbf{(3)}] We show that learning FCs is highly communication efficient in horizontal, vertical, and hybrid FL.
    \item[\textbf{(4)}] We provide extensive experiments demonstrating the effectiveness of our approach for learning large-scale PCs and performing FL. We consider multiple domains (tabular data, image data) and tasks (classification, density estimation).
\end{enumerate}

We proceed as follows: After touching upon related work, we provide the probabilistic view on FL and introduce FCs. Before concluding, we present our extensive experimental evaluation of FedPCs. 


\section{Preliminaries and Related Work}
% \fb{background instead of rw, talk about PC stuff here, briefly introduce FL, motivate the use/application of FL by further scaling PCs, use PC semantic for density estimation in FLs $\rightarrow$ we get all three FL cases automatically; also: much more possibilities than regular FL (more than just predictions; can also do marginal queries, MPE, ...)}
%Federated Circuits are related to two lines of research -- Federated Learning and Probabilistic Circuits -- which have not been combined into a single framework yet. %\notejo{Add more about FedTree}
%We now briefly introduce PCs and FL and revise related work.
In the following, we briefly introduce PCs and FL and give an overview of relevant related work.

\subsection{Probabilistic Circuits}
PCs encode a probability distribution as a computation graph that allows for tractable inference of a wide range of queries such as conditional (partial evidence) and marginalization. 
\cite{perharz2015theoreticalPC} define a PC over random variables $\mathbf{X}$ as a tuple $(\graph, \phi)$ where $\graph = (V, E)$ is a rooted, Directed Acyclic Graph (DAG) and $\phi: V \rightarrow 2^{\mathbf{X}}$ is the \textit{scope} function assigning a subset of random variables to each node in $\graph$. For each internal node $\Node$ of $\graph$ the scope is defined as the union of scopes of its children $\ch{\Node}$. Each leaf node $\Leaf$ computes a distribution/density over its scope. All internal nodes of $\graph$ are either a sum node $\SumNode$ or a product node $\ProductNode$ where each sum node computes a convex combination of its children, i.e. $\SumNode = \sum_{\Node \in \ch{\SumNode}} w_{\SumNode, \Node}\Node$, and each product node computes a product of its children, i.e. $\ProductNode = \prod_{\Node \in \ch{\ProductNode}}\Node$.
To ensure tractability of probabilistic queries such as marginalization, a PC must be \textit{decomposable}. Decomposability requires that for all $\ProductNode \in V$ it holds that $\phi(\Node) \cap \phi(\Node') = \emptyset$ where $\Node, \Node' \in \ch{\ProductNode}$. To further ensure that a PC represents a valid distribution, \textit{smoothness} must hold, i.e., for each sum $\SumNode \in V$ it holds that $\phi(\Node) = \phi(\Node')$ where $\Node, \Node' \in \ch{\SumNode}$~\citep{perharz2015theoreticalPC}. Decomposable and smooth PCs are often referred to as Sum-Product Networks (SPNs) \citep{domingos2011spns,peharz2015theoretical,sanchez2021sum}.

% There is a vast number of similar models that can be unified in the framework of PCs, such as Arithmetic Circuits \citep{darwiche2003AC}, Probabilistic Decision Graphs \citep{jaeger2004PDG} and SPNs.
% They have been successfully applied to image generation/classification~\citep{perharz2020einsum}, speech recognition~\citep{peharz2014modeling}, and language tasks~\citep{cheng2014language}.
% Other approaches for scaling PCs have been considered in the past.
Several works have tackled the goal of scaling PCs.
On the architecture side, it was shown that large, random structures can be used to scale to larger problems more easily~\citep{peharz2020random}.
Changes in the model layout, such as parallelizable layers and the einsum-operation~\citep{perharz2020einsum} and a reduction in IO operations~\citep{liu2024scalingtractableprobabilisticcircuits}, were also shown to drastically reduce the speed of computation.
\cite{lius2022calingviaLV} improved the performance of PCs by latent variable distillation, where deep generative models give additional supervision during the learning process.

%\textbf{Federated Learning.}
\subsection{Federated Learning}
In federated learning (FL), a set of data owners (or clients) aim to collaboratively learn an ML model without sharing their data. One distinguishes between horizontal, vertical, and hybrid FL depending on how data is partitioned. In horizontal FL, a dataset $\mathbf{D} \in \mathbb{R}^{n \times d}$ is partitioned s.t. each client holds the same $d$ features but different, non-overlapping sets of samples. In vertical FL, $\mathbf{D}$ is partitioned s.t. each client holds the same $n$ samples but different, non-overlapping subsets of the $d$ features. Hybrid FL describes a combination of horizontal and vertical FL where clients can hold both different (but possibly overlapping) sets of samples and features~\citep{wen2023federated, Li_2023FLSurvey}.

For all three FL settings, specifically tailored methods have been proposed to enable collaborative learning of models.
The most common scheme in horizontal FL is to average the models of all clients regularly during training \citep{McMahan2016FedAvg, karimireddy2020mime, Karimireddy2020SCAFFOLD, Sahu2018FedProx}. However, model averaging requires each client to share the same model structure. In vertical FL, clients hold different feature sets; thus, there is no guarantee that the model structure can be shared among clients. In these cases, tree-based and neural models are the predominant choice and are typically learned by sharing data statistics or feature representations among clients \citep{Kourtellis2016VHT, Cheng2021Secureboost, vepakomma2018split,Cellabos2020SplitNN, Tianyi2020VAFL, Liu2019FedBCD}. 
%Recently, neural network-based approaches communicating feature representations across clients have emerged \citep{vepakomma2018split,Cellabos2020SplitNN, Tianyi2020VAFL, Liu2019FedBCD}.  
Similar to tree-based vertical FL, tree-based hybrid FL approaches share data statistics (such as histograms) or model properties (such as split rules) among clients \citep{li2023fedtree, li2024effective}. However, tree-based approaches often require complex training procedures.
%Concerning hybrid FL (all clients hold the same features but different samples) ...
%However, FedPC is the first framework that can perform HFL, VFL and hybrid FL. Also, FedPCs learn a joint distribution over client data rather than splitting criteria, making them both, generative and discriminative.

%In the following, we present an elegant and effective federated learning framework called federated circuits (FCs). FCs unify horizontal, vertical, and hybrid FL by hierarchically learning mixtures (horizontal part) and fusing marginals (vertical part). FCs are compatible with a broad set of training algorithms. Further, FCs allow us to tremendously scale up PCs.
%In the following section, we show how federated circuits -- inspired by ideas from FL -- learn large-scale PCs and how federated circuits -- inspired by modeling approaches of PCs -- perform horizontal, vertical, and hybrid FL within one unified framework.

\section{Federated Circuits}
%\fb{figure for this section??}
This work aims to scale up PCs by splitting data and the model across multiple machines, thus harnessing the availability of compute clusters to train PCs in a federated fashion.
In the following, we present an elegant and effective way to achieve that using our novel federated learning framework called federated circuits (FCs). FCs unify horizontal, vertical, and hybrid FL by hierarchically learning mixtures (horizontal part) and fusing marginals (vertical part). %FCs are compatible with a broad set of training algorithms. 

%More formally, given a dataset $\mathbf{D}$ and a set of clients $\mathcal{C}$ where each $C \in \mathcal{C}$ holds a partition $\mathbf{D}_C$ of $\mathbf{D}$, we aim to learn the joint distribution $p(\mathbf{X})$ over random variables $\mathbf{X}$ (i.e., the features of $\mathbf{D}$).
%Note that the partitioning of $\mathbf{D}$ is not further specified. Hence, each client might only hold a subset of random variables $\mathbf{X}_C \subseteq \mathbf{X}$ with support $\mathcal{X}_C$. This can be interpreted as each $C \in \mathcal{C}$ holding a dataset $\mathbf{D}_C \sim p_C$ where $p_C$ is a joint distribution over $\mathbf{X}_C$ which is related to $p(\mathbf{X})$. Given this problem, we now briefly state our modeling assumptions, followed by bridging federated learning and probabilistic circuits, resulting in our novel framework of federated circuits. 

%In this section, we undertake a comprehensive reevaluation of the Federated Learning problem through a probabilistic lens.
%Our approach involves treating a dataset distributed across multiple clients as a unified virtual dataset. We break it down recursively along sample and feature dimensions, effectively breaking down the FL problem into smaller instances. Depending on the concrete parameterization of FCs, splitting along the sample dimension and feature dimension can be interpreted in different ways such as learning mixtures or identifying independent features. In the subsequent sections, we demonstrate that this straightforward yet potent approach enables a versatile and communication-efficient framework to address HFL, VFL, and hybrid FL.

%\subsection{Federated Learning for PCs and PCs for Federated Learning}
\subsection{Problem Statement \& Modeling Assumptions}
Given a dataset $\mathbf{D}$ and a set of clients $\mathcal{C}$ where each $c \in \mathcal{C}$ holds a partition $\mathbf{D}_c$ of $\mathbf{D}$; we aim to learn the joint distribution $p(\mathbf{X})$ over random variables $\mathbf{X}$ (i.e., the features of $\mathbf{D}$).
The partitioning of $\mathbf{D}$ is not further specified. Hence, each client might only hold a subset of random variables $\mathbf{X}_c \subseteq \mathbf{X}$ with support $\mathcal{X}_c$. This can be interpreted as each $c \in \mathcal{C}$ holding a dataset $\mathbf{D}_c \sim p_c$ where $p_c$ is a joint distribution over $\mathbf{X}_c$ which is related to $p(\mathbf{X})$.

We introduce two critical modeling assumptions relevant for learning a joint distribution $p(\mathbf{X})$ from a dataset $\mathbf{D}$ partitioned across a set of machines.

%In the realm of FL, learning the joint $p(\mathbf{X})$ hinges on a central question: How can we effectively merge local distributions, $p_C$, to accurately represent each client's data and thus the full dataset $\mathbf{D}$? This naturally prompts the notion of treating these local distributions as marginals of an undisclosed global joint distribution spanning all client data. Yet, to validate this view, two critical assumptions must be made.

\begin{assum}[Mixture Marginals]\label{assum:decomposition}
    There exists a joint distribution $p$ such that the relation $\int_{\mathbf{X} \setminus \mathbf{X}_S} p(x) = \sum_{l \in L} q(L=l) \cdot p_{S}(x | L=l)$ holds. Here, $\mathbf{X}_{S} \subseteq \mathbf{X}$ is a subset of the union of client random variables $\mathbf{X} = \cup_{c \in \mathcal{C}} \mathbf{X}_c$ with support $\mathcal{X} = \bigtimes_{c \in \mathcal{C}} \mathcal{X}_c$, each $p_{S}$ is defined over $\mathcal{X}_S \subseteq \mathcal{X}$ and $q$ is a prior over a latent $L$.
\end{assum}

%In words, Assumption \ref{assum:decomposition} means that there exists a joint distribution over the union of random variables $\mathbf{X}$ of all clients s.t. we can construct the marginal distribution over any subset $\mathbf{X}_S \subseteq \mathbf{X}$ of random variables as a latent variable model using the local distributions of all clients sharing said subset of random variables. 
To illustrate, consider a subset of variables $\mathbf{X}_S \subseteq \mathbf{X}$ shared among all clients and its complement $\mathbf{X}_{S^-} = \mathbf{X} \setminus \mathbf{X}_S$.
% The marginal $\int_{\mathbf{X}_{S^-}} p(\mathbf{X})$ should be representable as a mixture of all client distributions $p_C(\mathbf{X}_S)$ over $\mathbf{X}_S$.
Assumption \ref{assum:decomposition} ensures that the marginal $\int_{\mathbf{X}_{S^-}} p(\mathbf{X})$ is representable as a mixture of all client distributions $p_c(\mathbf{X}_S)$ over $\mathbf{X}_S$.
If Assumption \ref{assum:decomposition} would not hold, the information stored on the clients' data partitions would not be sufficient to learn $p(\mathbf{X})$.
%Assume data $\mathbf{D}$ is partitioned across clients s.t. there exists a subset of variables $\mathbf{X}_S \subseteq \mathbf{X}$ that is shared among all clients $\mathcal{C}$. Then, the marginal $\int_{\mathbf{X}_{S^-}} p(\mathbf{X})$ where $\mathbf{X}_{S^-} = \mathbf{X} \setminus \mathbf{X}_S$ should be representable as a mixture of all client distributions $p_C(\mathbf{X}_S)$ over $\mathbf{X}_S$ since the mixture accumulates all information about $\mathbf{X}_S$ that is held by the clients. 
%If data $\mathbf{D}$ is partitioned horizontally across clients, different local distributions $p_C$ on clients might be induced. However, for any subset of variables $\mathbf{X}_S \subset \mathbf{X}$, there exists only one marginal $\int_{\mathbf{X} \setminus \mathbf{X}_S} p(\mathbf{X})$; hence, this operation does not respect the fact that clients can have different distributions over the same variables as there only exists exactly one marginal. Assumption \ref{assum:decomposition} ensures that the marginal w.r.t. any subset of variables that are shared by a subset of clients $\mathcal{C}' \subseteq \mathcal{C}$ can be represented as the mixture of the client's distributions $p_C$ where $C \in \mathcal{C}'$.

%\fb{Suggestion for previous paragraph:}
%\color{blue}
%To illustrate, consider a subset of variables $\mathbf{X}_S \subseteq \mathbf{X}$ shared among all clients and its inverse $\mathbf{X}_{S^-} = \mathbf{X} \setminus \mathbf{X}_S$.
% The marginal $\int_{\mathbf{X}_{S^-}} p(\mathbf{X})$ should be representable as a mixture of all client distributions $p_C(\mathbf{X}_S)$ over $\mathbf{X}_S$.
%Here, Assumption \ref{assum:decomposition} ensures that the marginal $\int_{\mathbf{X}_{S^-}} p(\mathbf{X})$ is representable as a mixture of all client distributions $p_C(\mathbf{X}_S)$ over $\mathbf{X}_S$.
% the information stored on the client's data partitions is sufficient to learn $p(\mathbf{X})$.
%\color{black}

%Now that we have considered the case of clients sharing a subset of random variables let us shift to the case where different clients hold different subsets of random variables. 
A key assumption in FL is that data cannot be exchanged among clients. However, dependencies among variables residing on different clients might still exist. 
%Since data is kept private, these cannot be uncovered via standard techniques such as independence tests. 
To enable learning these ``hidden" dependencies while keeping data private, we make the following assumption:

\begin{assum}[Cluster Independence]\label{assum:cluster_independence}
    Given disjoint sets of random variables $\mathbf{X}_1, \cdots, \mathbf{X}_n$ and a joint distribution $p(\mathbf{X}_1, \cdots, \mathbf{X}_n)$, assume that a latent $L$ can be introduced s.t. the joint can be represented as $p(\mathbf{X}_1, \cdots, \mathbf{X}_n) = \sum_l p_{\theta}(L=l) \prod_{i=1}^n p(\mathbf{X}_i | L=l)$ where $p_{\theta}$ is a prior distribution over the latent $L$.
\end{assum}

Note that independence is only assumed within clusters in the data. Thus, the latent variable (which can be thought of as "cluster selectors``) allows capturing dependencies among variables residing on different clients. 
%The parameters $\theta$ of the mixture over latents (i.e., clusters) can be learned, e.g., by optimizing log-likelihood. Also note that setting the cardinality of the support set of the latent to one yields a product distribution, which amounts to assuming mutual independence among variables residing on different clients. Thus, 
Distributions of the form in Assumption \ref{assum:cluster_independence} are strictly more expressive distribution than the product distribution and thus allow for more complex modeling:

\begin{fact}\label{prop:mixture_of_prods}
A joint distribution $p$ over disjoint sets of random variables $\mathbf{X}_1, \cdots, \mathbf{X}_n$ of the form $p(\mathbf{X}_1, \cdots, \mathbf{X}_n) = \sum_l p_{\theta}(L=l) \prod_{i=1}^n p(\mathbf{X}_i | L=l)$ is strictly more expressive than a distribution of the form $p(\mathbf{X}_1, \cdots, \mathbf{X}_n) = \prod_{i=1}^n p(\mathbf{X}_i)$. We provide proof in the App. \ref{app:proofs}.
\end{fact}

%\begin{proofsketch}
%It has to be shown that (1) if $|\text{supp}\{L\}| = 1$, then $p(\mathbf{X}_1, \dots, \mathbf{X}_n)$ is a product distribution and (2) $L$ captures dependencies among random variables residing on different clients. (1) can be easily verified as $p_{\theta}$ becomes a point mass and. To see that (2) holds, one can intuitively think of $n$ 2-dimensional Gaussians, all with covariance matrix $\Sigma = \mathbf{I}$ but different mean vectors $\mathbf{\mu}_i$ where $\mathbf{I}$ is the identity matrix. Letting $L$ determine the mean of each Gaussian allows us to introduce dependencies among the two dimensions of the Gaussians. For a rigorous proof, see App. A. \qed
%\end{proofsketch}

%With Assumption \ref{assum:cluster_independence} we follow the principle of maximum entropy: While allowing for dependencies among client's random variables, we seek the joint distribution with the highest entropy (hence highest information) s.t. the marginals of each client can be computed from said joint distribution. Multiple joint distributions can yield these marginals, but non-maximum entropy options demand additional assumptions or require prior knowledge, restricting possible solutions. By assuming independence of all random variables within a cluster, we ensure that the highest entropy distribution can be constructed easily using a mixture of product distributions. For independent random variables, the highest entropy distribution is the product distribution. 
%Further, we highlight that Assumption \ref{assum:cluster_independence} aligns with the principle of maximum entropy: Although we allow dependencies among the clients' variables, we aim to find the joint distribution with maximum entropy \textit{within} clusters s.t. all marginals are preserved. More formally, we aim to find a joint where all clusters consist of independent components and $\int_{\mathbf{X} \setminus \mathbf{X}_i} p(\mathbf{X}) = p(\mathbf{X}_i)$ for any $\mathbf{X}_i$ holds.
%For more details, refer to App. B.

% \subsection{Federated Circuits}
\subsection{Bridging Probabilistic Circuits and Federated Learning}
\label{subsec:fcs}

%Since we aim to train PCs over multiple machines, we must learn PCs on partitioned datasets.
%Equipped with the stated modeling assumptions from above, we now show the inherent connection between PC semantics and FL. This connection ultimately allows us to train PCs on data partitioned over a set of clients.
We now illustrate an inherent connection between PC semantics and FL.
This will allow us to train PCs on data partitioned over a set of clients and thus greatly increase the scaling potential of PCs.
%\fb{Suggestion for previous paragraph:}
%\color{blue}
%We now illustrate an inherent connection between PC semantics and %FL.
%This will allow us to train PCs on data partitioned over a set of %clients and thus greatly increase the scaling potential of PCs.
%\color{black}

\textbf{Sum Nodes and Horizontal FL.} In horizontal FL, each client is assumed to hold the same set of features, i.e., $\mathbf{X}_c = \mathbf{X}_{c'}$ for all $c, c' \in \mathcal{C}$. However, each client holds a different subset of the data. 
Prominent horizontal FL methods solve this task by aggregating the \textit{model parameters} of locally learned models regularly.
However, the horizontal FL setting also precisely corresponds to the interpretation of sum nodes in PCs: A sum node splits a dataset into multiple disjoint clusters. The distribution over the entire data is then represented as a mixture of the distributions learned from the disjoint clusters. 
Thus, instead of aggregating model parameters, we aggregate the \textit{distributions} learned by each client on its data partition.

\begin{defin}[Horizontal FL]\label{def:HFL}
    Assume a set of samples $\mathbf{D}_c \sim p_c$ on each client $c \in \mathcal{C}$, a joint distribution $p$ adhering to Assumption \ref{assum:decomposition} and that $\mathbf{X}_c = \mathbf{X}_{c'}$ for all $c, c' \in \mathcal{C}$ s.t. $c \neq c'$. We define horizontal FL as fitting a mixture distribution $\hat{p} = \sum_{c \in \mathcal{C}} q(c) \cdot \hat{p}_c$ such that $d(\hat{p}, p)$ and $d(p_c, \hat{p}_c)$ are minimal for all $c \in \mathcal{C}$ where $d$ is a distance metric and $\hat{p}_c$ local distribution estimates.
\end{defin}

This view on horizontal FL has an appealing positive side effect: Aggregating model parameters can lead to divergence during training if the client's data distributions significantly differ. We circumvent the burden of aggregating model parameters by forming a mixture of local models that can be learned independently. Thus, we do not require further assumptions on the client's distributions. Also, since clients can train models independently, the communication cost of the training is minimized.

\textbf{Product Nodes \& Vertical FL.} In vertical FL, each client is assumed to hold a disjoint set of features, i.e., $\mathbf{X}_c \cap \mathbf{X}_{c'} = \emptyset$ for all $c, c' \in \mathcal{C}$. In contrast to horizontal FL, all clients hold different features belonging to the same sample instances.
%Since all clients hold different features, existing vertical FL approaches have to share some kind of data statistics (e.g., feature representations) to learn a model collaboratively. 
As in horizontal FL, there is a semantic connection between vertical FL and PCs. Product nodes in PCs compute a product distribution defined on a disjoint set of random variables. Thus, a product node separates the data along the feature dimension, corresponding to the vertical FL setting. However, a product node assumes the random variables of the child distributions to be independent of each other. Obviously, this is an unrealistic assumption for vertical FL, where features held by different clients might be statistically dependent. To capture such dependencies, Assumption \ref{assum:cluster_independence} can be exploited, and a mixture over multiple product distributions can be formed. We will discuss this in detail in Sec. \ref{subsec:FedPCs}.

\begin{defin}[Vertical FL]\label{def:VFL}
    Assume a set of samples $\mathbf{D}_c \sim p_c$ on each data owner $c \in \mathcal{C}$, the existence of a joint distribution $p$ adhering to Assumptions \ref{assum:decomposition} and \ref{assum:cluster_independence} and that $\mathbf{X}_c \cap \mathbf{X}_{c'} = \emptyset$ holds for all $c, c' \in \mathcal{C}$ s.t. $c \neq c'$. We define vertical FL as estimating a joint distribution $\hat{p}$ s.t. $d(p, \hat{p})$ is minimal and $\int_{\mathbf{X} \setminus \mathbf{X}_c} \hat{p}(x) = \hat{p}_{c}(x)$ for all $x \in \mathcal{X}$ where $d$ is a distance metric and $\hat{p}_c$ are estimates of client distributions.
\end{defin}

%\begin{defin}[Hybrid FL]\label{def:FL}
%    Given a data partition $\mathbf{D}_C \sim p_C$ on each client $C \in \mathcal{C}$ s.t. $\mathbf{D} = \bigcup_{C \in \mathcal{C}} \mathbf{D}_C$ and assuming the existence of a joint distribution $p$ from Assumption \ref{assum:decomposition}, we define hybrid federated learning as fitting a joint distribution $\hat{p}$ over all random variables $\mathbf{X}$ s.t. $d(p, \hat{p})$ and $d(p_C, \hat{p}_C)$ are minimal for some distance metric $d$ and all $p_C, \hat{p}_C$ where $p_C$ and $\hat{p}_C$ are the true and estimated marginal distribution over client random variables $\mathbf{X}_C \subseteq \mathbf{X}$.
%\end{defin}
\textbf{PCs \& Hybrid FL.}
Given Defs. \ref{def:HFL} and \ref{def:VFL}, hybrid FL is a combination of both. 
%Note that this definition does not make any assumptions on the topology of the communication network. Both, centralized as well as decentralized FL approaches can be taken without violating Assumption \ref{assum:decomposition} as long as the marginalization of the joint distribution $p$ can be represented as a mixture.
%With Definition \ref{def:FL}, we can draw the central connection between learning PCs and performing (hybrid) FL: 
%While $\mathbf{D}$ is assumed to come from a distribution $p(\mathbf{X})$, each client $C \in \mathcal{C}$ only holds a partition $\mathbf{D}_C$. Since there is no further assumption on the partitioning, the set of random variables and samples can vary across clients. Thus, each client $C$ can only learn a local distribution $p_C$ over the random variables and samples contained in its partition $\mathbf{D}_C$.
In terms of PC semantics, this amounts to building a hierarchy of fusing marginals and learning mixtures.
Provided with these probabilistic semantics, we can now formally bridge PCs and FL.
In the following, we distinguish between clients $\mathcal{C}$ and servers $\mathcal{S}$ and define the set of machines participating in training as $\mathcal{N} = \mathcal{C} \cup \mathcal{S}$. Bringing everything together and abstracting from the probabilistic interpretation, we define \textbf{federated circuits} (FCs) as follows.
%Horizontal FL can now be derived as a special case of Definition \ref{def:FL}.

%Definition \ref{def:HFL} is appealing in various ways: With horizontal FL, we obtain a straightforward way to scale PCs simply by partitioning $\mathbf{D}$ along the sample dimension, followed by learning a mixture of PCs. 
%For FL, Definition \ref{def:HFL} reduces the problem of horizontal FL to fitting a mixture model given estimates of local distributions $\hat{p}_C$. 
%From a FL viewpoint, Definition \ref{def:HFL} shifts the burden of merging distributional information of a set of clients from the model parameter space to the distribution space (or parameter space of the mixture model). This approach empowers independent training of local models without communication and eliminates the need to aggregate model parameters as it is typically done~\citep{McMahan2016FedAvg, Sahu2018FedProx, Karimireddy2020SCAFFOLD}.

%Similar to the horizontal case, vertical FL also becomes a special case of Definition \ref{def:FL}.

% TODO: Put it that way: Instead of directly talking about maximum-entropy distributions (and thus leading to independency assumption), state that variables among clients can have dependencies. Hence we propose our more general model/distribution assumption first, then say that this subsumes maximum-entropy distributions which in turn make a minimal set of assumptions (only independence instead of that each client distribution is representable as a mixture AND mixtures are independent).
%In vertical FL, data is partitioned s.t. clients only see \textit{marginals} of the joint distribution w.r.t. the variables they hold as, by definition, there is no overlap in the feature space among clients. The same can occur in hybrid FL since clients do not necessarily share the same feature space (although overlaps are allowed). Assumption \ref{assum:cluster_independence} ensures that dependencies among variables residing on different clients can be learned in vertical and hybrid FL. Also, Assumptions \ref{assum:decomposition} and \ref{assum:decomposition} ensure that all client distributions $p_C$ can be represented as a marginal of the full joint $p(\mathbf{X})$.

\begin{defin}[Federated Circuits]\label{def:fcs}
A \textbf{federated circuit} (FC) is a tuple $(\graph, \psi_{\graph}, \omega)$ where $\graph = (V, E)$ is a rooted, Directed Acyclic Graph (DAG), $\psi_{\graph}: V \rightarrow \mathcal{N}$ assigns each $\Node \in V$ to a compute node $n \in \mathcal{N}$ based on the structure of $\graph$ and $\omega: V \rightarrow O$ assigns an operation $o \in O$ to each node $\Node \in V$ where $o: \text{dom}(\ch{\Node}) \rightarrow \text{dom}(\Node)$ computes the value of $\Node$ given the values of the children of $\Node$.
\end{defin}

\textcolor{6dQr}{FCs extend the definition of PCs in the sense that FCs represent a 
computational graph $\mathcal{G} = (V, E)$ distributed over multiple machines where arbitrary operations can be performed in each node $\Node \in V$.} Depending on the parameterization of leaves and nodes $\Node$, FCs are not restricted to the probabilistic interpretation presented above. For example, parameterizing leaves by decision trees and introducing a node $\Node$ that performs averaging yields a bagging model.
%For now, let us dive deeper into the probabilistic interpretation of FCs. To that end, we present a concrete instantiation of FCs leveraging Probabilistic Circuits (PCs) as leaf models, resulting in federated PCs (FedPCs).
%Let us now illustrate FCs for learning Probabilistic Circuits (PCs) in a federated environment, resulting in federated PCs (FedPCs).
% TODO: Here we have to introduce Federated Einsums instead of FedSPNs
\subsection{Federated Probabilistic Circuits}
%\notejo{Describe FedPCs and NFCs in one framework here. Both estimate a density on clients, learning should thus be similar or even equal (except the leaf learning).}
\label{subsec:FedPCs}
Let us now dive deeper into the probabilistic interpretation of FCs. To that end, we present a concrete instantiation of FCs leveraging Probabilistic Circuits (PCs) as leaf models, resulting in federated PCs (FedPCs).
Following the probabilistic interpretation from Sec. \ref{subsec:fcs}, we align the PC structure with the communication network structure to form a federated PC.
\begin{defin}[Federated PC]
A Federated PC (FedPC) is a FC where each leaf node $\ClientNode$ is a density estimator and each node $\Node$ s.t. $\ch{\Node} \neq \emptyset$ is either a sum node ($\SumNode$) or a product node ($\ProductNode$).
\end{defin}
% \todo{namely NFs and PCs}
Note that \textcolor{6dQr}{only the client nodes $\ClientNode$ hold a dataset and} we only demand the clients to be parameterized by a density estimator.
%Due to the nature of FedPCs, it is possible to use different density estimators in the leaves.
In order for FedPCs to be computationally efficient, these density estimators should be tractable.
In the following, we parameterize the leaf nodes $\ClientNode$ as PCs.

The operation assignment $\omega$ is omitted in FedPCs as the operations performed by each node are implicitly defined (sum or product).
The assignment function $\phi$ transforms the PC's computation graph into a distributed computation graph aligned to the communication network. This establishes a direct correspondence between PC semantics (computation graph) and the communication network structure in FedPCs. Inference is performed as usual in PCs by propagating likelihood values from the leaf nodes to the root node. The only difference is that the result of a node $\Node$ has to be sent to its parent(s) $\mathbf{pa}(\Node)$ over the communication network if $\psi(\Node) \neq \psi(\Node')$ holds for $\Node' \in \mathbf{pa}(\Node)$.


%\notejo{We can set the parameters of the FC by hand but what about learning them? Problem: Samples are distributed over clients, how can we learn parameters without having access to each sample on each client? Some FedAvg like algo?}
Training FedPCs requires adapting the regular training procedure for PCs. This is mainly because not all clients can access the same samples if data is partitioned horizontally or hybrid. Since a forward pass through a PC requires the same sample to be available on each leaf, prominent learning algorithms such as Expectation Maximization (EM) are not directly applicable in horizontal and hybrid FL settings. In the following, we propose a \textit{one-pass} training procedure of FedPCs that does not require a full forward or backward pass over the model.

\textbf{One-Pass Training.}
% Our one-pass learning algorithm learns the structure and parameters of FedPCs so that local models can be trained independently (see Algo. \ref{alg:fedpcLearning}).
% %First, clients train local models independently, followed by inferring the FedPC parameters residing on the server-/network-side. This can be interpreted as learning to correctly weight the distributions local PCs represent to maximize, e.g., log-likelihood.
% Before training, all clients $c \in \mathcal{C}$ share their feature set/random variables $\mathbf{X}_c$ with a server. Each random variable is assumed to be uniquely identifiable across all clients. Then, the server divides the joint feature space $\mathbf{X}$ into disjoint subspaces $\{\mathbf{X}^{(1)}, \dots, \mathbf{X}^{(s)}\}$ using a unique descriptor vector $\mathbf{u}$. Each subspace $\mathbf{X}^{(j)}$ is constructed s.t. $|\mathbf{X}^{(j)}|$ and the size of $O_{\mathbf{X}^{(j)}} = \{c \in \mathcal{C} | \mathbf{X}^{(j)} \subseteq \mathbf{X}_c \}$ are maximized \textbf{(Line 1-7)}. For example, in Fig. \ref{fig:arch}, the features of partitions 1 and 2 define one subspace as it is the largest subspace covering all clients holding these features (2 clients).

\textcolor{allrev}{
Our one-pass learning algorithm learns the structure and parameters of FedPCs so that local models can be trained independently (Algo. \ref{alg:fedpcLearning}, Fig.~\ref{fig:algo}).
%First, clients train local models independently, followed by inferring the FedPC parameters residing on the server-/network-side. This can be interpreted as learning to correctly weight the distributions local PCs represent to maximize, e.g., log-likelihood.
Before training, all clients $c \in \mathcal{C}$ share their set of uniquely identifiable features/random variables $\mathbf{X}_c$ with a server, resulting in the feature set indicator matrix $\mathbf{M}^{|\mathcal{C}| \times |\mathbf{X}|}$ \textbf{(Lines 1-2)}. Feature identifiers can be names of features such as ``account balance" and have to correspond to the same random variable on all clients (thus uniquely identifiable). Then, the server divides the joint feature space $\mathbf{X}$ into disjoint subspaces by considering all unique columns ($\mathbf{u}$) in $\mathbf{M}$. Non-unique columns indicate sets of features with cardinality $>1$ held by multiple clients and, thus, can be modeled as a mixture in the FedPC. Hence, the subspaces $\{\mathbf{S}^{(1)}, \dots, \mathbf{S}^{(l)}\}$ represent sets of features shared by a set of clients $\{O_{\mathbf{S}^{(1)}}, \dots, O_{\mathbf{S}^{(l)}}\}$ such that the number of subspaces $l$ is minimized \textbf{(Lines 3-7)}. For example, in Fig. \ref{fig:arch}, the features of partitions 1 and 2 define one subspace as the largest subspace covering all clients holding these features (2 clients).
}

% Afterward, the FedPC structure is constructed: First, we build a mixture (sum node) for each subspace $\mathbf{X}^{(j)}$ where $|O_{\mathbf{X}^{(j)}}| > 1$, i.e., more than one client holds $\mathbf{X}^{(j)}$ \textbf{(Line 9-12)}. This enables each client to learn a PC over $\mathbf{X}^{(j)}$ independently. After that,$|O_{\mathbf{X}^{(j)}}| = 1$ holds for all remaining $\mathbf{X}^{(j)}$. Also, the scope of the sums nodes introduced in the FedPC share no features with any of the remaining $\mathbf{X}^{(j)}$ 
% since the server divided the feature space into disjoint subspaces. Therefore, we can use Prop. \ref{prop:mixture_of_prods} and introduce $P$ product nodes to construct the remaining part of the FedPC.

\begin{wrapfigure}[30]{R}{.5\textwidth}
\vspace{-0.3cm}
\hspace{0.15cm}
\begin{algorithm}[H]
\caption{One-Pass Training}\label{alg:fedpcLearning}
\KwData{Clients $\mathcal{C}$, features $\mathbf{X}$, cluster size $K$, FedPC fedPC}
\KwResult{Trained fedPC}
$\mathbf{M} = \mathbf{0}^{|\mathcal{C}| \times |\mathbf{X}|}$\;
$\mathbf{M}_{i, j} = 1$ if $X^{(j)}$ on client $i$\;
map $= []$\;
\For{$j, \mathbf{u}$ in enum(unique\_cols($\mathbf{M}$))}{
    $\mathbf{S}^{(j)} = \{i : i \in \{1, \dots, |\mathbf{X}| \land \text{all}(\mathbf{u} == \mathbf{M}_{:, i})\} \}$\;
    $O_{\mathbf{S}^{(j)}} = \text{argwhere}(\mathbf{u} == 1)$\;
    map.append($\mathbf{S}^{(j)}$, $O_{\mathbf{S}^{(j)}}$)\;
}
sums $= []$\;
\For{$\mathbf{S}^{(j)}$, $O_{\mathbf{S}^{(j)}}$ in map}{
    \If{$|O_{\mathbf{S}^{(j)}}| > 1$}{
        s $=$ fedPC.add\_sum($\mathbf{S}^{(j)}$, $O_{\mathbf{S}^{(j)}}$)\;
        sums.add(s)
    }
    \Else{
        client\_clusters $=$ cluster\_local\_data($O_{\mathbf{S}^{(j)}}$, $K$)\;
    }
}
products $=$ fedPC.add\_products($P$)\;
\For{prod in products}{
    prod.children.add(sums)\;
    \For{client, clusters in client\_clusters}{
        prod.children.add\_rand\_subset(clusters)\;
    }
}
fedPC.add\_mixture\_over\_products(products)\;
fedPC.train\_clients()\;
fedPC.infer\_weights()\;
\Return fedPC
\end{algorithm}
\end{wrapfigure}

Afterward, the FedPC structure is constructed (bottom part of Fig.~\ref{fig:algo}): First, we build a mixture (sum node) for each subspace $\mathbf{S}^{(j)}$ where $|O_{\mathbf{S}^{(j)}}| > 1$, i.e., more than one client holds $\mathbf{S}^{(j)}$ \textbf{(Lines 9-12)}. This enables each client to learn a PC over $\mathbf{S}^{(j)}$ independently. After that,$|O_{\mathbf{S}^{(j)}}| = 1$ holds for all remaining $\mathbf{S}^{(j)}$. Also, the scope of the sums nodes introduced in the FedPC share no features with any of the remaining $\mathbf{S}^{(j)}$ 
since the server divided the feature space into disjoint subspaces. Therefore, we can use Prop. \ref{prop:mixture_of_prods} and introduce $P$ product nodes to construct the remaining part of the FedPC.

% To this end, we divide the data of all subspaces $\mathbf{X}^{(j)}$ where $|O_{\mathbf{X}^{(j)}}| = 1$ holds into $K$ into clusters \textbf{(Line 14)}. Each client learns a dedicated PC for each cluster. To ensure that the FedPC spans the 

To this end, we divide the data of all subspaces $\mathbf{S}^{(j)}$ where $|O_{\mathbf{S}^{(j)}}| = 1$ holds into $K$ clusters \textbf{(Line 14)}. Each client learns a dedicated PC for each cluster. To ensure that the FedPC spans the 

\begin{figure}
    \centering
    \includegraphics[width=.9\textwidth]{federated-circuits/images/Algotithm.pdf}
    \caption{\textcolor{4iCr}{\textbf{One-Pass Training Visualized.} (Top) First, the matrix $\mathbf{M}$ is initialized, representing which features lie on which clusters. The unique descriptor vector $\mathbf{u}$ groups clients with the same feature subset. This forms a mapping indicating which features are available on each client. (Bottom) This mapping is utilized by first combining features that lie on different clients with sum nodes. Other features will be clustered into $K$ clusters (here $K=2$). The final FedPC is constructed by creating product nodes containing all the sum nodes from the previous steps and at least one of the $K$ clusters. Lastly, the root node (sum node) is inserted.}}
    \label{fig:algo}
\end{figure}

% \begin{wrapfigure}[29]{R}{.5\textwidth}
% \vspace{-0.3cm}
% \hspace{0.15cm}
% \begin{algorithm}[H]
% \caption{One-Pass Training}\label{alg:fedpcLearning}
% \KwData{Clients $\mathcal{C}$, features $\mathbf{X}$, cluster size $K$, FedPC fedPC}
% \KwResult{Trained fedPC}
% $\mathbf{M} = \mathbf{0}^{|\mathcal{C}| \times |\mathbf{X}|}$\;
% $\mathbf{M}_{i, j} = 1$ if $X^{(j)}$ on client $i$\;
% map $= []$\;
% \For{$j, \mathbf{u}$ in enum(unique\_cols($\mathbf{M}$))}{
%     $\mathbf{X}^{(j)} = \{i : i \in \{1, \dots, |\mathbf{X}| \land \text{all}(\mathbf{u} == \mathbf{M}_{:, i})\} \}$\;
%     $O_{\mathbf{X}^{(j)}} = \text{argwhere}(\mathbf{u} == 1)$\;
%     map.append($\mathbf{X}^{(j)}$, $O_{\mathbf{X}^{(j)}}$)\;
% }
% sums $= []$\;
% \For{$\mathbf{X}^{(j)}$, $O_{\mathbf{X}^{(j)}}$ in map}{
%     \If{$|O_{\mathbf{X}^{(j)}}| > 1$}{
%         s $=$ fedPC.add\_sum($\mathbf{X}^{(j)}$, $O_{\mathbf{X}^{(j)}}$)\;
%         sums.add(s)
%     }
%     \Else{
%         client\_clusters $=$ cluster\_local\_data($O_{\mathbf{X}^{(j)}}$, $K$)\;
%     }
% }
% products $=$ fedPC.add\_products($P$)\;
% \For{prod in products}{
%     prod.children.add(sums)\;
%     \For{client, clusters in client\_clusters}{
%         prod.children.add\_rand\_subset(clusters)\;
%     }
% }
% fedPC.add\_mixture\_over\_products(products)\;
% fedPC.train\_clients()\;
% fedPC.infer\_weights()\;
% \Return fedPC
% \end{algorithm}
% \end{wrapfigure}
% entire feature space of the clients, the children of product nodes are set as follows: Each sum node introduced 
% in the FedPC already becomes a child of each product node. Additionally, for each $\mathbf{X}^{(j)}$ where $|O_{\mathbf{X}^{(j)}}| = 1$ holds, we randomly select a PC learned over one of the $K$ clusters s.t. the scope of each product node spans $\mathbf{X}$, and each PC representing a cluster is the child of at least one product node.
% Then, we build a mixture over all product nodes using a sum node \textbf{(Line 15-20)}.
% Once the FedPC is constructed, all client-sided PCs are learned. Since clients learn their PCs independently, each client can use an arbitrary learning algorithm (even different ones).
% As a last step, the network-sided parameters, i.e., the weights of network-sided sum nodes, of the FedPC are inferred \textbf{(Line 21-22)}. For each sum node $\SumNode$, the weight $\mathbf{w}_{\SumNode}^{(i)}$ associated with the $i$-th child (i.e., distribution) of $\SumNode$ is set to $\frac{\rho(\Node_i)}{\sum_i \rho(\Node_i)}$. Here, $\rho(\Node_i) = \sum_{\ClientNode \in \ch{\Node_i}} |\mathbf{D}_{\ClientNode}|$ where $\mathbf{D}_{\ClientNode}$ is the dataset used to train the leaf $\ClientNode$. Hence, the network-sided weights can be inferred without any forward or backward pass. 
% Note that this approach reduces horizontal FL to learning a mixture of the client's data distributions and vertical FL to learning a mixture over $P$ product nodes.

entire feature space of the clients, the children of product nodes are set as follows: Each sum node introduced 
in the FedPC becomes a child of each product node. Additionally, for each $\mathbf{S}^{(j)}$ where $|O_{\mathbf{S}^{(j)}}| = 1$ holds, we randomly select a PC learned over one of the $K$ clusters s.t. the scope of each product node spans $\mathbf{S}$, and each PC representing a cluster is the child of at least one product node.
Then, we build a mixture over all product nodes using a sum node \textbf{(Lines 15-20)}.
Once the FedPC is constructed, all client-sided PCs are learned. Since clients learn their PCs independently, each client can use an arbitrary learning algorithm (even different ones).
As a last step, the network-sided parameters, i.e., the weights of network-sided sum nodes, of the FedPC are inferred \textbf{(Line 21-22)}. For each sum node $\SumNode$, the weight $\mathbf{w}_{\SumNode}^{(i)}$ associated with the $i$-th child (i.e., distribution) of $\SumNode$ is set to $\frac{\rho(\Node_i)}{\sum_i \rho(\Node_i)}$. Here, $\rho(\Node_i) = \sum_{\ClientNode \in \ch{\Node_i}} |\mathbf{D}_{\ClientNode}|$ where $\mathbf{D}_{\ClientNode}$ is the dataset used to train the leaf $\ClientNode$. Hence, the network-sided weights can be inferred without any forward or backward pass. 
Note that this approach reduces horizontal FL to learning a mixture of the client's data distributions and vertical FL to learning a mixture over $P$ product nodes.

Next, we analyze the communication efficiency of our proposed learning algorithm.

%\subsection{Federated Circuits and Bagging}
%\label{subsec:bagging}
%We already unveiled the deep connection between PCs and FL that we formalized in the FC framework. Interestingly, FCs have a connection to another prominent modeling approach, namely bagging. If the leaves of a PC are replaced by discriminative learners that learn a conditional distribution $p_C(\mathbf{Y} | \mathbf{X}_C)$ instead of a joint, and the FC performs a simple averaging operation, the FC can be interpreted as a bagging model.

\subsection{Analysis of Communication Efficiency}\label{subsec:CommunicationAnalysis}
%\notejo{This analysis does only hold for FedPCs (and maybe NFCs)!}
Communication efficiency is a key requirement for efficient training when learning models on scale on partitioned data, such as in FL. We now analyze the communication efficiency of FedPCs. 

\textbf{Horizontal FL.}
Assume a client set $\mathcal{C}$ where each client holds a model with $M$ parameters. Further, assume models are aggregated $K$ times during training ($K$ communication rounds). Then, model aggregation-based algorithms like FedAvg commonly used in horizontal FL send $\mathcal{O}(M \cdot |\mathcal{C}| \cdot K)$ messages over the network as each client sends $M$ model parameters to a server in each communication round. Training FedPCs with one-pass training, in contrast, only requires $\mathcal{O}(|\mathcal{C}| \cdot (M + 1))$ messages over the network as models are learned locally and independently of each other, followed by setting the parameters ($\mathcal{O}(|\mathcal{C}|)$ messages) of the sum nodes and aggregating the model on the server ($\mathcal{O}(M |\mathcal{C}|)$ messages).

\textbf{Vertical FL.}
In vertical settings, SplitNN-like architectures are commonly used. Assume training a SplitNN architecture for $E$ epochs that output a feature vector of size $F$ for each sample of a dataset with $S$ samples, vertically distributed over clients $\mathcal{C}$. The training requires sending $\mathcal{O}(E \cdot |\mathcal{C}| \cdot F \cdot S)$ messages over the network. In contrast, with one-pass training of FedPCs, each client learns a dedicated PC with $M$ parameters for each of the $K$ clusters that are learned. The last layer of the FedPC is a mixture of $P$ products of clusters. The mixture parameters are set after training each client's model. 
Aggregating the learned models and setting the network-sided mixture parameters requires $\mathcal{O}(K \cdot M \cdot |\mathcal{C}| + P)$ messages to be sent. 
If $(K \cdot M + \frac{P}{|\mathcal{C}|}) < (E \cdot F \cdot S)$ holds, training FedPCs is more communication efficient than training SplitNN-like architectures. In practice, this is likely to hold: The number of clusters is usually smaller than $100$ while feature vectors can have hundreds of dimensions (i.e., $F > 100$). Further, models should have fewer parameters than samples in the dataset to ensure generalization (i.e., $M < S$). $P$ can be set to an arbitrary value, depending on $|\mathcal{C}|$ and the data.
App. \ref{app:commeff} provides more details and an intuition on communication costs.
%For an intuition on communication costs, we show a plot on communication costs w.r.t. client number, model- and data size in App. C.

\textbf{Hybrid FL.}
In hybrid FL, FedPCs are trained on several subspaces: There are subspaces present on all or a subset of clients (denoted as $R_s$) and there are subspaces only available on one client (denoted as $R_d$). Further denote communication costs of FedPCs in horizontal FL and vertical FL as $C_h$ and $C_v$, respectively. Since the training procedure in hybrid cases essentially performs horizontal FL on shared feature spaces and vertical FL on disjoint feature spaces, $\mathcal{O}(|R_s| \cdot C_h + |R_v| \cdot C_v)$ messages are sent over the network during training.

\begin{rem}
    When scaling PCs using FedPCs, we do not aggregate the models after training. This distributes computation load across multiple machines also during inference and further decreases communication costs during training.
\end{rem}

% \subsection{FedPCs for Large-Scale Problems \fb{remove?}}
% % FedPCs are a novel contribution to the FL community, as they provide a framework for combining probability distributions in a hybrid FL setting. 
% Apart from providing a general framework for combining probability distributions in a hybrid FL setting, the intersection of PCs with FL results in benefits for scaling tractable, probabilistic models to large-scale problems.
% While most previous work has focused on speed or scaling of PCs on single devices, parallelizing across devices not only increases speed of training and inference, which might be crucial in real applications where fast inference is necessary, but, thereby, also allows for training larger models in the same or even a smaller amount of time.
% As we will show in our experiments, leveraging the FL setting for PCs improves computation speed throughout and, particularly for complex problems with a large number of samples and features, can also increase performance by large amounts.
% FedPCs allow for combining different types of probability density estimators, so any future progress on PCs or other models can easily be integrated into our framework, improving performance even further.

\section{Experiments}
% \begin{table}[]
% \resizebox{\columnwidth}{!}{
% \centering
% \begin{tabular}{c|ccccccc}
%        & cent.           & \multicolumn{2}{c}{horizontal}    & \multicolumn{2}{c}{vertical}         & \multicolumn{2}{c}{hybrid}        \\
%        & LL              & LL              & Rel. Runtime    & LL              & Rel. Runtime       & LL              & Rel. Runtime    \\ \hline
% MNIST  & $3352 \pm 3.5$  & $3350 \pm 3.2$  & $\mathbf{0.07} \pm \mathbf{0.01}$ & $3351 \pm 3.8$  & $0.13 \pm 0.01$    & $3349 \pm 3.7$  & $0.13 \pm 0.02$ \\
% Income & $-11.5 \pm 0.1$ & $-11.4 \pm 3.5$ & $\mathbf{0.17} \pm \mathbf{0.02}$ & $-11.9 \pm 3.3$ & $0.236 \pm $0.01\$ & $-12.0 \pm 1.5$ & $0.21 \pm 0.02$ \\
% Cancer & $-38.9 \pm 0.3$ & $-38.5 \pm 1.1$ & $\mathbf{0.21} \pm \mathbf{0.07}$ & $-38.6 \pm 0.5$ & $0.35 \pm 0.05$    & $-38.7 \pm 1.5$ & $0.35 \pm 0.1$  \\
% Credit & $-12.8 \pm 1.0$ & $-13.1 \pm 0.5$ & $0.42 \pm 0.05$ & $-12.5 \pm 2.3$ & $\mathbf{0.31} \pm 0.09$    & $-12.5 \pm 1.3$ & $0.40 \pm 0.13$
% \end{tabular}
% }
% \caption{\textbf{FedPCs speed up training while retaining model performance.} We trained PCs in a centralized setting (cent.) and in all FL settings (using FedPCs) on different datasets and the same structure learning algorithm. We find that FedPCs tremendously speed up training (reported as relative runtime w.r.t. centralized training where relative centralized runtime is 1). Also, FedPCs achieve the same log-likelihood as centralized PC on all datasets, demonstrating that PCs can be learned in federated settings (positive log-likelihoods due to Gaussian leaves).}
% \label{tab:likleihoods}
% \end{table}

\begin{table}[]
\resizebox{\columnwidth}{!}{
\centering
\begin{tabular}{c|cccc|cccc}
       & \multicolumn{4}{c}{Log-Likelihood}    & \multicolumn{4}{c}{Relative Runtime} \\
       & cent              & horizontal              & vertical    & hybrid              & cent & horizontal       & vertical              & hybrid    \\ \hline
MNIST  & $3352$\scriptsize{$\pm 3.5$}  & $3350$\scriptsize{$\pm 3.2$}  & $3351$\scriptsize{$\pm 3.8$} & $3349$\scriptsize{$\pm 3.7$} & $1.0$ & $\mathbf{0.07}$\scriptsize{$\pm \mathbf{0.01}$} & $0.13$\scriptsize{$\pm 0.01$}  & $0.13$\scriptsize{$\pm 0.02$} \\
Income & $-11.5$\scriptsize{$\pm 0.1$} & $-11.4$\scriptsize{$\pm 3.5$} & $-11.9$\scriptsize{$\pm 3.3$} & $-12.0$\scriptsize{$\pm 1.5$} & $1.0$ & $\mathbf{0.17}$\scriptsize{$\pm \mathbf{0.02}$} & $0.236$\scriptsize{$\pm 0.01$} & $0.21$\scriptsize{$\pm 0.02$} \\
Cancer & $-38.9$\scriptsize{$\pm 0.3$} & $-38.5$\scriptsize{$\pm 1.1$} & $-38.6$\scriptsize{$\pm 0.5$} & $-38.7$\scriptsize{$\pm 1.5$} & $1.0$ & $\mathbf{0.21}$\scriptsize{$\pm \mathbf{0.07}$} & $0.35$\scriptsize{$\pm 0.05$} & $0.35$\scriptsize{$\pm 0.1$}  \\
Credit & $-12.8$\scriptsize{$\pm 1.0$} & $-13.1$\scriptsize{$\pm 0.5$} & $-12.5$\scriptsize{$\pm 2.3$} & $-12.5$\scriptsize{$\pm 1.3$} & $1.0$ & $0.42$\scriptsize{$\pm 0.05$} & $\mathbf{0.31}$\scriptsize{$\pm 0.09$} & $0.40$\scriptsize{$\pm 0.13$}
\end{tabular}
}
\caption{\textbf{FedPCs speed up training while retaining model performance.} We trained PCs in a centralized setting (cent.) and in all FL settings (using FedPCs) on different datasets and the same structure learning algorithm. We find that FedPCs tremendously speed up training (reported as relative runtime w.r.t. centralized training where relative centralized runtime is 1.0 while there is no reduction in log-likelihood. This demonstrates that PCs can be learned in federated settings (positive log-likelihoods due to Gaussian leaves).}
\label{tab:likleihoods}
\end{table}
%\todo{add dataset information and setup}
In our empirical evaluation, we corroborate that FedPCs can be leveraged to effectively scale up PCs via data and model 

\begin{wrapfigure}[17]{R}{.5\textwidth}
    \vspace{-0.1cm}
    \includegraphics[width=\linewidth]{federated-circuits/images/runtimes.pdf}
    \caption{\textbf{FedPCs speed up training.} Due to parallel training on multiple, separate data partitions, FedPCs tremendously speed up training compared to EiNet (shown in relative speed-up).}
    \label{fig:runtime}
\end{wrapfigure}
partitioning. By performing horizontal, vertical and hybrid FL in one unified framework, we obtain high-performing models with the same or improved performance compared to prominent FL baselines. 
%Also, we confirm that FCs perform horizontal, vertical, and hybrid FL in one unified framework, yielding high-performing models. 
%The main goal of this work was to scale up PCs by distributing model training to multiple machines. On this path, we found a deep connection between PCs and FL, resulting in a novel and flexible FL framework of federated circuits. 
%Thus, we evaluate FCs w.r.t. both its upscaling capabilities of PCs and its performance in horizontal, vertical, and hybrid FL.
We aim to answer the following questions: \textbf{(Q1)}~Can FedPCs decrease the required training time and successfully learn a joint distribution over distributed data? \textbf{(Q2)}~Do FedPCs effectively scale up PCs, thus yielding more expressive models?
\textbf{(Q3)}~How do FCs with different parameterizations perform on classification tasks compared to existing FL methods? \textbf{(Q4)}~How does our one-pass learning algorithm compare to training with the EM algorithm?

\textbf{Experimental Setup.} To see if FedPCs, an instantiation of FCs, successfully scale up PCs, we follow \citet{liu2024scalingtractableprobabilisticcircuits} and perform density estimation on three large-scale, high-resolution image datasets:  Imagenet, Imagenet32 (both 1.2M samples), and CelebA (200K samples). The datasets were partitioned over 2-16 clients horizontally. We compare FedPCs to EiNets and Pyjuice.

To evaluate FCs in FL scenarios, we selected three tabular datasets that cover various application domains and data regimes present in the real world: one credit fraud dataset ($\sim 300$K samples), a medical dataset (breast cancer detection; $<1000$ samples), and the popular Income dataset ($>1$M samples). The selected datasets for FL cover low-data, medium-data, and large-data regimes\footnote{see App.~\ref{app:exp_details} for more details}. Both balanced (breast cancer) and imbalanced (income, credit) datasets are included in our evaluation. We selected tabular datasets as they are well suited to investigate FCs in horizontal, vertical, and hybrid settings and represent various real-world applications. We compare FCs to FedAvg (horizontal) and SplitNN (vertical), both using TabNet~\citep{arik2020tabnetattentiveinterpretabletabular} as neural network architecture parameterization. Additionally, we compare FCs to FedTree~\citep{li2023fedtree}. For more details on the experimental protocol, see App. \ref{app:exp_details}.

\textbf{\textbf{(Q1)} FedPCs learn joint distributions over 
partitioned data in less time.} First, we validate that FedPCs
correctly and efficiently perform density estimation on partitioned datasets 
distributed over multiple clients. To this end, multiple tabular datasets were distributed over a set of clients corresponding to horizontal (5 clients),
vertical (2 clients), and hybrid FL (2 clients). To demonstrate that FedPCs are also robust against label shifts, a common regime in FL, each client received data from only a subset of classes in the horizontal case, and local PCs were learned over the client samples. 
%Afterward, we aggregated the PCs on server side and constructed the FC by adding a sum-node (the root of the FedPC) with weights set as $w_C = \frac{|\mathbf{D}_C|}{\sum_{C \in \mathcal{C}} |\mathbf{D}_C|}$. 
In the vertical case, we split data s.t. feature spaces of clients 
are disjoint, but
each client 
holds the same samples. In hybrid settings, data was distributed s.t. both feature- and sample-spaces among 
clients have overlaps (but no full overlap).
For all tabular datasets, the leaves of the FedPC were parameterized with MSPNs~\citep{molina2018MSPNs}, a member of the 
PC model family that is capable of performing density estimation on mixed data domains (i.e., continuous as well as discrete random variables). 
We chose MSPNs as the centralized models, which were learned using \textsc{learnSPN}, a recursive greedy structure learning algorithm for SPNs~\cite{gens2013LearnSPN}. For MNIST, EiNets with Gaussian densities were used as PC instantiations in all settings.

Tab. \ref{tab:likleihoods} compares log-likelihood scores and relative runtime of centralized training of a PC on the full datasets with log-likelihood scores and relative runtimes achieved by FedPC in different FL settings. FedPCs successfully reproduce the results of centralized PCs on tabular datasets while being tremendously faster in training. This validates our approach and we answer \textbf{(Q1)} affirmatively.
%\begin{figure}[t]
%    \centering
%    \includegraphics[width=\columnwidth]{federated-circuits/images/lls_rt.pdf}
%    \caption{\textbf{FedPCs efficiently scale up PCs.} We performed centralized training of a PC and training of FedPCs with varying number of clients participating (repeated 10 times). Each client trained a separate model on a subset of Imagenet, i.e. the model size increases with the number of clients. Scaling up to more clients lead to a significant decrease in runtime (lower is better) while achieving better negative log-likelihood values (lower is better) due to larger models. \notejo{}}
    %\label{fig:lls_vs_rt}
%\end{figure}

\textbf{\textbf{(Q2)} FedPCs effectively scale up PCs.} To examine whether FedPCs can be leveraged to scale up PCs effectively, we trained an EiNet, PyJuice, and FedPC on CelebA, Imagenet32, and Imagenet. All models used the Poon-Domingos (PD) architecture. FedPCs were parameterized with EiNets, and data was distributed among 2, 4, 8, and 16 clients. The FedPC model and baseline models (EiNets and PyJuice) were selected to ensure that each fits within a single GPU (see App. \ref{app:exp_details} for system details). \textcolor{qUmr}{All models were parameterized with Categorical leave distributions.} Before training, data was clustered 
\begin{wraptable}[12]{r}{.5\textwidth}
%\vspace{cm}
    %\begin{table}
    %\centering
    \begin{tabular}{c|cc}
    \multicolumn{1}{l|}{} & EM & one-pass \\ \hline
    Synth. Data  & $-53.6 \pm 1.3$ & $-53.2 \pm 1.2$  \\
    Income  &  $-18.5 \pm 0.1$        & $-18.0 \pm 0.5$ \\
    Breast-Cancer & $-52.3 \pm 0.2$ & $-55.7 \pm 0.2$ \\
    Credit & $-26.7 \pm 1.2$ & $-28.3 \pm 0.4$ 
    \end{tabular}
    \caption{\textbf{One-pass training retains performance.} We trained the same FedPC architecture on various datasets using EM and one-pass training in a vertical setting. The average log-likelihood value of the hold-out test set across 10 runs is reported. 
    %\notejo{Why is 2step better on Income?}
    }
    \label{tab:e2evs2s}
    %\end{table}
\end{wraptable}
on encodings of a pre-trained Vision Transformer~\citep{dosovitskiy2021imageworth16x16words}, and the images were distributed horizontally, s.t. each client holds approximately equally large clusters. To ensure a fair comparison, EiNets and PyJuice were trained using the same clusters. The leafs and all baselines were trained with EM.
In Tab. \ref{tab:imageLLs}, we show nats normalized over samples and dimensions achieved by EiNets, PyJuice, and FedPC on the test set. For Imagenet and Imagenet32, model performance improves with more participating clients. On Celebi, nats decrease when we scale up to two participating clients. For 8 and 16 clients, nats increases again. We posit that this is because CelebA consists of a low number of relatively homogeneous clusters. Thus, increasing the cluster and model size to 8/16 could lead to overfitting and thus decreasing log-likelihoods. Since Imagenet consists of much more heterogeneous images, larger models and a larger number of clusters are beneficial for learning (see App. \ref{app:results} for more details).
Additionally using a larger number of clients reduces training time significantly (see Fig. \ref{fig:runtime}). FedPCs thus efficiently scale tractable probabilistic models to large datasets.

\textbf{\textbf{(Q3)} FCs achieve state of the art classification results in FL.}
FCs can be parameterized with different models in the leaves. We examine two parameterizations to solve a federated classification task on three tabular datasets. First, we use the FedPC (FC [PC]) from \textbf{(Q1)}, which can be used to solve discriminative tasks leveraging tractable computation of conditionals in PCs. The second FC parameterization we examine is decision trees (FC [DT]), representing an instantiation of a bagging model.
To see how FCs perform in federated classification tasks, we compare FCs to well-known methods for horizontal FL and vertical FL. The experiments were conducted on tabular datasets covering various real-world application domains and distribution properties. 
We employ TabNet and FedTree as strong baselines. In the horizontal FL setting, TabNet was trained using FedAvg; in the vertical FL setting, it was trained in a SplitNN fashion~\citep{Cellabos2020SplitNN}. The results were compared against our one-pass training. FCs yield comparable or even better results than the selected baselines on all datasets (see Fig \ref{fig:fl_results}; App. \ref{app:results}) while being significantly more flexible since FCs can be trained with the same unified procedure in all FL settings. In contrast, training neural networks requires substantial changes to the training procedure once the FL setting switches. Hence, FCs are more flexible while still competitive or better than prominent FL baselines.
\begin{table}[t]
\centering
\begin{tabular}{l|lll}
                  & CelebA             & Imagenet32          & Imagenet              \\ \hline
EiNet~\citep{perharz2020einsum}              & 5842.62 $\pm$ 94.9 & 682.82 $\pm$ 3.50   & -5893.59 $\pm$ 84.79  \\
PyJuice~\citep{liu2024scalingtractableprobabilisticcircuits}            & 4228.14 $\pm$ 25.5  & 664.54 $\pm$ 6.41  & -5732.21 $\pm$ 71.25   \\
FedPC (2 clients)  & \textbf{6337.50} $\pm$ \textbf{98.3} & 1044.38 $\pm$ 8.02  & -4971.36 $\pm$ 120.83 \\
FedPC (4 clients)  & \underline{6279.98} $\pm$ \underline{86.9} & 1196.39 $\pm$ 1.50  & -2330.87 $\pm$ 162.17 \\
FedPC (8 clients)  & 6019.53 $\pm$ 96.3 & \textbf{1205.05} $\pm$ \textbf{2.72}  & \underline{-1818.17} $\pm$ \underline{81.12}  \\
FedPC (16 clients) & 5387.62 $\pm$ 82.9 & \underline{1197.69} $\pm$ \underline{10.79} & \textbf{-1157.23} $\pm$ \textbf{74.29} 
\end{tabular}
\caption{\textbf{FedPCs outperform EiNets and PyJuice on density estimation tasks.} FedPCs achieve better results on density estimation tasks on three challenging image datasets (CelebA, Imagenet32 and Imagenet). This is because FedPCs can learn far larger models distributed across multiple machines. Results are reported as log-likelihood values (higher is better). Note that we used Gaussian \textit{densities} as PC leaves; thus, log-likelihood can get positive. Best value in \textbf{bold}, 2nd best \underline{underlined}.}
\label{tab:imageLLs}
\end{table}
\begin{figure}[t]
    \centering
    \includegraphics[width=\textwidth]{federated-circuits/images/fl_settings_results.pdf}
    \caption{\textbf{FCs are competitive to prominent FL methods in all settings.} FCs achieve competitive performance on various classification tasks compared to prominent horizontal/vertical FL baselines. FCs also handle the more challenging setting of hybrid FL without performance drops. We reported the F1 score as we consider binary classification tasks with imbalanced datasets.}
    \label{fig:fl_results}
\end{figure}

\textbf{\textbf{(Q4)} One-pass training retains performance.} To see how the proposed one-pass training compares to training PCs
with standard optimization algorithms such as EM, we define an FL setup where data exchange is allowed. 
This is necessary as we have to train the PC and FedPC architecture with EM to compare to our one-pass procedure. We used RAT-SPNs~\citep{peharz2020random} as leaf
parameterizations of the FedPC. Then, we trained a FedPC using standard EM (i.e., data exchange was allowed) and another FedPC with the same FedPC architecture on a vertically split dataset using our one-pass procedure. 
We report the final average log-likelihood of the test dataset, both for EM training 
and one-pass training (see Tab. \ref{tab:e2evs2s}). It can be seen that there is no significant decrease in log-likelihood in any case. 
%Interestingly, the one-pass training seems even to be better than EM. We suspect that that it is easier to solve the subtasks of local training independently instead of jointly optimizing all parameters of the FedPC. 
Hence, our results indicate that one-pass training is preferable since it is communication efficient. 
%one-pass training can be used instead of the more costly EM scheme.

\section{Conclusion}
In this work, we introduced federated circuits that hinge on an inherent connection between PCs and FL. We demonstrated that both the training speed and expressivity of PCs can be increased by learning PCs on scale across partitioned data.
%model performance due to a larger amount of computational resources that can be leveraged.
Since our framework allows for the integration of various types of density estimators, other models and advances of PCs and other fields can be integrated seamlessly, maintaining the relevance of the federated approach for scaling.

\textbf{Limitations and Future Work.}
While our experiments showed that scaling PCs can considerably improve training speed and performance, scaling to such large-scale models requires sufficient computational resources.
For future work, investigating other parametrizations for FCs beyond PCs is promising.
Additionally, it is interesting how the probabilistic framework for hybrid FL could also benefit more traditional FL applications, apart from scaling PCs.

%\section{Conclusion}
%Various approaches have been proposed in the literature to scale up PCs to larger, real-world datasets. 
%In this work, we proposed to scale PCs by splitting them across multiple machines and training them on partitioned data. This unveiled a deep connection between PCs and FL, leading to a novel and highly flexible federated learning framework, called federated circuits. We demonstrated that FCs can be leveraged to efficiently scale up PCs on three large image datasets, beating existing scaling approaches to scale PCs. Further, our experiments showed that FCs achieve competitive results compared to strong FL baselines in all FL settings while being more flexible than our baselines. This makes FCs an interesting candidate for scaling PCs and various federated learning applications. 
%Numerous methods tackle horizontal, vertical, or hybrid FL individually. In this work, we revisited FL with a probabilistic lens, paving the way for unified FL approaches that address all three settings in a single algorithm. We demonstrated that our framework encompasses both, HFL and VFL and introduced a novel FL method employing PCs, yielding FedPCs. Our findings highlight FedPCs' capacity to efficiently learn joint distribution approximations, even in scenarios where data owners do not share features or samples. Also, we demonstrated that FedPCs are an effective way to scale up PCs to large scale datasets such as Imagenet.
%\textbf{Limitations.} 
%In this work, we presented Federated Circuits, a general framework to tackle FL. 
%While we demonstrated that, depending on its parameterization, FCs are closely related to PCs, employing other models is possible. Further investigation on the effectiveness of federated circuits, e.g., normalizing flows as a parameterization, is interesting and can open FCs for even more applications.

%\textbf{Further Work.} 
%Further promising research avenues include investigating whether the communication efficiency of federated circuits can be further improved with methods from distributed learning such as ring reduce~\citep{lee2020flexreduce}. 
%Further, it would be interesting to assess the effectiveness of Federated Circuits with model classes other than PCs.

\bibliography{bibfile}
\bibliographystyle{iclr2025_conference}

\appendix
\newpage
\section{Notation}
\textcolor{4iCr}{
The following table provides an overview of all symbols used throughout the paper, each with a brief description.}
\begin{table}[h!]
\begin{tabular}{l|l}
Symbol & Meaning                                                                                                                                                       \\ \hline
$\mathbf{X}$      & Set of random variables                                                                                                                                       \\
$\mathbf{X}_c$   & Set of random variables on client c                                                                                                                           \\
$\mathbf{D}$      & Dataset                                                                                                                                                       \\
$\mathbf{D}_c$   & Dataset on client c                                                                                                                                           \\
$\mathcal{C}$      & set of clients                                                                                                                                                \\
$p$      & joint distribution                                                                                                                                            \\
$p_c$   & marginal distribution over all random variables held by client c                                                                                              \\
$\hat{p}$  & distribution from data                                                                                                                                        \\
$\Node$      & node in PC/FC                                                                                                                                                 \\
$\ClientNode$      & client node in FC                                                                                                                                             \\
$\SumNode$, $\ProductNode$   & Sum/Product node in PC/FC                                                                                                                                     \\
$\psi$    & scope function in PC/FC                                                                                                                                       \\
$\omega$  & \begin{tabular}[c]{@{}l@{}}function assigning compute nodes to nodes of FC. \\ Defines alignment between FC structure and communication network.\end{tabular}
\end{tabular}
\end{table}


\section{Proofs}
\label{app:proofs}
In this section we give full proofs for our propositions in the paper.

\subsection{Fact 1}
A joint distribution $p$ over disjoint sets of random variables $\mathbf{X}_1, \cdots, \mathbf{X}_c$ of the form $p(\mathbf{X}_1, \cdots, \mathbf{X}_c) = \sum_l p_{\theta}(L=l) \prod_k^c p(\mathbf{X}_k | L=l)$ is strictly more expressive than a distribution of the form $p(\mathbf{X}_1, \cdots, \mathbf{X}_c) = \prod_k^c p(\mathbf{X}_k)$.

\begin{proof}
We have to prove two things here: (1) A mixture consisting of one component equals the product distribution for the distribution family assumed in Proposition 1 and (2) a latent variable model is strictly more expressive than the product distribution.

\textbf{(1):} For a latent $L$ with $|\text{supp}\{L\}| = 1$ (hence $p(L)$ is a point mass), $\sum_l p_{\theta}(L=l) \prod_{k=1}^c p(\mathbf{X}_k | L=l) = \prod_{k=1}^c p(\mathbf{X}_k)$ holds as for $p_{\theta}(L=l) = 1$ for the only $l \in \text{supp}\{L\}$. Also, if there is only one mixture component, conditioning on the only component has no effect, i.e. $p(\mathbf{X}_k | L=l) = p(\mathbf{X}_k)$.

\textbf{(2):} Assume an $n$-dimensional space $\mathcal{X}_k = \mathcal{X}_{k_1} \times \dots \times \mathcal{X}_{k_n}$ for each set of variables $\mathbf{X}_k$ and a $c \times n \times m$ tensor $\mathbb{X}$ of random variables where each $\mathbb{X}_k$ corresponds to a matrix/set of random variables $\mathbf{X}_k = (X_{1 1}, \dots, X_{n m})$, i.e. there exist $m$ random variables per dimension of $\mathcal{X}_k$. Further assume a distribution $p_{\theta_{k i j}}$ for each $\mathbb{X}_{k i j}$ parameterized by $\theta_{k i j}$ and that $\mathbb{X}_{k i j} \indep \mathbb{X}_{k' l j}$ holds for all $k \neq k'$ and $l \neq i$. Note that this does not forbid dependencies among variables within each matrix $\mathbb{X}_k$. Due to our independence assumption we can define distributions $p_{\mathbf{\theta}_{j}} = \prod_{k=1}^c p(\mathbb{X}_{k : j})$ for each $j$. Since each of these distributions is defined over $\mathcal{X}$, we can introduce a latent $L$ with support $\{1, \dots, m\}$ and associated prior $p_{\theta}(L)$, yielding a mixture of $c$ components over vectorized random variables. Hence we can write $p(\mathbb{X}) = \sum_{l=1}^C p_{\theta}(L=l) \cdot p(\mathbb{X} | L=l)$. This can be rewritten as $p(\mathbb{X}) = \sum_{l=1}^c p_{\theta}(L=l) \cdot p(\mathbb{X}_l)$. As each $p(\mathbb{X}_l)$ is a product distribution over random variables corresponding to some mixture component $j$, rewriting yields $p(\mathbb{X}) = \sum_{l=1}^c p_{\theta}(L=l) \cdot \prod_{j=1}^c p(\mathbb{X}_{l : j})$. Using (1), setting $|\text{supp}\{L\}| = 1$ and setting the number of mixtures also to $1$ yields a special case, namely the product distribution over the only defined mixture component $j$, i.e. $\prod_j p(\mathbb{X}_{l : j})$. Hence a mixture as we have defined it is strictly more expressive as a single product distribution.
\end{proof}

%\begin{proofsketch}
%We consider the two cases that $\mathbf{X}$ and $\mathbf{Y}$ are mutually independent and that they are not mutually independent. Note that the joint entropy can be written as $h(\mathbf{X}, \mathbf{Y}) = h(\mathbf{X} | \mathbf{Y}) + h(\mathbf{Y})$. In the case of mutual independence, this reduces to $h(\mathbf{X}, \mathbf{Y}) = h(\mathbf{X}) + h(\mathbf{Y})$. Hence it has to be shown that $h(\mathbf{X} | \mathbf{Y}) < h(\mathbf{X})$ holds if $\mathbf{X}$ and $\mathbf{Y}$ are not mutually independent. Intuitively, if $\mathbf{Y}$ lets us infer something about $\mathbf{X}$, the conditional entropy should be lower than the marginal entropy. Exploiting that the mutual information $I(\mathbf{X}, \mathbf{Y}) = h(\mathbf{X}) - h(\mathbf{X} | \mathbf{Y}) \geq 0$, it follows that entropy is maximized if $\mathbf{X}$ and $\mathbf{Y}$ are mutually independent. The full proof can be found in App. A. \qed
%\end{proofsketch}

\subsection{Proposition 2}
Assumption 2 aligns with the principle of maximum entropy: we aim to find the joint distribution with maximum entropy \textit{within} clusters while allowing for dependencies among clients’ random variables and ensuring the marginals for each client are preserved. Although multiple joint distributions can preserve the marginals, non-maximal entropy solutions introduce additional assumptions or prior knowledge, limiting flexibility. By assuming independence of all variables within a cluster, we efficiently construct the maximum entropy distribution via a mixture of product distributions.
For independent variables, the product distribution maximizes entropy, as can be shown by leveraging the joint and conditional differential entropy.
Given random variables $\mathbf{X} = X_1, \dots, X_n$ and a density $p$ defined over support $\mathcal{X} = \mathcal{X}_1 \times \cdots \times \mathcal{X}_n$, the joint differential entropy is defined as:
\begin{equation}
    h(\mathbf{X}) = \int_{\mathcal{X}} p(x_1, \dots, x_n) \, \text{log} p(x_1, \dots, x_n)
\end{equation}
The conditional differential entropy for two sets of random variables $\mathbf{X}$ and $\mathbf{Y}$ and a joint distribution $p(\mathbf{X}, \mathbf{Y})$ defined over support $\mathcal{X} \times \mathcal{Y}$ is defined analogously: 
\begin{equation}
    h(\mathbf{X} | \mathbf{Y}) = \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \, \text{log}p(\mathbf{x} | \mathbf{y})
\end{equation}
Given two sets of random variables $\mathbf{X}$, $\mathbf{Y}$ with densities $p(\mathbf{X})$ and $p(\mathbf{Y})$ and support $\mathcal{X}$, $\mathcal{Y}$ respectively, the joint $p(\mathbf{X}, \mathbf{Y}) = p(\mathbf{X}) \cdot p(\mathbf{Y})$ is the maximum entropy distribution if $\mathbf{X}$ and $\mathbf{Y}$ are mutually independent.
\begin{proof}
 We consider the two cases that $\mathbf{X}$ and $\mathbf{Y}$ are mutually independent and that they are not mutually independent. The joint entropy can be written as $h(\mathbf{X}, \mathbf{Y}) = h(\mathbf{X} | \mathbf{Y}) + h(\mathbf{Y})$. In the case of mutual independence, this reduces to $h(\mathbf{X}, \mathbf{Y}) = h(\mathbf{X}) + h(\mathbf{Y})$. Hence it has to be shown that $h(\mathbf{X} | \mathbf{Y}) < h(\mathbf{X})$ holds if $\mathbf{X}$ and $\mathbf{Y}$ are not mutually independent:
 \begin{align*}
     & h(\mathbf{X} | \mathbf{Y}) < h(\mathbf{X}) \\
     \equiv & - \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} p(\mathbf{x} | \mathbf{y}) < - \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} p(\mathbf{x}) \\
     \equiv & - \bigg( \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} p(\mathbf{x} | \mathbf{y}) - \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} p(\mathbf{x}) \bigg) < 0 \\
     \equiv & - \bigg( \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} \frac{p(\mathbf{x} | \mathbf{y})}{p(\mathbf{x})} \bigg) < 0
 \end{align*}
Since $\mathbf{X} \indep \mathbf{Y}$ holds where $\indep$ means mutual independence, $\frac{p(\mathbf{x} | \mathbf{y})}{p(\mathbf{x})} \neq 1$ at least for some $\mathbf{x}, \mathbf{y}$. Since the mutual independence $I(\mathbf{X}, \mathbf{Y}) = \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} \frac{p(\mathbf{x}, \mathbf{y})}{p(\mathbf{x}) \cdot p(\mathbf{y})}$ can be represented as $I(\mathbf{X}, \mathbf{Y}) = h(\mathbf{X}) - h(\mathbf{X} | \mathbf{Y})$, $I(\mathbf{X}, \mathbf{Y}) \geq 0$ holds and $- \Big( \int_{\mathcal{X}, \mathcal{Y}} p(\mathbf{x}, \mathbf{y}) \text{log} \frac{p(\mathbf{x} | \mathbf{y})}{p(\mathbf{x})} \Big) = h(\mathbf{X} | \mathbf{Y}) - h(\mathbf{X})$ it follows that $h(\mathbf{X}) > h(\mathbf{X} | \mathbf{Y})$.
 
\end{proof}

\section{Algorithms}\label{app:algos}
In this section we provide pseudo-code for the end-to-end training algorithm, the two-step training algorithm and the FedSPN structure construction in hybrid FL scenarios.

\subsection{EM Training}
In vertical FL settings, a full forward and backward pass can be computed in FedPCs. Thus, we provide a distributed EM training algorithm here.
\begin{algorithm}[h!]
\caption{EM Training}\label{alg:e2e}
\KwData{FedPC-parameter tuple $\langle s, p\rangle$}
\KwData{Distributed Dataset $\mathbf{D}$}
\KwResult{Trained FedPC $s$}
$g \gets 0$ \\
\For{random batch $\mathbf{x}$ from $\mathbf{D}$}{
    $\ell \gets \text{log}(s(\mathbf{x}))$ \\
    $\nabla_p s(\mathbf{x}) \gets$ distributed\_backward$(\ell, \mathbf{x}, s, p)$ \\
    em\_step($p$, $\nabla_p s(\mathbf{x})$)  
}
\end{algorithm}

\begin{algorithm}[h!]
\caption{Distributed Backward}\label{alg:dist_back}
\KwData{FedPC-parameter tuple $\langle s, p\rangle$}
\KwData{Batch $\mathbf{x}$}
\KwData{Log-likelihood $\ell$}
\KwResult{Trained FedPC $s$}
$g \gets 0$ \\
gradients $\gets []$ \\
\For{sum node $\SumNode \in s$}{
    $g_{\pa{\SumNode}} \gets []$ \\
    \For{$N \in \pa{\SumNode}$}{
        \If{$N \not \in \phi(\SumNode)$}{
            obtain $\nabla_N(\mathbf{x}) \ell$ from $\phi(\SumNode)$
        }
        \Else{
            compute $\nabla_N(\mathbf{x}) \ell$
        }
        add $\nabla_N(\mathbf{x}) \ell$ to $g_{\pa{\SumNode}}$
    }
    
    compute $g_{p(\SumNode)} \gets \sum_{g \in g_{\pa{\SumNode}}} \nabla_{p(\SumNode)} \sum_{c \in \ch{\SumNode}} p_c(\SumNode) c(\mathbf{x})$ \\
    add $\langle p(\SumNode), g_{p(\SumNode)} \rangle$ to gradients
}
\Return gradients
\end{algorithm}

\newpage
%\subsection{One-Pass Training}
%\begin{algorithm}[h!]
%\caption{One-Pass Training}\label{alg:2step}
%\KwData{Client PCs $s$, Distributed Dataset $\mathbf{D}$, Setting $\sigma \in \{$ horizontal, vertical, hybrid $\}$}
%\KwResult{Trained FedPC $s_f$}
%\For{Client PC $s_c \in s$}{
%    train $s_c$ with client data $\mathbf{D}_c$ \\
%}
%$s_f \gets$ construct\_fpc($s$, $\sigma$) \\
%$\mathbf{l} \gets (s_1(\mathbf{D}_1), \dots, s_c(\mathbf{D}_c))$ \\
%EM($s_f$, $\mathbf{l}$)
%\end{algorithm}
%
%\begin{algorithm}[h!]
%\caption{Split Feature Space}\label{alg:split_features}
%\KwData{Clients $\mathcal{C}$}
%\KwResult{Split Feature Space}
%$M \gets \{\}$ \\
%$\mathbf{X} \gets \bigcup_{C \in \mathcal{C}} \mathbf{X}_C$ \\
%\For{$X \in \mathbf{X}$}{
%    $O \gets \{C: C \in \mathcal{C} | X \in \mathbf{X}_C\}$ \\
%    $M[X] \gets O$ \\
%}
%$\hat{O} \gets$ set($M$.values()) \\
%\For{$O' \in \hat{O}$}{
%    $F_{O'} \gets \{X: \langle X, O \rangle \in M | O = O'\}$ \\
%    send $F_{O'}$ to all clients in $O'$
%}
%\end{algorithm}
%
%\begin{algorithm}[h!]
%\caption{Build Mixture of Products}\label{alg:fed_spn_head}
%\KwData{Client PCs $s$, number of clusters per client $N$, number of product nodes $P$}
%\KwResult{FedPC structure $s_f$}
%    product\_nodes $\gets \{\}$ \\
%    not\_visited $\gets$ union($\{s[c]$ for $c \in \{1, \dots |s|\}\}$) \\
%    product\_nodes $\gets \{\}$ \\
%    \For{$p \in \{1, \dots, P\}$}{
%        nodes $\gets \{\}$ \\
%        \For{$c \in \{1, \dots, |s|\}$}{
%            $i \sim \mathcal{U}(0, N)$ \\
%            add $s[c][i]$ to nodes \\
%            remove $s[c][i]$ from not\_visited
%        }
%        prod $\gets$ ProductNode(nodes) \\
%        add prod to product\_nodes
%    }
%    \If{$|not\_visited| > 0$}{
%        prod $\gets$ ProductNode(not\_visited) \\
%        add prod to product\_nodes
%    }
%    pc = SumNode(product\_nodes) \\
%    \Return pc
%\end{algorithm}
%
%\begin{algorithm}[h!]
%\caption{Construct FedPC}\label{alg:fed_spn_head}
%\KwData{Client PCs $s$, Number of clusters per client $N$, Number of product nodes $P$, Mapping Feature Space to Clients $M$, Setting $\sigma \in \{$ horizontal, vertical, hybrid $\}$}
%%\KwData{Number of clusters per client $N$ (if vertical)}
%%\KwData{Number of product nodes $P$ (if vertical)}
%%\KwData{Mapping Feature Space to Clients $M$}
%%\KwData{Setting $\sigma \in \{$ horizontal, vertical, hybrid $\}$}
%\KwResult{FedPC structure $s_f$}
%\If{$\sigma$ = horizontal}{
%    pc = SumNode($s$)
%}
%\If{$\sigma$ = vertical}{
%    pc = build\_mixture\_of\_products($s$, $N$, $P$)
%}
%\Else{
%    mixtures $\gets \{ \}$ \\
%    pcsVistied $\gets \{ \}$ \\
%    mixtureSpaces $\gets \{ \langle \mathbf{X}, O \rangle \in M : |O| > 1 \}$ \\
%    \For{$\langle \mathbf{X}, O \rangle \in \text{mixtureSpaces}$}{
%        $s' \gets \{n \in s |$ scope($n$) $= O \}$ \\
%        add $s'$ to pcsVisited \\
%        add SumNode($s'$) to mixtures
%    }
%    
%    nodes $\gets$ mixtures $\cup \{s' \in s | s' \not \in$ pcsVisited$\}$ \\
%    pc = ProductNode(nodes) \\
%    \Return pc
%}
%\end{algorithm}
%
\section{Further Results}
\label{app:results}
Here, we provide further experimental details on FCs. 

\textbf{Model Parameter Ablation.} 
To validate our results, we provide an additional ablation study on the effect the model size (measured in the number of parameters) has on the final model performance. To this end, we trained models of different sizes (1.2M, 34M, and 99M parameters) on CelebA. We used equally clustered data (2, 4, 8, or 16 clusters) and trained a mixture of EiNets in each run to ensure that no other effects affect the result. We find that the model parameters have a significant effect on the final model performance (reported as log-likelihood) and larger models achieve better log-likelihood values. Thus, our ablation confirms that scaling PCs is crucial to obtaining high-quality density estimates on complex data. For detailed results, see Tab. \ref{tab:ablation}.

\begin{table}[h!]
\begin{tabular}{l|llll}
            & 2 clusters           & 4 clusters            & 8 clusters           & 16 clusters           \\ \hline
1.2M param. & -3692.40 $\pm$ 67.07 & -3263.54 $\pm$ 102.60 & -3668.98 $\pm$ 87.66 & -5145.27 $\pm$ 64.28  \\
34M param.  & 1659.57 $\pm$ 65.02  & 1154.19 $\pm$  55.31  & 481.02 $\pm$ 103.37  & -1104.55 $\pm$ 109.69 \\
99M param.  & \textbf{5011.55} $\pm$ \textbf{95.57}  & \textbf{4388.37} $\pm$ \textbf{67.94}   & \textbf{3727.43} $\pm$ \textbf{71.29}  & \textbf{2208.78} $\pm$ \textbf{38.82}                     
\end{tabular}
\caption{\textbf{Model size significantly influences log-likelihood.} We trained mixtures of EiNets of various sizes on the same clustering of CelebA to validate our results from the main paper. The model size has a crucial influence on the final model performance and larger models achieve better log-likelihoods.}
\label{tab:ablation}
\end{table}

\textbf{FL Classification Results.} We compare FCs to several baselines in horizontal, vertical, and hybrid FL. In horizontal FL, we compare against FedAvg (using TabNet~\citep{arik2020tabnetattentiveinterpretabletabular}) and FedTree~\citep{li2023fedtree}; in vertical FL, we compare against SplitNN (also using TabNet) and FedTree. In hybrid FL, we compare different parameterizations of FCs (FedPCs and FCs parameterized with decision trees). We find that FCs are competitive or outperforming the selected baselines in all FL settings (see Tab. \ref{tab:full_FL_results}). This makes them a very flexible FL framework that still yields high-performing models.
% \begin{table}[]
% \begin{tabular}{c|cccccc}
%                              & \multicolumn{2}{c}{Cancer}        & \multicolumn{2}{c}{Credit}        & \multicolumn{2}{c}{Income}        \\ \hline
%                              & Acc.            & F1              & Acc.            & F1              & Acc.            & F1              \\
% FedAvg {[}TabNet{]} (5 cl.)  & $0.92 \pm 0.03$ & $0.92 \pm 0.03$ & $0.71 \pm 0.11$ & $0.48 \pm 0.04$ & $0.68 \pm 0.06$ & $0.51 \pm 0.03$ \\
% FedAvg {[}TabNet{]} (10 cl.) & $0.92 \pm 0.04$ & $0.91 \pm 0.05$ & $0.56 \pm 0.12$ & $0.47 \pm 0.06$ & $0.64 \pm 0.06$ & $0.52 \pm 0.03$ \\
% FedTree (5 cl.)              & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.91 \pm 0.01$ & $0.63 \pm 0.01$ & $0.88 \pm 0.01$ & $0.82 \pm 0.02$ \\
% FedTree (10 cl.)             & $0.94 \pm 0.01$ & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.69 \pm 0.01$ & $0.87 \pm 0.01$ & $0.80 \pm 0.01$ \\
% FC {[}PC{]} (5 cl.)          & $0.98 \pm 0.01$ & $0.98 \pm 0.01$ & $0.93 \pm 0.02$ & $0.68 \pm 0.02$ & $0.87 \pm 0.02$ & $0.80 \pm 0.01$ \\
% FC {[}PC{]} (10 cl.)         & $0.95 \pm 0.02$ & $0.95 \pm 0.02$ & $0.93 \pm 0.01$ & $0.66 \pm 0.02$ & $0.87 \pm 0.01$ & $0.80 \pm 0.02$ \\
% FC {[}DT{]} (5 cl.)          & $0.95 \pm 0.03$ & $0.93 \pm 0.02$ & $0.92 \pm 0.01$ & $0.67 \pm 0.01$ & $0.89 \pm 0.01$ & $0.83 \pm 0.01$ \\
% FC {[}DT{]} (10 cl.)         & $0.95 \pm 0.02$ & $0.93 \pm 0.03$ & $0.92 \pm 0.01$ & $0.97 \pm 0.02$ & $0.89 \pm 0.01$ & $0.83 \pm 0.02$ \\
% SplitNN {[}TabNet{]}         & -               & -               & -               & -               & -               & -              
% \end{tabular}
% \caption{\textbf{FCs achieve competing performance in horizontal setting.}}
% \end{table}

% \begin{table}[]
% \begin{tabular}{c|cccccc}
%                              & \multicolumn{2}{c}{Cancer}        & \multicolumn{2}{c}{Credit}        & \multicolumn{2}{c}{Income}        \\ \hline
%                              & Acc.            & F1              & Acc.            & F1              & Acc.            & F1              \\
% SplitNN {[}TabNet{]} (2 cl.) & $0.98 \pm 0.01$ & $0.98 \pm 0.01$ & $0.93 \pm 0.01$ & $0.48 \pm 0.01$ & $0.56 \pm 0.25$ & $0.42 \pm 0.17$ \\
% SplitNN {[}TabNet{]} (3 cl.) & $0.98 \pm 0.01$ & $0.98 \pm 0.01$ & $0.93 \pm 0.01$ & $0.48 \pm 0.01$ & $0.62 \pm 0.20$ & $0.56 \pm 0.16$ \\
% FedTree (2 cl.)              & $0.94 \pm 0.01$ & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.69 \pm 0.02$ & $0.87 \pm 0.01$ & $0.80 \pm 0.01$ \\
% FedTree (3 cl.)              & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.92 \pm 0.01$ & $0.69 \pm 0.01$ & $0.87 \pm 0.01$ & $0.80 \pm 0.01$ \\
% FC {[}PC{]} (2 cl.)          & $0.96 \pm 0.01$ & $0.96 \pm 0.01$ & $0.92 \pm 0.01$ & $0.67 \pm 0.01$ & $0.84 \pm 0.02$ & $0.74 \pm 0.01$ \\
% FC {[}PC{]} (3 cl.)          & $0.95 \pm 0.01$ & $0.95 \pm 0.01$ & $0.92 \pm 0.01$ & $0.66 \pm 0.02$ & $0.84 \pm 0.01$ & $0.74 \pm 0.01$ \\
% FC {[}DT{]} (2 cl.)          & $0.96 \pm 0.01$ & $0.96 \pm 0.02$ & $0.93 \pm 0.01$ & $0.60 \pm 0.02$ & $0.83 \pm 0.02$ & $0.67 \pm 0.02$ \\
% FC {[}DT{]} (3 cl.)          & $0.95 \pm 0.01$ & $0.95 \pm 0.03$ & $0.93 \pm 0.01$ & $0.60 \pm 0.02$ & $0.82 \pm 0.02$ & $0.67 \pm 0.02$ \\
% FedAvg {[}TabNet{]}          & -               & -               & -               & -               & -               & -              
% \end{tabular}
% \caption{\textbf{FCs achieve competing performance in vertical setting.}}
% \end{table}

% \begin{table}[]
% \begin{tabular}{c|cccccc}
%                      & \multicolumn{2}{c}{Cancer}        & \multicolumn{2}{c}{Credit}        & \multicolumn{2}{c}{Income}        \\ \hline
%                      & Acc.            & F1              & Acc.            & F1              & Acc.            & F1              \\
% FC {[}PC{]} (2 cl.)  & $0.94 \pm 0.01$ & $0.94 \pm 0.01$ & $0.92 \pm 0.01$ & $0.67 \pm 0.01$ & $0.82 \pm 0.02$ & $0.71 \pm 0.01$ \\
% FC {[}PC{]} (3 cl.)  & $0.94 \pm 0.01$ & $0.94 \pm 0.01$ & $0.92 \pm 0.01$ & $0.67 \pm 0.02$ & $0.80 \pm 0.01$ & $0.70 \pm 0.01$ \\
% FC {[}DT{]} (2 cl.)  & $0.96 \pm 0.01$ & $0.96 \pm 0.02$ & $0.93 \pm 0.01$ & $0.60 \pm 0.02$ & $0.82 \pm 0.02$ & $0.66 \pm 0.02$ \\
% FC {[}DT{]} (3 cl.)  & $0.96 \pm 0.01$ & $0.96 \pm 0.01$ & $0.93 \pm 0.01$ & $0.54 \pm 0.02$ & $0.82 \pm 0.02$ & $0.66 \pm 0.02$ \\
% FedAvg {[}TabNet{]}  & -               & -               & -               & -               & -               & -               \\
% SplitNN {[}TabNet{]} & -               & -               & -               & -               & -               & -               \\
% FedTree              & -               & -               & -               & -               & -               & -              
% \end{tabular}
% \caption{\textbf{FCs can handle hybrid FL settings without performance deterioration.}}
% \end{table}

\begin{table}[h!]
\resizebox{\textwidth}{!}{
\begin{tabular}{c|c|cccccc}
&                             & \multicolumn{2}{c}{Cancer}        & \multicolumn{2}{c}{Credit}        & \multicolumn{2}{c}{Income}        \\ \hline
&                             & Acc.            & F1              & Acc.            & F1              & Acc.            & F1              \\
\multirow{9}{*}{\rotatebox{90}{Horizontal FL}} & FedAvg {[}TabNet{]} (5 cl.)  & $0.92 \pm 0.03$ & $0.92 \pm 0.03$ & $0.71 \pm 0.11$ & $0.48 \pm 0.04$ & $0.68 \pm 0.06$ & $0.51 \pm 0.03$ \\
& FedAvg {[}TabNet{]} (10 cl.) & $0.92 \pm 0.04$ & $0.91 \pm 0.05$ & $0.56 \pm 0.12$ & $0.47 \pm 0.06$ & $0.64 \pm 0.06$ & $0.52 \pm 0.03$ \\
& FedTree (5 cl.)              & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.91 \pm 0.01$ & $0.63 \pm 0.01$ & $0.88 \pm 0.01$ & $0.82 \pm 0.02$ \\
& FedTree (10 cl.)             & $0.94 \pm 0.01$ & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.69 \pm 0.01$ & $0.87 \pm 0.01$ & $0.80 \pm 0.01$ \\
& FC {[}PC{]} (5 cl.)          & $0.98 \pm 0.01$ & $0.98 \pm 0.01$ & $0.93 \pm 0.02$ & $0.68 \pm 0.02$ & $0.87 \pm 0.02$ & $0.80 \pm 0.01$ \\
& FC {[}PC{]} (10 cl.)         & $0.95 \pm 0.02$ & $0.95 \pm 0.02$ & $0.93 \pm 0.01$ & $0.66 \pm 0.02$ & $0.87 \pm 0.01$ & $0.80 \pm 0.02$ \\
& FC {[}DT{]} (5 cl.)          & $0.95 \pm 0.03$ & $0.93 \pm 0.02$ & $0.92 \pm 0.01$ & $0.67 \pm 0.01$ & $0.89 \pm 0.01$ & $0.83 \pm 0.01$ \\
& FC {[}DT{]} (10 cl.)         & $0.95 \pm 0.02$ & $0.93 \pm 0.03$ & $0.92 \pm 0.01$ & $0.97 \pm 0.02$ & $0.89 \pm 0.01$ & $0.83 \pm 0.02$ \\
& SplitNN {[}TabNet{]}         & -               & -               & -               & -               & -               & -              \\ \hline
\multirow{9}{*}{\rotatebox{90}{Vertical FL}} & SplitNN {[}TabNet{]} (2 cl.) & $0.98 \pm 0.01$ & $0.98 \pm 0.01$ & $0.93 \pm 0.01$ & $0.48 \pm 0.01$ & $0.56 \pm 0.25$ & $0.42 \pm 0.17$ \\
& SplitNN {[}TabNet{]} (3 cl.) & $0.98 \pm 0.01$ & $0.98 \pm 0.01$ & $0.93 \pm 0.01$ & $0.48 \pm 0.01$ & $0.62 \pm 0.20$ & $0.56 \pm 0.16$ \\
& FedTree (2 cl.)              & $0.94 \pm 0.01$ & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.69 \pm 0.02$ & $0.87 \pm 0.01$ & $0.80 \pm 0.01$ \\
& FedTree (3 cl.)              & $0.93 \pm 0.01$ & $0.92 \pm 0.01$ & $0.92 \pm 0.01$ & $0.69 \pm 0.01$ & $0.87 \pm 0.01$ & $0.80 \pm 0.01$ \\
& FC {[}PC{]} (2 cl.)          & $0.96 \pm 0.01$ & $0.96 \pm 0.01$ & $0.92 \pm 0.01$ & $0.67 \pm 0.01$ & $0.84 \pm 0.02$ & $0.74 \pm 0.01$ \\
& FC {[}PC{]} (3 cl.)          & $0.95 \pm 0.01$ & $0.95 \pm 0.01$ & $0.92 \pm 0.01$ & $0.66 \pm 0.02$ & $0.84 \pm 0.01$ & $0.74 \pm 0.01$ \\
& FC {[}DT{]} (2 cl.)          & $0.96 \pm 0.01$ & $0.96 \pm 0.02$ & $0.93 \pm 0.01$ & $0.60 \pm 0.02$ & $0.83 \pm 0.02$ & $0.67 \pm 0.02$ \\
& FC {[}DT{]} (3 cl.)          & $0.95 \pm 0.01$ & $0.95 \pm 0.03$ & $0.93 \pm 0.01$ & $0.60 \pm 0.02$ & $0.82 \pm 0.02$ & $0.67 \pm 0.02$ \\
& FedAvg {[}TabNet{]}          & -               & -               & -               & -               & -               & -              \\ \hline
\multirow{7}{*}{\rotatebox{90}{Hybrid FL}} & FC {[}PC{]} (2 cl.)  & $0.94 \pm 0.01$ & $0.94 \pm 0.01$ & $0.92 \pm 0.01$ & $0.67 \pm 0.01$ & $0.82 \pm 0.02$ & $0.71 \pm 0.01$ \\
& FC {[}PC{]} (3 cl.)  & $0.94 \pm 0.01$ & $0.94 \pm 0.01$ & $0.92 \pm 0.01$ & $0.67 \pm 0.02$ & $0.80 \pm 0.01$ & $0.70 \pm 0.01$ \\
& FC {[}DT{]} (2 cl.)  & $0.96 \pm 0.01$ & $0.96 \pm 0.02$ & $0.93 \pm 0.01$ & $0.60 \pm 0.02$ & $0.82 \pm 0.02$ & $0.66 \pm 0.02$ \\
& FC {[}DT{]} (3 cl.)  & $0.96 \pm 0.01$ & $0.96 \pm 0.01$ & $0.93 \pm 0.01$ & $0.54 \pm 0.02$ & $0.82 \pm 0.02$ & $0.66 \pm 0.02$ \\
& FedAvg {[}TabNet{]}  & -               & -               & -               & -               & -               & -               \\
& SplitNN {[}TabNet{]} & -               & -               & -               & -               & -               & -               \\
& FedTree              & -               & -               & -               & -               & -               & -        
\end{tabular}
}
\caption{\textbf{All Classification results of FL experiments.} Here, we show the detailed performances of FC, FedAvg, and SplitNN in all three FL settings. It can be seen that FCs, while being much more flexible than our baselines, still achieve competitive or better results on various classification tasks.}
\label{tab:full_FL_results}
\end{table}

\begin{table}[h!]
\centering
\begin{tabular}{l|lll}
                  & CelebA             & Imagenet32          & Imagenet              \\ \hline
EiNet~\citep{perharz2020einsum}              & 5.37 & 5.74   & 6.28  \\
PyJuice~\citep{liu2024scalingtractableprobabilisticcircuits}            & 5.56  & 5.75  & 6.27   \\
FedPC (2 clients)  & \textbf{5.31}  & 5.57  & 6.24 \\
FedPC (4 clients)  & \underline{5.32}  & 5.51  & 6.15 \\
FedPC (8 clients)  & 5.35 & \textbf{5.49}  & \underline{6.13}  \\
FedPC (16 clients) & 5.42 & \underline{5.51} & \textbf{6.10} 
\end{tabular}
\caption{\textbf{FedPCs outperform EiNets and PyJuice on density estimation tasks.} FedPCs achieve better results on density estimation tasks on three challenging image datasets (CelebA, Imagenet32 and Imagenet). This is because FedPCs can learn far larger models distributed across multiple machines. Results are reported as bits per dimension (bpd) averaged over 5 runs (lower is better). Note that we used Gaussian \textit{densities} as PC leaves. Best value in \textbf{bold}, 2nd best \underline{underlined}.}
\label{tab:imageBPDs}
\end{table}

\clearpage
\section{Communication Efficiency}
\label{app:commeff}
Communication efficiency is a critical property when it comes to learning models across multiple machines, as it is done in FL. Here, in addition to our theoretical results, we more intuitively provide further details on the communication efficiency of FCs. For that, we plot the communication cost in Megabytes (MB) required to train a FedPC vs. FedAvg/SplitNN in horizontal/vertical FL settings with datasets of different sizes (1M and 100M samples). Regardless of the number of samples in the dataset, FedPCs are more communication efficient compared to our baselines in both horizontal and vertical settings (see Fig. \ref{fig:comm_effic}).

\begin{figure}[h!]
    \centering
    \includegraphics[scale=0.4]{federated-circuits/images/communication_cost.pdf}
    \caption{\textbf{FedPCs are communication-efficient.} We compare communication cost in Megabytes (MB) sent over the network during one full training of a model (0.5M/50M parameters) on a dataset (1M/100M samples) using results from Section 3.4. Results are shown on log-scale. It can be seen that FedPCs significantly reduce communication cost of training.}
    \label{fig:comm_effic}
\end{figure}

\section{Experimental Details}
\label{app:exp_details}
\subsection{Datasets}
The following describes the datasets used in our experiments. If not stated differently, the datasets were distributed across clients as follows: 

In horizontal cases, we either split samples randomly across clients (done for all binary classification tasks) or we distribute a subset of the dataset corresponding to a certain label (e.g. the 0 in MNIST) to one client. 

In vertical cases, we split tabular datasets randomly along the feature-dimension, i.e. each client gets all samples but a random subset of features assigned. For image data, we split the images into non-overlapping patches which were then distributed to the clients.

In hybrid cases, we split tabular datasets along both, the feature and the sample-dimension. We do this s.t. at least two clients have at least one randomly chosen feature in commeon (but hold different samples thereof). For image data, we split images into overlapping patches, sample a subset of the dataset and assign the resulting subsets to clients.

\textbf{Income Dataset.}
We used the Income dataset from \url{https://www.kaggle.com/datasets/wenruliu/adult-income-dataset}. This dataset represents a binary classification problem with 14 features and approximate 450K samples in the train and 900 samples in the test set. We encoded discrete variables to numerical values using TargetEncoder from sklearn. Additionally, missing values were imputed using the median of the corresponding feature. Further we standardized all features.

\textbf{Breast Cancer Dataset.}
We used the Breast Cancer dataset from \url{https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data}. It represents a binary classification problem with 31 features and 570 samples. We split the dataset into 450 training samples and 120 test samples. We standardized all features for training.

\textbf{Credit Dataset.}
We used the Give Me Some Credit dataset from \url{https://www.kaggle.com/c/GiveMeSomeCredit}. The dataset represents a binary classification task with 10 features, 1.5M training samples and 100K test samples. We encoded discrete variables to numerical values using TargetEncoder from sklearn. Additionally, missing values were imputed using the median of the corresponding feature. Further we standardized all features.

\textbf{MNIST.}
We used the MNIST dataset provided by pytorch. It contains 70K hand-written digits between 0 and 9 as 28x28 images (60K train, 10K test). We standardized all features as preprocessing.

\textbf{Imagenet/Imagenet32.}
We used the Imagenet dataset provided by pytorch. It consists of about 1.2M images showing objects of 1000 classes. The images come in different resolutions; we resized each image to 112x112 (Imagenet) and 32x32 (Imagenet32) pixels, applied center cropping, and standardized all features as preprocessing. In our experiments, we used a pre-trained Vision Transformer (ViT)~\citep{dosovitskiy2021imageworth16x16words} to obtain encodings of each image. Then, we applied KMeans to cluster the dataset into $n$ clusters (depending on the number of clients participating). Images of each cluster were then distributed to the clients, defining the client's datasets.

\subsection{Hyperparameters}
The following tables show the setting of all relevant hyperparameters for each dataset and FL setting.
\begin{table}[h!]
\centering
\begin{tabular}{c|ccccc}
FL-Setting                  & Dataset & Structure & Threshold & min\_num\_instances & glueing       \\ \hline
\multirow{3}{*}{horizontal} & Income  & learned   & 0.3       & 200                 & -             \\
                            & Credit  & learned   & 0.5       & 200                 & -             \\
                            & Cancer  & learned   & 0.4       & 300                 & -             \\
\multirow{3}{*}{vertical}   & Income  & learned   & 0.4       & 100                 & combinatorial \\
                            & Credit  & learned   & 0.5       & 50                  & combinatorial \\
                            & Cancer  & learned   & 0.4       & 300                 & combinatorial \\
\multirow{3}{*}{hybrid}     & Income  & learned   & 0.4       & 100                 & combinatorial \\
                            & Credit  & learned   & 0.5       & 50                  & combinatorial \\
                            & Cancer  & learned   & 0.4       & 300                 & combinatorial
\end{tabular}
\caption{Hyperparameters used in our experiments for all tabular datasets.}
\end{table}

\begin{table}[h!]
\centering
\begin{tabular}{c|ccc}
                      & MNIST             & Imagenet(32)           & CelebA\\ \hline
num\_epochs           & 5                 & 25                 & 10\\
batch\_size           & 64                & 64                 & 64\\
online\_em\_frequency & 5                 & 10                 & 10\\
online\_em\_stepsize  & 0.1               & 0.25                & 0.25\\
Structure             & poon-domingos     & poon-domingos      & poon-domingos\\
pd\_num\_pieces       & 4                 & 4                  & 4\\
K                     & 10                & 120                 & 120\\
Leaf Distribution     & Gaussian          & Gaussian           & Gaussian\\
min\_var              & $1 \cdot 10^{-3}$ & $1 \cdot 10^{-3}$  & $1 \cdot 10^{-3}$\\
max\_var              & $1 \cdot 10^{-7}$ & $1 \cdot 10^{-7}$ & $1 \cdot 10^{-7}$
\end{tabular}
\caption{Hyperparameters used in our experiments for image datasets.}
\end{table}

\subsection{Hardware}
All experiments were conducted on Nvidia DGX machines with Nvidia A100 (40GB) GPUs, AMD EPYC 7742 64-Core Processor and 2TiB of RAM.
\end{document}