\documentclass[accepted]{uai2022} 





\usepackage{zref-xr,zref-user}
\zexternaldocument*{vo_458}

\usepackage{setspace}
\usepackage{amsthm}

\usepackage{dsfont}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{float}
\usepackage{upgreek}
\allowdisplaybreaks
\usepackage{bm}
\usepackage{enumitem}
\usepackage{multirow}
\usepackage{xcolor}
\usepackage{makecell}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage[linesnumbered,ruled,vlined]{algorithm2e}


\usepackage{stackengine}


\newcommand{\q}{q}
\newcommand{\qbar}{\tilde{\mathbb{Q}}}
\newcommand{\p}{p}
\newcommand{\D}{\mathbb{D}}
\newcommand{\m}{\mathbb{M}}
\newcommand{\e}{\mathbb{E}}
\newcommand{\V}{\mathbb{V}}
\newcommand{\infq}{\inf_{\q}}
\newcommand{\Ifqp}{\mathbb{I}_f(\q,\p)}
\newcommand{\eq}{\mathbb{E}_{\q}}
\newcommand{\ep}{\mathbb{E}_{\p}}
\newcommand{\ed}{\mathbb{E}_{\D}}
\newcommand{\EM}{\mathbb{E}_{\m}}
\newcommand{\Risk}{\mathbb{L}}
\newcommand{\Riskattained}{\underline{\mathbb{L}}}
\newcommand{\risk}{L}
\newcommand{\riskattained}{\underline{L}}
\newcommand{\reals}{\mathbb{R}}
\newcommand{\maps}{\rightarrow}
\newcommand{\settozero}{\overset{!}{=}0}
\newcommand{\ind}{\perp \!\!\! \perp }

\newcommand{\argmax}{\operatornamewithlimits{arg\max}}
\newcommand{\argmin}{\operatornamewithlimits{arg\min}}
\newcommand{\doo}{\textnormal{do}}
\allowdisplaybreaks



\newtheorem{theorem}{Theorem}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}
\newtheorem{example}{Example}
\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{assumption}{Assumption}



\newcommand{\ymis}{Y^{\textrm{mis}}_i}
\newcommand{\yobs}{Y^{\textrm{obs}}_i}
\newcommand{\Ymis}{\textbf{Y}_{\textrm{mis}}}
\newcommand{\Yobs}{\textbf{Y}_{\textrm{obs}}}
\newcommand{\W}{\textbf{W}}
\newcommand{\X}{\textbf{X}}
\newcommand{\Yzero}{\textbf{Y}(0)}
\newcommand{\Yone}{\textbf{Y}(1)}
\newcommand{\yzero}{Y_i(0)}
\newcommand{\yone}{Y_i(1)}
\newcommand{\varbeta}{\sigma^2_{\beta}}
\newcommand{\betac}{\beta^{{[c]}}}
\newcommand{\betat}{\beta^{{[t]}}}
\newcommand{\x}{\textbf{x}_i}
\newcommand{\xic}{\x^\top\betac}
\newcommand{\xit}{\x^\top\betat}
\newcommand{\betaall}{\boldsymbol{\beta}}
\newcommand{\et}{{\epsilon_i^{[t]}}}
\newcommand{\ec}{{\epsilon_i^{[c]}}}
\newcommand{\epstilde}{\tilde{\epsilon}}
\newcommand{\mumis}{\mu_i^{\textrm{mis}}}
\newcommand{\muobs}{\mu_i^{\textrm{obs}}}
\newcommand{\lambdaobs}{\lambda_i^{\textrm{obs}}}
\newcommand{\lambdamis}{\lambda_i^{\textrm{mis}}}
\newcommand{\qmis}{{Q}_i^{\textrm{mis}}}
\newcommand{\Qmis}{{\textbf{Q}}^{{\textrm{mis}}}}
\newcommand{\qobs}{{Q}_i^{{\textrm{obs}}}}
\newcommand{\Qobs}{{\textbf{Q}}^{{\textrm{obs}}}}
\newcommand{\xiobs}{\xi_i^{\textrm{obs}}}
\newcommand{\ximis}{\xi_i^{\textrm{mis}}}
\newcommand{\xitilde}{\tilde{\textbf{x}}^{\textrm{obs}}_i}
\newcommand{\xitildemis}{\tilde{\textbf{x}}^{\textrm{mis}}_i}
\newcommand{\xtilde}{\tilde{\textbf{x}}}
\newcommand{\Xtilde}{\tilde{\textbf{X}}}
\newcommand{\xibreve}{\breve{\textbf{x}}_i}

\newcommand{\highlight}[2][yellow]{\mathchoice {\colorbox{#1}{$\displaystyle#2$}}{\colorbox{#1}{$\textstyle#2$}}{\colorbox{#1}{$\scriptstyle#2$}}{\colorbox{#1}{$\scriptscriptstyle#2$}}}



\usepackage[american]{babel}


\usepackage{natbib} \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} \usepackage{booktabs} \usepackage{tikz} 



\newcommand{\swap}[3][-]{#3#1#2} 

\title{Appendix:\\Bayesian Federated Estimation of Causal Effects from Observational Data}

\author[1]{\href{mailto:<votv@comp.nus.edu.sg>?Subject=Your UAI 2022 paper}{Thanh~Vinh~Vo}{}}
\author[2]{Young Lee}
\author[3]{Trong Nghia Hoang}
\author[1]{\href{mailto:<leongty@comp.nus.edu.sg>?Subject=Your UAI 2022 paper}{Tze-Yun~Leong}{}}
\affil[1]{School of Computing\\
    National University of Singapore
}
\affil[2]{Harvard University
}
\affil[3]{School of Electrical Engineering and Computer Science\\
    Washington State University
  }
  
  \begin{document}
\maketitle






\appendix









\section{The Preprocessing Procedure}
\label{sec:appendix-preprocessing}
The assumptions were described briefly in Section~\zref{sec:assumptions} of the main text. Here we present the preprocessing procedures to remove duplicated individuals.

\begin{figure}\centering
    \includegraphics[width=0.47\textwidth]{figures/causal-federated-preprocessing.pdf}
    \caption{The secure preprocessing procedures to identify duplicated individuals among multiple sources. $\text{PK}a_i$ ($i=1,\!...,5$), $\text{PK}b_i$ ($i=1,\!...,7$), $\text{PK}c_i$ ($i=1,\!...,4$) are the primary keys of each individual in each source. $a_i$ ($i=1,\!...,5$), $b_i$ ($i=1,\!...,7$), $c_i$ ($i=1,\!...,4$) are the hashed sequences of these individuals.}
    \label{fig:preprocessing}
\end{figure}






The preprocessing procedure are summarized as follows. Firstly, each source would use a one-way hash function (such as MD4, MD5, SHA or SHA256) to encrypt each individuals' primary key and then send the hashed sequences to a server. By doing this, the individuals' data are secured. Note that the one-way hash function is agreed among the sources so that they would use the same function. Then, the server collects all hashed sequences from all sources and perform a matching algorithm to see if there exists repeated individuals among different sources. For each repeated individual, the server randomly choose to keep it on a small number (predefined) of sources and inform the other sources to exclude this individual from the training process. The whole procedure is to ensure that an individual does not exists in a huge number of sources, thus prevent learning a biased model. We summarize the procedure in Figure~\ref{fig:preprocessing}.

Assumption~\zref{assumption:unique-ident} and the preprocessing procedure are required for data that are highly repeated in different sources only. For data that are not likely to have a high number of repetitions such as patients from different hospitals of different countries, the above assumption and the preprocessing procedure are not required. Note that the existing methods also need Assumption~\zref{assumption:unique-ident} since they need to combine data and remove repeated individuals.

In this work, we assume that all of the assumptions described in this section are satisfied, and the preprocessing procedure was performed if it is necessary.


\section{The Federated Evidence Lower Bound}
\label{sec:appendix-elbo}

Naively applying variational inference would lead to a non-decomposable ELBO. The proposed ELBO can be decomposed into multiple components, thus enabling federated optimization. We give a full derivation as follows:
\begin{align*}
        &\log \p(\mathbf{y}_{\textrm{obs}}\,|\,\mathbf{X},\mathbf{w}) \\
        &=\log \int p(\mathbf{y}_{\textrm{obs}},\mathbf{g}, \Psi, \Sigma\,|\,\mathbf{X},\mathbf{w}) d\mathbf{g}d\Psi d\Sigma\\
        &=\log \int p(\mathbf{y}_{\textrm{obs}}\,|\,\mathbf{g}, \Psi, \Sigma,\mathbf{X},\mathbf{w})p(\mathbf{g}, \Psi, \Sigma|\mathbf{X},\mathbf{w}) d\mathbf{g}d\Psi d\Sigma.
\end{align*}

From Figure~\ref{fig:the-model}, we see that $\mathbf{g}, \Psi, \Sigma \ind \mathbf{X}^\mathsf{s},\mathbf{w}^\mathsf{s}$ (for all $\mathsf{s}=1,2,\dots,m$), i.e, $\mathbf{g}, \Psi, \Sigma$ are independent with $\mathbf{X}^\mathsf{s},\mathbf{w}^\mathsf{s}$ when $\mathbf{y}_{\text{obs}}^\mathsf{s}, \mathbf{y}_{\text{mis}}^\mathsf{s}$ are not given. Thus, $p(\mathbf{g}, \Psi, \Sigma|\mathbf{X},\mathbf{w}) = p(\mathbf{g}, \Psi, \Sigma)$.

\begin{figure}[!ht]
\centering
		\includegraphics[width=0.47\textwidth]{figures/model-plate-new}

    \caption{
        Graphical model that summarizes the proposed framework with treatment $\mathbf{w}^\mathsf{s}$, covariate $\mathbf{X}^\mathsf{s}$, and the two potential outcomes $\mathbf{y}_\textrm{mis}^\mathsf{s}$ and $\mathbf{y}_\textrm{obs}^\mathsf{s}$. The quantity $\mathbf{f}^\mathsf{s}$ is idiosyncratic to the sources and $\mathbf{g}$ contains shared characteristics across all the sources. $\Sigma$ and $\Psi$ are shared parameters. Note that this is not a causal graph.
    } \label{fig:the-model}
\end{figure}

In addition, from Figure~\ref{fig:the-model}, 
we also have 
\begin{align*}
    p(\mathbf{y}_{\textrm{obs}}\,|\,\mathbf{g}, \Psi, \Sigma,\mathbf{X},\mathbf{w}) = \prod_{\mathsf{s}=1}^m   p(\mathbf{y}_{\textrm{obs}}^\mathsf{s}\,|\,\mathbf{g}, \Psi, \Sigma,\mathbf{X}^\mathsf{s},\mathbf{w}^\mathsf{s}).
\end{align*}
Thus,
\begin{align*}
        &\log p(\mathbf{y}_{\textrm{obs}}\,|\,\mathbf{X},\mathbf{w}) \\
&= \log \int q(\mathbf{g}, \Psi, \Sigma)\prod_{\mathsf{s}=1}^m   p(\mathbf{y}_{\textrm{obs}}^\mathsf{s}\,|\,\mathbf{g}, \Psi, \Sigma,\mathbf{X}^\mathsf{s},\mathbf{w}^\mathsf{s})\\
        &\qquad\qquad\qquad\qquad\qquad \times\frac{p(\mathbf{g}, \Psi, \Sigma)}{q(\mathbf{g}, \Psi, \Sigma)} d\mathbf{g}d\Psi d\Sigma\\
        &\ge \int q(\mathbf{g}, \Psi, \Sigma)\log\Bigg(\prod_{\mathsf{s}=1}^m   p(\mathbf{y}_{\textrm{obs}}^\mathsf{s}\,|\,\mathbf{g}, \Psi, \Sigma,\mathbf{X}^\mathsf{s},\mathbf{w}^\mathsf{s})\\
        &\qquad\qquad\qquad\qquad\qquad \times\frac{p(\mathbf{g}, \Psi, \Sigma)}{q(\mathbf{g}, \Psi, \Sigma)}\Bigg) d\mathbf{g}d\Psi d\Sigma\\ 
        &= \sum_{\mathsf{s}=1}^m \e_q[\log  p(\mathbf{y}_{\textrm{obs}}^\mathsf{s}\,|\,\mathbf{g}, \Psi, \Sigma,\mathbf{X}^\mathsf{s},\mathbf{w}^\mathsf{s})] \\
        &\qquad\qquad- \mathbb{D}_{\text{KL}}[q(\mathbf{g}, \Psi, \Sigma)\|p(\mathbf{g}, \Psi, \Sigma)]\\
        &= \sum_{\mathsf{s}=1}^m\Big( \e_q[\log  p(\mathbf{y}_{\textrm{obs}}^\mathsf{s}\,|\,\mathbf{g}, \Psi, \Sigma,\mathbf{X}^\mathsf{s},\mathbf{w}^\mathsf{s})] \\
        &\qquad\qquad- \frac{1}{m}\mathbb{D}_{\text{KL}}[q(\mathbf{g}, \Psi, \Sigma)\|p(\mathbf{g}, \Psi, \Sigma)]\Big)\\
        &= \sum_{\mathsf{s}=1}^m \mathbf{L}^\mathsf{s},
\end{align*}
where
\begin{align*}
    \mathbf{L}^\mathsf{s} &\vcentcolon=  \e_q[\log  p(\mathbf{y}_{\textrm{obs}}^\mathsf{s}\,|\,\mathbf{g}, \Psi, \Sigma,\mathbf{X}^\mathsf{s},\mathbf{w}^\mathsf{s})] \\
    &\qquad- \frac{1}{m}\mathbb{D}_{\text{KL}}[q(\mathbf{g}, \Psi, \Sigma)\|p(\mathbf{g}, \Psi, \Sigma)].
\end{align*}
Hence, we can divide the ELBO into multiple components, which leads to federated training of the model. Without the proposed model, the ELBO cannot be decomposed into multiple components and hence cannot be trained in a federated setting.



\section{Proof of Lemma~1}
\label{sec:appendix-proof-lem-1}


\begin{proof}
We denote $\xi_0^{\mathsf{s}} \sim \mathsf{N}(\mathbf{0}, \mathbf{I}_{n_{\mathsf{s}}})$ and $\xi_1^{\mathsf{s}} \sim \mathsf{N}(\mathbf{0}, \mathbf{I}_{n_{\mathsf{s}}})$. Then, from the model definition (Eq.~(5) in the main text), we have
\begin{align*}
\setlength\arraycolsep{0.0pt} &\begin{bmatrix}
y_1^{\mathsf{s}}(0)&\dots&y_{n_{\mathsf{s}}}^{\mathsf{s}}(0)\\
y_1^{\mathsf{s}}(1)&\dots&y_{n_{\mathsf{s}}}^{\mathsf{s}}(1)
\end{bmatrix} \\
&\qquad=\Psi^{\frac{1}{2}} \begin{bmatrix}
f_1^{\mathsf{s}}(0)+g^{\mathsf{s}}(0)&\dots&f_{n_{\mathsf{s}}}^{\mathsf{s}}(0)+g^{\mathsf{s}}(0)\\
f_1^{\mathsf{s}}(1)+g^{\mathsf{s}}(1)&\dots&f_{n_{\mathsf{s}}}^{\mathsf{s}}(1)+g^{\mathsf{s}}(1)
\end{bmatrix} \\
&\qquad\quad+\Sigma^{\frac{1}{2}}\begin{bmatrix}
\varepsilon_1^{\mathsf{s}}(0)&\dots&\varepsilon_{n_{\mathsf{s}}}^{\mathsf{s}}(0)\\
\varepsilon_1^{\mathsf{s}}(1)&\dots&\varepsilon_{n_{\mathsf{s}}}^{\mathsf{s}}(1)
\end{bmatrix}.
\end{align*}
The above equation is equivalent to the following
\begin{align*}
&\mathbf{Y}^{\mathsf{s}} =  \setlength\arraycolsep{5pt} \begin{bmatrix}
\bm{\mu}_0&\bm{\mu}_1
\end{bmatrix}\!\!
(\Psi^{\frac{1}{2}})^\top +\setlength\arraycolsep{5pt} \begin{bmatrix}
\varepsilon_0^{\mathsf{s}}&\varepsilon_1^{\mathsf{s}}
\end{bmatrix}(\Sigma^{\frac{1}{2}})^\top,
\end{align*}
where 
\begin{align*}
    \bm{\mu}_0 = \mu_0(\mathbf{X}^{\mathsf{s}}) \!+\! \mathbf{g}_0^{\mathsf{s}} \!+\! (\mathbf{K}^{\mathsf{s}})^{\frac{1}{2}}\xi_0^{\mathsf{s}},\\
    \bm{\mu}_1 = \mu_1(\mathbf{X}^{\mathsf{s}}) \!+\! \mathbf{g}_1^{\mathsf{s}} \!+\! (\mathbf{K}^{\mathsf{s}})^{\frac{1}{2}}\xi_1^{\mathsf{s}}.
\end{align*}
Further expanding the right hand side, we have
\begin{align*}
&\mathbf{Y}^{\mathsf{s}} \!=\! \setlength\arraycolsep{5pt}\begin{bmatrix}\mu_0(\mathbf{X}^{\mathsf{s}})\!+\!\mathbf{g}_0^{\mathsf{s}}&\mu_1(\mathbf{X}^{\mathsf{s}})+\mathbf{g}_1^{\mathsf{s}}
\end{bmatrix}(\Psi^{\frac{1}{2}})^\top \\
&\qquad+ (\mathbf{K}^{\mathsf{s}})^{\frac{1}{2}}\setlength\arraycolsep{5pt}\begin{bmatrix}\xi_0^{\mathsf{s}}&\xi_1^{\mathsf{s}}
\end{bmatrix}(\Psi^{\frac{1}{2}})^\top +\setlength\arraycolsep{5pt}\begin{bmatrix}
\varepsilon_0^{\mathsf{s}}&\varepsilon_1^{\mathsf{s}}
\end{bmatrix}(\Sigma^{\frac{1}{2}})^\top\\
&\text{vec}(\mathbf{Y}^{\mathsf{s}}) =  \left(\Psi^{\frac{1}{2}} \otimes \mathbf{I}_{n_{\mathsf{s}}}\right)\begin{bmatrix}\mu_0(\mathbf{X}^{\mathsf{s}})+\mathbf{g}_0^{\mathsf{s}}\\\mu_1(\mathbf{X}^{\mathsf{s}})+\mathbf{g}_1^{\mathsf{s}}
\end{bmatrix}  \\
&\qquad\qquad+\left(\Psi^{\frac{1}{2}} \otimes (\mathbf{K}^{\mathsf{s}})^{\frac{1}{2}}\right)\begin{bmatrix}\xi_0^{\mathsf{s}}\\\xi_1^{\mathsf{s}}
\end{bmatrix} + (\Sigma^{\frac{1}{2}} \otimes \mathbf{I}_{n_{\mathsf{s}}})\begin{bmatrix}
\varepsilon_0^{\mathsf{s}}\\\varepsilon_1^{\mathsf{s}}
\end{bmatrix},
\end{align*}
where $\text{vec}(\cdot)$ denotes the vectorization of a matrix, which converts a matrix into a column vector.


For the second term on the right hand side of the above equation, note that $\xi_0^{\mathsf{s}} \sim \mathsf{N}(\mathbf{0}, \mathbf{I}_{n_{\mathsf{s}}})$ and $\xi_1^{\mathsf{s}} \sim \mathsf{N}(\mathbf{0}, \mathbf{I}_{n_{\mathsf{s}}})$, so we have the following
\begin{align*}
&\begin{bmatrix}\xi_0^{\mathsf{s}}\\\xi_1^{\mathsf{s}}
\end{bmatrix} \sim \mathsf{N}(\mathbf{0}, \mathbf{I}_{2n_{\mathsf{s}}})\\
&\left(\Psi^{\frac{1}{2}} \otimes (\mathbf{K}^{\mathsf{s}})^{\frac{1}{2}}\right)\begin{bmatrix}\xi_0^{\mathsf{s}}\\\xi_1^{\mathsf{s}}\end{bmatrix} \\
&\qquad\sim \mathsf{N}\left(\mathbf{0}, \left(\Psi^{\frac{1}{2}} \otimes (\mathbf{K}^{\mathsf{s}})^{\frac{1}{2}}\right)\mathbf{I}_{2N}\left(\!\Psi^{\frac{1}{2}} \otimes (\mathbf{K}^{\mathsf{s}})^{\frac{1}{2}}\right)^\top\right)\\
&\left(\Psi^{\frac{1}{2}} \otimes (\mathbf{K}^{\mathsf{s}})^{\frac{1}{2}}\right)\begin{bmatrix}\xi_0^{\mathsf{s}}\\\xi_1^{\mathsf{s}}
\end{bmatrix} \sim \mathsf{N}\left(\mathbf{0}, \Psi\otimes \mathbf{K}^{\mathsf{s}}\!\right).
\end{align*}
For the last term, note that $\varepsilon_0^{\mathsf{s}} \sim \mathsf{N}(0, \mathbf{I}_{n_{\mathsf{s}}}), \varepsilon_1^{\mathsf{s}} \sim \mathsf{N}(0, \mathbf{I}_{n_{\mathsf{s}}})$, thus
\begin{align*}
&\begin{bmatrix}\varepsilon_0^{\mathsf{s}}\\\varepsilon_1^{\mathsf{s}}
\end{bmatrix} \sim \mathsf{N}(\mathbf{0}, \mathbf{I}_{2n_{\mathsf{s}}})\\
&\left(\Sigma^{\frac{1}{2}} \!\otimes\! \mathbf{I}_{n_{\mathsf{s}}}\right)\begin{bmatrix}\varepsilon_0^{\mathsf{s}}\\\varepsilon_1^{\mathsf{s}}
\end{bmatrix} \sim \mathsf{N}\left(\mathbf{0}, \left(\Sigma^{\frac{1}{2}} \!\otimes\! \mathbf{I}_{n_{\mathsf{s}}}\right)\mathbf{I}_{2n}\left(\Sigma^{\frac{1}{2}} \!\otimes\! \mathbf{I}_{n_{\mathsf{s}}}\right)^\top\right)\\
&\left(\Sigma^{\frac{1}{2}} \otimes \mathbf{I}_{n_{\mathsf{s}}}\right)\begin{bmatrix}\varepsilon_0^{\mathsf{s}}\\\varepsilon_1^{\mathsf{s}}
\end{bmatrix} \sim \mathsf{N}\left(\mathbf{0}, \Sigma\otimes \mathbf{I}_{n_{\mathsf{s}}}\right).
\end{align*}
Consequently, 
\vspace{-6pt}
\begin{align*}
&\text{vec}(\mathbf{Y}^{\mathsf{s}}) \big| \Psi, \Sigma, \mathbf{X}^{\mathsf{s}}, \mathbf{w}^{\mathsf{s}}, \mathbf{g}^{\mathsf{s}} \\
&\,\,\sim \mathsf{N}\left( \left(\Psi^{\frac{1}{2}} \otimes \mathbf{I}_{n_{\mathsf{s}}}\right)\begin{bmatrix}\mu_0(\mathbf{X}^{\mathsf{s}})\!+\! \mathbf{g}_0^{\mathsf{s}}\\\mu_1(\mathbf{X}^{\mathsf{s}}) \!+\! \mathbf{g}_1^{\mathsf{s}}
\end{bmatrix} \!, \Psi \otimes \mathbf{K}^{\mathsf{s}} \!+\! \Sigma \otimes \mathbf{I}_{n_{\mathsf{s}}}\right)\!,
\end{align*}
which implies that
\begin{align*}
&\begin{bmatrix}
\mathbf{y}^{\mathsf{s}}(0)\\
\mathbf{y}^{\mathsf{s}}(1)
\end{bmatrix}\Big|\Psi, \Sigma, \mathbf{X}^{\mathsf{s}}, \mathbf{w}^{\mathsf{s}}, \mathbf{g}^{\mathsf{s}} \\
&\,\,\sim \mathsf{N}\left( \left(\Psi^{\frac{1}{2}} \otimes \mathbf{I}_{n_{\mathsf{s}}}\right)\begin{bmatrix}\mu_0(\mathbf{X}^{\mathsf{s}})\!+\! \mathbf{g}_0^{\mathsf{s}}\\\mu_1(\mathbf{X}^{\mathsf{s}})\!+\! \mathbf{g}_1^{\mathsf{s}}
\end{bmatrix} \!, \Psi \otimes \mathbf{K}^{\mathsf{s}} \!+\! \Sigma \otimes \mathbf{I}_{n_{\mathsf{s}}}\right)\!.
\end{align*}
This completes the proof.
\end{proof}

\section{Proof of Lemma~2}

\label{sec:appendix-proof-lem-2}





\begin{proof}
Following the proof of Lemma~2, we note that if the observed treatment $w_i^{\mathsf{s}} = 0$, then the mean of $p(y^{\mathsf{s}}_{i,\textrm{obs}} | \mathbf{X}^{\mathsf{s}}, \mathbf{w}^{\mathsf{s}}, \Psi, \Sigma, \mathbf{g}^{\mathsf{s}})$ equals to the mean of $p(y_i^{\mathsf{s}}(0) | \Psi, \Sigma, \mathbf{X}^{\mathsf{s}}, \mathbf{w}^{\mathsf{s}}, \mathbf{g}^{\mathsf{s}})$ and the mean of $p(y^{\mathsf{s}}_{i,\textrm{mis}} | \mathbf{X}^{\mathsf{s}}, \mathbf{w}^{\mathsf{s}}, \Psi, \Sigma, \mathbf{g}^{\mathsf{s}})$ equals to the mean of $p(y_i^{\mathsf{s}}(1) | \Psi, \Sigma, \mathbf{X}^{\mathsf{s}}, \mathbf{w}^{\mathsf{s}}, \mathbf{g}^{\mathsf{s}})$. If the observed treatment $w_i^{\mathsf{s}} = 1$, then the mean of $p(y^{\mathsf{s}}_{i,\textrm{obs}} | \mathbf{X}^{\mathsf{s}}, \mathbf{w}^{\mathsf{s}}, \Psi, \Sigma, \mathbf{g}^{\mathsf{s}})$ equals to the mean of $p(y_i^{\mathsf{s}}(1) | \Psi, \Sigma, \mathbf{X}^{\mathsf{s}}, \mathbf{w}^{\mathsf{s}}, \mathbf{g}^{\mathsf{s}})$ and the mean of $p(y^{\mathsf{s}}_{i,\textrm{mis}} | \mathbf{X}^{\mathsf{s}}, \mathbf{w}^{\mathsf{s}}, \Psi, \Sigma, \mathbf{g}^{\mathsf{s}})$ equals to the mean of $p(y_i^{\mathsf{s}}(0) | \Psi, \Sigma, \mathbf{X}^{\mathsf{s}}, \mathbf{w}^{\mathsf{s}}, \mathbf{g}^{\mathsf{s}})$. Hence, we have
\begin{align*}
\mu_{\textrm{obs}}(\mathbf{X}^\mathsf{s}) &= (\mathbf{1} - \mathbf{w}^\mathsf{s})\odot\mathbf{m}_0 + \mathbf{w}^\mathsf{s} \odot\mathbf{m}_1,\\
\mu_{\textrm{mis}}(\mathbf{X}^\mathsf{s}) &= \mathbf{w}^\mathsf{s} \odot\mathbf{m}_0 + (\mathbf{1} - \mathbf{w}^\mathsf{s}) \odot\mathbf{m}_1,
\end{align*}
Similarly, for the covariance matrix, each element in $\mathbf{K}_{\textrm{obs}}$, $\mathbf{K}_{\textrm{mis}}$, and   $\mathbf{K}_{\textrm{om}}$ also depends on whether $w_i^{\mathsf{s}} = 0$ or $w_i^{\mathsf{s}} = 1$. So each element in these matrices is computed by the following kernel function
\begin{align*}
k_{\textrm{obs}}(\mathbf{x}_i, \mathbf{x}_j) \!&=\! \big[(1\!-\!w_i)(1\!-\!w_j)\psi_{11} \!+\! w_iw_j\psi_{22} \\
&\,\,\,\,\,+ (1\!-\!w_i)w_j\psi_{12} \!+\! w_i(1\!-\!w_j)\psi_{21}\big] \mathsf{k}(\mathbf{x}_i, \mathbf{x}_j)\\[-0cm]
&\,\,\,\,\, + \big[(1\!-\!w_i)\sigma_{11} \!+\! w_i\sigma_{22}\big] \mathds{1}_{i=j},\\[-0cm]
k_{\textrm{mis}}(\mathbf{x}_i, \mathbf{x}_j) \!&=\! \big[w_iw_j\psi_{11} \!+\! (1\!-\!w_i)(1\!-\!w_j)\psi_{22} \\
&\,\,\,\,\,+ (1\!-\!w_i)w_j\psi_{21} \!+\! w_i(1\!-\!w_j)\psi_{12}\big] \mathsf{k}(\mathbf{x}_i, \mathbf{x}_j)\\[-0cm]
& \,\,\,\,\,+ \big[w_i\sigma_{11} \!+\! (1\!-\!w_i)\sigma_{22}\big] \mathds{1}_{i=j},\\[-0cm]
k_{\textrm{om}}(\mathbf{x}_i, \mathbf{x}_j) &= \big[(1\!-\!w_i)(1-w_j)\psi_{21} \!+\! w_iw_j\psi_{12} \\
&\,\,\,\,\,+ (1\!-\!w_i)w_j\psi_{22} \!+\! w_i(1\!-\!w_j)\psi_{11}\big] \mathsf{k}(\mathbf{x}_i, \mathbf{x}_j) \\[-0cm]
&\,\,\,\,\,+ \big[(1\!-\!w_i)\sigma_{21} \!+\! w_i\sigma_{12}\big] \mathds{1}_{i=j},
\end{align*}
where $\psi_{ab}$ and $\sigma_{ab}$ are the $(a,b)$--th elements of $\Psi$ and $\Sigma$, respectively.

This completes the proof.
\end{proof}

\section{Evaluation Metrics}
\label{sec:eval-metrics}
The two evaluation metrics reported in our experiments are defined as follows: (i) precision in estimation of heterogeneous effects (PEHE): \begin{align*}
    \epsilon_\textrm{PEHE} \vcentcolon= \sum_{\mathsf{s}=1}^m\sum_{i=1}^{n_\mathsf{s}}(\tau^{\mathsf{s}}_i - \hat{\tau}^{\mathsf{s}}_i)^2/(m n_\mathsf{s})
\end{align*}
for evaluating ITE, 
and (ii) absolute error:
\begin{align*}
    \epsilon_\textrm{ATE} \vcentcolon=  |\tau-\hat{\tau}|
\end{align*}
for evaluating ATE, where $\tau_{i}^{\mathsf{s}}$ and $\tau$ are the \textit{true} ITE and \textit{true} ATE, respectively, and $\hat{\tau}_{i}^{\mathsf{s}}$,  $\hat{\tau}$ are their estimates. 


\section{Additional Experimental Results}


In this section, we present some additional results which was skipped in the main text due to limited space.

\subsection{Synthetic Data: DATA-2}
\label{sec:appendix-data-2}
In this section, we present additional experimental results on DATA-2. Again, those results were skipped in the main text due to limited space. In Table~\ref{tab:error-synthetic-2-appendix}, we present additional results of the baselines trained locally ($\mathsf{loc}$) and the baselines trained with bootstrap aggregating ($\mathsf{agg}$). Similar to the experiments on DATA-1 presented in the main text, the results on DATA-2 also show that FedCI achieves much lower errors, especially the error in predicting ITE.

\begin{figure}\centering
    \includegraphics[width=0.47\textwidth]{figures/FedCI-uncertainty-analysis-ihdp.pdf}
\caption{The estimated ATE distribution on source \#1 of IHDP dataset. The dotted black lines represent the true ATE.}
\label{fig:fedci-uncertainty-analysis-ihdp}
\end{figure}

\begin{table}\centering
\caption{Out-of-sample errors on DATA-2 where top-3 performances are highlighted in bold (lower is better). The dashes (---) in `$\mathsf{loc}$' and `$\mathsf{agg}$' indicate that the numbers are the same as those of `$\mathsf{com}$'.}
\label{tab:error-synthetic-2-appendix}
\setlength{\tabcolsep}{2.1pt}
\scriptsize
\begin{tabular}{@{}lcccccc@{}}
\toprule
\multirow{2}{*}{Method}                                           & \multicolumn{3}{c}{The error of ITE ($\sqrt{\epsilon_\text{PEHE}}$)} & \multicolumn{3}{c}{The error of ATE ( $\epsilon_\text{ATE}$)} \\ \cmidrule(lr){2-4}\cmidrule(lr){5-7} 
                                                                  & 1 source      & 3 sources     & 5 sources     & 1 source      & 3 sources     & 5 sources    \\ \cmidrule(r){1-1}\cmidrule(lr){2-4}\cmidrule(lr){5-7}
BART$_\mathsf{loc}$                                                       & ---    & 18.4$\pm$0.3    & 18.3$\pm$0.2    & ---    & 3.37$\pm$0.7    & 2.90$\pm$0.6   \\
X-Learner$_\mathsf{loc}$                                                  & ---    & 22.7$\pm$0.5    & 22.8$\pm$0.5    & ---    & 3.55$\pm$1.3    & 3.09$\pm$0.8   \\
R-Learner$_\mathsf{loc}$                                                  & ---    & 26.3$\pm$0.2    & 26.1$\pm$0.2    & ---    & 19.7$\pm$0.3    & 19.5$\pm$0.3   \\
OthoRF$_\mathsf{loc}$                                                     & ---   & 38.3$\pm$1.4    & 40.0$\pm$0.9    & ---    & 4.09$\pm$0.9    & 4.40$\pm$1.2   \\
TARNet$_\mathsf{loc}$    & ---     & 37.6$\pm$0.6     & 37.1$\pm$0.4     & ---     & 7.31$\pm$0.4     & 7.25$\pm$0.3     \\ 
CFR Wass$_\mathsf{loc}$  & ---     & 37.2$\pm$0.7 & 37.0$\pm$0.5 & ---     & 7.24$\pm$0.3     & 7.12$\pm$0.2     \\ 
CFR MMD$_\mathsf{loc}$   & --- & 37.2$\pm$0.6 & 36.8$\pm$0.4 & ---     & 7.21$\pm$0.4     & 7.11$\pm$0.3     \\
CEVAE$_\mathsf{loc}$                                                     & ---   & $21.4\pm$0.7    & 19.8$\pm$0.6    & ---    & 2.11$\pm$0.4    & 1.97$\pm$0.2   \\\cmidrule(r){1-1}\cmidrule(lr){2-4}\cmidrule(lr){5-7}
BART$_\mathsf{agg}$                                                 & ---             & 17.9$\pm$0.2              & 17.7$\pm$0.2              & ---             & 3.91$\pm$0.8              & 3.15$\pm$0.7             \\
X-Learner$_\mathsf{agg}$ & ---             & 18.2$\pm$0.4    & 17.1$\pm$0.2    & ---             & 3.43$\pm$1.3    & 3.07$\pm$0.8   \\
R-Learner$_\mathsf{agg}$ & ---             & 26.2$\pm$0.3    & 26.1$\pm$0.2    & ---             & 19.7$\pm$0.4    & 19.6$\pm$0.3   \\
OthoRF$_\mathsf{agg}$    & ---             & 25.0$\pm$1.3    & 17.3$\pm$0.6              & ---             & 4.56$\pm$1.1    & \textbf{1.30$\pm$0.4}             \\
TARNet$_\mathsf{agg}$    & ---     & 36.5$\pm$0.3     & 36.1$\pm$0.3     & ---     & 7.26$\pm$0.3     & 7.18$\pm$0.3     \\ 
CFR Wass$_\mathsf{agg}$  & ---     & 35.2$\pm$0.5 & 35.0$\pm$0.3 & ---     & 7.13$\pm$0.3     & 6.97$\pm$0.2     \\ 
CFR MMD$_\mathsf{agg}$   & --- & 35.2$\pm$0.5 & 35.1$\pm$0.4 & ---     & 7.10$\pm$0.4     & 7.05$\pm$0.2     \\
CEVAE$_\mathsf{agg}$                                                     & ---    & $19.2\pm$0.8    & 18.3$\pm$0.7    & ---    & 2.02$\pm$0.3    & 1.91$\pm$0.4   \\\cmidrule(r){1-1}\cmidrule(lr){2-4}\cmidrule(l){5-7}
BART$_\mathsf{com}$                                                     & \textbf{18.0$\pm$0.4}             & \textbf{17.7$\pm$0.2}    & 17.4$\pm$0.1    & \textbf{3.54$\pm$1.3}             & 2.94$\pm$0.8    & \textbf{1.84$\pm$0.5}   \\
X-Learner$_\mathsf{com}$    & 21.1$\pm$0.9             & 17.9$\pm$0.4    & \textbf{16.2$\pm$0.2}    & 4.55$\pm$1.4             & 3.29$\pm$1.0    & 2.37$\pm$0.8   \\
R-Learner$_\mathsf{com}$     & 25.9$\pm$0.6             & 23.5$\pm$0.5    & 21.3$\pm$0.4    & 19.0$\pm$0.8             & 15.6$\pm$0.7    & 12.3$\pm$0.6   \\
OthoRF$_\mathsf{com}$        & 37.8$\pm$2.7             & \textbf{10.7$\pm$0.5}    & \textbf{9.83$\pm$0.5}    & 7.88$\pm$2.2             & \textbf{1.99$\pm$0.4}    & 2.36$\pm$0.6   \\
TARNet$_\mathsf{com}$    & 36.1$\pm$0.4     & 35.5$\pm$0.2     & 35.0$\pm$0.2     & 7.11$\pm$0.4     & 7.10$\pm$0.3     & 7.08$\pm$0.2     \\ 
CFR Wass$_\mathsf{com}$  & 35.1$\pm$0.4     & 34.5$\pm$0.2 & 34.1$\pm$0.2 & 7.10$\pm$0.4     & 7.01$\pm$0.3     & 6.90$\pm$0.2     \\ 
CFR MMD$_\mathsf{com}$   & 35.1$\pm$0.4 & 35.0$\pm$0.2 & 34.9$\pm$0.2 & 7.12$\pm$0.4     & 7.02$\pm$0.3     & 7.01$\pm$0.2     \\
CEVAE$_\mathsf{com}$        & \textbf{20.1$\pm$0.5}             & 18.4$\pm$0.6    & 16.6$\pm$0.6    & \textbf{1.50$\pm$0.3}             & \textbf{1.38$\pm$0.4}    & 1.89$\pm$0.2   \\\cmidrule(r){1-1}\cmidrule(lr){2-4}\cmidrule(lr){5-7}
FedCI                                                             & \textbf{9.28$\pm$0.4}     & \textbf{6.34$\pm$0.2}    & \textbf{5.53$\pm$0.1}    & \textbf{2.37$\pm$0.5}    & \textbf{1.47$\pm$0.4}    & \textbf{0.74$\pm$.2}   \\ \bottomrule
\end{tabular}
\end{table}

\begin{table}\centering
\caption{Out-of-sample errors on IHDP dataset where top-3 performances are highlighted in bold (lower is better). The dashes (---) in `$\mathsf{loc}$' and `$\mathsf{agg}$' indicate that the numbers are the same as those of `$\mathsf{com}$'.}
\label{tab:error-ihdp-appendix}
\setlength{\tabcolsep}{2.3pt}
\scriptsize
\begin{tabular}{@{}lcccccc@{}}
\toprule
\multirow{2}{*}{Method}                                           & \multicolumn{3}{c}{The error of ITE ($\sqrt{\epsilon_\text{PEHE}}$)} & \multicolumn{3}{c}{The error of ATE ( $\epsilon_\text{ATE}$)} \\ \cmidrule(lr){2-4}\cmidrule(lr){5-7} 
                                                                  & 1 source      & 2 sources     & 3 sources     & 1 source      & 2 sources     & 3 sources    \\ \cmidrule(r){1-1}\cmidrule(lr){2-4}\cmidrule(lr){5-7}
BART$_\mathsf{loc}$                                                       & ---    & 5.83$\pm$2.6    & 6.56$\pm$3.3    & ---    & 2.09$\pm$0.9    & 1.38$\pm$0.5   \\
X-Learner$_\mathsf{loc}$                                                  & ---    & 4.14$\pm$1.5    & 4.54$\pm$1.9    & ---    & 1.51$\pm$0.7    & 0.77$\pm$0.5   \\
R-Learner$_\mathsf{loc}$                                                  & ---    & 6.35$\pm$1.9    & 6.16$\pm$2.0    & ---    & 2.13$\pm$0.7    & 1.44$\pm$0.3   \\
OthoRF$_\mathsf{loc}$                                                     & ---    & 4.33$\pm$1.6    & 4.59$\pm$1.9    & ---    & 1.10$\pm$0.6    & 0.75$\pm$0.3   \\
TARNet$_\mathsf{loc}$    & ---     & 3.71$\pm$1.0     & 3.83$\pm$1.1     & ---     & 1.31$\pm$0.5     & 0.98$\pm$0.4     \\ 
CFR Wass$_\mathsf{loc}$  & ---     & 3.35$\pm$0.8 & 3.12$\pm$0.7 & ---     & 0.87$\pm$0.5     & 0.82$\pm$0.4     \\ 
CFR MMD$_\mathsf{loc}$   & --- & 3.40$\pm$0.9 & 3.15$\pm$1.2 & ---     & 1.17$\pm$0.5     & 0.63$\pm$0.3     \\
CEVAE$_\mathsf{loc}$                                                     & ---    & 3.78$\pm$0.7    & 3.93$\pm$0.8    & ---    & 1.91$\pm$0.3    & 2.37$\pm$0.2   \\\cmidrule(r){1-1}\cmidrule(lr){2-4}\cmidrule(lr){5-7}
BART$_\mathsf{agg}$                                                 & ---             & 4.05$\pm$1.9              & 3.69$\pm$1.8              & ---             & 2.09$\pm$1.0              & 1.30$\pm$0.5             \\
X-Learner$_\mathsf{agg}$ & ---             & 3.98$\pm$1.5    & 4.28$\pm$1.9    & ---             & 1.51$\pm$0.7    & 0.83$\pm$0.5   \\
R-Learner$_\mathsf{agg}$ & ---             & 4.76$\pm$1.3    & 4.46$\pm$1.6    & ---             & 1.92$\pm$0.5    & 1.41$\pm$0.2   \\
OthoRF$_\mathsf{agg}$    & ---             & 3.40$\pm$1.1    & 4.26$\pm$1.9              & ---             & 0.87$\pm$0.3    & 1.20$\pm$0.6             \\
TARNet$_\mathsf{agg}$    & ---     & 3.52$\pm$0.9     & 3.81$\pm$1.2     & ---     & 1.23$\pm$0.4     & 0.95$\pm$0.4     \\ 
CFR Wass$_\mathsf{agg}$  & ---     & 3.21$\pm$0.7 & 2.93$\pm$0.9 & ---     & 0.80$\pm$0.3     & 0.71$\pm$0.2     \\ 
CFR MMD$_\mathsf{agg}$   & --- & 3.17$\pm$0.8 & 2.91$\pm$1.3 & ---     & 1.12$\pm$0.5     & 0.57$\pm$0.3     \\
CEVAE$_\mathsf{agg}$    & ---             & 3.63$\pm$0.7    & 3.73$\pm$0.5              & ---             & 0.92$\pm$0.2    & 0.84$\pm$0.5             \\\cmidrule(r){1-1}\cmidrule(lr){2-4}\cmidrule(l){5-7}
BART$_\mathsf{com}$                                                 & 5.98$\pm$2.7             & 4.32$\pm$2.1              & 4.04$\pm$2.0              & 1.80$\pm$1.1             & 2.09$\pm$1.1              & 1.21$\pm$0.6             \\
X-Learner$_\mathsf{com}$ & 4.22$\pm$1.6             & 4.15$\pm$1.5    & 4.06$\pm$1.8    & 1.64$\pm$0.7             & 1.93$\pm$0.8    & 0.84$\pm$0.4   \\
R-Learner$_\mathsf{com}$ & 6.97$\pm$2.1             & 4.43$\pm$1.4    & 4.47$\pm$1.7    & 3.15$\pm$0.5             & 1.34$\pm$0.5    & 1.10$\pm$0.3   \\
OthoRF$_\mathsf{com}$    & 4.49$\pm$1.9             & 3.81$\pm$1.3    & 3.75$\pm$1.5              & 1.86$\pm$0.8             & 1.61$\pm$0.6    & 1.56$\pm$0.8\\ 
TARNet$_\mathsf{com}$    & 4.50$\pm$1.4     & 3.15$\pm$0.8     & 3.79$\pm$1.1     & \textbf{1.52$\pm$0.5}     & 1.18$\pm$0.4     & 0.91$\pm$0.3     \\ 
CFR Wass$_\mathsf{com}$  & \textbf{4.37$\pm$1.2}     & 2.93$\pm$0.6 & 2.85$\pm$0.9 & \textbf{1.18$\pm$0.7}     & \textbf{0.72$\pm$0.2}     & 0.67$\pm$0.1     \\ 
CFR MMD$_\mathsf{com}$   & 4.43$\pm$1.3 & \textbf{2.85$\pm$0.6} & \textbf{2.83$\pm$1.1} & 2.32$\pm$0.8     & \textbf{0.63$\pm$0.2}     & \textbf{0.54$\pm$0.2}     \\
CEVAE$_\mathsf{com}$        & \textbf{3.16$\pm$0.6}             & \textbf{2.34$\pm$0.6}    & \textbf{2.31$\pm$0.7}    & 2.02$\pm$0.4             & \textbf{0.53$\pm$0.1}    & \textbf{0.48$\pm$0.2}   \\\cmidrule(r){1-1}\cmidrule(lr){2-4}\cmidrule(lr){5-7}
FedCI                                                             & \textbf{2.88$\pm$0.8}     & \textbf{2.36$\pm$0.5}    & \textbf{2.35$\pm$0.6}    & \textbf{1.43$\pm$0.7}    & 1.03$\pm$0.4    & \textbf{0.51$\pm$0.2}   \\ \bottomrule
\end{tabular}
\end{table}


\subsection{IHDP Dataset}
\label{sec:appendix-ihdp}

In this section, we present additional  experimental results on the IHDP dataset. The results here were not presented in the main text due to limited space. In Table~\ref{tab:error-ihdp-appendix}, we present additional results of the baselines trained locally ($\mathsf{loc}$) and the baselines trained with bootstrap aggregating ($\mathsf{agg}$). Similar to the experiments on synthetic data, the results presented here show that FedCI achieves much smaller errors. The reason is because FedCI has access to all the data sources in a federated fashion while the `baselines trained locally' ($\mathsf{loc}$) and the `baselines trained with bootstrap aggregating' ($\mathsf{agg}$) only have access to a local data source.

Similar to the experiment on synthetic data, the estimated distribution of ATE in the first source ($\mathsf{s}=1$) is presented in~Figure~\ref{fig:fedci-uncertainty-analysis-ihdp}. Again, the figures show that the true ATE is inside the estimated interval and the estimated mean ATE shifts towards its true value (dotted lines) when more data sources are used.

















\end{document}
