% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
%\usepackage{xr} 
%\externaldocument{ma_251}
%------------------------------------------------------------------------------
% Hack to use xr in overleaf.
% See https://www.overleaf.com/learn/how-to/Cross_referencing_with_the_xr_package_in_Overleaf
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{
\typeout{(#1)}
\@addtofilelist{#1}
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother
\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
\myexternaldocument{ma_251} %<- external document
%------------------------------------------------------------------------------

\hypersetup{
colorlinks   = true, %Colours links instead of ugly boxes
urlcolor     = blue, %Colour for external hyperlinks
linkcolor    = blue, %Colour of internal links
citecolor    = blue %Colour of citations
}

\usepackage{hyperref}
\usepackage{url}

%------------------------------------------------------------------------------
% Put added packages and preambles here
\usepackage{graphicx}
\usepackage{tabularx}
\usepackage{subfigure}
\usepackage{amsmath,amsthm,amsfonts,bm,amssymb}
\usepackage{multirow,colortbl}
\usepackage{enumitem}
\usepackage{wrapfig}
\usepackage{algorithm,algpseudocode}
\usepackage{caption}
\usepackage{booktabs}

\definecolor{Gray}{gray}{0.9}
\newcommand{\gc}{\cellcolor{Gray}}
%% \newcommand{\la}{METR-LA}
%% \newcommand{\bay}{PEMS-BAY}
%% \newcommand{\pmub}{PMU-B}
%% \newcommand{\pmuc}{PMU-C}
\newcommand{\la}{\textsc{metr-la}}
\newcommand{\bay}{\textsc{pems-bay}}
\newcommand{\pmub}{\textsc{pmu-b}}
\newcommand{\pmuc}{\textsc{pmu-c}}
\newcommand{\fone}{\textsc{f1}}
\newcommand{\prauc}{\textsc{prauc}}
%% \newcommand{\rocauc}{\textsc{rocauc}}
\newcommand{\rocauc}{\textsc{auc}}
\newcommand{\acc}{\textsc{acc}}
\newcommand{\perm}{\texttt{p}}
\newcommand{\ones}{\bm{1}}
\newcommand{\DD}{\mathcal{D}}
\newcommand{\gauss}{\mathcal{N}}
\newcommand{\unif}{\mathcal{U}}
\newcommand{\mean}{\mathbb{E}}
\newcommand{\real}{\mathbb{R}}
\newcommand{\todo}{{\color{red} TODO}}
%% \newcommand{\rd}[1]{\textcolor{red}{#1}}
%% \newcommand{\bl}[1]{\textcolor{blue}{#1}}
\newcommand{\rd}[1]{#1}
\newcommand{\bl}[1]{#1}
\newcommand{\no}[1]{#1}

\DeclareMathOperator{\softmax}{softmax}
\DeclareMathOperator{\embed}{embedding}
\DeclareMathOperator{\relu}{ReLU}
\DeclareMathOperator{\lstm}{LSTM}
\DeclareMathOperator{\gru}{GRU}
\DeclareMathOperator{\sigmoid}{sigmoid}
\DeclareMathOperator{\entropy}{entropy}
\DeclareMathOperator{\gumbel}{Gumbel}
\DeclareMathOperator{\cat}{Cat}
\DeclareMathOperator{\ber}{Ber}
%\DeclareMathOperator{\unif}{Uniform}
\DeclareMathOperator{\erf}{erf}
\DeclareMathOperator{\bias}{Bias}
\DeclareMathOperator{\diag}{diag}
% customized commands
\DeclareMathOperator{\argmax}{\arg\max}
\DeclareMathOperator{\argmin}{\arg\min}
\DeclareMathOperator*{\minimize}{\text{minimize}}
\DeclareMathOperator*{\maximize}{\text{maximize}}
\DeclareMathOperator*{\st}{\text{subject to}}

\theoremstyle{definition} \newtheorem{definition}{Definition}
\theoremstyle{remark}     \newtheorem{remark}{Remark}
\theoremstyle{remark}     \newtheorem{example}{Example}
\theoremstyle{definition}      \newtheorem{theorem}{Theorem}%[section]
\theoremstyle{plain}      \newtheorem{conjecture}[theorem]{Conjecture}
\theoremstyle{plain}      \newtheorem{proposition}[theorem]{Proposition}
\theoremstyle{plain}      \newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{plain}      \newtheorem{lemma}[theorem]{Lemma}

\graphicspath{{figs/}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Federated Learning of Models Pre-Trained on Different Features with Consensus Graphs\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Tengfei Ma}
\author[2]{Trong Nghia Hoang}
\author[3,1]{\href{mailto:<chenjie@us.ibm.com>?Subject=Your UAI 2023 paper}{Jie Chen}{}}
% Add affiliations after the authors
\affil[1]{%
    IBM Research
}
\affil[2]{%
    Washington State University
}
\affil[3]{%
    MIT-IBM Watson AI Lab
  }
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

%------------------------------------------------------------------------------
\section{Permutation Ambiguity Example for GRU}\label{app:b}
In Section~\ref{sec:ambiguity}, we discuss that one can arbitrarily permute the latent representations while keeping a local model fixed. Here, we give another example -- the GRU. Let $\mathbf{x}=\{\mathbf{x}_1,\mathbf{x}_2,\ldots,\mathbf{x}_T\}$ be an input sequence. The embedding function $\mathbf{h}=\embed(\mathbf{x})$ implemented as a GRU reads:

\begin{algorithmic}[1]
  \Function {$\mathbf{h}=\gru$}{$\{\mathbf{x}_t\}_{t=1}^T$}
  \State $\mathbf{h}_0=\mathbf{0}$
  \For{$t=1,\ldots,T$}
  \State $\mathbf{z}_t = \sigmoid(\mathbf{W}_z\mathbf{x}_t+\mathbf{U}_z\mathbf{h}_{t-1}+\mathbf{b}_z)$
  \State $\mathbf{r}_t = \sigmoid(\mathbf{W}_r\mathbf{x}_t+\mathbf{U}_r\mathbf{h}_{t-1}+\mathbf{b}_r)$
  \State $\mathbf{n}_t = \tanh(\mathbf{W}_n\mathbf{x}_t+\mathbf{U}_n(\mathbf{r}_t\odot \mathbf{h}_{t-1})+\mathbf{b}_n)$
  \State $\mathbf{h}_t = (\mathbf{1}-\mathbf{z}_t)\odot \mathbf{h}_{t-1} + \mathbf{z}_t\odot \mathbf{n}_t$
  \EndFor
  \State \Return $\mathbf{h}=\mathbf{h}_T$
  \EndFunction
\end{algorithmic}

One can arbitrarily permute the elements of $\mathbf{h}$ through manipulating the GRU parameters properly. To achieve $\mathbf{h}[\perm] = \embed(\mathbf{x}[\perm])$,
\begin{itemize}[leftmargin=*]
\item the gate outputs and bias vectors ($\mathbf{z}_t$, $\mathbf{r}_t$, $\mathbf{n}_t$, $\mathbf{h}_t$, $\mathbf{b}_z$, $\mathbf{b}_r$, $\mathbf{b}_n$) will need be permuted accordingly ($\mathbf{z}_t[\perm]$, $\mathbf{r}_t[\perm]$, $\mathbf{n}_t[\perm]$, $\mathbf{h}_t[\perm]$, $\mathbf{b}_z[\perm]$, $\mathbf{b}_r[\perm]$, $\mathbf{b}_n[\perm]$);

\item the weight matrices attached to the input ($\mathbf{W}_z$, $\mathbf{W}_r$, $\mathbf{W}_n$) will need to have their rows (i.e., output neurons) permuted ($\mathbf{W}_z[\perm,:]$, $\mathbf{W}_r[\perm,:]$, $\mathbf{W}_n[\perm,:]$); and

\item the weight matrices attached to the hidden states ($\mathbf{U}_z$, $\mathbf{U}_r$, $\mathbf{U}_n$) will need to have both their rows and columns permuted ($\mathbf{U}_z[\perm,\perm]$, $\mathbf{U}_r[\perm,\perm]$, $\mathbf{U}_n[\perm,\perm]$).
\end{itemize}


%------------------------------------------------------------------------------
\section{Proofs and Additional Results of Theorem~\ref{thm:cdf.z}}\label{sec:bernoulli.proof}


%------------------------------------------------------------------------------
\subsection{Distribution of Gumbel Softmax}
The Gumbel softmax reparameterization trick~\citep{Jang2017,Maddison2017} works in the following manner. Let $\cat(\boldsymbol{\pi})$ be the categorical distribution with probability vector $\boldsymbol{\pi}$ and let $\mathbf{g}$, of the same shape as $\boldsymbol{\pi}$, be a vector variable whose elements are i.i.d. $\sim\gumbel(0,1)$. Then, 
\begin{eqnarray}\label{eqn:y}
\mathbf{y} &=& \softmax\left(\frac{1}{\tau}\left(\log\boldsymbol{\pi}+\mathbf{g}\right)\right),\quad\tau>0
\end{eqnarray}
admits a distribution converging to $\cat(\boldsymbol{\pi})$ when $\tau\to0$. Hence, to sample $\ber(\theta)$ approximately but differentiably, it suffices to let $\boldsymbol{\pi}=[\theta,1-\theta]^\top$ and use $y_1$ as the sample. 


As preliminary, we consider the first entry $y_1$ of the random variable $\mathbf{y}$ defined in~\eqref{eqn:y} for the Gumbel softmax parameterization. Note that for any $\tau\ne0$, $y_1$ is only approximately binary; the possible values of $y_1$ in fact span the entire interval $[0,1]$. We derive the following CDF for $y_1$. For notational simplicity, $\theta$ denotes a scalar rather than a matrix.

\begin{theorem}\label{thm:cdf.y1}
  For all $\tau>0$, $\theta\in(0,1)$, and $t\in[0,1]$, we have
  \begin{eqnarray}\label{eqn:cdf.y1}
  \Pr(y_1 \le t) &=& \frac{t^{\tau}(1-\theta)}{t^{\tau}(1-\theta)+(1-t)^{\tau}\theta}.
  \end{eqnarray}
\end{theorem}

\textbf{Proof.}
We first consider the case $0<t<1$. Through simple algebraic manipulation, we obtain that $y_1\le t$ is equivalent to
\begin{eqnarray}\label{eqn:le1}
  g_1-g_2 &\le& \tau\log\frac{t}{1-t}-\log\frac{\theta}{1-\theta}.
\end{eqnarray}
Let $g_1=-\log(-\log u)$ and $g_2=-\log(-\log v)$, where $u$ and $v$ are independent and $\sim\unif(0,1)$. Then, \eqref{eqn:le1} is equivalent to
\[
v \ \ge\ u^M \quad\text{where}\quad
M \ =\ \frac{t^{\tau}(1-\theta)}{(1-t)^{\tau}\theta}.
\]
Therefore, by recalling that $u$ and $v$ are uniform in $[0,1]^2$, we note that the probability that $v\le u^M$ happens is the double integral
\[
\Pr(v\ge u^M) \ =\ \int_0^1\int_{u^M}^1 1\,dvdu.
\]
This integral is nothing but
\[
1-\int_0^1 u^M\,du \ =\ \frac{M}{1+M},
\]
which completes the proof of~\eqref{eqn:cdf.y1}. The cases of $t=0$ or $1$ obviously hold by continuity.


%------------------------------------------------------------------------------
\subsection{Proof of Theorem~\ref{thm:cdf.z}}
We first consider the case when the distribution with CDF $F$ is finitely supported on $[a,b]$. Through simple algebraic manipulation, we obtain that $z\le t$ is equivalent to $s\ge M$ where $M:=F^{-1}(\theta)+\tau\log(t^{-1}-1)$. If $t < \sigmoid((F^{-1}(\theta)-b)/\tau)$, we see that $M>b$ and thus such $s$ can never occur. Similarly, if $t > \sigmoid((F^{-1}(\theta)-a)/\tau)$, we see that $M<a$, which indicates that $s\ge M$ always happens. Otherwise, when $t$ is within the two extremes, the probability that $s\ge M$ happens is $1-F(M)$, concluding the proof of Theorem~\ref{thm:cdf.z}.

The statement of the theorem regarding the case when the distribution is not finitely supported is obviously true.

To show that the distribution of $z$ converges to $\ber(\theta)$, let us first consider the scenario when the distribution with CDF $F$ is finitely supported. The CDF of $z$ is always continuous but it has three segments connected by two joints: $t_1=\sigmoid((F^{-1}(\theta)-b)/\tau)$ and $t_2=\sigmoid((F^{-1}(\theta)-a)/\tau)$. When $\tau\to0$, the joint $t_1\to0$ and the joint $t_2\to1$ and thus the middle segment has a wider and wider support converging to $[0,1]$. Hence, it suffices to consider only the middle segment. Further, with an analogous argument for other scenarios, it is also true that it suffices to consider only the third case of Theorem~\ref{thm:cdf.z}.

In this case, for any fixed $t<1$ and when $\tau\to0$, we have $\tau\log(t^{-1}-1)\to0$ and thus $\Pr(z\le t)\to1-F(F^{-1}(\theta))=1-\theta$. Meanwhile, we cannot push $\tau\to1$ because then the limit of $\tau\log(t^{-1}-1)$ is undefined. However, we know by definition that $\Pr(z\le1)=1$. Hence, the continuous distribution of $z$ converges to a degenerate distribution $\Pr(z<1)=1-\theta$ and $\Pr(z=1)=1$. This is the CDF of $\ber(\theta)$.


%------------------------------------------------------------------------------
\section{Tuning Guidance for Temperature $\tau$}\label{sec:tune.tau}
Our tuning guidance for the temperature $\tau$ is motivated from an asymptotic convergence comparison between {\bf ICDF} and Gumbel reparameterization, which is featured in the theorem below.
\begin{theorem}\label{thm:bias}
When $\tau$ is small,
\begin{eqnarray}
\bias(y_1) &=& \frac{1}{6}\tau^2\pi^2\theta(1-\theta)(1-2\theta) \ +\ O(\tau^4), \label{eqn:gumbel.bias.tau}\\
\bias(z) &=& \frac{1}{6}\tau^2\pi^2F''(F^{-1}(\theta)) \ +\ O(\tau^4). \label{eqn:icdf.bias.tau}
\end{eqnarray}
Moreover, when $F$ is the CDF of a normal variable $\sim\mathbb{N}(0,\sigma^2)$,
\begin{eqnarray}\label{eqn:icdf.bias.tau.gauss}
\bias(z) &=& -\frac{1}{6\sigma^2}\tau^2\pi^{\frac{3}{2}}\erf^{-1}\left(2\theta-1\right)e^{-(\erf^{-1}(2\theta-1))^2} \ +\  O(\tau^4).
\end{eqnarray}
Its formal proof is detailed later.
\end{theorem}

Theorem~\ref{thm:bias} suggests that the {\bf ICDF} method converges equally fast as does the Gumbel trick -- both on the order of $O(\tau^2)$. On the other hand, the biases depend on $\theta$. Thus, one cannot set temperatures $\tau$ independently of the desired probability $\theta$ to equate the two biases. In practice, $\tau$ is a tunable hyper-parameter and a guidance on the tuning range is therefore necessary.

To begin, we use a subscript to distinguish the two temperatures -- $\tau_{\text{g}}$ for the Gumbel trick and $\tau_{\text{i}}$ for the {\bf ICDF} method -- and write, based on~\eqref{eqn:gumbel.bias.tau} and~\eqref{eqn:icdf.bias.tau.gauss} and ignoring the high order terms,
\[
\frac{\bias(y_1)}{\bias(z)} \ \simeq\ \frac{\tau_{\text{g}}^2\sigma^2}{\tau_{\text{\text{i}}}^2}r(\theta) \quad\text{where}\quad
r(\theta)\ =\ \frac{\sqrt{\pi}\theta(1-\theta)(2\theta-1)}{\erf^{-1}(2\theta-1)e^{-(\erf^{-1}(2\theta-1))^2}}.
\]
Note that $r(\theta)$ is symmetric around $\theta=\frac{1}{2}$, is concave, attains maximum $\frac{1}{2}$ when $\theta=\frac{1}{2}$, and attains minimum $0$ when $\theta=0,1$. Hence, if $\tau_{\text{g}}=\tau_{\text{i}}$ and $\sigma=\sqrt{2}$, the bias of the Gumbel trick is (approximately) smaller than that of the {\bf ICDF} method. On the other hand, for a $\sigma>\sqrt{2}$, there exist $\widetilde{\theta}_1<\widetilde{\theta}_2$ such that $\sigma^{-2}=r(\widetilde{\theta}_1)=r(\widetilde{\theta}_2)$ and that $\bias(y_1)\gtrapprox\bias(z)$, whenever $\theta\in[\widetilde{\theta}_1,\widetilde{\theta}_2]$. For example, when $\sigma\approx2.5$, on the interval $\theta\in[0.01,0.99]$, the bias of the Gumbel trick is (approximately) greater than that of the {\bf ICDF} method.

Based on the foregoing, a practical guide is to use the same tuning range of $\tau$ for the {\bf ICDF} method as for the Gumbel trick. A small change of $\sigma$ (e.g., $\sqrt{2}$ versus $2.5$) will entirely flip the landscape of the bias comparison between the two methods. Because the tuning range is much wider than the change of $\sigma$, for simplicity it suffices to fix $\sigma=1$.


%------------------------------------------------------------------------------
\section{Proof of Theorem~\ref{thm:bias} and Additional Results}\label{app:extra}
By the definition of bias, we have
\[
\bias(x)=\mean[x]-\theta
\quad\text{where}\quad
\mean[x] = \int_0^1 t\,d\Pr(x\le t) = 1 - \int_0^1 \Pr(x\le t)\,dt.
\]
Therefore, for Gumbel softmax,
\[
\bias(y_1)=1-\theta-\int_0^1\frac{t^{\tau}(1-\theta)}{t^{\tau}(1-\theta)+(1-t)^{\tau}\theta}\,dt,
\]
and for {\bf ICDF} with any $F$,
\[
\bias(z)=\int_0^1 F(F^{-1}(\theta)+\tau\log(t^{-1}-1))\,dt-\theta.
\]
We now prove Theorem~\ref{thm:bias} in a few parts.


%------------------------------------------------------------------------------
\textbf{Proof of~\eqref{eqn:icdf.bias.tau}.}
Let $s=F^{-1}(\theta)$ and perform a change of variable $m=\log(t^{-1}-1)$. Then,
\[
\bias(z)=\int_0^1[F(s+\tau m)-F(s)]\,dt
=\int_{-\infty}^{\infty}[F(s+\tau m)-F(s)]\frac{e^m}{(1+e^m)^2}\,dm.
\]
We perform Taylor expansion of $F$ around $s$ and obtain
\[
F(s+\tau m)-F(s) = \sum_{n=1}^{\infty}\frac{F^{(n)}(s)}{n!}\tau^nm^n.
\]
Therefore,
\[
\bias(z)=\sum_{n=1}^{\infty}\frac{F^{(n)}(s)}{n!}\tau^n\int_{-\infty}^{\infty}\frac{m^ne^m}{(1+e^m)^2}\,dm
\]
Each integral term is finite and the odd terms vanish because the integrands are odd functions. Thus, for small $\tau$, we are left with
\[
\bias(z)=\frac{F''(s)}{2}\tau^2\int_{-\infty}^{\infty}\frac{m^2e^m}{(1+e^m)^2}\,dm + O(\tau^4).
\]
The definite integral evaluates to $\frac{\pi^2}{3}$; we therefore conclude the proof.


%------------------------------------------------------------------------------
\textbf{Proof of~\eqref{eqn:icdf.bias.tau.gauss}.}
Equation~\eqref{eqn:icdf.bias.tau.gauss} is straightforward by substuting
\[
F''(s)=-\frac{s}{\sigma^3\sqrt{2\pi}}e^{-\frac{s^2}{2\sigma^2}}
=-\frac{\erf^{-1}(2\theta-1)}{\sigma^2\sqrt{\pi}}e^{-(\erf^{-1}(2\theta-1))^2}.
\]
into~\eqref{eqn:icdf.bias.tau}.


%------------------------------------------------------------------------------
\textbf{Proof of~\eqref{eqn:gumbel.bias.tau}.}
To simplify notation, let $\beta=\theta/(1-\theta)$ and perform a change of variable $m=\log(t^{-1}-1)$. Then,
\[
\int_0^1\frac{t^{\tau}(1-\theta)}{t^{\tau}(1-\theta)+(1-t)^{\tau}\theta}\,dt
=\int_0^1\frac{dt}{1+\beta e^{m\tau}}
=\int_{-\infty}^{\infty}\frac{1}{1+\beta e^{m\tau}}\frac{e^m}{(1+e^m)^2}\,dm.
\]
Denote $h(\tau,m)=[1+\beta e^{m\tau}]^{-1}$. Treating $h$ a function of $\tau$ and performing Taylor expansion around zero, we obtain
\[
h(\tau,m)=\sum_{n=0}^{\infty}\frac{h^{(n)}(0,m)}{n!}\tau^n.
\]
Therefore,
\[
\int_0^1\frac{t^{\tau}(1-\theta)}{t^{\tau}(1-\theta)+(1-t)^{\tau}\theta}\,dt
=\sum_{n=0}^{\infty}\frac{\tau^n}{n!}\int_{-\infty}^{\infty}h^{(n)}(0,m)\frac{e^m}{(1+e^m)^2}\,dm.
\]
In a moment, we will show that for all $n$,
\begin{equation}\label{eqn:h}
h^{(n)}(0,m) = C_nm^n \quad\text{where $C_n$ is independent of $m$}.
\end{equation}
Suppose that~\eqref{eqn:h} holds. Then, each integral term is finite and the odd terms vanish, because the integrands are odd functions. Therefore, for small $\tau$, we are left with
\[
\int_0^1\frac{t^{\tau}(1-\theta)}{t^{\tau}(1-\theta)+(1-t)^{\tau}\theta}\,dt
=C_0\int_{-\infty}^{\infty}\frac{e^m}{(1+e^m)^2}\,dm
+C_2\frac{\tau^2}{2}\int_{-\infty}^{\infty}\frac{m^2e^m}{(1+e^m)^2}\,dm
+O(\tau^4).
\]
By calculating
\begin{gather*}
C_0=h(0,m)=[1+\beta]^{-1}=1-\theta,
\qquad
C_2=h''(0,m)=-\theta(1-\theta)(1-2\theta),\\
\int_{-\infty}^{\infty}\frac{e^m}{(1+e^m)^2}\,dm=1,
\qquad
\int_{-\infty}^{\infty}\frac{m^2e^m}{(1+e^m)^2}\,dm=\frac{\pi^2}{3},
\end{gather*}
we conclude that
\[
\bias(y_1)=\frac{\tau^2\pi^2\theta(1-\theta)(1-2\theta)}{6}+O(\tau^4).
\]

It remains to prove~\eqref{eqn:h}. We suppress the argument on $m$ and write $g(\tau)=1+\beta e^{m\tau}$ and $h(\tau)=g(\tau)^{-1}$. By Fa\`{a} di Bruno's formula,
\[
h^{(n)}(0) = \left.\left(\frac{1}{g(\tau)}\right)^{(n)}\right|_{\tau=0}
= \sum_{k=1}^n\frac{(-1)^kk!}{g(0)^{k+1}}\cdot B_{n,k}\Big(g'(0), g''(0), \ldots, g^{(n-k+1)}(0)\Big),
\]
where $B_{n,k}$ is the Bell polynomial. Clearly, $g(0)=1+\beta$ and $g^{(r)}(0)=\beta m^r$ for all $r>0$. Hence, $B_{n,k}$ is a multiple of $m^n$. Therefore, $h^{(n)}(0)$ is a multiple of $m^n$.


%------------------------------------------------------------------------------
\subsection{Additional Result Regarding the Bias}
Theorem~\ref{thm:bias} states results for a small temperature $\tau$. The purpose is to understand the limiting behavior of the bias. Here, we give an additional result for any $\tau>0$. It states that the biases of the two sampling approaches have the same sign. This result is a nontrivial extension of Theorem~\ref{thm:bias} and requires a different proof technique.

\begin{theorem}\label{thm:bias.2}
For any $\tau>0$,
\begin{equation}\label{eqn:gumbel.bias}
  \textstyle
  \bias(y_1) > 0 \text{ when } \theta < \frac{1}{2}, \quad
  \bias(y_1) = 0 \text{ when } \theta = \frac{1}{2}, \quad
  \bias(y_1) < 0 \text{ when } \theta > \frac{1}{2}.
\end{equation}
Moreover, if $F'(x)$ (that is, the pdf) is even and is increasing when $x<0$, then
\begin{equation}\label{eqn:icdf.bias}
  \textstyle
  \bias(z) > 0 \text{ when } \theta < \frac{1}{2}, \quad
  \bias(z) = 0 \text{ when } \theta = \frac{1}{2}, \quad
  \bias(z) < 0 \text{ when } \theta > \frac{1}{2}.
\end{equation}
\end{theorem}

We prove Theorem~\ref{thm:bias.2} in two parts.


%------------------------------------------------------------------------------
\textbf{Proof of~\eqref{eqn:gumbel.bias}.}
Consider
\[
\bias(y_1)=\int_0^1 g(t,\theta)\,dt
\quad\text{where}\quad
g(t,\theta)=1-\theta-\frac{t^{\tau}(1-\theta)}{t^{\tau}(1-\theta)+(1-t)^{\tau}\theta}.
\]
With a brute-force calculation, we have
\[
g(t,\theta)+g(1-t,\theta)=
\frac{[(1-t)^{\tau}-t^{\tau}]^2\theta(1-\theta)(1-2\theta)}{[t^{\tau}(1-\theta)+(1-t)^{\tau}\theta][(1-t)^{\tau}(1-\theta)+t^{\tau}\theta]}.
\]
All terms on the right-hand side are positive, except $1-2\theta$. Therefore, when $\theta<\frac{1}{2}$, $g(t,\theta)+g(1-t,\theta)>0$ and hence
\[
\bias(y_1)=\int_0^1\frac{g(t,\theta)+g(1-t,\theta)}{2}\,dt >0.
\]
The other cases ($\theta>\frac{1}{2}$ and $\theta=\frac{1}{2}$) are similarly proved.


%------------------------------------------------------------------------------
\textbf{Proof of~\eqref{eqn:icdf.bias}.}
Consider
\[
\bias(z)=\int_0^1 h(t,\theta)\,dt-\theta
\quad\text{where}\quad
h(t,\theta)=F(F^{-1}(\theta)+\tau\log(t^{-1}-1)).
\]
We have
\[
h(1-t,\theta)=F(F^{-1}(\theta)-\tau\log(t^{-1}-1)).
\]
To simplify notation, let $F^{-1}(\theta)=s$ and $\tau\log(t^{-1}-1)=a$. Then, $h(t,\theta)=F(s+a)$ and $h(1-t,\theta)=F(s-a)$. Let us first consider the case $s<0$ and $a>0$. We see that
\[
F(s+a)-F(s)=\int_{s}^{s+a}F'(m)\,dm
\quad\text{and}\quad
F(s)-F(s-a)=\int_{s-a}^{s}F'(m)\,dm.
\]
For any $b>0$, if $s+b<0$, then by monotonicity, $F'(s+b)>F'(s-b)$. On the other hand, if $s+b\ge0$, then $F'(s+b)=F'(-s-b)>F'(s-b)$. In both cases, the right integral is always smaller than the left integral. In other words,
\[
F(s+a) + F(s-a) > 2F(s).
\]
In fact, the above inequality is also established when $s<0$ and $a<0$. Therefore, whenever $s<0$,
\[
\int_0^1 h(t,\theta)\,dt
=\int_0^1 \frac{h(t,\theta)+h(1-t,\theta)}{2}\,dt
>\int_0^1 F(F^{-1}(\theta))\,dt=\theta.
\]
That is, $\bias(z)>0$. Other cases ($s=F^{-1}(\theta)>0$ and $s=F^{-1}(\theta)=0$) are similarly proved.


%------------------------------------------------------------------------------
\subsection{Empirical Comparison between Gumbel and {\bf ICDF} reparameterization}
Extending the last experiment in Section~\ref{sec:exp}, Table~\ref{tab:time.mem.four.dataset} summarizes the time and memory consumption during the training of global models on the four data sets. The results indicate that our developed {\bf ICDF} reparameterization is more economic than the Gumbel-Softmax approach. 

\begin{table}[h]
  \centering
  \caption{Time and memory consumption of F$^3$ (five epoches) with respect to {\bf ICDF} and Gumbel-Softmax reparameterization. Time is in seconds and memory is in MB.}
  \label{tab:time.mem.four.dataset}
  \begin{tabular}{ccccccccc}
    \toprule
    & \multicolumn{2}{c}{\la}
    & \multicolumn{2}{c}{\bay}
    & \multicolumn{2}{c}{\pmub}
    & \multicolumn{2}{c}{\pmuc}\\
    & Time & Memory & Time & Memory & Time & Memory & Time & Memory\\
    \midrule
    {\bf Gumbel-Softmax} & 87.89 & 832.38 & 270.52 & 1896.11 & 42.40 & 348.39 & 84.89 & 1119.13 \\
    {\bf ICDF}  & 79.69 & 568.24 & 157.93 & 1167.19 & 30.16 & 322.59 & 54.07 & 894.63 \\
    \bottomrule
  \end{tabular}
\end{table}


%------------------------------------------------------------------------------
\section{Data Set Description and Preprocessing}\label{sec:data.descrp}
\textbf{\la} and \textbf{\bay.} These are traffic data sets (MIT licensed) used by~\citet{Li2018}. The former was collected from loop detectors in the highway of Los Angles, CA~\citep{Jagadish2014} and the latter was collected by the California Transportation Agencies Performance Measure System. Both data sets recorded several months of data at the resolution of five minutes. The network graphs are available, which were constructed by imposing a radial basis function on the pairwise distance of sensors at a certain cutoff.

The data sets were originally prepared for forecasting tasks and hence no labeling information exists. We adapt the data for classification. Specifically, we split the time series on the hour, forming hourly windows. We label each window as whether or not it corresponds to rush hour. For proof of concept, we specify 07:00--10:00 and 16:00--19:00 as rush hour and the others non-rush hour. We note that in the original data sets, one of the attributes is time. We remove this attribute to avoid triviality and retain only the speed attribute.

The specification of rush hours may not be highly accurate, but it is a sensible practice to cope with the nonexistence of labeling information. Intuitively, the signal of rush hour comes from reduced traffic speed, but not every location of the network experiences traffic jam. Hence, the diverse traffic patterns inside the same time window under a single label causes nontrivial challenges for local models to discern. Therefore, the need of a global consensus model is justified and it fits well the federated feature fusion scenario.

\textbf{\pmub} and \textbf{\pmuc.} These are proprietary data sets coordinately provided by multiple data owners of the U.S. power grid. No personally identifiable information is present. The suffixes B and C indicate the interconnects of the grid. The data sets come with thousands of annotated grid events spanning a period of two years; they form the classification labels. Many variables (attributes) of the grid condition are recorded; we select only the voltage magnitude and the current magnitude, because they appear to be the strongest signals for event detection based on domain knowledge, and also because more data are available for these two variables. The grid topology is not available.

For each event, we select a one-second window from the three-minute window that covers the approximate annotated event time, based on the largest z-score. We retain a sampling frequency of 30Hz, even though some data are 60Hz. Furthermore, a large amount of data are missing in the raw data. We impute the series by using \texttt{pandas.DataFrame.interpolate(method = 'linear', limit\_direction = 'both')} from the Python \texttt{pandas} package. This way, a windowed series is complete if it ever has raw data. Even so, many series are entirely empty, which corresponds to the scenario illustrated by Figure~\ref{fig:illustration} of the main text. Classes in these two data sets are rather skewed. For \pmub, we remove a class that consists of only one data point and for \pmuc, we combine classes that contain fewer than 24 data points into a single class.


%------------------------------------------------------------------------------
\section{Experiment Details}\label{sec:exp.detail}
The experiments are conducted on one x86 node of a computing cluster with one A100 NVIDIA GPU. The compute node has eight Intel cores and 128GB memory. For each data set, we perform a 70/10/20 random split for training, validation, and testing, respectively.

For local models, we use LSTM with the same hyperparameters: one hidden layer whose hidden dimension is $16$ and the maximum number of epochs $=200$. We pre-train the local models and freeze their parameters afterward. We train each global model for a maximum of 500 epochs and use early stopping according to the validation loss, with a patience of 50 epochs. For the GNN global model, we use a 2-layer GCN with skip connections. The hidden dimension is set at 8 and we select the learning rate from $\{0.01, 0.001\}$. For missing data, we impute the node features by using zero.


%------------------------------------------------------------------------------
\section{Soft and Hard Feature Alignment}
\label{app:hard_alignment}
Feature alignment can be achieved in two manners. The first approach is a soft alignment, which treats each $\mathbf{P}_i$ a free parameter matrix to optimize. Such an alignment softens the one-to-one correspondence in the permutation constraint; i.e., each feature in the source can have a weighted correspondence to each of the features in the target. That is the way we used in the main paper.

An alternative approach is a hard alignment, which treats each $\mathbf{P}_i$ as a permutation matrix. Learning permutation matrices is challenging, however, because they correspond to combinatorial structures and are unsuitable for gradient-based training. We follow~\citet{Mena2018,Emami2018} and relax $\mathbf{P}_i$ by a doubly stochastic matrix, which can be differentiably parameterized  by the Sinkhorn--Knopp algorithm~\citep{Sinkhorn1967}. Specifically, starting from a nonnegative square matrix $\mathbf{K}_0$ and column vectors $\mathbf{r}_0=\mathbf{c}_0=\ones$ of matching lengths, define the sequence
\begin{equation}\label{eqn:knight}
  \mathbf{c}_{j+1} = \ones\oslash (\mathbf{K}_0^T\mathbf{r}_{j}) \text{ and }
  \mathbf{r}_{j+1} = \ones\oslash (\mathbf{K}_0\mathbf{c}_{j}), \quad\text{for $j=0,1,\ldots$}
\end{equation}
Then, under a mild condition, $\mathbf{K}_j:=\diag(\mathbf{r}_j)\mathbf{K}_0\diag(\mathbf{c}_j)$ converges to a doubly stochastic matrix. We truncate the sequence at the $T$th step and treat $\mathbf{K}_T$ as an approximation of $\mathbf{P}_i$.

Despite the advocation by~\cite{Mena2018,Emami2018}, we obtain the following convergence result of Sinkhorn--Knopp, which reveals no free lunch. 
\begin{theorem}[informal]\label{thm:speed.informal}
  Under a condition of $\mathbf{K}_0$, there exists a positive integer $J$ and a constant $C_J$ such that for all $j\ge J$,
  \[
  \left\|\begin{bmatrix}\mathbf{K}_j^T\ones\\ \mathbf{K}_j\ones\end{bmatrix}-
  \begin{bmatrix}\ones\\ \ones\end{bmatrix}\right\|
  \le C_J(1+\sigma_2^2)\sigma_2^{2(j-J)},
  \]
  where $\sigma_2\le1$ is the second largest singular value of the limit of $\mathbf{K}_j$. 
\end{theorem}
Since this is not the focus of this paper, we omit the rigorous analysis of this theorem. The result suggests that for a desirable limit being a permutation matrix, whose $\sigma_2=1$, the error $O(\sigma_2^{2j})$ does not drop. In practice, to expect for an approximate permutation matrix, $\sigma_2\approx1$ and the convergence is exceedingly slow. The practical usefulness of~\eqref{eqn:knight} depends on the learned quality of $\mathbf{K}_0$.

The soft and hard alignment approaches have pros and cons. The hard approach maintains the correspondence of each feature dimension of the latent vectors while the soft approach does not. Maintaining the dimension correspondence is an advantage, especially for local models that produce disentangled latent representations~\citep{Higgins2018}, because each feature dimension is equipped with a semantic meaning that controls a certain aspect of the data. On the other hand, the soft approach is more straightforward and the hard approach is based on an algorithm that barely converges. In practice, we observe that two approaches deliver similar performance. We list the results for both approaches in Table~\ref{tab:hard_alignment}, and we visualize the hard alignment matrix learned on each dataset Figure~\ref{fig:alignment}, to help readers understand the feature alignment. 

\begin{table}[t]
  \centering
  \caption{Different approaches to feature alignment.}
  \label{tab:hard_alignment}
  \begin{tabular}{lcccccccc}
    \toprule
    & \multicolumn{2}{c}{\la}
    & \multicolumn{2}{c}{\bay}
    & \multicolumn{2}{c}{\pmub}
    & \multicolumn{2}{c}{\pmuc}\\
    & \fone & \rocauc & \fone & \rocauc & \fone & \rocauc & \fone & \rocauc\\
    \midrule
    soft alignment
    & .835 & .975 & .860 & .980 & .390 & .734 & .451 & .725\\
    hard alignment
    & .839 & .973 & .855 & .976 & .390 & .737 & .429 & .721\\
    \bottomrule
  \end{tabular}
\end{table}

\begin{figure}[t]
  \centering
  \subfigure[\la]{
    \includegraphics[width=.24\linewidth]{alignment_final_la_neurips.png}}\hfill
  \subfigure[\bay]{
    \includegraphics[width=.24\linewidth]{alignment_final_bay_neurips.png}}\hfill
  \subfigure[\pmub]{
    \includegraphics[width=.24\linewidth]{alignment_final_pmub_neurips.png}}\hfill
  \subfigure[\pmuc]{
    \includegraphics[width=.24\linewidth]{alignment_final_pmuc_neurips.png}}
  \caption{Examples of learned permutation matrices ($\mathbf{K}_T$). The plots clearly show patterns of a permutation matrix: there is one and only one significant value per row and per column. Because of the slow convergence, we attribute the desirable results of $\mathbf{K}_T$ (at a small $T$) to the success of the learning of $\mathbf{K}_0$. Note also interestingly that a learned permutation may be the identity mapping.}
  \label{fig:alignment}
\end{figure}


%------------------------------------------------------------------------------
\bibliography{reference}

\end{document}
