\section{MoWB Ambiguity Set}\label{sec:dist_model}

\subsection{Problem Separability} 
In this section, we extend the classical Wasserstein amiguity set to the distributed setting via the novel MoWB ambiguity set $\mathcal{A}_G$ defined next.

\begin{definition}[MoWB ambiguity set]
The Mixture of Wasserstein Balls (MoWB) ambiguity set contains mixture distributions whose constituents are distributions from local Wasserstein balls defined at each client, and is expressed as
\begin{multline} \label{eq:global_amb_set}
    \mathcal{A}_G \coloneqq \bigg \{ \mathbb{Q} \colon \mathbb{Q} = \sum_{g=1}^{G} \alpha_g \mathbb{Q}_g, \ \alpha_g \geq 0, \ \sum_{g=1}^{G} \alpha_g = 1,\\ \ \mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)\bigg \},
\end{multline}
where $\alpha_g$ is client $g$'s weight, and $\mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)$ is the type-$1$ Wasserstein ball of radius $\varepsilon_g$ supported on $\Xi$, centered at $\widehat{\mathbb{P}}_{N_g}$, and defined via cost function $d(\boldsymbol{\xi},\boldsymbol{\xi}^{\prime})$ shown in \eqref{eq:cost_func}.
\end{definition}

\begin{remark} \label{remark:amb_set}
    Observe that when $G=N$, our ambiguity set models worst-case perturbations in individual training samples in a fashion similar to robust optimization (RO). Alternatively, when $G=1$, our ambiguity set reduces to the classical Wasserstein ball $\mathcal{A}_{\varepsilon,1,d}(\Xi)$ defined in \eqref{eq:trad_amb_set}. This suggests that our proposed ambiguity set offers more flexibility in modeling the uncertainty than the classical Wasserstein ambiguity set, which can allow it to achieve improved performance in some settings. This also suggests that our proposed ambiguity set naturally extends the classical Wasserstein ball to the FL setting. Indeed, we show in Proposition \ref{lemma:general_form} that when equipped with the MoWB ambiguity set, the DRO problem enjoys a naturally distributed formulation.
\end{remark}
\begin{proposition}[Problem Separability] \label{lemma:general_form}
    The original DRO problem in \eqref{eq:dro_problem} equipped with the MoWB ambiguity set defined in \eqref{eq:global_amb_set} admits the following reformulation:
    \begin{multline} \label{eq:general_form_eq}
        \inf_{\boldsymbol{w}} \sup_{\mathbb{Q} \in \mathcal{A}_G} \mathbb{E}^{\mathbb{Q}}[\ell_H(\boldsymbol{w};\boldsymbol{\xi})] \\= \inf_{\boldsymbol{w}} \sum_{g=1}^{G} \alpha_g \sup_{\mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)}\mathbb{E}^{\mathbb{Q}_g}[\ell_H(\boldsymbol{w};\boldsymbol{\xi})].
    \end{multline}
\end{proposition}
\begin{proof}
    Proof is provided in Appendix \ref{proof:general_form}.
\end{proof}

\subsection{Out-of-Sample Performance Guarantees} 
Since the MoWB ambiguity set $\mathcal{A}_G$ relies on local Wasserstein balls $\mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)$, it inherits desirable out-of-sample performance guarantees shown by \cite{kuhn2019wasserstein}. Indeed, we show in Proposition \ref{prop:oos} that the true distribution $\mathbb{P} = \sum_{g=1}^G \alpha_g \mathbb{P}_g$ is contained within the MoWB ambiguity set with a certain confidence level, thereby allowing for the reduction of the \textit{true risk} without knowing $\mathbb{P}$. This relies on Assumption \ref{assump:light_tail}, which allows for tighter concentration inequalities for $\mathbb{P}_g$, ensuring that they can indeed be modeled as a perturbation of the empirical distributions $\widehat{\mathbb{P}}_{N_g}$.

\begin{assumption}[Light-tailed Distribution]\label{assump:light_tail}
    The true distribution $\mathbb{P}_g$ of the data at client $g$ is light-tailed. That is, there exists $a > 1$ with $A_g \coloneqq \mathbb{E}^{\mathbb{P}_g}[\exp(||2\boldsymbol{x}||^{a_g})] < +\infty$.
\end{assumption}

\begin{proposition}[Out-of-Sample Performance]\label{prop:oos}
    Suppose Assumption \ref{assump:light_tail} holds and the local Wasserstein ball radius $\varepsilon_g$ at client $g$ is set as \citep{kuhn2019wasserstein}
    \begin{multline*}
    \allowdisplaybreaks
        \varepsilon_{N_g}(\eta_g) = \left ( \frac{\log(c_{1_g} \eta_g^{-1})}{c_{2_g}N_g} \right )^{\frac{1}{a_g}} \mathbbm{1}_{ \left \{N_g < \frac{\log(c_{1_g} \eta_g^{-1)}}{c_{2_g} c_{3_g}} \right \}} \\+ \left ( \frac{\log(c_{1_g} \eta_g^{-1})}{c_{2_g}N_g} \right )^{\frac{1}{P}} \mathbbm{1}_{ \left \{N_g \geq \frac{\log(c_{1_g} \eta_g^{-1)}}{c_{2_g} c_{3_g}} \right \}},
    \end{multline*}
    where $c_{1_g}, c_{2_g}, c_{3_g} \in \mathbb{R}_+$ are constants that depend on $a_g$, $A_g$, $P$ (dimension of the feature space), and the transportation cost given by \eqref{eq:cost_func}. Then the MoWB ambiguity set $\mathcal{A}_G$ defined in \eqref{eq:global_amb_set} enjoys the following property
    \begin{equation*}
    \mathbb{P}^N \{ \mathbb{P} \in \mathcal{A}_G \} \geq \prod_{g=1}^G (1 - \eta_g),
    \end{equation*}
where $\eta_g$ is such that $\mathbb{P}^{N_g} \{ \mathbb{P}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi) \} \geq (1 - \eta_g)$.
\end{proposition}
\begin{proof}
    Proof is provided in Appendix \ref{proof:oos}.
\end{proof}


