\section{Solution Algorithms}\label{sec:algs}
We introduce two algorithms to solve problem \eqref{eq:general_form_eq}. Given that our motivating scenario involves manufacturing plants (clients) with ample local compute resources and reliable communication with a central server, we adopt Assumption \ref{assump:sync_train} to guarantee convergence of our algorithms.
\begin{assumption} [Synchronous Training] \label{assump:sync_train}
    The distributed optimization problem in \eqref{eq:general_form_eq} is solved synchronously. That is, the central server only performs an update step once all the clients have completed solving their local problems and communicated their insights to the central server.
\end{assumption}
\subsection{Subgradient-based Algorithm (SM)}\label{subsec:sm_alg}
The subgradient-based (\textbf{SM}) algorithm begins by initializing the global model parameters $\boldsymbol{w}$. Next, each client $g$ seeks to obtain a subgradient for their inner maximization problem from \eqref{eq:general_form_eq}, to be sent to the server for aggregation and model update. This requires each client $g$ to obtain a worst-case distribution $\mathbb{Q}^{\ast}_g$ from its local ambiguity set $\mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)$, allowing the worst-case risk to be directly expressed as an expectation with respect to $\mathbb{Q}^{\ast}_g$. However, \cite{kuhn2019wasserstein} show that the worst-case distribution cannot be obtained from a type-1 Wasserstein ambiguity set centered around an empirical distribution $\widehat{\mathbb{P}}_{N_g}$ if $\mathcal{X}$ is not compact. Therefore, we make Assumption \ref{assump:feat_supp} \textit{only} for the SM algorithm, ensuring the compactness of $\mathcal{X}$.

\begin{assumption}[Support of Feature Vector] \label{assump:feat_supp}
    The feature vector $\boldsymbol{x}$ is such that: 
        $0 \leq e_p^{\mathsf{T}}\boldsymbol{x} \leq 1 \ \forall p \in [P]$, 
    where $e_p$ are the standard unit vectors.
\end{assumption}

Note that Assumption \ref{assump:feat_supp} is not very restrictive in practice. Indeed, real-world data is often bounded by sensor ranges, and can therefore be easily normalized. Given Assumption \ref{assump:feat_supp} holds and the global model parameters $\boldsymbol{w}$ are fixed, then by \cite{2019regularization} it can be shown that the worst-case distribution $\mathbb{Q}^{\ast}_g$ for client $g$ is
\begin{multline} \label{eq:ext_dist}
        \mathbb{Q}_g^{\ast} = \frac{1}{N_g} \sum_{n_g=1}^{N_g} \bigg ( {\beta_{n_g}^+}^{\ast} \delta_{(\widehat{\boldsymbol{x}}_{n_g}-{\boldsymbol{q}_{n_g}^+}^{\ast}/{\beta_{n_g}^+}^{\ast},\widehat{y}_{n_g})} \\+  {\beta_{n_g}^-}^{\ast} \delta_{(\widehat{\boldsymbol{x}}_{n_g}-{\boldsymbol{q}_{n_g}^-}^{\ast}/{\beta_{n_g}^-}^{\ast},-\widehat{y}_{n_g})} \bigg ),
    \end{multline}
where $\delta_{(\boldsymbol{x},y)}$ is the Dirac density function that assigns probability mass 1 at sample $\boldsymbol{\xi}=(\boldsymbol{x},y)$, and $\beta_{n_g}^+$, $\beta_{n_g}^-$, $\boldsymbol{q}_{n_g}^+$, and $\boldsymbol{q}_{n_g}^-$ are maximizers of the following optimization problem:
\begin{align}\label{eq:subgrad_client_prob}
    \nonumber &\max_{\substack{\beta_{n_g}^+, \beta_{n_g}^- \\ \boldsymbol{q}_{n_g}^+, \boldsymbol{q}_{n_g}^-}} H_g(\boldsymbol{w}) \coloneq\\ &\left \{ \begin{aligned}
        &\max_{\substack{\beta_{n_g}^+, \beta_{n_g}^- \\ \boldsymbol{q}_{n_g}^+, \boldsymbol{q}_{n_g}^-}} && \begin{multlined} \frac{1}{N_g}\sum_{n_g=1}^{N_g} \big ( -(\beta_{n_g}^+ - \beta_{n_g}^-) \widehat{y}_{n_g} \boldsymbol{w}^{\mathsf{T}}\widehat{\boldsymbol{x}}_{n_g} \\- \widehat{y}_{n_g} \boldsymbol{w}^{\mathsf{T}} (\boldsymbol{q}_{n_g}^+ - \boldsymbol{q}_{n_g}^-) \big )\end{multlined}\\
        & \subjectto && \sum_{n_g=1}^{N_g} \big ( ||\boldsymbol{q}_{n_g}^+|| + ||\boldsymbol{q}_{n_g}^-|| + \kappa_g \beta_{n_g}^- \big ) \leq N_g \varepsilon_g \\
        &&& \beta_{n_g}^+ + \beta_{n_g}^- = 1 \qquad \qquad \qquad \  \forall n_g \in [N_g]\\
        &&& 0 \leq \beta_{n_g}^+ \widehat{\boldsymbol{x}}_{n_g}-\boldsymbol{q}_{n_g}^+ \leq \beta_{n_g}^+ \qquad \forall n_g \in [N_g]\\
        &&&0 \leq \beta_{n_g}^- \widehat{\boldsymbol{x}}_{n_g}-\boldsymbol{q}_{n_g}^- \leq \beta_{n_g}^- \qquad \forall n_g \in [N_g]\\
        &&& \beta_{n_g}^+,\beta_{n_g}^- \geq 0 \qquad \qquad \qquad \quad \ \forall n_g \in [N_g]
    \end{aligned} \right.,
\end{align}
where $||\cdot||$ is the norm used in the definition of the transportation cost function \eqref{eq:cost_func}. Armed with the discrete worst-case distribution $\mathbb{Q}^{\ast}_g$, each client $g$ can compute a subgradient, $\boldsymbol{v}_g$, of their local maximization problem. Proposition \ref{prop:subgrad_comp} presents a closed-form for obtaining $\boldsymbol{v}_g$.

\begin{proposition}[Local Subgradient Computation]\label{prop:subgrad_comp}
    Suppose the worst-case distribution $\mathbb{Q}^{\ast}_g$ is known to client $g$. Then, they can compute a subgradient $\boldsymbol{v}_g$ for their respective maximization problem from \eqref{eq:general_form_eq} as any vector that obeys
    \begin{equation*}
        \boldsymbol{v}_g \in \frac{1}{N_g} \sum_{n_g=1}^{N_g} \big ( \mathcal{B}^+ + \mathcal{B}^- \big ),
    \end{equation*}
    where $+$ is the Minkowski sum and $\mathcal{B}^+$, $\mathcal{B}^-$ are defined as
    \begin{equation*}
        \mathcal{B}^{\pm} \coloneq \left \{ \begin{aligned}
            & \boldsymbol{0} && \text{if } \widehat{r}_{n_g}^{\pm} < 0\\
            & \mp {\beta_{n_g}^{\pm}}^{\ast} \widehat{y}_{n_g}\widehat{\boldsymbol{z}}_{n_g}^{\pm}&& \text{if } \widehat{r}_{n_g}^{\pm} > 0\\
            & \text{conv}\left ( \{ \boldsymbol{0}, \mp {\beta_{n_g}^{\pm}}^{\ast} \widehat{y}_{n_g}\widehat{\boldsymbol{z}}_{n_g}^{\pm} \} \right ) && \text{if } \widehat{r}_{n_g}^{\pm} = 0
        \end{aligned} \right.,
    \end{equation*}
where $\widehat{\boldsymbol{z}}_{n_g}^{\pm} = \widehat{\boldsymbol{x}}_{n_g}-{\boldsymbol{q}_{n_g}^{\pm}}^{\ast}/{\beta_{n_g}^{\pm}}^{\ast}$, $\widehat{r}_{n_g}^{\pm} =  1 \mp \widehat{y}_{n_g} \cdot \boldsymbol{w}^{\mathsf{T}} \widehat{\boldsymbol{z}}_{n_g}^{\pm}$, and $\text{conv}(\Theta)$ is the convex hull of set $\Theta$.
\end{proposition}
\begin{proof}
    Proof is provided in Appendix \ref{proof:subgrad_comp}.
\end{proof}

The subgradients $\boldsymbol{v}_g$ from the clients are then aggregated by the server, and used to update the global model $\boldsymbol{w}$ and broadcast it back to the clients. This process repeats for $T$ rounds. The pseudocode of the SM algorithm is given in \ref{alg:subgrad}.
\begin{algorithm}[ht]
    \caption{SM Algorithm}
    \smaller
    \label{alg:subgrad}
    \textbf{Input:} $\boldsymbol{w}^{(0)}$\\
    \textbf{Parameters:} Number of rounds $T$, step-size $\gamma(t)$ at round $t$\\
    \textbf{Output:} $\boldsymbol{w}^{\ast}$
    \begin{algorithmic}[1]
        \FOR{$t = 1, \dots, T$}
        \STATE \textbf{Client Update:}
        \FOR{clients $g = 1, \dots, G$}
        \STATE Solve for $[{\beta_{n_g}^+}^{\ast}, {\beta_{n_g}^-}^{\ast}, {\boldsymbol{q}_{n_g}^+}^{\ast}, {\boldsymbol{q}_{n_g}^-}^{\ast}] \leftarrow \argmax H_g(\boldsymbol{w}^{(t)})$
        \STATE Compute $\mathbb{Q}_g^{\ast} \leftarrow \frac{1}{N_g} \sum_{n_g=1}^{N_g} {\beta_{n_g}^+}^{\ast} \delta_{(\widehat{\boldsymbol{x}}_{n_g}-{\boldsymbol{q}_{n_g}^+}^{\ast}/{\beta_{n_g}^+}^{\ast},\widehat{y}_{n_g})} +  {\beta_{n_g}^-}^{\ast} \delta_{(\widehat{\boldsymbol{x}}_{n_g}-{\boldsymbol{q}_{n_g}^-}^{\ast}/{\beta_{n_g}^-}^{\ast},-\widehat{y}_{n_g})}$
        \STATE Compute any local subgradient $\boldsymbol{v}_g$ via Proposition \ref{prop:subgrad_comp}
        \STATE Send $\boldsymbol{v}_g$ to central server.
        \ENDFOR
        \STATE \textbf{Server Update:}
        \STATE $\boldsymbol{w}^{(t+1)} \longleftarrow \boldsymbol{w}^{(t)} - \gamma(t) \sum_{g=1}^{G} \alpha_g \boldsymbol{v}_g$
        \STATE Broadcast $\boldsymbol{w}^{(t+1)}$ to all clients
        \ENDFOR
    \end{algorithmic}
\end{algorithm}

\textbf{Convergence.} It is known that the subgradient method converges to an optimal objective value under certain conditions \citep{boyd2003}. We present Lemmas \ref{lem:sub_con}, \ref{lem:lip_cont}, \ref{lem:coer} in Appendix \ref{sec:lemmas}, proving the convexity, Lipschitz continuity, and coercivity of problem \eqref{eq:general_form_eq}'s objective in $\boldsymbol{w}$. Theorem \ref{thm:sm_conv} then asserts that these properties satisfy the convergence criteria of the subgradient method given that the step-size diminishes appropriately, proving the convergence of the SM algorithm. We also derive the SM algorithm's worst-case time complexity in Theorem \ref{thm:subgrad_conv}, showing that it converges in polynomial time with a sublinear number of communication rounds.

\begin{theorem}[SM Convergence]\label{thm:sm_conv}
    The SM Algorithm \ref{alg:subgrad} converges to an optimal solution $\boldsymbol{w}^{\ast}$ of problem \eqref{eq:general_form_eq} within an arbitrary tolerance $\epsilon_1 > 0$, provided the step-size $\gamma(t) \rightarrow 0$ as $t \rightarrow \infty$ and $\sum_{t=1}^{\infty} \gamma(t) = \infty$.
\end{theorem}
\begin{proof}
    Proof is provided in Appendix \ref{proof:sm_conv}.
\end{proof}

\begin{theorem}[SM Time Complexity]\label{thm:subgrad_conv}
Suppose the $\ell_{\infty}$-norm is used in problem \eqref{eq:subgrad_client_prob}, and that it is solved via the barrier method equipped with the log barrier function and Newton updates. Then, the SM algorithm \ref{alg:subgrad} with the diminishing step-size in Theorem \ref{thm:sm_conv} has an overall worst-case time complexity of $\mathcal{O}\left (\epsilon_1^{-2} \left [ N_{g^{\ast}}^{3.5}P^{3.5}\log(\epsilon_2^{-1}) + GP \right ] \right )$ (with $\mathcal{O}(\epsilon_1^{-2})$ communication rounds), where $\epsilon_1$, $\epsilon_2 > 0$ are tolerances 
    on the solutions of the subgradient method and problem \eqref{eq:subgrad_client_prob}, respectively, and $N_{g^{\ast}}$ is the greatest number of samples at any client. 
\end{theorem}

\begin{proof}
    Proof is provided in Appendix \ref{proof:subgrad_conv}.
\end{proof}

\subsection{ADMM-based Algorithm (ADMM)}\label{subsec:admm_alg}
The ADMM-based (\textbf{ADMM}) algorithm requires each client $g$ to solve their local problem and send their optimal local model $\boldsymbol{w}^{\ast}_g$ to the server. There, the local models are aggregated to obtain optimal global model $\boldsymbol{w}^{\ast}$, which is broadcast to the clients. This repeats for $T$ rounds. To guarantee theoretical convergence, we also create a modified version of this algorithm with strongly convex client objectives, denoted as \textbf{ADMM-SC}. Further detail on this is given in the convergence discussion. Deriving this algorithm begins by introducing a decision variable $\boldsymbol{w}_g$ for each client $g$, and rewriting problem \eqref{eq:general_form_eq} to enforce client concensus as follows.
\begin{equation}\label{eq:admm_orig_obj}
\begin{aligned}
&\inf_{\boldsymbol{w_g},\boldsymbol{w}} &&\sum_{g=1}^{G} \alpha_g \sup_{\mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)}\mathbb{E}^{\mathbb{Q}_g}[\ell_H(\boldsymbol{w}_g;\boldsymbol{\xi})]\\
    &\subjectto && \boldsymbol{w}_g - \boldsymbol{w} = 0 \qquad \qquad \qquad \forall g \in [G].
\end{aligned}
\end{equation}
Next, we express the Augmented Lagrangian parameterized by scale parameter $\rho$ for the problem in \eqref{eq:admm_orig_obj} as follows:
\begin{multline*}
    \allowdisplaybreaks
    \mathcal{L}_{\rho}(\boldsymbol{w}_1, \dots, \boldsymbol{w}_G, \boldsymbol{w}, \boldsymbol{\mu}_1, \dots, \boldsymbol{\mu}_G) \\= \sum_{g=1}^G \alpha_g\mathcal{L}_{\rho_g}(\boldsymbol{w}_g, \boldsymbol{w}, \boldsymbol{\mu}_g),
\end{multline*}
where $\boldsymbol{\mu_g}$ are client $g$'s scaled Lagrange multipliers, and
\begin{multline*}
    \mathcal{L}_{\rho_g}(\boldsymbol{w}_g, \boldsymbol{w}, \boldsymbol{\mu}_g) = \\\sup_{\mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)}\mathbb{E}^{\mathbb{Q}_g}[\ell_H(\boldsymbol{w}_g;\boldsymbol{\xi})]
    + \frac{\rho}{2} ||\boldsymbol{w}_g - \boldsymbol{w} + \boldsymbol{\mu}_g||_2^2.
\end{multline*}
Given the Augmented Lagrangian, client $g$ and the server can obtain their model updates by minimizing it with respect to $\boldsymbol{w}_g$ and $\boldsymbol{w}$, respectively. Proposition \ref{prop:admm_client_up} presents a \textit{tractable, convex} problem that is solved by each client for local model updates. Proposition \ref{prop:admm_server_up} presents a closed-form expression for the server's update of the global model.

\begin{proposition}[ADMM Client Update]\label{prop:admm_client_up}
    Provided with the updated global model $\boldsymbol{w}^{\ast}$, client $g$ can obtain their updated local model $\boldsymbol{w}_g^{\ast}$ as the minimizer to the following problem
    \begin{align}
        &  \nonumber J_g (\boldsymbol{w},\boldsymbol{\mu}_g) =  \\
        &\label{eq:admm_2d}  \left \{ \begin{aligned}
            & \min_{\boldsymbol{w}_g,\lambda_g,s_{n_g}}&& \lambda_g \varepsilon_g + \frac{1}{N_g} \sum_{n_g=1}^{N_g}s_{n_g} \\
            &&& \qquad \qquad \qquad  +\frac{\rho}{2} || \boldsymbol{w}_g - \boldsymbol{w}^{\ast} + \boldsymbol{\mu}_g||_2^2\\
            & \subjectto && \ell_H(\boldsymbol{w}_g;(\widehat{\boldsymbol{x}}_{n_g},\widehat{y}_{n_g})) \leq s_{n_g} \ \forall n_g \in [N_g] \\
            &&& \begin{aligned} &\ell_H(\boldsymbol{w}_g;(\widehat{\boldsymbol{x}}_{n_g},-\widehat{y}_{n_g})) - \kappa \lambda_g \leq s_{n_g}\\
            & \qquad \qquad \qquad \qquad \qquad \quad \forall n_g \in [N_g] \end{aligned} \\
            &&& \lambda \geq ||\boldsymbol{w}_g||_{\ast}&&
        \end{aligned} \right.,
    \end{align}
where $||\cdot||_{\ast}$ is the dual to the norm used in the transportation cost function \eqref{eq:cost_func}.
\end{proposition}

\begin{proof}
    Proof is provided in Appendix \ref{proof:admm_client_up}.
\end{proof}

\begin{proposition}[ADMM Server Update]\label{prop:admm_server_up}
    Provided with the updated local models $\boldsymbol{w}_g^{\ast}$ and scaled Lagrange multipliers $\boldsymbol{\mu}_g^{\ast}$, the central server can obtain the updated global model $\boldsymbol{w}^{\ast}$ via the following
    \begin{equation*}
        \boldsymbol{w}^{\ast} = \sum_{g=1}^G \alpha_g (\boldsymbol{w}_g^{\ast} + \boldsymbol{\mu}_g^{\ast}).
    \end{equation*}
\end{proposition}

\begin{proof}
    Proof is provided in Appendix \ref{proof:admm_server_up}.
\end{proof}

The server then broadcasts $\boldsymbol{w}^{\ast}$ to the clients, where they update their Lagrange multipliers and the process repeats. We provide the pseudocode for the ADMM algorithm in \ref{alg:admm}.
\begin{algorithm}[ht]
    \caption{ADMM/ADMM-SC Algorithm}
    \smaller
    \label{alg:admm}
    \textbf{Input:} $\boldsymbol{w}^{(0)}$, $\boldsymbol{w}_g^{(0)}$, $\boldsymbol{\mu}_g^{(0)}$\\
    \textbf{Parameters:} Number of rounds $T$, scale parameter $\rho$\\
    \textbf{Output:} $\boldsymbol{w}^{\ast}$
    \begin{algorithmic}[1]
        \FOR{$t = 1, \dots, T$}
        \STATE \textbf{Client Update:}
        \FOR{clients $g = 1, \dots, G$}
        \STATE Solve for $\boldsymbol{w}_g^{(t+1)} \leftarrow \boldsymbol{w}_g^{\ast} \text{ minimizer of } J_g(\boldsymbol{w}^{(t)}, \boldsymbol{\mu}_g^{(t)})$ \eqref{eq:admm_2d} (or  $J_g^{\text{SC}}(\boldsymbol{w}^{(t)}, \boldsymbol{\mu}_g^{(t)})$ in Appendix \ref{sec:admm_sc} for ADMM-SC) 
        \STATE Send $\boldsymbol{w}_g^{(t+1)}$ to central server
        \ENDFOR
        \STATE \textbf{Server Update:}
        \STATE Update $\boldsymbol{w}^{(t+1)} \leftarrow \sum_{g=1}^G \alpha_g (\boldsymbol{w}_g^{(t+1)} + \boldsymbol{\mu}_g^{(t)})$
        \STATE Broadcast $\boldsymbol{w}^{(t+1)}$ to all clients
        \STATE \textbf{Client Update:}
        \FOR{clients $g = 1, \dots, G$}
        \STATE  $\boldsymbol{\mu}_g^{(t+1)} \leftarrow \boldsymbol{\mu}_g^{(t)} + \boldsymbol{w}_g^{(t+1)} - \boldsymbol{w}^{(t+1)}$
        \ENDFOR
        \ENDFOR
    \end{algorithmic}
\end{algorithm}

\textbf{Convergence.} Even if the objective function is closed and proper convex as we show in Lemma \ref{lem:clo_pro_con} in Appendix \ref{sec:lemmas}, it has been established in the literature that the global convergence of the multi-block (i.e. $G \geq 3$) ADMM algorithm is generally not guaranteed. Indeed, a counterexample is presented by \cite{chen2016} demonstrating that the multi-block ADMM with a separable convex objective function can fail to converge. However, the convergence of multi-block ADMM in practical cases such as \citep{tao2011} has motivated works to investigate conditions under which it is guaranteed convergence \citep{deng2017,lin2015}. In Theorem \ref{thm:admm_conv_cri}, we introduce an additional strongly convex term to be added to each client's objective, and we denote the ADMM algorithm with strongly convex client objectives as ADMM-SC. We then show that ADMM-SC indeed converges as it obeys the criteria given by \cite{lin2015}. Subsequently, we present worst-case time complexity of ADMM-SC in Theorem \ref{thm:admm_conv}, showing that it too converges in polynomial time, but requires fewer communication rounds than the SM algorithm.
\begin{theorem}[ADMM-SC Convergence]\label{thm:admm_conv_cri} Suppose the local client problem in \eqref{eq:admm_2d} is modified by adding a $\tau_g||\boldsymbol{w}_g||_2^2$ term to the objective function, where $\tau_g$ is a user-defined hyperparameter, resulting in the modified client problem with a strongly convex objective $J_g^{\text{SC}}(\boldsymbol{w}^{(t)}, \boldsymbol{\mu}_g^{(t)})$ in Appendix \ref{sec:admm_sc}. Suppose further that the ADMM-SC algorithm in \ref{alg:admm} is used to train the FDR-SVM with the modified objective. Then, ADMM-SC converges to an optimal solution $\boldsymbol{w}^{\ast}$ of the modified problem with arbitrary tolerance $\epsilon_1 > 0$ if $\rho \leq \min_{g=1,\dots,G-1}\Bigg \{ \frac{4\alpha_g \tau_g}{g(2G+1-g)},\frac{4\alpha_G \tau_G}{(G-1)(G+2)}\Bigg \}$.
\end{theorem}

\begin{proof}
    Proof is provided in Appendix \ref{proof:admm_conv_cri}.
\end{proof}

\begin{remark}\label{rem:reg}
    The additional $\tau_g||\boldsymbol{w}_g||_2^2$ terms are redundant regularization terms, potentially impacting the performance of the final model as shown empirically in Section \ref{sec:num_exp}.
 \end{remark}

\begin{theorem}[ADMM-SC Time Complexity]\label{thm:admm_conv}
    Suppose that the $\ell_1$-norm is used in the strongly convex variant of the local model problem \eqref{eq:admm_2d}, and that it is solved via the barrier method with the log barrier function and Newton updates. Then, the ADMM-SC algorithm \ref{alg:admm} equipped with $\rho$ chosen according to Theorem \ref{thm:admm_conv_cri} has an overall worst-case time complexity of $\mathcal{O}\left (\epsilon_1^{-1} \left [(N_{g^{\ast}}+P)^{3.5}\log(\epsilon_2^{-1}) + GP \right ]\right )$ (with $\mathcal{O}(\epsilon_1^{-1})$ communication rounds), where $\epsilon_1$, $\epsilon_2 > 0$ are tolerances on the solutions of ADMM and the strongly convex variant of the local problem \eqref{eq:admm_2d}, respectively, and $N_{g^{\ast}}$ is the greatest number of samples at any client.
\end{theorem}

\begin{proof}
    Proof is provided in Appendix \ref{proof:admm_conv}.
\end{proof}