\subsection{Proofs of Theoretical Results} \label{sec:proofs}
\subsubsection{Proof of Proposition \ref{lemma:general_form}} \label{proof:general_form}
\begin{proof}
    \begin{subequations}
    \begin{align}
        \inf_{\boldsymbol{w}} \sup_{\mathbb{Q} \in \mathcal{A}_G} \mathbb{E}^{\mathbb{Q}}[\ell_H(\boldsymbol{w};\boldsymbol{\xi})] \label{eq:gen_1a}&= \inf_{\boldsymbol{w}} \sup_{\left \{ \mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi) \right \}_{g=1}^{G}} \mathbb{E}^{\sum_{g=1}^{G}\alpha_g \mathbb{Q}_g}[\ell_H(\boldsymbol{w};\boldsymbol{\xi})] \\
        &\label{eq:gen_1b} = \inf_{\boldsymbol{w}} \sup_{\left \{ \mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)\right \}_{g=1}^{G}} \sum_{g=1}^{G} \alpha_g\mathbb{E}^{\mathbb{Q}_g}[\ell_H(\boldsymbol{w};\boldsymbol{\xi})]\\
        &\label{eq:gen_1c} = \inf_{\boldsymbol{w}} \sum_{g=1}^{G} \alpha_g \sup_{\mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)}\mathbb{E}^{\mathbb{Q}_g}[\ell_H(\boldsymbol{w};\boldsymbol{\xi})],
    \end{align}
    \end{subequations}
    where \eqref{eq:gen_1a} follows from the definition of the global ambiguity $\mathcal{A}_G$ set in \eqref{eq:global_amb_set}, \eqref{eq:gen_1b} follows from the Law of Total Expectation, and \eqref{eq:gen_1c} follows by recognizing that the maximization problems are separable due to each decision variable only affecting its corresponding term.
\end{proof}

\subsubsection{Proof of Proposition \ref{prop:oos}} \label{proof:oos}
\begin{proof}
    Suppose the assumptions in the Proposition statement hold. Then, as demonstrated by \cite{kuhn2019wasserstein} we have the following
    \begin{equation*}
        \mathbb{P}^{N_g} \{ \mathbb{P}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi) \} \geq (1 - \eta_g)
    \end{equation*}
    Therefore, we can obtain the following.
    \begin{subequations}
    \begin{align}
        \mathbb{P}^N\{\mathbb{P} \in \mathcal{A}_G\} &\label{eq:oos1} \geq \prod_{g=1}^{G}\mathbb{P}^{N_g} \{ \mathbb{P}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi) \} \\
        &\label{eq:oos2} \geq \prod_{g=1}^{G} (1 - \eta_g),
    \end{align}
    \end{subequations}
    where \eqref{eq:oos1} follows by noting that the local data and Wasserstein balls at all $G$ clients are mutually independent, and that $\mathbb{P} = \sum_{g=1}^{G} \alpha_g \mathbb{P}_g$. Furthermore, note that \eqref{eq:oos1} contains an inequality instead of an equality as there is no guarantee that $\mathbb{P}$ cannot be constructed as a mixture of distributions from the local Wasserstein balls $\{\mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)\}_{g=1}^G$. Therefore, we have that
    \begin{equation*}
        \mathbb{P}^N\left \{\mathbb{P} \in \mathcal{A}_G \cap \mathbb{P}_g \notin \{\mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)\}_{g=1}^G \right \} \neq 0.
    \end{equation*}
\end{proof}

\subsubsection{Proof of Proposition \ref{prop:subgrad_comp}} \label{proof:subgrad_comp}
\begin{proof}
    Firstly let us note the following:
    \begin{subequations}
    \begin{align}
        \partial \sup_{\mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)}\mathbb{E}^{\mathbb{Q}_g}[\ell_H(\boldsymbol{w};\boldsymbol{\xi})] &\label{eq:subgrad_1a}\supseteq \partial \mathbb{E}^{\mathbb{Q}_g^{\ast}}[\ell_H(\boldsymbol{w};\boldsymbol{\xi})]\\
        &\label{eq:subgrad_1b}= \mathbb{E}^{\mathbb{Q}_g^{\ast}}[\partial \ell_H(\boldsymbol{w};\boldsymbol{\xi})],
    \end{align}
    \end{subequations}
    where \ref{eq:subgrad_1a} follows from Lemma 4.4.1 in \citep{convextextbook} by the fact that $\mathbb{Q}_g^{\ast}$ is a maximizer of the supremum on the left hand side, and \ref{eq:subgrad_1a} follows from the fact that $\ell_H(\boldsymbol{w};\boldsymbol{\xi})$ is convex and integrable, and $\mathbb{Q}_g^{\ast}$ is a discrete distribution. Thus $\mathbb{E}^{\mathbb{Q}_g^{\ast}}[\cdot]$ is a weighted sum.
    
    Now, let us introduce the functions $h_1(\boldsymbol{w})$, and $h_2(\boldsymbol{w})$ to simplify notation as follows:
        \begin{subequations}
        \begin{align}
            \mathbb{E}^{\mathbb{Q}_g^{\ast}} [\ell_H(\boldsymbol{w};\boldsymbol{\xi})]
            &\label{eq:subgrad_2a}= \frac{1}{N_g} \sum_{n_g=1}^{N_g} {\beta_{n_g}^+}^{\ast}\ell_H(\boldsymbol{w};(\widehat{\boldsymbol{z}}_{n_g}^+,\widehat{y}_{n_g})) + {\beta_{n_g}^-}^{\ast}\ell_H(\boldsymbol{w};(\widehat{\boldsymbol{z}}_{n_g}^-,-\widehat{y}_{n_g}))\\
            &\label{eq:subgrad_2c} \coloneqq \frac{1}{N_g} \sum_{n_g=1}^{N_g} h_1(\boldsymbol{w}) + h_2(\boldsymbol{w}),
        \end{align}
        \end{subequations}
        where \ref{eq:subgrad_2a} uses the definition of $\mathbb{Q}_g^{\ast}$ from Equation \ref{eq:ext_dist}, and $\widehat{\boldsymbol{z}}_{n_g}^{\pm} = \widehat{\boldsymbol{x}}_{n_g}-{\boldsymbol{q}_{n_g}^{\pm}}^{\ast}/{\beta_{n_g}^{\pm}}^{\ast}$. Now, observe that we can write the subdifferentials of $h_1(\boldsymbol{w})$ and $h_2(\boldsymbol{w})$ with respect to $\boldsymbol{w}$ as follows:
        \begin{equation*}
            \partial h_1(\boldsymbol{w}) = \left \{ \begin{aligned}
                & \boldsymbol{0} && \text{if } 1 - \widehat{y}_{n_g} \cdot \boldsymbol{w}^{\mathsf{T}} \widehat{\boldsymbol{z}}_{n_g}^+ < 0\\
                & -{\beta_{n_g}^+}^{\ast} \widehat{y}_{n_g}\widehat{\boldsymbol{z}}_{n_g}^+&& \text{if } 1 - \widehat{y}_{n_g} \cdot \boldsymbol{w}^{\mathsf{T}} \widehat{\boldsymbol{z}}_{n_g}^+ > 0\\
                & \text{conv}\left ( \{ \boldsymbol{0}, -{\beta_{n_g}^+}^{\ast} \widehat{y}_{n_g}\widehat{\boldsymbol{z}}_{n_g}^+ \} \right ) && \text{if } 1 - \widehat{y}_{n_g} \cdot \boldsymbol{w}^{\mathsf{T}} \widehat{\boldsymbol{z}}_{n_g}^+ = 0
            \end{aligned} \right.
        \end{equation*}
    
        \begin{equation*}
            \partial h_2(\boldsymbol{w}) = \left \{ \begin{aligned}
                & \boldsymbol{0} && \text{if } 1 + \widehat{y}_{n_g} \cdot \boldsymbol{w}^{\mathsf{T}} \widehat{\boldsymbol{z}}_{n_g}^- < 0\\
                & {\beta_{n_g}^-}^{\ast} \widehat{y}_{n_g}\widehat{\boldsymbol{z}}_{n_g}^-&& \text{if } 1 + \widehat{y}_{n_g} \cdot \boldsymbol{w}^{\mathsf{T}} \widehat{\boldsymbol{z}}_{n_g}^- > 0\\
                & \text{conv}\left ( \{ \boldsymbol{0}, {\beta_{n_g}^-}^{\ast} \widehat{y}_{n_g}\widehat{\boldsymbol{z}}_{n_g}^- \} \right ) && \text{if } 1 + \widehat{y}_{n_g} \cdot \boldsymbol{w}^{\mathsf{T}} \widehat{\boldsymbol{z}}_{n_g}^- = 0
            \end{aligned} \right.
        \end{equation*}
        Therefore, we can use the previous result to obtain the following:
        \begin{equation*}
            \mathbb{E}^{\mathbb{Q}_g^{\ast}}[\partial \ell_H(\boldsymbol{w};\boldsymbol{\xi})] = \frac{1}{N_g} \sum_{n_g=1}^{N_g} \partial h_1(\boldsymbol{w}) + \partial h_2(\boldsymbol{w}),
        \end{equation*}
        where we use the Minkowski sum in the above equation.
\end{proof}

\subsubsection{Proof of Theorem \ref{thm:sm_conv}} \label{proof:sm_conv}
\begin{proof}
    As shown by \cite{nesterov2013}, the subgradient method guarantees convergence assuming the following conditions are met.
    \begin{enumerate}
        \item The objective function is convex.
        \item The objective function is Lipschitz continuous.
        \item The step-size diminishes at an appropriate rate as stated in the theorem statement.
        \item The distance between any optimal solution $\boldsymbol{w}^{\ast}$ and any initial solution $\boldsymbol{w}^{(0)}$ is bounded from above. That is $||\boldsymbol{w}^{\ast} - \boldsymbol{w}^{(0)}|| \leq C$, where $C\in \mathbb{R}$ need not be known.
    \end{enumerate}
    Note that we verify the first two conditions in Lemmas \ref{lem:sub_con} and \ref{lem:lip_cont}, whereas the third condition can be ensured by selecting an appropriately diminishing step-size sequence, as exemplified in the theorem statement. In examining the fourth condition, we note that it is readily satisfied through the coercivity of the objective function, which we prove in Lemma \ref{lem:coer}. To see this, first note that $f(\boldsymbol{0}) = 1$, and by definition $\inf_{\boldsymbol{w}} f(\boldsymbol{w}) \leq f(\boldsymbol{0})$. Suppose we have a set $\mathcal{W} = \{\boldsymbol{w}:f(\boldsymbol{w}) \leq f(\boldsymbol{0}) \}$. We know that for $\boldsymbol{w}^{\ast}$ to be a minimizer of $f(\boldsymbol{w})$, it must be that $\boldsymbol{w}^{\ast} \in \mathcal{W}$. Suppose further that the set $\mathcal{W}$ contains a sequence $\boldsymbol{w}_i$ such that $||\boldsymbol{w}_i|| \rightarrow \infty$. This results in a contradiction, as
    \begin{equation*}
        ||\boldsymbol{w}_i|| \rightarrow \infty \Rightarrow f(\boldsymbol{w}_i) \rightarrow +\infty \Rightarrow \boldsymbol{w}_i \notin \mathcal{W},
    \end{equation*}
    which follows from the coercivity of $f(\boldsymbol{w})$. Thus, there must exist some constant $R \in \mathbb{R}$ such that 
    \begin{equation*}
        \boldsymbol{w} \in \mathcal{W} \Rightarrow ||\boldsymbol{w}|| \leq R.
    \end{equation*}
    Finally, suppose we choose any finite initializer $\boldsymbol{w}^{(0)}$ for the SM algorithm. Then, by the triangle inequality we have
    \begin{equation*}
        ||\boldsymbol{w}^{\ast} - \boldsymbol{w}^{(0)}|| \leq R + ||\boldsymbol{w}^{(0)}||,
    \end{equation*}
    proving that the distance between any initializer $\boldsymbol{w}^{(0)}$ and any optimizer $\boldsymbol{w}^{\ast}$ is indeed bounded from above.
\end{proof}

\subsubsection{Proof of Theorem \ref{thm:subgrad_conv}} \label{proof:subgrad_conv}
\begin{proof}
    We first examine the time complexity of problem \eqref{eq:subgrad_client_prob} that each client $g$ solves at each iteration $t$. When the $\ell_{\infty}$-norm is used in \eqref{eq:subgrad_client_prob}, the problem becomes a Linear Program (LP) with $4N_gP + 2N_g$ decision variables (including slack variables) and $4N_gP + 7N_g$ constraints, where $N_g$ is the number of training samples at the $g^{th}$ client. Solving the problem via the barrier method with the log barrier function and Newton updates requires $\mathcal{O}(\sqrt(C)\log(\epsilon_2^{-1}))$ iterations to reach an $\epsilon_2$-solution \citep{nn1994}, where $C$ is the number of constraints. Moreover, each iteration has an arithmetic complexity of $\mathcal{O}(CD^2)$, where $D$ is the number of decision variables. Therefore, the theoretical worst-case time complexity of solving the problem in \eqref{eq:subgrad_client_prob} is:
    \begin{equation*}
    \mathcal{O}([4N_gP+7N_g]^{1.5}[4N_gP+2N_g]^{2}\log(\epsilon_2^{-1})).
    \end{equation*}
    By eliminating scalar multipliers and constants, we arrive at the following  simplified expression, 
    \begin{equation*}
    \mathcal{O}([N_gP]^{3.5}\log(\epsilon_2^{-1})).
    \end{equation*}
    
    Since all clients can solve their local problems in parallel, and will have the same number of features. Thus, the client with the largest number of samples $N_{g^{\ast}}$ will have the highest time complexity. Furthermore, the central server performs a summation of $G+1$ vectors of dimension $P$ during each iteration $t$, the time complexity of which is $\mathcal{O}(GP)$. We obtain the final result by noting that the subgradient method converges to a solution with tolerance $\epsilon_1$ in $\mathcal{O}(\epsilon_1^{-2})$ iterations \citep{bubeck2015}. Note that we do not explicitly consider the time complexity of computing the local subgradient at each client since it is lower than that of solving the problem in \eqref{eq:subgrad_client_prob}.
\end{proof}

\subsubsection{Proof of Proposition \ref{prop:admm_client_up}} \label{proof:admm_client_up}
In order to obtain updated local model $\boldsymbol{w}_g^{\ast}$, each client $g$ must minimize the global Lagrangian with respect to $\boldsymbol{w}_g$. Thus, the updated local model $\boldsymbol{w}_g^{\ast}$ can be obtained as the minimizer to the following problem.
\begin{subequations}
\allowdisplaybreaks
\begin{align}
    \nonumber J_g (\boldsymbol{w},\boldsymbol{\mu}_g) &=  \inf_{\boldsymbol{w}_g}  \mathcal{L}_{\rho}(\boldsymbol{w}_1, \dots, \boldsymbol{w}_G, \boldsymbol{w}, \boldsymbol{\mu}_1, \dots, \boldsymbol{\mu}_G)\\
    &\label{eq:admm_2a} =\inf_{\boldsymbol{w}_g} \mathcal{L}_{\rho_g}(\boldsymbol{w}_g, \boldsymbol{w}, \boldsymbol{\mu}_g)\\
    &\label{eq:admm_2b} = \inf_{\boldsymbol{w}_g}\sup_{\mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)}\mathbb{E}^{\mathbb{Q}_g}[\ell_H(\boldsymbol{w}_g;\boldsymbol{\xi})]+ \frac{\rho}{2} ||\boldsymbol{w}_g - \boldsymbol{w} + \boldsymbol{\mu}_g||_2^2\\
    &\label{eq:admm_2c} \begin{aligned}&= \inf_{\boldsymbol{w}_g,\lambda_g \geq 0} \lambda_g \varepsilon_g + \frac{1}{N_g} \sum_{n_g=1}^{N_g} \sup_{\boldsymbol{\xi} \in \Xi} \left \{\ell_H(\boldsymbol{w}_g;\boldsymbol{\xi}) - \lambda_g d(\boldsymbol{\xi},\widehat{\boldsymbol{\xi}}_{n_g}) \right \} + \frac{\rho}{2} ||\boldsymbol{w}_g - \boldsymbol{w} + \boldsymbol{\mu}_g||_2^2 \end{aligned}\\
    &\label{eq:admm_2d_} = \left \{ \begin{aligned}
        & \min_{\boldsymbol{w}_g,\lambda_g,s_{n_g}}&& \lambda_g \varepsilon_g + \frac{1}{N_g} \sum_{n_g=1}^{N_g}s_{n_g} +\frac{\rho}{2} || \boldsymbol{w}_g - \boldsymbol{w} + \boldsymbol{\mu}_g||_2^2&&\\
        & \subjectto && \ell_H(\boldsymbol{w}_g;(\widehat{\boldsymbol{x}}_{n_g},\widehat{y}_{n_g})) \leq s_{n_g}&& \forall n_g \in [N_g] \\
        &&&\ell_H(\boldsymbol{w}_g;(\widehat{\boldsymbol{x}}_{n_g},-\widehat{y}_{n_g})) - \kappa \lambda_g \leq s_{n_g} &&\forall n_g \in [N_g] \\
        &&& \lambda \geq ||\boldsymbol{w}_g||_{\ast}&&
    \end{aligned} \right.,
\end{align}
\end{subequations}
where \ref{eq:admm_2a} follows from the separability of the Augmented Lagrangian, \ref{eq:admm_2b} follows by definition of the local Lagrangian, \ref{eq:admm_2c} exploits the notable duality result presented by \cite{datadrivenDRO,kuhn2019wasserstein} to rewrite the inner maximization problem as a minimization problem, and
\eqref{eq:admm_2d_} follows by introducing slack variables $s_{n_g}$, recalling that $\ell_H(\boldsymbol{w};\boldsymbol{\xi})$ is convex and Lipschitz continuous, and utilizing similar arguments to the ones presented in the proof of Theorem 1 in \citep{shafieezadehabadeh2015distributionally}.

\subsubsection{Proof of Proposition \ref{prop:admm_server_up}} \label{proof:admm_server_up}
\begin{proof}
The central server can obtain updated global parameters $\boldsymbol{w}^{\ast}$ by minimizing the global Lagrangian with respect to $\boldsymbol{w}$. This can be done as follows.
\begin{subequations}
\begin{align}
    \boldsymbol{w}^{\ast} &= \argmin_{\boldsymbol{w}} \mathcal{L}_{\rho}(\boldsymbol{w}_1, \dots, \boldsymbol{w}_G, \boldsymbol{w}, \boldsymbol{\mu}_1, \dots, \boldsymbol{\mu}_G)\\
    &\label{eq:admm_3a}= \argmin_{\boldsymbol{w}} \sum_{g=1}^G \alpha_g\mathcal{L}_{\rho_g}(\boldsymbol{w}_g, \boldsymbol{w}, \boldsymbol{\mu}_g) \\
    &\label{eq:admm_3b} = \argmin_{\boldsymbol{w}} \sum_{g=1}^G \alpha_g \frac{\rho}{2}||\boldsymbol{w}_g - \boldsymbol{w} + \boldsymbol{\mu}_g||_2^2,
\end{align}
\end{subequations}
where \ref{eq:admm_3b} follows by observing that the norm term is the only term involving the variable $\boldsymbol{w}$. Let us define $f(\boldsymbol{w}) = \sum_{g=1}^G \alpha_g \frac{\rho}{2} ||\boldsymbol{w} - (\boldsymbol{w}_g + \boldsymbol{\mu}_g)||_2^2$. We note that $f(\boldsymbol{w})$ is strongly convex as it is a sum of strongly convex terms. Thus, it has a unique minimizer. We analyze its partial derivative with respect to $\boldsymbol{w}$ by setting it to 0 to obtain our minimizer as follows.
\begin{subequations}
\begin{align}
    \frac{\partial f}{\partial \boldsymbol{w}} &= \sum_{g=1}^G \alpha_g\frac{\rho}{2} \left [2 \boldsymbol{w} - 2(\boldsymbol{w}_g + \boldsymbol{\mu}_g) \right ]\\
    & = 0
\end{align}
\end{subequations}
Finally, we derive a closed form solution for $\boldsymbol{w}^{\ast}$ as follows:
\begin{subequations}
\begin{align}
    &\sum_{g=1}^G \alpha_g\frac{\rho}{2} \left [2 \boldsymbol{w} - 2(\boldsymbol{w}_g + \boldsymbol{\mu}_g) \right ] = 0\\
     \Leftrightarrow &\sum_{g=1}^G \alpha_g \boldsymbol{w} = \sum_{g=1}^G \alpha_g (\boldsymbol{w}_g + \boldsymbol{\mu}_g)\\
     \Leftrightarrow&\label{eq:admm_4a} \boldsymbol{w} = \sum_{g=1}^G \alpha_g (\boldsymbol{w}_g + \boldsymbol{\mu}_g),
\end{align}
\end{subequations}
where \ref{eq:admm_4a} follows by recalling that $\sum_{g=1}^G \alpha_g = 1$.
\end{proof}

\subsubsection{Proof of Theorem \ref{thm:admm_conv_cri}} \label{proof:admm_conv_cri}
\begin{proof}
    As mentioned previously, even when the client objective functions are closed proper convex functions as we demonstrate in \ref{lem:clo_pro_con}, and strong duality holds as shown by \cite{kuhn2019wasserstein}, multi-block ADMM is not theoretically guaranteed to converge \citep{chen2016}. However, \cite{lin2015} establish the convergence of multi-block ADMM in the setting where the objective functions of $(B-1)$ of the $B$ blocks are strongly convex with strong convexity parameter $\sigma_b$ for each block $b$. They formulate the problem to be solved via ADMM as follows:
    \begin{equation} \label{eq:admm_paper_form}
        \begin{aligned}
            &\min && f_1(\boldsymbol{v}_1) + f_2(\boldsymbol{v}_2) + \dots + f_B(\boldsymbol{v}_B)\\
            & \subjectto && \boldsymbol{A}_1\boldsymbol{v}_1 + \boldsymbol{A}_2\boldsymbol{v}_2 + \dots + \boldsymbol{A}_B\boldsymbol{v}_B = c\\
            &&& \boldsymbol{v}_b \in \mathcal{V}_b \quad \forall b \in [B],
        \end{aligned}
    \end{equation}
    where $f_b(\boldsymbol{v}_b)$ is the objective function term and $\boldsymbol{v}_b$ is the decision variable associated with the $b^{th}$ block. 
    
    Note that if we were to rewrite our problem from \eqref{eq:admm_orig_obj} in a similar form, there would be no distinction between the clients and the central server, and the objective function term associated with the central server would remain $0$. Thus, we add a strongly convex term $\tau_g||\boldsymbol{w}_g||_2^2$ to the objective function term associated with each of the clients to meet the requirement that $B-1$ of the blocks must have a strongly convex objective function. During the server aggregation step, each $\tau_g||\boldsymbol{w}_g||_2^2$ term will be multiplied by its respective weight $\alpha_g$. Therefore, the strong convexity parameter associated with client $g$ would be $2\alpha_g\tau_g$.

   To rewrite problem \eqref{eq:admm_orig_obj} in the form of problem \eqref{eq:admm_paper_form}, the $\boldsymbol{A}$ matrix associated with client $g$ would be a block matrix of $P\times P$ matrices stacked vertically in $G$ blocks. The $g^{th}$ block from the top would be the identity matrix, whereas all the other blocks would be zero. Similarly, the matrix associated with the central server would be a block matrix of similar structure but where all the blocks are the negative of the identity matrix. Incorporating this insight into the condition on $\rho$ described in Theorem 3.3 in \citep{lin2015} allows us to obtain the final result.
\end{proof}

\subsubsection{Proof of Theorem \ref{thm:admm_conv}} \label{proof:admm_conv}
\begin{proof}
This proof follows a very similar strategy to that of Theorem \ref{thm:subgrad_conv}. We begin by noting that the strongly convex variant of the local model problem in \eqref{eq:admm_2d} equipped with the $\ell_1$-norm can be written as a quadratically constrained quadratic problem (QCQP) with $N_g+2P+3$ decision variables (including slack variables) and $2N_g+2P+3$ constraints. When solved via the barrier method equipped with the log barrier function and Newton updates \citep{nn1994}, this problem would have the following worst-case time complexity
    \begin{equation*}
        \mathcal{O}([N_g+P]^{3.5}\log(\epsilon_2^{-1})).
    \end{equation*}
    Similar to the previous algorithm, all clients can solve their local problems in parallel and will have the same number of features. Thus the client with the greatest number of samples $N_{g^{\ast}}$ will have the problem with the greatest time complexity. Furthermore, we note that the central server aggregates $2G$ vectors of dimension $P$ in each iteration, the time complexity of which is $\mathcal{O}(GP)$. Therefore, we obtain the final result by noting that ADMM converges to an $\epsilon_1$-solution in $\mathcal{O}(\epsilon_1^{-1})$ iterations assuming the strong convexity of the objective function and that the upper bound on $\rho$ is satisfied \citep{lin2015}. While each client $g$ also performs the update of the local scaled Lagrange multipliers $\boldsymbol{\mu}_g$ during each iteration, this process has a much lower complexity than solving the local problem and is, therefore, not explicitly considered in this analysis.
\end{proof}
