
\section{Proofs and Supplementary Theoretical Results}
\subsection{Preliminary Lemmas} \label{sec:lemmas}
\begin{lemma} \label{lem:diff_max}
    Any two real scalars $a,a^{\prime} \in \mathbb{R}$ obey the following
    \begin{equation*}
        |\max\{0,a\} - \max \{0,a^{\prime} \}| \leq |a - a^{\prime}|.
    \end{equation*}
\end{lemma}
\begin{proof}
    To see this, consider the following cases:
    \begin{enumerate}
        \item $a,a^{\prime} \geq 0$. In this case one can directly see that
        \begin{equation*}
            |\max\{0,a\} - \max \{0,a^{\prime} \}| = |a - a^{\prime}|
        \end{equation*}

        \item $a \geq 0, \ a^{\prime} < 0$. In this case, we have the following:
        \begin{equation*}
            |\max\{0,a\} - \max \{0,a^{\prime} \}|  = a  < |a| + |a^{\prime}| = |a - a^{\prime}|.
        \end{equation*}

        \item $a < 0, \ a^{\prime} \geq 0$. This is symmetric to the previous case.

        \item $a < 0, \ a^{\prime} < 0$. In this case we have the following:
        \begin{equation*}
            |\max\{0,a\} - \max \{0,a^{\prime} \}|  = 0 \leq | a - a^{\prime}|.
        \end{equation*}
    \end{enumerate}
\end{proof}
\begin{lemma}[SM Objective Function Convexity]\label{lem:sub_con}
    Suppose Assumption \ref{assump:feat_supp} holds and let $f(\boldsymbol{w}) = \sum_{g=1}^{G} \alpha_g \sup_{\mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)}\mathbb{E}^{\mathbb{Q}_g}[\ell_H(\boldsymbol{w};\boldsymbol{\xi})]$. Then, $f(\boldsymbol{w})$ is convex in $\boldsymbol{w}$.
\end{lemma}
\begin{proof}
    Since $\ell_H(\boldsymbol{w},\boldsymbol{\xi})$ is a maximum of linear terms in $\boldsymbol{w}$, then it is convex in $\boldsymbol{w}$. Moreover, sums, scalar multiplication, taking the supremum, and the expectation are all operations that preserve convexity \citep{boyd2004convex}. Thus, $f(\boldsymbol{w})$ is convex in $\boldsymbol{w}$.
\end{proof}

\begin{lemma}[SM Objective Function Lipschitz Continuity] \label{lem:lip_cont}
    Suppose Assumption \ref{assump:feat_supp} holds and let $f(\boldsymbol{w}) = \sum_{g=1}^{G} \alpha_g \sup_{\mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)}\mathbb{E}^{\mathbb{Q}_g}[\ell_H(\boldsymbol{w};\boldsymbol{\xi})]$. Then, $f(\boldsymbol{w})$ is Lipschitz continuous in $\boldsymbol{w}$.
\end{lemma}
\begin{proof}
    As discussed by \cite{2019regularization}, if assumption \ref{assump:feat_supp} holds then one can obtain the discrete distribution described in \eqref{eq:ext_dist} that attains the worst case risk. Therefore, we have the following:
    \begin{align*}
    \allowdisplaybreaks
        f(\boldsymbol{w}) &=  \sum_{g=1}^{G} \alpha_g f_g(\boldsymbol{w}) \\
        & \coloneqq \sum_{g=1}^{G} \alpha_g \Bigg (\frac{1}{N_g} \sum_{n_g=1}^{N_g} {\beta_{n_g}^+}^{\ast} \ell_H(\boldsymbol{w};(\widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^+)) + {\beta_{n_g}^-}^{\ast} \ell_H(\boldsymbol{w};(-\widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^-)) \Bigg ),
    \end{align*}
    Now, suppose we have $\boldsymbol{w}$ and $\boldsymbol{w}^{\prime}$ which correspond to worst-case distributions characterized by $({\beta_{n_g}^{\pm}}^{\ast},\widehat{\boldsymbol{z}}_{N_g}^{\pm})$ and $({\beta_{n_g}^{\pm \prime}}^{\ast},\widehat{\boldsymbol{z}}_{N_g}^{\pm \prime})$, respectively. Then we can write the following:
    \begin{subequations}
    \allowdisplaybreaks
    \begin{align}
    \allowdisplaybreaks
        &|f_g(\boldsymbol{w}) - f_g(\boldsymbol{w}^{\prime})| \nonumber \\
        & \label{eq:lip_1a}=\frac{1}{N_g} \Bigg | \sum_{n_g=1}^{N_g} \bigg [{\beta_{n_g}^{+}}^{\ast} \ell_H(\boldsymbol{w};( \widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{+})) + {\beta_{n_g}^{-}}^{\ast} \ell_H(\boldsymbol{w};( -\widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{-})) \bigg ] \nonumber \\
        & \qquad \qquad \qquad - \sum_{n_g=1}^{N_g} \bigg [{\beta_{n_g}^{+ \prime}}^{\ast} \ell_H(\boldsymbol{w}^{\prime};( \widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{+ \prime})) + {\beta_{n_g}^{- \prime}}^{\ast} \ell_H(\boldsymbol{w}^{\prime};( -\widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{- \prime})) \bigg] \Bigg |\\
        &\label{eq:lip_1c} \leq \frac{1}{N_g} \Bigg | \sum_{n_g=1}^{N_g} \bigg [{\beta_{n_g}^{+}}^{\ast} \ell_H(\boldsymbol{w};( \widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{+})) - {\beta_{n_g}^{+ \prime}}^{\ast} \ell_H(\boldsymbol{w}^{\prime};( \widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{+ \prime})) \bigg ] \Bigg| \nonumber \\
        & \qquad \qquad \qquad + \Bigg | \sum_{n_g=1}^{N_g} \bigg [{\beta_{n_g}^{-}}^{\ast} \ell_H(\boldsymbol{w};( -\widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{-})) - {\beta_{n_g}^{- \prime}}^{\ast} \ell_H(\boldsymbol{w}^{\prime};( -\widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{- \prime})) \bigg] \Bigg |\\
        &\label{eq:lip_1d} \leq \frac{1}{N_g} \sum_{n_g=1}^{N_g} \Bigg [\bigg | {\beta_{n_g}^{+}}^{\ast} \ell_H(\boldsymbol{w};( \widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{+})) - {\beta_{n_g}^{+ \prime}}^{\ast} \ell_H(\boldsymbol{w}^{\prime};( \widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{+ \prime}))  \bigg|  \nonumber \\
        & \qquad \qquad \qquad + \bigg|{\beta_{n_g}^{-}}^{\ast} \ell_H(\boldsymbol{w};( -\widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{-})) - {\beta_{n_g}^{- \prime}}^{\ast} \ell_H(\boldsymbol{w}^{\prime};( -\widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{- \prime})) \bigg | \Bigg ]\\
        &\label{eq:lip_1e} \leq \frac{1}{N_g} \sum_{n_g=1}^{N_g} \Bigg [\bigg | {\beta_{n_g}^{+}}^{\ast} \ell_H(\boldsymbol{w};( \widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{+})) - {\beta_{n_g}^{+}}^{\ast} \ell_H(\boldsymbol{w}^{\prime};( \widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{+}))  \bigg| \nonumber \\
        & \qquad \qquad \qquad + \bigg|{\beta_{n_g}^{-}}^{\ast} \ell_H(\boldsymbol{w};( -\widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{-})) - {\beta_{n_g}^{- }}^{\ast} \ell_H(\boldsymbol{w}^{\prime};( -\widehat{y}_{n_g},\widehat{\boldsymbol{z}}_{n_g}^{-})) \bigg | \Bigg ]\\
        &\label{eq:lip_1f} \leq \frac{1}{N_g} \sum_{n_g=1}^{N_g} \Bigg [\bigg | {\beta_{n_g}^{+}}^{\ast} \max\{0,1-\widehat{y}_{n_g} \cdot \boldsymbol{w}^{\mathsf{T}}\widehat{\boldsymbol{z}}_{n_g}^{+} \} - {\beta_{n_g}^{+}}^{\ast} \max\{0,1-\widehat{y}_{n_g} \cdot \boldsymbol{w}^{\prime{\mathsf{T}}}\widehat{\boldsymbol{z}}_{n_g}^{+} \} \bigg| \nonumber \\
        & \qquad \qquad \qquad +  \bigg|{\beta_{n_g}^{-}}^{\ast} \max\{0,1+\widehat{y}_{n_g} \cdot \boldsymbol{w}^{\mathsf{T}}\widehat{\boldsymbol{z}}_{n_g}^{-} \} - {\beta_{n_g}^{- }}^{\ast} \max\{0,1+\widehat{y}_{n_g} \cdot \boldsymbol{w}^{\prime{\mathsf{T}}}\widehat{\boldsymbol{z}}_{n_g}^{-} \} \bigg | \Bigg ]\\
        &\label{eq:lip_1g} = \frac{1}{N_g} \sum_{n_g=1}^{N_g} \Bigg [\bigg | (\boldsymbol{w}^{\prime} - \boldsymbol{w})^{\mathsf{T}}({\beta_{n_g}^{+}}^{\ast}\cdot \widehat{y}_{n_g} \cdot \widehat{\boldsymbol{z}}_{n_g}^{+}) \bigg | + \bigg | (\boldsymbol{w} - \boldsymbol{w}^{\prime})^{\mathsf{T}}({\beta_{n_g}^{-}}^{\ast}\cdot \widehat{y}_{n_g} \cdot \widehat{\boldsymbol{z}}_{n_g}^{-})\bigg | \Bigg ]\\
        &\label{eq:lip_1h} \leq \frac{1}{N_g} \sum_{n_g=1}^{N_g} ||\boldsymbol{w} - \boldsymbol{w}^{\prime}|| \Bigg (\left | \left | {\beta_{n_g}^{+}}^{\ast}\cdot \widehat{y}_{n_g} \cdot \widehat{\boldsymbol{z}}_{n_g}^{+} \right | \right |_{\ast} + \left | \left | {\beta_{n_g}^{-}}^{\ast}\cdot \widehat{y}_{n_g} \cdot \widehat{\boldsymbol{z}}_{n_g}^{-} \right | \right |_{\ast} \Bigg )\\
         &\label{eq:lip_1i} = ||\boldsymbol{w} - \boldsymbol{w}^{\prime}|| \Bigg [ \frac{1}{N_g} \sum_{n_g=1}^{N_g}\left | \left | {\beta_{n_g}^{+}}^{\ast}\cdot \widehat{y}_{n_g} \cdot \widehat{\boldsymbol{z}}_{n_g}^{+} \right | \right |_{\ast} + \left | \left | {\beta_{n_g}^{-}}^{\ast}\cdot \widehat{y}_{n_g} \cdot \widehat{\boldsymbol{z}}_{n_g}^{-} \right | \right |_{\ast}\Bigg],
    \end{align}
    \end{subequations}
    where \eqref{eq:lip_1c} and \eqref{eq:lip_1d} follow from the triangle inequality, and \eqref{eq:lip_1e} follows by noting that the distribution characterized by $({\beta_{n_g}^{\pm \prime}}^{\ast},\widehat{\boldsymbol{z}}_{N_g}^{\pm \prime})$ maximizes the expected risk with respect to $\boldsymbol{w}^{\prime}$, thus the distribution characterized by $({\beta_{n_g}^{\pm }}^{\ast},\widehat{\boldsymbol{z}}_{N_g}^{\pm})$ will at most attain the same risk with respect to $\boldsymbol{w}^{\prime}$. Additionally, \eqref{eq:lip_1f} follows from the definition of the hinge loss function, \eqref{eq:lip_1g} follows from Lemma \ref{lem:diff_max}, and \eqref{eq:lip_1h} follows from the Cauchy-Schwarz inequality, where $||\cdot||_{\ast}$ is the dual norm of $||\cdot||$ used to measure distances in the space of $\boldsymbol{w}$. Given the previous, we can obtain the final result as follows:
    \begin{subequations}
    \begin{align}
        |f(\boldsymbol{w}) - f(\boldsymbol{w}^{\prime})| & = \left |\sum_{g=1}^{G} \alpha_g f_g(\boldsymbol{w}) - \sum_{g=1}^{G} \alpha_g f_g(\boldsymbol{w}^{\prime}) \right |\\
        &\label{eq:lip_2b} \leq \sum_{g=1}^{G} \alpha_g |f_g(\boldsymbol{w}) - f_g(\boldsymbol{w}^{\prime})| \\ 
        &\label{eq:lip_2c} \leq || \boldsymbol{w} - \boldsymbol{w}^{\prime}|| \sum_{g=1}^G \alpha_g \Lip(f_g(\boldsymbol{w})),
    \end{align}
    \end{subequations}
    where \eqref{eq:lip_2b} follows from the triangle inequality and $\Lip(f_g(\boldsymbol{w}))$ is taken from \eqref{eq:lip_1i}.
\end{proof}

\begin{lemma}[SM Objective Function Coercivity] \label{lem:coer}
    Suppose Assumption \ref{assump:feat_supp} holds and let $f(\boldsymbol{w}) = \sum_{g=1}^{G} \alpha_g \sup_{\mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)}\mathbb{E}^{\mathbb{Q}_g}[\ell_H(\boldsymbol{w};\boldsymbol{\xi})]$. Then, $f(\boldsymbol{w})$ is coercive in $\boldsymbol{w}$.
\end{lemma}
\begin{proof}
    We begin out proof by studying each of the individual terms $f_g(\boldsymbol{w})$ as follows
    \begin{subequations}
    \allowdisplaybreaks
    \begin{align}
        \allowdisplaybreaks
        \nonumber f_g(\boldsymbol{w}) &\coloneq \sup_{\mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)}\mathbb{E}^{\mathbb{Q}_g}[\ell_H(\boldsymbol{w};\boldsymbol{\xi})]\\
        &\label{eq:coer1} = \inf_{\lambda_g \geq 0} \lambda_g \varepsilon_g +  \frac{1}{N_g} \sum_{n_g=1}^{N_g} \sup_{\boldsymbol{\xi} \in \Xi} \left \{\ell_H(\boldsymbol{w};\boldsymbol{\xi}) - \lambda_g d(\boldsymbol{\xi},\widehat{\boldsymbol{\xi}}_{n_g}) \right \}\\
        &\label{eq:coer2} = \left \{ \begin{aligned}
            &\inf_{\lambda_g \geq 0, s_{n_g}} && \lambda_g \varepsilon_g +  \frac{1}{N_g} \sum_{n_g=1}^{N_g} s_{n_g} &&\\
            & \subjectto && \sup_{\boldsymbol{\xi} \in \Xi} \left \{\ell_H(\boldsymbol{w};\boldsymbol{\xi}) - \lambda_g d(\boldsymbol{\xi},\widehat{\boldsymbol{\xi}}_{n_g}) \right \} \leq s_{n_g} && \forall n_g \in [N_g]\\
        \end{aligned} \right.\\
        &\label{eq:coer3} = \left \{ \begin{aligned}
            &\inf_{\lambda_g \geq 0, s_{n_g}} && \lambda_g \varepsilon_g +  \frac{1}{N_g} \sum_{n_g=1}^{N_g} s_{n_g} &&\\
            & \subjectto && \sup_{\boldsymbol{x} \in \mathcal{X}} \left \{\ell_H(\boldsymbol{w};(\boldsymbol{x},\widehat{y}_{n_g})) - \lambda_g ||\boldsymbol{x} - \widehat{\boldsymbol{x}}_{n_g}|| \right \} \leq s_{n_g} && \forall n_g \in [N_g]\\
            &&& \sup_{\boldsymbol{x} \in \mathcal{X}} \left \{\ell_H(\boldsymbol{w};(\boldsymbol{x},-\widehat{y}_{n_g})) - \lambda_g ||\boldsymbol{x} - \widehat{\boldsymbol{x}}_{n_g}|| \right \} - \kappa \lambda_g \leq s_{n_g} && \forall n_g \in [N_g]\\
        \end{aligned} \right.\\
        &\label{eq:coer4} = \left \{ \begin{aligned}
            &\inf_{\lambda_g, s_{n_g}} && \lambda_g \varepsilon_g +  \frac{1}{N_g} \sum_{n_g=1}^{N_g} s_{n_g} &&\\
            & \subjectto && \ell_H(\boldsymbol{w};(\widehat{\boldsymbol{x}}_{n_g},\widehat{y}_{n_g})) \leq s_{n_g} && \forall n_g \in [N_g]\\
            &&& \ell_H(\boldsymbol{w};(\widehat{\boldsymbol{x}}_{n_g},-\widehat{y}_{n_g})) - \kappa \lambda_g \leq s_{n_g} && \forall n_g \in [N_g]\\
            &&& \lambda_g \geq ||\boldsymbol{w}||_{\ast}
        \end{aligned} \right.
    \end{align}
    \end{subequations}
    where \eqref{eq:coer1} follows from the strong duality result presented by \cite{2019regularization,kuhn2019wasserstein}, \eqref{eq:coer2} is obtained through the introduction of slack variables and moving the maximization problems to the constraints, and \eqref{eq:coer3} is obtained through the definition of the separable transportation cost function \eqref{eq:cost_func} and by noting that $y \in \{ -1, +1\}$, and finally \eqref{eq:coer4} is obtained by recalling that the hinge loss function $\ell_H(\boldsymbol{w};\boldsymbol{\xi})$ is convex and Lipschitz continuous in $\boldsymbol{x}$, and therefore it follows from Lemma A.3 in \citep{2019regularization} that
    \begin{equation*}
        \sup_{\boldsymbol{x} \in \mathcal{X}} \left \{\ell_H(\boldsymbol{w};(\boldsymbol{x},y)) - \lambda_g ||\boldsymbol{x} - \widehat{\boldsymbol{x}}|| \right \} = \left \{ \begin{aligned}
            &\ell_H(\boldsymbol{w};(\widehat{\boldsymbol{x}},y)) && \text{if} ||\boldsymbol{w}||_{\ast} \leq \lambda_g\\
            & + \infty && \text{otherwise},
        \end{aligned} \right.
    \end{equation*}
    where $||\cdot||_{\ast}$ is the dual to the norm utilized in the definition of the transportation cost function \eqref{eq:cost_func}. Therefore, as $||\boldsymbol{w}||_{\ast} \rightarrow \infty$, we get that $\lambda_g \rightarrow \infty$. Since $\lambda_g$ has a positive sign in the objective function of \eqref{eq:coer4}, then $f_g(\boldsymbol{w}) \rightarrow +\infty$ as $\lambda_g \rightarrow \infty$. This implies that $f(\boldsymbol{w}) = \sum_g^G f_g(\boldsymbol{w})$ is a coercive function of $\boldsymbol{w}$, since $f(\boldsymbol{w}) \rightarrow +\infty$ as $||\boldsymbol{w}||_{\ast} \rightarrow \infty$.
\end{proof}

\begin{lemma}[ADMM Objective Properties] \label{lem:clo_pro_con}
     Let $f(\boldsymbol{w}_g) = \sup_{\mathbb{Q}_g \in \mathcal{A}^{(g)}_{\varepsilon_g,1,d}(\Xi)}\mathbb{E}^{\mathbb{Q}_g}[\ell_H(\boldsymbol{w}_g;\boldsymbol{\xi})]$, then $f(\boldsymbol{w}_g)$ is a closed proper convex function in $\boldsymbol{w}_g$.
\end{lemma}

\begin{proof}
    Recall that $\ell_H(\boldsymbol{w}_g,\boldsymbol{\xi})$ is convex in $\boldsymbol{w}_g$, and taking the supremum and expectation are operations that preserve convexity \citep{boyd2004convex}, thus $f(\boldsymbol{w}_g)$ is convex in $\boldsymbol{w}_g$. Now, note that
    \begin{equation*}\ell_H(\boldsymbol{w}_g,\boldsymbol{\xi}) \geq 0  \Rightarrow f(\boldsymbol{w}_g) \geq 0  \quad \forall \boldsymbol{w}_g \in \mathbb{R}^P.
    \end{equation*}
    Now, observe that $f(\boldsymbol{0}) = 1$ since $\ell_H(\boldsymbol{0},\boldsymbol{\xi}) = 1 \ \forall \boldsymbol{\xi} \in \Xi$. Since $f(\boldsymbol{w}_g)>-\infty$ and it has a nonempty effective domain, then it is proper convex \citep{cd2006}. Finally, since $f(\boldsymbol{w}_g):\mathbb{R}^P \rightarrow (-\infty,\infty]$ is proper convex, then it is continuous by Proposition 1.3.11 in \citep{bertsekas2009convex}. This implies the closedness of the function.
\end{proof}
