%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{dirtytalk} % correct English quotation marks
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amsmath,amssymb}
\usepackage{dsfont}
\newtheorem{thm}{Theorem}
\newtheorem{prop}{Proposition}
\newtheorem{defi}{Definition}
\newtheorem{rem}{Remark}
\newtheorem{obs}{Observation}
\newtheorem{cor}{Corollary}
\newtheorem{exa}{Example}
\newtheorem{ax}{Axiom}
% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{jansen_234}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Robust Statistical Comparison of Random Variables\\
with Locally Varying Scale of Measurement\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<christoph.jansen@stat.uni-muenchen.de>?Subject=Your UAI 2023 paper}{Christoph~Jansen}{}}
\author[1]{Georg~Schollmeyer}
\author[1]{Hannah~Blocher}
\author[1]{Julian~Rodemann}
\author[1]{Thomas~Augustin}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    Ludwig-Maximilians-Universität\\
    Munich, Bavaria, Germany
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

In the following, we give supplementary information and material to the main paper. This includes all mathematical proofs of the propositions and corollaries established in the main paper (Part~\ref{proofs}), further details on the implementation and reproducibility (Part~\ref{repro}), further calculations for the robustified test statistics (Part~\ref{calcu}), and further analyses of the applications in the main paper (Part~\ref{furtheranalysis}). If not explicitly stated otherwise, from now on, all references to equations, propositions, etc. refer to the main part of the paper.

\appendix

\section{Proofs of the Results in the Main Paper} \label{proofs}
\subsection{Proofs for Proposition~\ref{obs1}~and~\ref{normalized}: Bounded Preference Systems}
%
We start by proving Propositions~\ref{obs1}~resp.~\ref{normalized} from Sections~\ref{intro}~resp.~\ref{gsd} that state that checking consistency resp. GSD simplifies if the underlying preference system is bounded.
%
\begin{prop}
Let $\mathcal{A}=[A, R_1 , R_2]$ be a bounded preference system. Then $\mathcal{A}$ is consistent iff it is $0$-consistent. 
\end{prop}
%
\textbf{Proof.} If $\mathcal{A}$ is $0$-consistent, then it is obviously also consistent, since every normalized representation is in particular a representation. For the other direction, assume $\mathcal{A}$ to be consistent. Choose $u \in \mathcal{U}_{\mathcal{A}}$ arbitrarily and denote by $a_*,a^*$ the $R_1$-minimal resp. $R_1$-maximal elements satisfying $(a^*,a_*) \in P_{R_1}$. From the latter we know that $u(a^*)>u(a_*)$. Thus, the function
$$\Tilde{u}: A \to [0,1]~~,~~a \mapsto \frac{u(a)- u(a_*)}{u(a^*)- u(a_*)}$$
is well-defined. Moreover, one easily verifies that $\Tilde{u}\in \mathcal{U}_{\mathcal{A}}$, and $u(a_*)=0$, and $u(a^*)=1$. Thus, we can conclude that $\Tilde{u}\in \mathcal{N}_{\mathcal{A}}$, which -- by definition -- implies $0$-consistency.
\hfill $\square$
%
%
\begin{prop} 
If $\mathcal{A}$ is consistent and bounded with $a_*,a^*$ as before, then $(X,Y) \in R_{(\mathcal{A},\pi)}$ iff
\begin{equation*}
\forall u \in \mathcal{N}_{\mathcal{A}}: \mathbb{E}_{\pi}(u \circ X) \geq \mathbb{E}_{\pi}(u \circ Y).
\end{equation*}
\end{prop}
%
\textbf{Proof.} The direction $\Rightarrow$ follows trivially by observing $\mathcal{N}_{\mathcal{A}} \subseteq \mathcal{U}_{\mathcal{A}}$. For the direction $\Leftarrow$, assume that it holds $\forall u \in \mathcal{N}_{\mathcal{A}}: \mathbb{E}_{\pi}(u \circ X) \geq \mathbb{E}_{\pi}(u \circ Y)$. Choose $u \in \mathcal{U}_{\mathcal{A}}$ arbitrarily. With the same argument as given in the proof of Proposition~\ref{obs1}, we know that then $\Tilde{u}\in \mathcal{N}_{\mathcal{A}}$, where $\Tilde{u}$ is defined as in the proof of Proposition~\ref{obs1}. Since $\Tilde{u}$ is a positive (affine) linear transformation of $u$, we know that $\mathbb{E}_{\pi}(u \circ X) \geq \mathbb{E}_{\pi}(u \circ Y)$ if and only if $\mathbb{E}_{\pi}(\Tilde{u} \circ X) \geq \mathbb{E}_{\pi}(\Tilde{u} \circ Y)$. Since the latter is true by assumption (utilizing $\Tilde{u}\in \mathcal{N}_{\mathcal{A}}$), the first also is true. As $u$ was chosen arbitrarily, this completes the proof.  \hfill $\square$
%
\subsection{Proofs of Propositions~\ref{prop:xi_computation}~and~\ref{prop:test_statitic_reg}: Computations For the Permutation Test}
%
We now give proofs for Propositions~\ref{prop:xi_computation}~resp.~\ref{prop:test_statitic_reg} from Section~\ref{sec:compu_test_statistic} that concern the computation of the maximum regularization strength resp. the computation of the (regularized) test statistic for the permutation-test.
%
\begin{prop}
For samples $\mathbf{x}$ and $\mathbf{y}$ of the form ~(\ref{sample1}) and~(\ref{sample2}) and $\varepsilon \in [0,1]$, we consider the linear program
\begin{equation*}
    \xi \longrightarrow \max_{(v_1 , \dots , v_s,\xi)}
\end{equation*}
with constraints $(v_1 , \dots , v_s,\xi) \in C(\mathbf{x},\mathbf{y})$. Denote by $\xi^*$ its optimal value. It then holds $\delta_{\varepsilon}(\omega_0)=\varepsilon \cdot \xi^*$. 
\end{prop}
%
\textbf{Proof.} The Proposition follows from standard results on linear optimization and the fact that $C(\mathbf{x},\mathbf{y})$ is compact. Set $I:=\bigl\{ \ell: (z_{\ell},a^*) \in I_{R_1}\bigr\}$ and define the vector $\underline{v}:=(0,1, v_3 , \dots ,v_s,0) \in [0,1]^{s+1}$ by $v_{\ell}=1$ if $\ell \in I$ and $v_{\ell}=0$ otherwise. One then easily verifies that $\underline{v}$ is an admissible solution to the above linear program. Since $C(\mathbf{x},\mathbf{y})$ is compact, this implies the existence of an optimal solution. Denote thus by $\underline{v}^*:=(0,1,v_3^*, \dots , v_s^*, \xi^*)$ an arbitrary optimal solution. We have to show that $$\xi^*=\sup \bigl\{\xi:\mathcal{N}^{\xi}_{\mathcal{A}_{\omega_0}} \neq \emptyset\bigr\}=:c.$$
Assume, for contradiction, the above equality does not hold. We distinguish two cases:

\textit{Case 1:} $\xi^* < c$. Then, one easily verifies that for any function $u \in \mathcal{N}^{c}_{\mathcal{A}_{\omega_0}} $ the vector $(u(z_1),\dots , u(z_s), c)$ defines an admissible solution to the above linear program with an objective value of $c$. This constradicts the optimality of $\underline{v}^*$.

\textit{Case 2:} $\xi^* > c$. Then, setting $u:(\mathbf{X} \mathbf{Y})_{\omega_0} \to [0,1]$ with $u(z_{\ell}):= v^*_{\ell}$ defines an element of $\mathcal{N}^{\xi^*}_{\mathcal{A}_{\omega_0}} $, contradicting that $c$ is the largest number for which $\mathcal{A}_{\omega_0}$ is $c$-consistent.

Thus, we have that $c= \xi^*$, implying $\delta_{\varepsilon}(\omega_0)=\varepsilon \cdot \xi^*$.
\hfill $\square$
%
\begin{prop}
For samples $\mathbf{x}$ and $\mathbf{y}$ of the form ~(\ref{sample1}) and~(\ref{sample2}) and $\varepsilon \in [0,1]$, we consider the following linear program
\begin{equation*}
    \sum_{\ell =1}^{s} v_{\ell}\cdot\Bigl(\tfrac{|\{i:x_i=z_{\ell}\}|}{n}-\tfrac{|\{i:y_i=z_{\ell}\}|}{m}\Bigr) \longrightarrow \min_{(v_1 , \dots , v_s)}
\end{equation*}
with constraints $(v_1 , \dots , v_s) \in C_{\varepsilon  \xi^*}(\mathbf{x},\mathbf{y})$, where $\xi^*$ denotes the optimal value of~(\ref{maxdel}).  Denote by opt$_{\varepsilon}(\mathbf{x},\mathbf{y})$ its optimal value. It then holds:
\begin{itemize}
    \item[i)] opt$_{\varepsilon}(\mathbf{x},\mathbf{y})=d^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0)$. 
    \item[ii)] There is in-sample GSD of $X$ over $Y$ if and only if opt$_0(\mathbf{x},\mathbf{y})\geq 0$.
\end{itemize}
\end{prop}
%
\textbf{Proof.} i) By definition and Proposition~2~, we know that $\mathcal{N}^{\xi^*}_{\mathcal{A}_{\omega_0}}\neq \emptyset$. As these sets are nested with decreasing $\xi$-value and we have $\varepsilon  \xi^* \leq \xi^*$, this implies that also $\mathcal{N}^{\varepsilon  \xi^*}_{\mathcal{A}_{\omega_0}}\neq \emptyset$. Hence, we can choose $u \in \mathcal{N}^{\varepsilon  \xi^*}_{\mathcal{A}_{\omega_0}}$. One then easily verifies that the vector $(u(z_1),\dots , u(z_s))$ defines an admissible solution to the above linear program. Since $ C_{\varepsilon  \xi^*}(\mathbf{x},\mathbf{y})$ is compact, this implies the existence of an optimal solution. Thus, denote by $\underline{v}^*:=(v_1^*, \dots , v_s^*)$ an arbitrary such optimal solution. If we then define $u:(\mathbf{X} \mathbf{Y})_{\omega_0} \to [0,1]$ with $u(z_{\ell}):= v^*_{\ell}$, then one easily verifies that $u \in\mathcal{N}^{\varepsilon\xi^*}_{\mathcal{A}_{\omega_0}} $ and that 
%
\begin{equation}\label{fref}
 opt_{\varepsilon}(\mathbf{x},\mathbf{y})=  \sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot(\hat{\pi}^{\omega_0}_X(\{z\})-\hat{\pi}^{\omega_0}_Y(\{z\}))
\end{equation}
%
(to see this, note that the right side of the equation is a simple reformulation of the objective function with $\underline{v}^*$ plugged-in).

We have to show that
$$opt_{\varepsilon}(\mathbf{x},\mathbf{y})=d^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0).$$
Assume, for contradiction, the above equality does not hold. We distinguish two cases:

\textit{Case 1:} $opt_{\varepsilon}(\mathbf{x},\mathbf{y})>d^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0).$ This would imply that there exists an $u' \in\mathcal{N}^{\varepsilon\xi^*}_{\mathcal{A}_{\omega_0}} $ that -- if it was set in the right-hand side of the above Equation~(\ref{fref}) (in the supplementary material) instead of $u$ -- would produce a value strictly smaller than $opt_{\varepsilon}(\mathbf{x},\mathbf{y})$. This contradicts the optimality of $\underline{v}^*$, since every $u' \in\mathcal{N}^{\varepsilon\xi^*}_{\mathcal{A}_{\omega_0}} $ produces an admissible solution to the linear program with objective value given by the right-hand side of the above Equation~(\ref{fref}).

\textit{Case 2:} $opt_{\varepsilon}(\mathbf{x},\mathbf{y})<d^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0).$  This would be an immediate contradiction to the above Equation~(\ref{fref}) (in the supplementary material), since $d^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0)$  is by definition the infimum over all the expressions on the equation's right-hand side.

This completes the proof of i). To see ii), note that i) implies $opt_{0}(\mathbf{x},\mathbf{y})=d^{0}_{\mathbf{X},\mathbf{Y}}(\omega_0)$. Thus, we have $opt_{0}(\mathbf{x},\mathbf{y})\geq 0$ if and only if $d^{0}_{\mathbf{X},\mathbf{Y}}(\omega_0) \geq 0$, which -- by definition -- is true if and only if there is in-sample GSD of $X$ over $Y$.
\hfill $\square$
%
\subsection{Proofs of Proposition~\ref{its}~and~\ref{prop:test_statistic_ip}: Computations for Robustified Testing}
%
We now give proofs of Proposition~\ref{its}~resp.~\ref{prop:test_statistic_ip}  from Section~\ref{robtest} concerning the computation of the robustified test statistic resp. its simplification under the special case of a $\gamma$-contamination model (with $\gamma \in [0,1]$).
%
\begin{prop} 
For samples $\mathbf{x}$ and $\mathbf{y}$ of the form ~(\ref{sample1}) and~(\ref{sample2}), $\varepsilon \in [0,1]$, and $(\pi_1 , \pi_2) \in \mathcal{E}(\mathcal{M}^{\omega_0}_X) \times\mathcal{E}(\mathcal{M}^{\omega_0}_Y)$, we consider the following linear program:
\begin{equation*}
    \sum_{\ell =1}^{s} v_{\ell}\cdot(\pi_1(\{z\})- \pi_2(\{z\})) \longrightarrow \min_{(v_1 , \dots , v_s)}
\end{equation*}
with constraints $(v_1 , \dots , v_s) \in C_{\varepsilon  \xi^*}(\mathbf{x},\mathbf{y})$, where $\xi^*$ denotes the optimal value of~(\ref{maxdel}).  Denote by opt$_{\varepsilon}(\mathbf{x},\mathbf{y},\pi_1, \pi_2)$ its optimal value and by $\underline{opt}_{\varepsilon}(\mathbf{x},\mathbf{y})$ the minimal optimum over all combinations of $(\pi_1 , \pi_2) \in \mathcal{E}(\mathcal{M}^{\omega_0}_X) \times\mathcal{E}(\mathcal{M}^{\omega_0}_Y)$. It then holds:
\begin{itemize}
    \item[i)] $ \underline{opt}_{\varepsilon}(\mathbf{x},\mathbf{y})=\underline{d}^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0)$. 
    \item[ii)] There is in-sample GSD of $X$ over $Y$ for any $\pi$ with $\hat{\pi}^{\omega_0}_X \in \mathcal{M}^{\omega_0}_X$ and $\hat{\pi}^{\omega_0}_Y \in \mathcal{M}^{\omega_0}_Y$  if  $\underline{opt}_0(\mathbf{x},\mathbf{y})\geq 0$.
\end{itemize}
\end{prop}
%
\textbf{Proof.} i) Since nothing in the proof of Proposition \ref{prop:test_statitic_reg} hinges on the concrete structure of the involved empirical image measures, Proposition~\ref{prop:test_statitic_reg} is still valid if we replace $\hat{\pi}^{\omega_0}_X$ and $\hat{\pi}^{\omega_0}_Y$ by arbitrary $\pi_1 \in \mathcal{M}^{\omega_0}_X$ and $\pi_2 \in \mathcal{M}^{\omega_0}_Y$, respectively. This specifically implies 
%
%
\begin{equation} \label{helper}
 opt_{\varepsilon}(\mathbf{x},\mathbf{y},\pi_1, \pi_2)=  \inf_{u\in \mathcal{N}^{\delta_{\varepsilon}(\omega_0)}_{\mathcal{A}_{\omega_0}}}\sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot(\pi_1(\{z\})-\pi_2(\{z\})).
\end{equation}
%
In order to show i), we now need to verify that
$$ \inf_{(\pi_1 , \pi_2)\in \mathcal{E}(\mathcal{M}^{\omega_0}_X) \times\mathcal{E}(\mathcal{M}^{\omega_0}_Y)} opt_{\varepsilon}(\mathbf{x},\mathbf{y},\pi_1, \pi_2)  =\underline{d}^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0).$$
%
Due to the above Equation~(\ref{helper}) (in the supplementary material) and the fact that iterated infima can be equivalently replaced by one global infimum, we know that
%
\begin{equation} \label{helper2}
    \inf_{(\pi_1 , \pi_2)\in \mathcal{M}^{\omega_0}_X \times\mathcal{M}^{\omega_0}_Y} opt_{\varepsilon}(\mathbf{x},\mathbf{y},\pi_1, \pi_2)=\underline{d}^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0).
\end{equation}
%
We then can compute:
%
\begin{align*}
\underline{d}^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0)&\stackrel{(\ref{helper2})}{=}\inf_{(\pi_1 , \pi_2)\in \mathcal{M}^{\omega_0}_X \times\mathcal{M}^{\omega_0}_Y} opt_{\varepsilon}(\mathbf{x},\mathbf{y},\pi_1, \pi_2)\\[.25cm] 
&= \inf_{(\pi_1 , \pi_2)\in \mathcal{M}^{\omega_0}_X \times\mathcal{M}^{\omega_0}_Y} \inf_{u\in \mathcal{N}^{\delta_{\varepsilon}(\omega)}_{\mathcal{A}_{\omega}}}\sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot(\pi_1(\{z\})-\pi_2(\{z\}))\\[.25cm]
&=  \inf_{u\in \mathcal{N}^{\delta_{\varepsilon}(\omega)}_{\mathcal{A}_{\omega}}} \inf_{(\pi_1 , \pi_2)\in \mathcal{M}^{\omega_0}_X \times\mathcal{M}^{\omega_0}_Y}\sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot(\pi_1(\{z\})-\pi_2(\{z\}))\\[.25cm]
&\stackrel{(\star)}{=}  \inf_{u\in \mathcal{N}^{\delta_{\varepsilon}(\omega)}_{\mathcal{A}_{\omega}}} \Biggl(\inf_{\pi_1 \in \mathcal{M}^{\omega_0}_X} \sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot\pi_1(\{z\})-\sup_{ \pi_2\in \mathcal{M}^{\omega_0}_Y}\sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot\pi_2(\{z\})\Biggr)\\[.15cm]
&\stackrel{(\star\star)}{=}  \inf_{u\in \mathcal{N}^{\delta_{\varepsilon}(\omega)}_{\mathcal{A}_{\omega}}} \Biggl(\inf_{\pi_1 \in \mathcal{E}(\mathcal{M}^{\omega_0}_X)} \sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot\pi_1(\{z\})-\sup_{ \pi_2\in \mathcal{E}(\mathcal{M}^{\omega_0}_Y)}\sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot\pi_2(\{z\})\Biggr)\\[.25cm]
&= \inf_{(\pi_1 , \pi_2)\in \mathcal{E}(\mathcal{M}^{\omega_0}_X) \times\mathcal{E}(\mathcal{M}^{\omega_0}_Y)} \inf_{u\in \mathcal{N}^{\delta_{\varepsilon}(\omega)}_{\mathcal{A}_{\omega}}}\sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot(\pi_1(\{z\})-\pi_2(\{z\}))\\[.25cm]
&= \inf_{(\pi_1 , \pi_2)\in \mathcal{E}(\mathcal{M}^{\omega_0}_X) \times\mathcal{E}(\mathcal{M}^{\omega_0}_Y)} opt_{\varepsilon}(\mathbf{x},\mathbf{y},\pi_1, \pi_2)\\
\end{align*}
%
Here, ($\star$) follows since -- for $u$ fixed -- the infimum of the differences of the two sums is attained if the first sum is smallest possible and the second sum is largest possible (note that all sums involved are finite). Further, ($\star\star$) follows since -- again for $u$ fixed -- the sums are linear functions on the compact sets $\mathcal{M}^{\omega_0}_X $ resp. $\mathcal{M}^{\omega_0}_Y$ and, therefore, attain their optima on $\mathcal{E}(\mathcal{M}^{\omega_0}_X) $ resp. $\mathcal{E}(\mathcal{M}^{\omega_0}_Y)$. The fith and sixth equalities are just reversing the computation done in the first three equalities.

To see ii), note that i) implies $ \underline{opt}_{0}(\mathbf{x},\mathbf{y})=\underline{d}^{0}_{\mathbf{X},\mathbf{Y}}(\omega_0)$. Thus, $\underline{opt}_{0}(\mathbf{x},\mathbf{y}) \geq 0$ if and only if $\underline{d}^{0}_{\mathbf{X},\mathbf{Y}}(\omega_0) \geq 0$. But -- by definition -- the latter is true if and only if
$$\inf_{u\in \mathcal{N}^{0}_{\mathcal{A}_{\omega_0}}}\sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot(\pi_1(\{z\})-\pi_2(\{z\})) \geq 0$$
for all $(\pi_1 , \pi_2)\in \mathcal{M}^{\omega_0}_X \times\mathcal{M}^{\omega_0}_Y$. This obviously implies in-sample GSD of $X$ over $Y$ for any $\pi$ with $\hat{\pi}^{\omega_0}_X \in \mathcal{M}^{\omega_0}_X$ and $\hat{\pi}^{\omega_0}_Y \in \mathcal{M}^{\omega_0}_Y$, since $\mathcal{N}^{0}_{\mathcal{A}_{\omega_0}}=\mathcal{N}_{\mathcal{A}_{\omega_0}}$.
%As -- according to Equation~(\ref{helper2}) (in the supplementary material) --  $\underline{d}^{0}_{\mathbf{X},\mathbf{Y}}(\omega_0)$ is smaller or equal than $opt_{\varepsilon}(\mathbf{x},\mathbf{y},\pi_1, \pi_2)$ for all $(\pi_1 , \pi_2)\in \mathcal{M}^{\omega_0}_X \times\mathcal{M}^{\omega_0}_Y$, we can conclude that $opt_{\varepsilon}(\mathbf{x},\mathbf{y})$ from Proposition 2 is  greater or equal than no matter on which candidate empirical measures $(\pi_1 , \pi_2)\in \mathcal{M}^{\omega_0}_X \times\mathcal{M}^{\omega_0}_Y$. Thus, 
\hfill $\square$
%
\begin{prop}
Consider again the situation of Proposition~\ref{its} with the additional assumption that $\mathcal{M}^{\omega_0}_X $ and $\mathcal{M}^{\omega_0}_Y$ are of the form~(\ref{lvm}) with extreme points as in~(\ref{elvm}). It then holds:
$$\underline{opt}_{\varepsilon}(\mathbf{x},\mathbf{y})=opt_{\varepsilon}(\mathbf{x},\mathbf{y},\pi_*, \pi^*)$$
where
$$\pi_*=\gamma \delta_{a_*} + (1 - \gamma) \hat{\pi}^{\omega_0}_X$$
and
$$\pi^*=\gamma \delta_{a^*} + (1 - \gamma) \hat{\pi}^{\omega_0}_Y.$$
\end{prop}
%
\textbf{Proof.} By again utilizing Equation~(\ref{helper}) (of the supplementary material), the claim modifies to showing that
$$\underline{opt}_{\varepsilon}(\mathbf{x},\mathbf{y})=\inf_{u\in \mathcal{N}^{\delta_{\varepsilon}(\omega_0)}_{\mathcal{A}_{\omega_0}}}\sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot(\pi_*(\{z\})-\pi^*(\{z\})).
$$
Since, by Proposition 2, we know that $\underline{d}^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0)=\underline{opt}_{\varepsilon}(\mathbf{x},\mathbf{y})$ and $\underline{d}^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0)$ is by definition the infimum over all the expressions on the right-hand side, the direction $\leq$ is immediate. So, it remains to show the direction $\geq$. To do so, choose $(\pi_1 , \pi_2)\in \mathcal{M}^{\omega_0}_X \times\mathcal{M}^{\omega_0}_Y$ arbitrarily. Since both $\mathcal{M}^{\omega_0}_X $ and $\mathcal{M}^{\omega_0}_Y$ are of the form~(\ref{lvm}), we then know that there exist probability measures $\nu_1$ and $\nu_2$ such that
$$\pi_1= \gamma \cdot \nu_1 + (1 - \gamma) \cdot \hat{\pi}^{\omega_0}_X$$
and 
$$\pi_2= \gamma \cdot \nu_2 + (1 - \gamma) \cdot \hat{\pi}^{\omega_0}_Y.$$
%
Here, we utilized the fact that credal sets of the form~(\ref{lvm}) can be equivalently characterized as $$
\mathcal{M}^{\omega}_Z =\Bigl\{ \pi: \pi \geq (1 -\gamma)\cdot \hat{\pi}^{\omega}_Z\Bigr\}=\Bigl\{ \gamma \cdot \nu + (1 - \gamma) \cdot \hat{\pi}^{\omega_0}_Z: \nu \text{ probability measure} \Bigr\}.
$$
For $u\in \mathcal{N}^{\delta_{\varepsilon}(\omega_0)}_{\mathcal{A}_{\omega_0}}$ fixed (but arbitrary), we then can compute:
%
\begin{align*}
 \sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot\pi_1(\{z\})  &=  \gamma \cdot \sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}}u(z)\cdot\nu_1(\{z\}) + (1-\gamma)\sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot\hat{\pi}^{\omega_0}_X(\{z\})\\[.2cm]
 &\geq \gamma \cdot u(a_*) + (1-\gamma)\sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot\hat{\pi}^{\omega_0}_X(\{z\})\\[.2cm]
 &=  \gamma \cdot \sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}}u(z)\cdot\delta_{a_*}(\{z\}) + (1-\gamma)\sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot\hat{\pi}^{\omega_0}_X(\{z\})\\[.2cm]
 &=  \sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}}u(z)\cdot\pi_*(\{z\}) 
\end{align*}
%
Analogous reasoning yields:
%
\begin{align*}
 \sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot\pi_2(\{z\})   &\leq  \sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}}u(z)\cdot\pi^*(\{z\}) 
\end{align*}
%
Putting the two together, we arrive at:
%
\begin{align*}
 \sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot(\pi_1(\{z\})-\pi_2(\{z\}))   &\geq  \sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot(\pi_*(\{z\})-\pi^*(\{z\}))
\end{align*}
%
As $\pi_1, \pi_2$, and $u$ were chosen arbitrarily, the inequality remains valid for the infimum, i.e.
%
%
\begin{align*}
  \inf_{(\pi_1 , \pi_2,u)\in \mathcal{M}^{\omega_0}_X \times\mathcal{M}^{\omega_0}_Y \times \mathcal{N}^{\delta_{\varepsilon}(\omega_0)}_{\mathcal{A}_{\omega_0}}}\sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot(\pi_1(\{z\})-\pi_2(\{z\}))   &\geq \inf_{u\in \mathcal{N}^{\delta_{\varepsilon}(\omega_0)}_{\mathcal{A}_{\omega_0}}}  \sum_{z \in (\mathbf{X} \mathbf{Y})_{\omega_0}} u(z)\cdot(\pi_*(\{z\})-\pi^*(\{z\}))
\end{align*}
%
Observing that the left side of this inequality by definition  equals $\underline{d}^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0)$ and, therefore, by Proposition~\ref{prop:test_statitic_reg}, also $\underline{opt}_{\varepsilon}(\mathbf{x},\mathbf{y})$ completes the direction $\geq$ and thus the proof.
%
\hfill $\square$
%
\subsection{Proofs of Propositions~\ref{somecharacterisitcs}~and~\ref{simplification}: Multi-dimensional spaces}
%
Finally, we give proofs of Propositions~\ref{somecharacterisitcs}~and~\ref{simplification} from Section~\ref{multi} concerning several different characterizing properties of the GSD-order for the special case of preferences systems arising from multi-dimensional spaces with differently scaled dimensions. For this, recall that in Section~\ref{gsd} for a preference system $\mathcal{A}$ and a probability measure $\pi$ we defined
$$\mathcal{F}_{(\mathcal{A},\pi)}:=
    \Bigr\{X \in A^{\Omega}: u \circ X \in  \mathcal{L}^1(\Omega ,\mathcal{S}_1, \pi)~\forall u \in \mathcal{U}_{\mathcal{A}}\Bigl\}.
$$ 
This definition is needed for stating the next proposition.
%
\begin{prop} 
Let $\pi$ be a probability measure on $(\Omega,\mathcal{S}_1)$, and  $X=(\Delta_1 , \dots ,\Delta_r),Y=(\Lambda_1 , \dots ,\Lambda_r) \in  \mathcal{F}_{(\textsf{pref}(\mathbb{R}^r),\pi)}$, where the first $0 \leq z \leq r$ dimensions of $\textsf{pref}(\mathbb{R}^r)$ are of cardinal scale.  Then, the following holds:
\begin{itemize}
    \item[i)] $\textsf{pref}(\mathbb{R}^r)$ is consistent.
     \item[ii)] If $z=0$, then $R_{(\textsf{pref}(\mathbb{R}^r),\pi)}$ coincides with (first-order) stochastic dominance w.r.t.~$\pi$ and~$R_1^*$ (short: FSD$(R_1^*,\pi)$).
      \item[iii)] If $(X,Y) \in R_{(\textsf{pref}(\mathbb{R}^r),\pi)}$ and $\Delta_j, \Lambda_j \in \mathcal{L}^1(\Omega ,\mathcal{S}_1, \pi)$ for all $j=1, \dots, r$, then
      \begin{itemize}
          \item[I.] $\mathbb{E}_{\pi}(\Delta_j) \geq \mathbb{E}_{\pi}(\Lambda_j)$ for all $j=1, \dots , r$, and
          \item[II.] $(\Delta_j,\Lambda_j) \in$FSD$(\geq , \pi)$ for all $j=z+1, \dots , r$.
      \end{itemize}
      Additionally, in the special case where all components of $X$ are jointly independent and all components of $Y$ are jointly independent, properties I. and II. imply $(X,Y) \in R_{(\textsf{pref}(\mathbb{R}^r),\pi)}$ (i.e. also the converse implication holds). 
\end{itemize}
\end{prop}
%
\textbf{Proof.} i) Let $\alpha_1 ,\dots , \alpha_r \in \mathbb{R}^+$ and $\phi_{z+1} , \dots, \phi_r:\mathbb{R} \to \mathbb{R}$ strictly isotone functions. Define $u:\mathbb{R}^r \to \mathbb{R}$ by setting 
\begin{equation*}
u(x):=\sum_{s=1}^{z} \alpha_s \cdot x_s + \sum_{s=z+1}^{r} \alpha_s \cdot \phi_s(x_s) .    
\end{equation*}
%
Then one easily verifies that $u$ defines a representation of $\textsf{pref}(\mathbb{R}^r)$, proving its consistency. 

ii) Assume $z=0$, i.e. all considered dimensions are purely ordinal. We claim that for $\mathcal{A}_0:=[\mathbb{R}^r, R_1^*, \emptyset]$ it holds $\mathcal{U}_{\textsf{pref}(\mathbb{R}^r)}=\mathcal{U}_{\mathcal{A}_0}$. The direction $\subseteq$ is trivial, so assume $u \in \mathcal{U}_{\mathcal{A}_0}$ arbitrary. It suffices to show that $u$ represents arbitrary pairs of pairs in $R_2^*$. As $R_2^*$ is antisymmetric for $z=0$, this reduces to show that $u$ strictly represents arbitrary pairs of pairs in $P_{R_2^*}$. So, let $((v,w),(x,y)) \in P_{R_2^*}$. This means that for all $j \in \{1 , \dots , r\}$ we have $v_j \geq x_j \geq y_j \geq w_j$ and that there is $j_0 \in \{1 , \dots , r\}$ such that either $v_{j_0} > x_{j_0}$ or $y_{j_0} > w_{j_0}$. Together, this implies $u(v) > u(x) \geq u(y) \geq u(w)$ or $u(v) \geq u(x) \geq u(y) > u(w)$, either way implying $u(v)-u(w) > u(x)-u(y)$. Thus $u \in \mathcal{U}_{\textsf{pref}(\mathbb{R}^r)}$. As $R_{(\mathcal{A}_0, \pi)}$ coincides with (first-order) stochastic dominance by definition and we have $\mathcal{U}_{\textsf{pref}(\mathbb{R}^r)}=\mathcal{U}_{\mathcal{A}_0}$ also $R_{(\textsf{pref}(\mathbb{R}^r),\pi)}$ coincides with (first-order) stochastic dominance.

iii) Let $(X,Y) \in R_{(\textsf{pref}(\mathbb{R}^r),\pi)}$. We start by showing I, so choose $j \in \{1 , \dots , r\}$ arbitrary. By part i) of the proof, for every $n \in \mathbb{N}$, the function $u_n: \mathbb{R}^r \to \mathbb{R}$ defined by
$$u_n(x):= x_j + \frac{1}{n} \cdot\sum_{s \neq j}  x_s$$
is a representation of $\textsf{pref}(\mathbb{R}^r)$, that is $u_n \in \mathcal{U}_{\textsf{pref}(\mathbb{R}^r)}$. Thus, by our assumption $(X,Y) \in R_{(\textsf{pref}(\mathbb{R}^r),\pi)}$, we know that we have $\mathbb{E}_{\pi}(u_n \circ X) \geq \mathbb{E}_{\pi}(u_n \circ Y)$. This implies (by the linearity of the expectation operator)
$$\mathbb{E}_{\pi}(\Delta_j) + \frac{1}{n} \cdot\sum_{s \neq j} \mathbb{E}_{\pi}(\Delta_s) \geq \mathbb{E}_{\pi}(\Lambda_j) + \frac{1}{n} \cdot\sum_{s \neq j} \mathbb{E}_{\pi}(\Lambda_s).$$
Letting $n \to \infty$ on both sides gives $\mathbb{E}_{\pi}(\Delta_j) \geq \mathbb{E}_{\pi}(\Lambda_j)$.\\[.1cm]
We use a very similar argument to see II: Choose $j \in \{z+1 , \dots , r\}$ arbitrarily and let $\phi: \mathbb{R} \to \mathbb{R}$ be strictly isotone. By part i) of the proof, for every $n \in \mathbb{N}$, the function $u'_n: \mathbb{R}^r \to \mathbb{R}$ defined by
$$u'_n(x):= \phi(x_j) + \frac{1}{n} \cdot\sum_{s \neq j}  x_s$$
is a representation of $\textsf{pref}(\mathbb{R}^r)$, that is $u_n \in \mathcal{U}_{\textsf{pref}(\mathbb{R}^r)}$. Thus, by our assumption $(X,Y) \in R_{(\textsf{pref}(\mathbb{R}^r),\pi)}$, we know that we have $\mathbb{E}_{\pi}(u_n \circ X) \geq \mathbb{E}_{\pi}(u_n \circ Y)$. This implies (by the linearity of the expectation operator)
$$\mathbb{E}_{\pi}(\phi \circ\Delta_j) + \frac{1}{n} \cdot\sum_{s \neq j} \mathbb{E}_{\pi}(\Delta_s) \geq \mathbb{E}_{\pi}(\phi \circ\Lambda_j) + \frac{1}{n} \cdot\sum_{s \neq j} \mathbb{E}_{\pi}(\Lambda_s).$$
Letting $n \to \infty$  gives $\mathbb{E}_{\pi}(\phi\circ\Delta_j) \geq \mathbb{E}_{\pi}(\phi \circ\Lambda_j)$. As $\phi$ was chosen arbitrarily, this implies $(\Delta_j,\Lambda_j) \in$FSD$(\geq , \pi)$. 

To see the addition to part iii), 
let $ X=(\Delta_1,\ldots \Delta_r)$ and $Y=(\Lambda_1,\ldots ,\Lambda_r)$ have both jointly independent components, respectively, and let I. and II. of iii) be true.
Let furthermore $u \in \mathcal{U}_{\textsf{pref}(\mathbb{R}^r)}$ be an arbitrary utility function that represents the preference system $\textsf{pref}(\mathbb{R}^r)$. We now show that $ \mathbb{E}_\pi (u \circ X) \geq \mathbb{E}_\pi(u\circ Y)$ holds: Because of independence we can compute the expectations of $u\circ X$ and $u \circ Y$ by using Fubini's theorem. To prove the inequality, we first integrate over the ordinal part and use isotonicity of $u$ in every integration. Then we integrate over the cardinal parts and iteratively use the fact that the corresponding functions are representing the corresponding cardinal subsystem built by the components we did not integrate over before. Formally, we arrive at:

\begin{align*}
\mathbb{E}_\pi(u\circ X) &~~=~~~ \int_{\Omega} u \circ X d\pi \\
 &\stackrel{(ind.)}{=} \int_{\Delta_1 (\Omega)} \cdots \int_{\Delta_r (\Omega)} u(\delta_1,\ldots, \delta_z , \delta_{z+1}, \ldots \delta_r)
 d\pi_{\Delta_r}  \ldots d\pi_{\Delta_{z+1}}d\pi_{\Delta_z}\ldots d \pi_{\Delta_1}\\
&~~\stackrel{\left(\star \right)}{\geq}~~  \int_{\Delta_1 (\Omega)} \cdots \int_{\Lambda_r (\Omega)} u(\delta_1,\ldots, \delta_z , \lambda_{z+1}, \ldots \lambda_r) d\pi_{\Lambda_r} \ldots d\pi_{\Lambda_{z+1}}d\pi_{\Delta_z}\ldots d \pi_{\Delta_1}\\
&~\stackrel{{\left(\star\star\right)} }{\geq} ~~ \int_{\Lambda_1 (\Omega)} \cdots \int_{\Lambda_r (\Omega)} u(\lambda_1,\ldots, \lambda_z , \lambda_{z+1}, \ldots \lambda_r)d\pi_{\Lambda_r} \ldots d\pi_{\Lambda_{z+1}}d\pi_{\Lambda_z}\ldots d \pi_{\Lambda_1}\\
&\stackrel{(ind.)}{=}\mathbb{E}_{\pi}(u\circ Y)
\end{align*}

Here, $\left(\star\right)$ is valid because, for fixed cardinal components, $u$ is isotone in every ordinal component and we have first order stochastic dominance, which means that the iterated integrals gets smaller if one switches from $\pi_{\Delta_k}$ to $\pi_{\Lambda_k}$.

Similarly, $\left(\star\star\right)$ is valid because e.g., for the mapping $$\psi: \mathbb{R}^{z-1}\to \mathbb{R}~~~,~~~ (\delta_1,\ldots , \delta_{z-1})\mapsto \int_{\Delta_z(\Omega)} u(\delta_1,\ldots, \delta_r) d\pi_{\Delta_z}$$ is a positive (affine) linear transformation w.r.t. the corresponding subsystem.
\hfill $\square$
%
\begin{cor}
If $\mathcal{C}=[C,R_1^c,R^c_2]$ is a bounded subsystem of $\textsf{pref}(\mathbb{R}^r)$ and $X,Y \in \mathcal{F}_{(\mathcal{C},\pi)}$, then $\mathcal{C}$ is $0$-consistent and ii) and iii) from Prop.~\ref{somecharacterisitcs} hold, if we replace $R_{(\textsf{pref}(\mathbb{R}^r),\pi)}$ by $R_{(\mathcal{C},\pi)}$, FSD$(R_1^*,\pi)$ by FSD$(R_1^c,\pi)$, and $(X,Y) \in R_{(\textsf{pref}(\mathbb{R}^r),\pi)}$ by $\forall u \in \mathcal{N}_{\mathcal{C}}: \mathbb{E}_{\pi}(u \circ X) \geq \mathbb{E}_{\pi}(u \circ Y)$.
\end{cor}

\textbf{Proof.} As, according to Proposition~\ref{somecharacterisitcs} i), we know that $\textsf{pref}(\mathbb{R}^r)$ is consistent, the same holds true for all of its subsystems. Hence, $\mathcal{C}$ is consistent. Since $\mathcal{C}$ is assumed to be bounded, it then is $0$-consistent by Proposition 1. The rest of the Corollary follows, since -- by Proposition 2 -- for bounded preference systems it suffices to check for dominance only over all normalized representations. \hfill $\square$
%
\begin{prop} 
Let $z=1$ and denote by $\mathcal{U}_{sep}$ the set of all $u: \mathbb{R}^r \to \mathbb{R}$ such that, for $(x_2 , \dots , x_r) \in \mathbb{R}^{r-1}$ fixed, the function $u(\cdot , x_2 , \dots , x_r)$ is strictly increasing and (affine) linear and such that, for  $x_1 \in \mathbb{R}$ fixed, the function $u(x_1, \cdot, \dots , \cdot)$ is strictly isotone w.r.t.~the the componentwise partial order on $\mathbb{R}^{r-1}$. Then $\mathcal{U}_{sep}=\mathcal{U}_{\textsf{pref}(\mathbb{R}^r)}$.
\end{prop}

\textbf{Proof.} First, let $ u \in \mathcal{U}_{\textsf{pref}(\mathbb{R}^r)}$. One easily verifies that, for $x_{-}:=(x_2 , \dots , x_r) \in \mathbb{R}^{r-1}$ fixed, the preference system $Z:=[\mathbb{R},R_1^{x_{-}},R_2^{x_{-}}]$, where $R_1^{x_{-}}:= \geq$ and $R_2^{x_{-}} $ is defined by
$$\Biggl\{((t,u),(v,w)): \Biggl(\Biggl(\begin{pmatrix}
t\\  x_{-}  
\end{pmatrix},\begin{pmatrix}
u\\  x_{-}  
\end{pmatrix}\Biggl),\Biggl(\begin{pmatrix}
v\\  x_{-}  
\end{pmatrix},\begin{pmatrix}
w\\  x_{-}  
\end{pmatrix}\Biggl)\Biggl)\in R_2^*\Biggr\}$$
%
is a complete positive-difference structure in the sense of~\citet[Definition 1, p.~147]{k1971}. According to~~\citet[Theorem 1, p.~147]{k1971} this implies that any two representations of $Z$ are positive (affine) linear transformations of each other. But it is immediate that both $u(\cdot , x_2 , \dots , x_r)$ and $id_{\mathbb{R}}(\cdot)$ are representations of $Z$. Thus, $u(\cdot , x_2 , \dots , x_r)= \alpha\cdot id_{\mathbb{R}}(\cdot)+ \beta$ for some $\alpha \in \mathbb{R}^+$ and $\beta  \in \mathbb{R}$, proving the first claim of this direction. The second claim -- i.e., the strict isotony of the function $u(x_1, \cdot, \dots , \cdot)$ w.r.t.~the the componentwise partial order on $\mathbb{R}^{r-1}$ for fixed $x_1 \in \mathbb{R}$ -- is also immediate. Thus, $u \in \mathcal{U}_{sep}$.

For the other direction, assume that $u \in \mathcal{U}_{sep}$. It follows directly from the assumptions that $u$ is strictly isotone w.r.t.~$R_1^*$. To see that $u$ also strictly represents $R_2^*$, choose $((x,y),(x',y')) \in R_2^*$ arbitrary. We have two cases:
\\[.1cm]
\textit{Case 1:} $((x,y),(x',y')) \in I_{R_2^*}$. This implies that $x_1 - y_1=x'_1 - y'_1$
and therefore also
$x_1-x'_1=y_1-y'_1.$
Moreover, one easily verifies that the restriction of $R_2^*$ to the ordinal dimensions is antisymmetric . Since we have that $x_{-}$ componentwise dominates $x'_{-}$ and vice versa and that $y_{-}$ componentwise dominates $y'_{-}$ and vice versa, this antisymmetry then implies that $x_{-}=x'_{-}$ and $y_{-}=y'_{-}$. Therefore, there are common $\alpha_1, \alpha_2 \in \mathbb{R}^+$ and $\beta_1 , \beta_2  \in \mathbb{R}$ such that
$$u(x)=\alpha_1\cdot x_1 + \beta_1~~~,~~~u(x')=\alpha_1\cdot x'_1 + \beta_1$$
$$u(y)=\alpha_2\cdot y_1 + \beta_2~~~,~~~u(y')=\alpha_2\cdot y'_1 + \beta_2$$
 Moreover, observe that $\alpha_1 = \alpha_2$, since otherwise there wolud be $x^* \in \mathbb{R}$ with $u(x^* , x_{-})<u(x^* , y_{-})$, which is not possible, since $u$ is strictly isotone w.r.t.~$R_1^*$.
Define
$$D:= (u(x)-u(y))-(u(x')-u(y')).$$
Simple computations then yield
$$D=\alpha_1 \cdot(x_1 - x'_1)-\alpha_2 \cdot(y_1 - y'_1)=(x_1 - x'_1)\cdot(\alpha_1 - \alpha_2)$$
%
which, as $\alpha_1 = \alpha_2$, implies $D=0$.
\\[.1cm]
\textit{Case 2:} $((x,y),(x',y')) \in P_{R_2^*}$. This implies $x_- \geq x'_- \geq y'_- \geq y_-$, where $\geq$ is to be understood componentwise. Using the same argument as seen before, this implies that there exists a $\alpha \in \mathbb{R}^+$ and $\beta_1,\beta_2,\beta_3,\beta_4 \in \mathbb{R}$ such that
$$u(x)=\alpha\cdot x_1 + \beta_1~~~,~~~u(x')=\alpha\cdot x'_1 + \beta_3$$
$$u(y)=\alpha\cdot y_1 + \beta_2~~~,~~~u(y')=\alpha\cdot y'_1 + \beta_4$$
Thus, computing $D$ defined as above yields:
$$D=\alpha \cdot((x_1 - y_1)-(x'_1 - y'_1))+\beta_1-\beta_2-\beta_3+\beta_4$$
\textit{Sub-Case 2.1:} $x_1- y_1 > x'_1 - y'_1$. Observe that, as $u$ is isotone w.r.t.~$R_1^*$, we have that $u(y'_1,y'_-) \geq u(y'_1,y)$. However, this implies $\beta_4 \geq \beta_2$. Analogous reasoning yields $\beta_1 \geq \beta_3$. Using the assumptions of the sub-case, this implies $D>0$.
\\[.1cm]
\textit{Sub-Case 2.2:} $x_1- y_1 = x'_1 - y'_1$. Using the case assumption, this implies that either $x_- > x'_-$ or $y'_- > y_-$, where the $>$ is to be understood as the strict part of the componentwise $\geq$. As $u$ is strictly isotone w.r.t.~$R_1^*$, this implies  that either $u(y'_1,y'_-) > u(y'_1,y)$ or $u(x'_1,x_-) > u(x'_1,x'_-)$, which itself implies either $\beta_4 > \beta_2$ or $\beta_1 > \beta_3$. As we know $\beta_4 \geq \beta_2$ and $\beta_1 \geq \beta_3$, this, together with the sub-case assumption, implies $D>0$. \hfill $\square$

%
\section{Details on Implementation and reproducibility} \label{repro}

In Section \ref{implementation} we stated that the implementation of the constraint matrix has worst-case complexity $\mathcal{O}(s^4)$.  This worst case occurs when everything in $R_1^*$ and $R_2^*$ is comparable and then $$s \cdot (s-1) + \left(s \cdot (s-1)\right) \cdot \left(\left(s \cdot (s-1)\right)  - 1 \right ) = s^4 - 2s^3 + s^2$$  many pairwise comparisons have to be considered. Note that we omit the reflexive part of the pre-orders $R_1^*$ and $R_2^*$. 

%In implementing the constraint matrix, we exploit the fact that sorting the data set allows some comparisons to be skipped immediately by considering only the ordinal components. In particular, if the ordinal variables have a small number of categories compared to the sample size $s$, this can lead to a large proportion of comparisons being skipped. In the most cases, this reduces the computational cost of computing the constraint matrix compared to a naive implementation. Of course, in the worst case, if the observations grouped by their ordinal components are highly skewed and the largest ordinal components correspond to the largest group, the computation time cannot be drastically reduced in this way.

 We are interested in the non-regularized test statistic as well as the regularized test statistic with $\varepsilon \in \{0.25, 0.5, 0.75, 1\}$, see Section~\ref{application}. For all these cases, we compute the test statistics based on the sample, as well as 1000 times on a permuted version of that sample. Note that the linear programs for computing the test statistics based on the permuted data are identical to that for the non-permuted data except for the objective function, see 
 Section~\ref{sec:permutation_test}. In Section~C (in the supplementary material), we prove that the robustified test statistics are a shift of the non-robustified test statistic. Thus, the robustified test statistics are immediately given.
 
The simulation is based on a random sample of the data set. Two of the data sets and the corresponding R-code can be found here: 
\begin{center}
\url{https://github.com/hannahblo/Robust_GSD_Tests}
\end{center}The data set used for the poverty analysis (ALLBUS) is freely accessible,  but registration in the corresponding online portal is needed.\footnote{Further information on the survey and the data set itself can be found here: \url{https://search.gesis.org/research_data/ZA5240} (accessed: Febr 16, 2023)}

For the computation of the linear programs, we used the R interface of Gurobi optimizer, which is documented in \cite{gurobi_pdf}. This is a commercial solver that offers free academic licenses\footnote{Further details can be found here: \url{https://www.gurobi.com/academia/academic-program-and-licenses/} (accessed: Febr 16, 2023)}. In particular, the computation of linear programs is faster than using the free and open source solvers known to us, see \cite{meindl12}. We also used the R-packages \textit{purrr}, \textit{dplyr}, \textit{slam}, \textit{readr}, \textit{tidyr, forcast, ggplot2, reshape2, tidyverse, ggridges, latex2exp, RColorBrewer}, \textit{rcartocolor} and \textit{foreign} for our implementation, see \cite{mailund22, yarberry21, readr_package, slam_package, tidyr_package, forcast_package, ggplot2_package, reshape_package, tidyverse_package, ggridges_package, latex2exp_package, RColorBrewer_package, rcartocolor_package, foreign_package}.

The computation was done for
\begin{itemize}
    \item ALLBUS data set, see \cite{allbus-2014}, on a commodity desktop laptop with a 8-core Intel(R) Core(TM) i7-8665U CPU @ 1.90GHz processor and 16 GB RAM in R version 4.2.2.
    \item dermatology data set, see \cite{demiroz1998learning} accessed via \cite{Dua:2019}, on a commodity desktop computer with a 32-core Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz processor and 64 GB RAM in R version 4.2.1
    \item German credit data set, see \cite{Dua:2019}, on a commodity desktop laptop with a 8-core Intel(R) Core(TM) i7-10510U CPU @ 1.80GHz processor and 16 GB RAM in R version 4.2.2.
\end{itemize}


\section{Calculations for robustified test statistics} \label{calcu}
%
In Section~\ref{application} we show a graph visualizing the fraction of resamples in favor of \textbf{non}-rejection of $H_0$ (i.e., the p-values) as a function of the size of the contamination $\gamma$ of the underlying linear-vacuous model (see Figure~\ref{fig:res-share-rej}). We will briefly show here how the exact function is calculated. For general (polyhedral) credal sets, a resample $I$ is in favor of rejection of $H_0$ under the robustified resampling scheme, if $\underline{d}^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0)> \overline{d}_I^{\varepsilon}$. Hence, the fraction of resamples in favor of rejection of $H_0$ is given by
$$\frac{1}{N}\cdot \sum_{I \in \mathcal{I}_N} \mathds{1}_{\bigl\{\underline{d}^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0)> \overline{d}_I^{\varepsilon}\bigr\}}$$
where $N$ denotes the number of resamples and $\mathcal{I}_N$ is the corresponding set of resamples. In the special case that the credal sets involved are $\gamma$-contamination models, we can use Proposition~\ref{prop:test_statistic_ip} (and a slight variation of it with $\pi_*$ and $\pi^*$ in reversed roles) to obtain
%
$$\underline{d}^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0)= (1- \gamma) \cdot d^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0) - \gamma$$
and 
$$\overline{d}_I^{\varepsilon}= (1- \gamma) \cdot d_I^{\varepsilon} + \gamma $$
%
and, therefore, the condition in the indicator above is satisfied if and only if
%
$$d^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0) - d_I^{\varepsilon} > \frac{2\gamma}{(1- \gamma)}.$$
%
Finally, if we interpret $\varepsilon$ as a function parameter, then we can write the fraction of resamples in favor of \textbf{non}-rejection of $H_0$ (i.e., the observed p-values) as a function of the size $\gamma$ of the contamination of the underlying linear-vacuous model:
%
$$f_{\varepsilon}(\gamma):=1-\frac{1}{N}\cdot \sum_{I \in \mathcal{I}_N} \mathds{1}_{\bigl\{d^{\varepsilon}_{\mathbf{X},\mathbf{Y}}(\omega_0) - d_I^{\varepsilon} > \frac{2\gamma}{(1- \gamma)}\bigr\}}.$$
%
\section{Further details on the applications} \label{furtheranalysis}
%
\subsection{Data Sets}
We applied our analysis to three different data sets:
\begin{itemize}
    \item For the poverty analysis, see Section~\ref{application}, we used the ALLBUS data set. The data set is described by \cite{allbus-2014} and \cite{breyer2015skala}. As mentioned already in the previous section, the data set is freely accessible, but only after registration in the corresponding online portal: \url{https://search.gesis.org/research_data/ZA5240} (accessed: 08.02.2023). Please download the file ZA5240\_v2-2-0.sav (5.31MB) there.

    The analysis was done on a sample consisting of 100 female and 100 male observations.

    \item We analyzed the dermatology data set, see \cite{demiroz1998learning} accessed via \cite{Dua:2019}.

    The analysis was performed on a sample of $46$ individuals with family history of eryhemato-squamous disease and $100$ individuals without.

    \item We analyzed the German credit data set, see \cite{Dua:2019}.

    The analysis was performed on a sample of $100$ credit risks classified as good and 100 credit risks classified as poor individuals.
\end{itemize}


\subsection{Application on Credit Data}

We focus on three variables (features) in the German credit data set \cite{Dua:2019}: credit amount (numeric), credit history (ordinal, $5$ levels ranging from \say{delay in paying off in the past} to \say{all credits paid back duly}) and employment status (ordinal, $5$ levels ranging from \say{unemployed} to \say{present employment longer than $7$ years}). We use a subsample with $n = m = 100$ high-risk applicants and low-risk applicants each. We are interested in the hypothesis that high-risk applicants are dominated by low-risk applicants w.r.t. GSD. The test results (see Figures \ref{fig:credit-1} and \ref{fig:credit-2} in the supplementary material) can be interpreted analogously to Section~\ref{application}: For~$\varepsilon \in \{0.75,1\}$ we reject for the common significance level of $\alpha \approx 0.05$. This time, we do not reject in case of $\varepsilon = 0.5$.

Similar to the example of poverty analysis in Section~\ref{application}, rejecting $H_0$ does not necessarily mean that high-risk applicants are dominated by low-risk applicants. They could also be incomparable, see also Section~\ref{testdom}. However, our tests with reversed variables give no evidence of incomparability: The observed p-values for all these reversed tests are all $1$.

\begin{figure}[h!]
    \centering
    \includegraphics[scale=0.45]{plots/distr-credit.pdf}
    \caption{Distributions of ${d}^{\varepsilon}_{I}$ with $\varepsilon \in \{0,0.25,0.5,0.75,1\}$ 
     obtained from $N = 1000$ resamples of Credit data. Black stripes show exact positions of ${d}^{\varepsilon}_{I}$ values. Vertical black line marks median. Red line shows value of the respective observed test statistics ${d}^{\varepsilon}_{\mathbf X, \mathbf Y}(\omega) $.  }    \label{fig:credit-1}
\end{figure}


\begin{figure}[h!]
    \centering
    \includegraphics[scale=0.45]{plots/Rplot39.pdf}
    \caption{P-values as function of the contamination~$\gamma$ (see Supp.~C) for tests with different regularization strength~$\varepsilon$ performed nd on credit data set. Dotted red line marks significance level $\alpha = 0.05$.}
    \label{fig:credit-2}
\end{figure}



\newpage

\subsection{Application on Dermatological Data}

We focus on three variables (features) in the dermatology data set \cite{demiroz1998learning, Dua:2019}: age of skin (numeric), the intensity of itching (ordinal, 4 levels ranging from \say{no itching} to \say{strong itching}) and erythema (redness of skin) (ordinal, 4 levels again ranging from no to highest intensity). We use a subsample with $n = 46$ patients with a family history of eryhemato-squamous disease and $m = 100$ without. We are interested in the hypothesis that patients without a family history of the disease are dominated by patients without a family history with respect to GSD. The test results (see Figures~\ref{fig:derma-1} and \ref{fig:derma-2} in the supplementary material) can be interpreted analogously to Section~\ref{application}: For~$\varepsilon \in \{0.75,1\}$ we again reject for the common significance level of $\alpha \approx 0.05$. However, the p-values are much higher than in the other two applications, see also Figure \ref{fig:derma-2} (in the supplementary material). 

Similar to the example of poverty analysis in Section~\ref{application}, rejecting $H_0$ does not necessarily mean that patients with a family history of eryhemato-squamous disease are dominated by patients without. They could also be incomparable; see also Section~\ref{testdom}. However, our tests with reversed variables give no evidence of incomparability: The observed p-values for all these reversed tests are all 1.



\begin{figure}[h!]
    \centering
    \includegraphics[scale=0.45]{plots/distr-derma.pdf}
    \caption{Distributions of ${d}^{\varepsilon}_{I}$ with $\varepsilon \in \{0,0.25,0.5,0.75,1\}$ 
     obtained from $N = 1000$ resamples of dermatology data. Black stripes show exact positions of ${d}^{\varepsilon}_{I}$ values. Vertical black line marks median. Red line shows value of the respective observed test statistics ${d}^{\varepsilon}_{\mathbf X, \mathbf Y}(\omega) $.  }    \label{fig:derma-1}
\end{figure}


\begin{figure}[h!]
    \centering
    \includegraphics[scale=0.45]{plots/Rplot43.pdf}
    \caption{P-values as function of the contamination~$\gamma$ (see Supp.~C) for tests with different regularization strength~$\varepsilon$ performed on Dermatology data set. The dotted red line marks significance level $\alpha = 0.05$.}
    \label{fig:derma-2}
\end{figure}




\newpage
\bibliography{jansen_234}

\end{document}


The proof is a straightforward generalization of the one in~\citet[Proposition~3]{jnsa2022} to general preference systems. Note that nothing in the proof given there hinges on the specific structure of the considered preference system.

First, let $opt_{ij} \geq 0$. Choose $ u \in \mathcal{N}^{\delta}_{\mathbb{C}}$ arbitrarily and let $g:\mathbb{R}^d \to \mathbb{R}$ denote the objective function of the linear program. We then have
\begin{equation}
D(u):=\mathbb{E}_{\pi}(u \circ \phi(C_i,\cdot)) - \mathbb{E}_{\pi}(u \circ \phi(C_j,\cdot))=  g(u(q_1), \dots , u(q_d)) \geq 0
\end{equation}
where the equation follows by simple manipulations of the expected values and the lower bound of $0$ follows since, by definition, $(u(q_1), \dots , u(q_d))\in \nabla^{\delta}_{\mathbb{C}}$. Since $u \in \mathcal{N}^{\delta}_{\mathbb{C}}$ was chosen arbitrarily, this implies $C_i \succsim_{\delta} C_j$.
\\[.15cm]
Conversely, let $opt_{ij} < 0$. Choose $(u^*_1 , \dots , u^*_d) \in \nabla^{\delta}_{\mathbb{C}}$ to be an optimal solution yielding $opt_{ij}$ and define $u: \mathcal{Q} \to [0,1]$ by setting $u(q_i):=u_i^*$ for all $i = 1 , \dots , d$. We then have to distinguish two different cases:
\\[.15cm]
\textit{Case 1:} $\delta >0.$ One then easily verifies that $u \in \mathcal{N}^{\delta}_{\mathbb{C}}$ and
\begin{equation}
D(u)=  g(u_1^*, \dots , u_d^*)=opt_{ij} < 0
\end{equation}
Thus, $u$ is a function from $\mathcal{N}^{\delta}_{\mathbb{C}}$ with $\mathbb{E}_{\pi}(u \circ \phi(C_i,\cdot)) < \mathbb{E}_{\pi}(u \circ \phi(C_j,\cdot))$. Thus $\neg(C_i \succsim_{\delta} C_j)$.
\\[.15cm]
\textit{Case 2:} $\delta =0.$ If $u \in \mathcal{N}^{0}_{\mathbb{C}}$, then the same argument as in the first case applies. Thus, assume that $u \notin \mathcal{N}^{0}_{\mathbb{C}}$. Then, since $(u^*_1 , \dots , u^*_d) \in \nabla^{0}_{\mathbb{C}}$, we still know that $u$ is monotone but we no longer have \textit{strict} monotonicity with respect to the relations $R_1$ and $R_2$ of $\mathbb{C}$  (meaning that properties i) and ii) from Definition~\ref{consistency} are still valid but without the \textit{iff} condition). Now, choose $u^+ \in \mathcal{N}^{0}_{\mathbb{C}}$ arbitrarily (this is always possible, since we assume $0$-consistency). If $D(u^+)< 0$, then $\mathbb{E}_{\pi}(u^+ \circ \phi(C_i,\cdot)) < \mathbb{E}_{\pi}(u^+ \circ \phi(C_j,\cdot))$. This yields $\neg(C_i \succsim_{\delta} C_j)$. If $D(u^+)\geq 0$, then we have
$$0 \leq \xi:=\frac{D(u^+)}{D(u^+)-D(u)} <1$$
and we can choose $\alpha \in (\xi,1)$. One then easily verifies that $u_{\alpha}:= \alpha u + (1- \alpha) u^+ \in  \mathcal{N}^{0}_{\mathbb{C}}$ and that $\mathbb{E}_{\pi}(u_{\alpha} \circ \phi(C_i,\cdot)) < \mathbb{E}_{\pi}(u_{\alpha} \circ \phi(C_j,\cdot))$. This again yields that $\neg(C_i \succsim_{\delta} C_j)$, thereby completing the proof.
