% \documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
% \usepackage{natbib} % has a nice set of citation styles and commands
%     \bibliographystyle{abbrvnat}

\usepackage{csquotes}
\usepackage[style=authoryear, natbib, giveninits, uniquename=init, maxbibnames=99]{biblatex}
\setlength\bibitemsep{2\itemsep}

\addbibresource{references.bib}
    % \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
% \usepackage[textsize=tiny]{todonotes}
\usepackage{microtype}
\usepackage{graphicx} 
\usepackage{algorithm}
\usepackage{algorithmicx}
\usepackage{algpseudocode}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{dsfont}
\usepackage{multirow}
\usepackage{makecell}
\usepackage{tabularx}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\newcommand{\Var}{\on{\mathbb{V}}}
\newcommand{\Exp}{\on{\mathbb{E}}}
\newcommand{\IR}{\mathbb{R}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\IS}{\mathcal{S}}
\newcommand{\IC}{\mathcal{C}}
\newcommand{\IN}{\mathbb{N}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\eps}{\varepsilon}
\newcommand{\hyp}{\on{hyp}}
\newcommand*{\sothat}{:}
\newcommand{\thatis}{i.\,e.,\xspace}
\newcommand*{\from}{\colon}
\newcommand{\data}{\mathcal{D}}
\newcommand{\argmin}{\operatorname*{\arg\,\min}}
\newcommand{\argmax}{\operatorname*{\arg\,\max}}
\newcommand{\argsort}{\operatorname*{\arg\, sort}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\given}{\, | \,}
\newcommand{\grammarcheck}[1]{\textcolor{green}{#1}}

\newcommand{\fromto}{\longrightarrow}
\renewcommand{\to}{\longrightarrow}
\renewcommand{\vec}[1]{\boldsymbol{#1}}
\newcommand{\on}{\operatorname}

\DeclareMathOperator{\Prob}{\mathit{P}}
\DeclareMathOperator{\prob}{\mathit{p}}

\newcommand\blfootnote[1]{%
  \begingroup
  \renewcommand\thefootnote{}\footnotetext{#1}%
  % \addtocounter{footnote}{-1}%
  \endgroup
}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Conformal Prediction without
Nonconformity Scores}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2,*]{\href{mailto:<jonas.hanselle@ifi.lmu.de>?Subject=Your UAI 2025 paper}{Jonas Hanselle}{}}
\author[1,2,*]{Alireza Javanmardi}
\author[1,2]{Tobias Oberkofler}
\author[1,2]{Yusuf Sale}
\author[1,2,3]{Eyke Hüllermeier}
% Add affiliations after the authors
\affil[1]{%
    Institute of Informatics\\
    LMU Munich\\
    Germany
}
\affil[2]{%
    Munich Center for Machine Learning (MCML)\\
    Germany
}
\affil[3]{%
    German Centre for Artificial Intelligence (DFKI), Kaiserslautern, Germany
 }
  \begin{document}
\maketitle
\blfootnote{* indicates equal contribution.}
\begin{abstract}
  Conformal prediction (CP) is an uncertainty quantification framework that allows for constructing statistically valid prediction sets. Key to the construction of these sets is the notion of a nonconformity function, which assigns a real-valued score to individual data points: only those (hypothetical) data points contribute to a prediction set that sufficiently conform to the data. 
  The point of departure of this work is the observation that CP predictions are invariant against (strictly) monotone transformations of the nonconformity function. In other words, it is only the ordering of the scores that matters, not their quantitative values. Consequently, instead of scoring individual data points, a conformal predictor only needs to be able to compare pairs of data points, deciding which of them is the more conforming one. This suggests an interesting connection between CP and preference learning, in particular learning-to-rank methods, and makes CP amenable to training data in the form of (qualitative) preferences. Elaborating on this connection, we propose methods for preference-based CP and show their usefulness in real-world classification tasks.  

  % learning (latent) nonconformity functions from data of that kind 
  %while conventional CP methods rely on a quantitative notion of conformity to decide which observations are included in said sets (typically referred to as \textit{nonconformity scores}), only a qualitative understanding is necessary: Being able to tell which of two observations is more conformal than another is adequate for conformal classification. This opens up opportunities for learning nonconformity relations from preferences, a type of data that is ubiquitious in many application domains such as LLM finetuning or information retrieval. In this work we show that for each possible nonconformity score there is an equivalent preference relation that can be inferred form pairwise data. We demonstrate this equivalence on synthetic data and use aforementioned preference relations in real-world classification tasks, showcasing the usefulness of our method.
  
  \end{abstract}

\section{Introduction}
\label{sec:introduction}
% 
Conformal prediction (CP) \citep{vovk_algorithmic_2022} has recently emerged as a prominent tool for uncertainty quantification in machine learning, offering formal statistical guarantees \citep{angelopoulos_gentle_2021}. Instead of point predictions, which do not account for potential uncertainty, conformal prediction provides set-valued predictions (e.g., subsets of class labels in classification or intervals in regression). A key advantage over other uncertainty quantification methods is its ability to provide coverage guarantees without relying on distributional assumptions, making it applicable to any predictive model. 
% 
At the core of conformal prediction is the notion of a nonconformity score, a real-valued function that quantifies how atypical a (hypothetical) data point is compared to previously observed data. During inference, given a new input as a query, each possible outcome is evaluated by comparing its nonconformity to those in a reference set, determining how well it aligns with past observations. Specifically, if the nonconformity score of the new input with a given outcome ranks before a certain quantile, that outcome is included in the prediction set. 


An important observation here is that the exact values of the scores do not matter; rather, it is the relative ordering of the data points in terms of their nonconformities that ultimately determines the prediction set. Indeed, any monotonic transformation of the nonconformity function that preserves the \emph{ranking} of the scores will leave the prediction set unchanged \citep{vovk_algorithmic_2022}.
% 
% Unlike \emph{full} conformal prediction, the more computationally efficient \emph{split} conformal approach \citep{papadopoulos_inductive_2002, lei2018distribution} partitions the available data into two separate sets, one for model training, and the other for calibration. More specifically, assume that a pre-trained model $\widehat{\mu}: \mathbb{R}^d \rightarrow \mathcal{Y}$ is available, where $\mathcal{Y}$ denotes the target space. Further, denote by $\{(X_i, Y_i)\}_{i = 1}^n$ the calibration set. Having seen the current feature $X_{n+1} \in \mathbb{R}^d$ the goal is to construct a prediction set $\mathcal{C}(X_{n+1})$ for the unseen $Y_{n+1}$ such that $\mathcal{C}(X_{n+1})$ contains $Y_{n+1}$. More formally, we require that 
% \begin{align}\label{eq:cov}
%     \mathbb{P}\{ Y_{n+1} \in \mathcal{C}(X_{n+1}) \} \geq 1 - \alpha,
% \end{align}
% where the probability is taken over $\{(X_i, Y_i)\}_{i=1}^{n+1}$. 
% We call  \eqref{eq:cov} \emph{marginal} coverage guarantee. 
%
% The workflow of (split) conformal prediction is quite straightforward: given a new input $X_{n+1}$ the goal is to assess how \emph{conforming} (or non-conforming) a candidate output $y \in \mathcal{Y}$ is with respect to the calibration data. To achieve this, a nonconformity score is first defined, which quantifies the extent to which a new observation deviates from previously seen data. For continuous targets, a common choice is the absolute residual, $s(X_{n+1},y) = |\widehat{\mu}(X_{n+1}) - y|$, though more sophisticated scores can be used depending on the application. Once a suitable non-conformity function is selected, the next step is to compute non-conformity scores for all calibration examples, $s(X_i, Y_i)$ for $i \in \{1,\dots,n\}$. These scores are then used to determine a threshold, $\widehat{q}_{\alpha}$, which is typically chosen as the $\lceil (1 - \alpha)(n +1) \rceil$-smallest value of the residuals in the calibration set. Consequently, the final prediction set is given by:
% \begin{align}\label{eq:pred-set}
%     \mathcal{C}(X_{n+1}) = \left\{y \in \mathcal{Y}\,:\, s(X_{n+1},y) \leq \widehat{q}_{\alpha} \right\}.
% \end{align}
% An obvious, yet important, observation is that the final prediction set \eqref{eq:pred-set} depends solely on the relative ordering of the nonconformity scores, rather than on their actual numerical values. Consequently, any monotonic transformation of the nonconformity function that preserves the \emph{ranking} of the scores will leave the prediction set unchanged.
% 
In this paper, we leverage this observation to make CP amenable to preference learning \citep{furnkranz_preference_2011}. Rather than assigning a numerical score to each data point independently, one can instead focus on pairwise comparisons to determine which data points are more conforming and which are less. 
In other words, a conformal predictor essentially requires the ability to learn a ranking over the data, thereby establishing a natural connection to preference learning and learning-to-rank frameworks. 

More concretely, we propose novel methods for learning latent nonconformity functions directly from qualitative preference data. 
Our approach replaces the conventional pointwise scoring mechanism with a strategy that utilizes pairwise comparisons, making it well-suited for scenarios where training data are available in the form of relative judgments.
This is particularly useful in fields where human judgment is the primary source of information, often collected through comparative assessments \citep{yannakakis2015ratings}.
Analysis on data of this type has a long-standing tradition in economics and psychology, where preference information is used to analyze consumer behavior and decision making.  
% More recently, pairwise comparisons are also collected in the field of machine learning, most prominently for fine-tuning large language models to align with human preferences \cite{ouyang_training_2022}.
More recently, pairwise comparisons have become integral to machine learning, too, especially for fine-tuning large language models and aligning them with human preferences \citep{ouyang_training_2022}.
% 
This type of feedback is also promising for eliciting the human perception of nonconformity.
Indeed, humans often struggle with quantitative assessments, such as assigning precise probabilities to events \citep{tversky_judgment_1974}. Comparing two events and deciding which one is more probable, or likewise comparing two data points and saying which of them is more conforming to the data, is arguably easier. 
%the subsequent task of assigning numeric nonconformity values to observations is likely to be even more difficult.
% Therefore, the connection between CP and preference learning facilitates the exploration of human implicit understanding of nonconformity and makes it accessible for constructing prediction sets.
Thus, the connection between CP and preference learning provides a means of harnessing human implicit understanding of nonconformity, making it accessible for the construction of prediction sets.

%\emph{Paper organization.} 
In Section \ref{sec:problem}, we first review the (theoretical) foundations that underpin the relationship between conformal prediction and preference learning. In Section \ref{sec:pl}, we then introduce our method for learning nonconformity relations represented by latent nonconformity functions from preference data. Afterward, we discuss the experiments we conducted to validate the proposed method, showing that it indeed manages to induce meaningful nonconformity scores and is applicable in the context of conformal classification. Through extensive evaluations, we confirm that our approach not only preserves the formal statistical guarantees of conformal prediction but also enhances its applicability in settings where qualitative preference data are more readily available. 

% By construction, this approach ensures that the marginal coverage property in \eqref{eq:cov} holds under the assumption of exchangeability.


% Rooted in classical frequentist statistics, this framework enables the construction of reliable prediction sets on top of any model without requiring distributional assumptions. At its core, the nonconformity function quantifies the strangeness of a pair $(x,y)$, where higher nonconformity indicates a less expected data point. This measure is then used to rank data points. During inference, each possible label for a given instance is treated as a candidate, and its nonconformity—along with its ranking among the augmented dataset—determines whether it should be excluded from the prediction set.

\section{Reinterpreting CP via Preference relations}\label{sec:problem}
% \subsection{conformal prediction}
% Consider a set of data points $\mathcal{D} = \{(X_i, Y_i)\}_{i = 1}^{n+1} \subset \mathcal{X} \times \mathcal{Y}$ drawn exchangeably from an unknown distribution 
% $P$. 
Consider a set of data points $\mathcal{D} = \{(X_i, Y_i)\}_{i = 1}^{n} \subset \mathcal{X} \times \mathcal{Y}$ drawn from an unknown distribution $P$. Let $(X_{n+1}, Y_{n+1})$ be a future test point, also drawn from $P$, such that the combined collection $\mathcal{D} \cup \{(X_{n+1}, Y_{n+1})\}$ is exchangeable.
Assuming that the outcome $Y_{n+1}$ for the instance 
$X_{n+1}$ is unobserved, conformal prediction seeks to construct a prediction set 
$\mathcal{C}(X_{n+1}) \subseteq \mathcal{Y}$ such that 
\begin{equation}\label{eq:coverage}
    \mathbb{P}(Y_{n+1} \in \mathcal{C}(X_{n+1})) \geq 1-\alpha,
\end{equation}
where $\alpha \in (0,1)$ is a user-specified error rate and the probability is taken over all samples $\mathcal{D} \cup \{(X_{n+1}, Y_{n+1})\}$. 
%assuming that  $(X_{n+1},Y_{n+1})$ is also drawn exchangeably from  $P$. 

In essence, conformal prediction tests the hypothesis $Y_{n+1} = y$ for each possible label $y \in \mathcal{Y}$. To compute p-values for this hypothesis test, CP relies on a nonconformity score $s:\mathcal{X} \times \mathcal{Y} \fromto \mathbb{R}$ that quantifies how atypical a new pair $(x,y)$ is relative to the previously observed data, with higher values indicating less conformity. This score can be predefined, but is normally derived from a model fitted on $\mathcal{D}$. For example, in classification, a common choice for the nonconformity score is $s_\text{LAC} (x,y) :=1-\hat{p}(y\mid x)$ \citep{sadinle_least_2019}, where 
$\hat{p}(y\mid x)$ is the predicted probability assigned to the class $y$.

There are two main variants of CP: full and split conformal prediction \citep{papadopoulos_inductive_2002,lei2018distribution}. In full CP, the entire dataset 
$\mathcal{D}$ is used for model training. Specifically, for each candidate 
$y \in \mathcal{Y}$, a model $M_y$ is trained on the augmented dataset 
$\mathcal{D} \cup (X_{n+1}, y)$, and the nonconformity scores for all $ n+1$ data points are computed based on the fitted model $M_y$. Finally, the p-value for the hypothesis $Y_{n+1}=y$ is calculated as
\begin{equation}
\label{eq:p_value:full}
p_\text{full}^y = \frac{1 + \sum_{i=1}^n \mathds{1} \{s_{M_y}(X_i, Y_i) \geq s_{M_y}(X_{n+1}, y) \} }{n+1} \, ,   
\end{equation}
where $s_{M_y}$ is the nonconformity score defined based on $M_y$. 
One immediate drawback of full conformal prediction is the need to train $|\mathcal{Y}|$ models, which makes it computationally expensive. In contrast, split conformal prediction requires only a single model training step, at the cost of partitioning the data $\mathcal{D}$ into separate training set $\mathcal{D}_\text{train}$ and calibration set $\mathcal{D}_\text{calib}$. Specifically, split CP trains a single model $M$ on $\mathcal{D}_\text{train}$ and then uses this model to compute nonconformity scores on the calibration set. The p-value for the hypothesis $Y_{n+1}=y$ is then calculated as:
\begin{equation}
\label{eq:p_value:split}
p_\text{split}^y = \frac{1 + \sum_{i\in\mathcal{D}_\text{calib}} \mathds{1} \{s_{M}(X_i, Y_i) \geq s_{M}(X_{n+1}, y) \} }{|\mathcal{D}_\text{calib}|+1},   
\end{equation} 
where $s_{M}$ is the score defined based on $M$.

Given the p-values for each label in $\mathcal{Y}$ calculated using \eqref{eq:p_value:full} or \eqref{eq:p_value:split}, the conformal prediction set is constructed as:
\begin{equation}\label{eq:CPset}
    \cC(X_{n+1}) = \{y \in \cY \colon p^y \geq \alpha\}.
\end{equation}

Regardless of the variant of conformal prediction—full or split—and the choice of the nonconformity score, an important observation is the one we already highlighted in the introduction: what ultimately matters in the calculation of p-values and the construction of the prediction set is the relative ranking of the data points, rather than the exact values of the scores. This invariance to scale is formally stated in the following lemma.

% \begin{proposition}
% Let $s$ be a nonconformity score and 
% $\mathcal{T}(s)$ be any strictly monotonically increasing transformation of $s$. The conformal prediction set constructed using $s$ is equivalent to the one constructed using $\mathcal{T}(s)$.
% \end{proposition}
\begin{lemma}[\citep{vovk_algorithmic_2022}]
Let $s: \mathcal{X} \times \mathcal{Y} \to \mathbb{R}$ be a nonconformity score and let $\mathcal{T}: \mathbb{R} \to \mathbb{R}$ be a strictly increasing function. For any $\alpha \in (0,1)$, the conformal prediction set constructed from $s$  is identical to that constructed from $\mathcal{T}(s)$.
\end{lemma}
% \begin{proof}
% Since $\mathcal{T}$ is strictly increasing, for any $(X_1, Y_1)$ and $(X_2, Y_2)$ we have
%     \begin{align*}
%         s(X_1,Y_1) \geq & s(X_2,Y_2) \\ & \quad \iff 
%         \mathcal{T}(s(X_1,Y_1)) \geq \mathcal{T}(s(X_2,Y_2))
%     \end{align*}
% Hence, we have
%     \begin{align*}
%         \mathds{1} \{ s(X_1,Y_1) \geq &s(X_2,Y_2)\} \\& \iff \mathds{1} \{\mathcal{T}(s(X_1,Y_1)) \geq \mathcal{T}(s(X_2,Y_2)) \},
%     \end{align*}
% and the p-values in \eqref{eq:p_value:full} and \eqref{eq:p_value:split} remain unchanged. 
% \end{proof}
%
According to this observation, one may ask whether, instead of defining a nonconformity score based on a model (e.g., a probabilistic classifier) that is ultimately used to rank data points, it would be sufficient to directly learn a relation over pairs that ranks them without explicitly defining a score. 
% 
As already said, this is closely related to the concept of preference learning in machine learning, where the objective is to learn a ranking over data points \citep{furnkranz_preference_2011}. More specifically, learning-to-rank methods construct a (weak) preference relation 
$\succsim$ that, given a pair of objects $Z_i$ and $Z_j$, determines which one is preferred over (or ranked before) the other. For example, $Z_i \succsim Z_j$ suggests that object $Z_i$ is preferred to or as good as $Z_j$. More specifically, if objects are data points $Z = (X,Y)$, then $(X_{i}, Y_{i}) \succsim (X_{j}, Y_{j})$ implies that instance $X_i$ with label $Y_i$ is (weakly) preferred to instance $X_j$ with label $Y_j$. 
% An additional advantage of preference learning methods is their ability to learn such a ranking relation using weaker supervision, in particular, pairwise comparisons of the form
% $\{(X_{i_1}, Y_{i_1}) \succsim (X_{i_2}, Y_{i_2})\}_{i=1}^n$.
We will return to the learning of preferences later in Section \ref{sec:pl}.

Having access to such a relation, we can modify the conformal prediction framework as follows: in the case of full CP, for each candidate 
$y \in \mathcal{Y}$, we use the augmented datasets $\mathcal{D} \cup (X_{n+1}, y)$ to infer a preference relation $\succsim_{s_y}$. This allows us to redefine the notion of a p-value in \eqref{eq:p_value:full} as follows:
\begin{equation}
\label{eq:p_value:full:preference}
p_\text{full}^y = \frac{1 + \sum_{i=1}^n \mathds{1} \{(X_{n+1},y) \succsim_{s_y} (X_i, Y_i) \} }{n+1}.   
\end{equation}

Accordingly, for the split CP, we use the training data $\mathcal{D}_\text{train}$ to learn a single relation $\succsim_s$ and modify \eqref{eq:p_value:split} to have
\begin{equation}
\label{eq:p_value:split:preference}
p_\text{split}^y = \frac{1 + \sum_{i\in\mathcal{D}_\text{calib}} \mathds{1} \{{(X_{n+1},y) \succsim_{s} (X_i, Y_i) \} }}{|\mathcal{D}_\text{calib}|+1}.   
\end{equation} 
  
The pseudo-code of the proposed methods, which we refer to as \textit{preference-based} conformal prediction, is presented in Algorithms~\ref{alg:cp_without_ncs:full} and~\ref{alg:cp_without_ncs:split}, respectively.

% \begin{algorithm}[tb]
%      \caption{Full conformal prediction without non-conformity score}
%      \label{alg:cp_without_ncs:full}
%   \begin{algorithmic}
%      \State {\bfseries Input:} data $\mathcal{D}$, error rate $\alpha$, test instance $X_{n+1}$
%      \State For each $y\in \mathcal{Y}$: 
%     \begin{itemize}
%         \item use data $\mathcal{D} \cup (X_{n+1},y)$ to infer preference relation $\succsim_{s_y}$ 
%         \item calculate $p^y$ according to \eqref{eq:p_value:full:preference} given $\succsim_{s_y}$
%     \end{itemize}
%      \State Return prediction set $\mathcal{C}(X_{n+1}) = \{ y \in \cY \colon p^y \geq \alpha \}$ 
%   \end{algorithmic}
%   \end{algorithm}
  
\begin{algorithm}[tb]
     \caption{Full Preference-based Conformal Prediction}
     \label{alg:cp_without_ncs:full}
  \begin{algorithmic}
     \State {\bfseries Input:} data $\mathcal{D}$, error rate $\alpha$, test instance $X_{n+1}$
     \For{each $y \in \mathcal{Y}$}
        \State Use data $\mathcal{D} \cup (X_{n+1},y)$ to infer relation $\succsim_{s_y}$
        \State Calculate $p^y$ according to \eqref{eq:p_value:full:preference} given $\succsim_{s_y}$
     \EndFor
     \State Return prediction set $\mathcal{C}(X_{n+1}) = \{ y \in \mathcal{Y} \colon p^y \geq \alpha \}$ 
  \end{algorithmic}
  \end{algorithm}

  \begin{algorithm}[tb]
     \caption{Split Preference-based Conformal Prediction}
     \label{alg:cp_without_ncs:split}
  \begin{algorithmic}
     \State {\bfseries Input:} data $\mathcal{D}$, error rate $\alpha$, test instance $X_{n+1}$
     \State Partition $\mathcal{D}$ into $\mathcal{D}_\text{train}$ and $\mathcal{D}_\text{calib}$
     \State Use $\mathcal{D}_\text{train}$ to infer preference relation $\succsim_s$ 
     % \State Sort $\mathcal{D}_\text{calib}$ according to $\succsim_s$ 
     \State For each $y\in \mathcal{Y}$, calculate $p^y$ according to \eqref{eq:p_value:split:preference} given $\mathcal{D}_\text{calib}$
     \State Return prediction set $\mathcal{C}(X_{n+1}) = \{ y \in \cY \colon p^y \geq \alpha \}$ 
     % \State Let $(X_q,Y_q)$ be the $\lceil (1-\alpha) (n+1)  \rceil$ -th element in the sorted list 
     % \State Return prediction set $\mathcal{C}(X_{n+1}) = \{ y \in \cY \colon (X_{n+1},y) \succsim (X_q,Y_q) \}$ \\
  \end{algorithmic}
  \end{algorithm}
Beyond the natural connection to the notion of nonconformity, an additional advantage of this approach is its ability to learn such a ranking relation using weaker supervision, in particular, pairwise comparisons of the form 
$ \{(X_{i_1}, Y_{i_1}) \succsim (X_{i_2}, Y_{i_2})\}_{i=1}^n$.
This highlights the benefit of our approach compared to standard CP in scenarios where a large amount of pairwise comparisons is available, but supervision in the form of labeled data is limited, as outlined empirically in Section \ref{subsec:mixed_setup}.
\subsection{Validity of the proposed method}
As mentioned earlier, the desirable property of CP lies in its marginal coverage, as stated in \eqref{eq:coverage}. Hence, it is necessary to provide sufficient conditions under which the proposed method in the previous section still preserves this guarantee. We begin with the following definitions on a relation $\succsim$.
% \begin{definition}[Transitivity]
%     A relation $\succsim$ is said to be transitive if and only if given any $(X_1,Y_1), (X_2,Y_2), (X_3,Y_3) \in \mathcal{X} \times \mathcal{Y}$, 
%     \begin{equation}\label{eq:transitivity}
%     \begin{cases}
%         (X_1,Y_1)\succsim(X_2,Y_2) & \\
%         \text{ and } \\
%         (X_2,Y_2) \succsim (X_3,Y_3) & 
%     \end{cases}
%            \Rightarrow (X_1,Y_1)\succsim(X_3,Y_3).
%     \end{equation}
% \end{definition}

% \begin{definition}[Completeness]
%     A relation $\succsim$ is said to be complete if and only if given any $(X_1,Y_1), (X_2,Y_2) \in \mathcal{X} \times \mathcal{Y}$, either $(X_1,Y_1)\succsim(X_2,Y_2)$ or $(X_2,Y_2)\succsim(X_1,Y_1)$ holds.
% \end{definition}

% \begin{definition}[Continuity]
%     A relation $\succsim$ is said to be continuous if and only if the sets $\{ (W,Z)| (X,Y)\succsim(W,Z)\}$ and $\{ (W,Z)| (W,Z)\succsim(X,Y)\}$ are closed for all $(X,Y) \in \mathcal{X} \times \mathcal{Y}$.
% \end{definition}


\begin{definition}\label{def:properties}
    Consider a relation $\succsim$ defined on $\mathcal{X} \times \mathcal{Y}$.
    \begin{enumerate}
        \item \textbf{Transitivity:} $\succsim$ is transitive if and only if for all $(X_1,Y_1), (X_2,Y_2), (X_3,Y_3) \in \mathcal{X} \times \mathcal{Y}$,  
        \begin{equation*}\label{eq:transitivity}
        \begin{cases}
            (X_1,Y_1)\succsim(X_2,Y_2) & \\
            \text{ and } \\
            (X_2,Y_2) \succsim (X_3,Y_3) & 
        \end{cases}
               \Rightarrow (X_1,Y_1)\succsim(X_3,Y_3).
        \end{equation*}
        
        \item \textbf{Completeness:} $\succsim$ is complete if and only if for all $(X_1,Y_1), (X_2,Y_2) \in \mathcal{X} \times \mathcal{Y}$, either 
        \begin{equation*}
            (X_1,Y_1)\succsim(X_2,Y_2) \quad \text{or} \quad (X_2,Y_2)\succsim(X_1,Y_1)
        \end{equation*}
        holds.
        
        \item \textbf{Continuity:} $\succsim$ is continuous if and only if for every $(X,Y) \in \mathcal{X} \times \mathcal{Y}$, the sets
        \begin{equation*}
            \{ (W,V)\in \mathcal{X} \times \mathcal{Y}\mid  (X,Y)\succsim(W,V)\}
        \end{equation*}
        and 
        \begin{equation*}
            \{ (W,V)\in \mathcal{X} \times \mathcal{Y}\mid  (W,V)\succsim(X,Y)\}
        \end{equation*}
        are closed.
    \end{enumerate}
\end{definition}

The following theorem states that a relation, under certain assumptions, can be represented by a real-valued (unknown) utility function, where the preference of one point over another can be translated into a numerical comparison of their latent utilities.
% \begin{theorem}[\cite{debreu_representation_1954}]\label{thm:utility}
%     A relation $\succsim$ that is transitive, complete, and continuous can be represented by a utility function $u:\mathcal{X} \times \mathcal{Y} \fromto \mathbb{R}$, such that 
%     \begin{equation}\label{eq:utility}
%         (X_1,Y_1)\succsim(X_2,Y_2) \iff u(X_1,Y_1)\geq u(X_2,Y_2),
%     \end{equation}
%     for every $ (X_1,Y_1), (X_2,Y_2) \in \mathcal{X} \times \mathcal{Y}$.
% \end{theorem}
\begin{theorem}[\citep{debreu_representation_1954}]\label{thm:utility}
Let $\succsim$ be a binary relation on $\mathcal{X} \times \mathcal{Y}$ that is transitive, complete, and continuous. Then there exists a utility function $u:\mathcal{X} \times \mathcal{Y} \fromto \mathbb{R}$ such that, for all $(X_1,Y_1), (X_2,Y_2) \in \mathcal{X} \times \mathcal{Y}$,
\begin{equation}\label{eq:utility}
        (X_1,Y_1)\succsim(X_2,Y_2) \iff u(X_1,Y_1)\geq u(X_2,Y_2).
\end{equation}
% for every $ (X_1,Y_1), (X_2,Y_2) \in \mathcal{X} \times \mathcal{Y}$.
\end{theorem}
%
Next, we show that if there exists more than one utility function that represents a relation, they must all be comonotonic, meaning that they must agree on the relative comparisons. 
%
\begin{lemma}[Comonotonicity]\label{prop:comonotonicity}
Let $\mathcal{U}$ be the set of utility functions representing a transitive, complete, and continuous relation $\succsim$. Then for every $u, u' \in \mathcal{U}$ and every pair $ (X_1,Y_1), (X_2,Y_2) \in \mathcal{X} \times \mathcal{Y}$
\begin{equation*}
% \label{eq:comonotonicity}
    u(X_1,Y_1)\geq u(X_2,Y_2) \iff u'(X_1,Y_1)\geq u'(X_2,Y_2). 
\end{equation*}
\end{lemma}
% In other words, all $u \in \mathcal{U}$ induce the same ordering on $\mathcal{X} \times \mathcal{Y}$.

\begin{proof}
% Using \eqref{eq:utility} two times for $u$ and $u'$ we have
By \eqref{eq:utility}, we have for each $u, u^{\prime} \in \mathcal{U}$ and all $ (X_1,Y_1), (X_2,Y_2) \in \mathcal{X} \times \mathcal{Y}$
\begin{align*}
    u(X_1,Y_1)\geq u(X_2,Y_2) &\iff
    (X_1,Y_1)\succsim (X_2,Y_2) \\ &\iff u'(X_1,Y_1)\geq u'(X_2,Y_2).
\end{align*}
    % for all $ (X_1,Y_1), (X_2,Y_2) \in \mathcal{X} \times \mathcal{Y}$ and $u, u' \in \mathcal{U}$.
\end{proof}
Now, we are ready to present our main theorems. 

% \begin{lemma}
%     Consider a relation $\succsim$ and let $\mathcal{U}$ be the set of all utility functions representing $\succsim$. The relation $\succsim$  defines a unique ranking over datapoints $\mathcal{U}$  if all the utility functions in $\mathcal{U}$  are comontonic. 
% \end{lemma}

\begin{theorem}\label{thm:validity:full}
    Let $\mathcal{D}\cup \{ (X_{n+1}, Y_{n+1}) \}$ be a set of exchangeable data points, and let $\{\succsim_{s_y}:y\in \mathcal{Y}\}$ be the set of relations as in Algorithm \ref{alg:cp_without_ncs:full}. If every $\succsim_{s_y}$ is transitive, complete, and continuous, and the algorithm used to derive them is invariant to data permutation, then the conformal prediction sets constructed using Algorithm \ref{alg:cp_without_ncs:full} are valid.  
\end{theorem}
\begin{proof}
Consider $y \in \mathcal{Y}$. According to Theorem \ref{thm:utility}, let $u_y$ be a utility function that represents $\succsim_{s_y}$. We can therefore rewrite \eqref{eq:p_value:full:preference} as follows: 
\begin{equation*}
p_\text{full}^y = \frac{1 + \sum_{i=1}^n \mathds{1} \{u_y(X_{n+1},y) \geq u_y(X_i, Y_i) \} }{n+1}.   
\end{equation*}
Hence, the conformal prediction procedure provided in Algorithm \ref{alg:cp_without_ncs:full} can be seen as the standard full CP with the conformity function $u_y$ for each $y \in \mathcal{Y}$, which is a valid set predictor as long as $\succsim_{s_y}$ does not depend on the ordering of the data points (see \cite[Proposition $3.8$]{angelopoulos2024theoretical} for more details). Also, note that in the case where more than one utility function can represent the relation $\succsim_{s_y}$, Lemma \ref{prop:comonotonicity} ensures that p-values remain the same. 
\end{proof}

\begin{theorem}\label{thm:validity:split}
    Let $\mathcal{D}_\text{calib}\cup \{ (X_{n+1}, Y_{n+1}) \}$ be a set of exchangeable data points, and let $\succsim_{s}$ be the relation as in Algorithm \ref{alg:cp_without_ncs:split} that is transitive, complete, and continuous. The conformal prediction sets constructed using Algorithm \ref{alg:cp_without_ncs:split} are valid.  
\end{theorem}
We omit the proof for this case, as it follows the same reasoning as in the previous theorem.
Our results show that the preference-based CP is valid when the three conditions in Definition \ref{def:properties} are met for the preference relation. What our theorem does not account for is the case where the relation does not satisfy some of these properties. Indeed, although such a relation is easy to obtain from a preference learning perspective, it is not clear whether one can achieve valid prediction sets with it.
% We omit the proof for this case, as it follows the same line of argumentation as in the previous theorem.
% Typically, the nonconformity measure is not predefined but is constructed based on the fitted model. For example, in regression, a common choice is the absolute residual $|y-\hat{y}|$, while in classification, it is the negative conditional probability assigned to the true label.



% More specifically, when building a prediction set for a test instance $x_{n+1}$ the conformal p-value is defined as 

% for all $y \in \cY$.
% The resulting prediction set is then given by $$\cC(x_{n+1}) = \{y \in \cY \colon p^y > \alpha\} $$
% for a specified error rate $\alpha$.
% By inspecting Equation \ref{eq:p_value}, we observe that the resulting p-value is only determined by the ranking of the nonconformity scores, not by their absolute values.
% This straightforward observation let's us reinterpret the nonconformity scores:
% Instead of real-valued functions, we can equivalently define a preference relation $\succsim_s$ over $\cX \times \cY$, yielding the following Proposition.
% \begin{proposition}\label{thm:equivalence}
% % [equivalence]
% For any nonconformity function $s \colon \cX \times \cY \fromto \R$ in conformal prediction, there exists a rank equivalent preference relation $\succsim_s$ on $\cX \times \cY$.
% \end{proposition}
% A direct consequence of the rank-equivalence are coinciding p-values, which also imply coinciding prediction sets.
% \begin{proposition}\label{thm:validity}
% % [validity]
%   If the data points in $\mathcal{D}_\text{calib} \cup (x_\text{new}, y_\text{new})$ are exchangeable, then 
%       \begin{align*}
%           \mathbb{P}(y_\text{new} \in \mathcal{C}(x)) \geq 1-\alpha.
%       \end{align*}
% \end{proposition}

% It comes at no surprise that valid prediction sets can be constructed without considering the absolute values of nonconformity scores.
% However, the use of a preference relation $\succsim_s$ in lieu of the nonconformity function makes CP amenable to methods from the field of preference learning.
% Having access to training data that is only present in terms of qualitative comparisons, we can learn a latent nonconformity relation $\succsim_s$ via learning-to-rank methods and use it for afterwards for building conformal predicion sets.
% % This is particularly interesting for applications where qualitative comparisons are the standard form of supervision, which is naturally the case when human judgment is collected 

% Building upon the standard procedure for split conformal prediction, a general procedure for conformal prediction without nonconformity scores is depicted in Algorithm \ref{alg:split_cp_without_ncs}.



%\clearpage
% \textit{Conformal prediction} (CP) is a distribution-free and model-agnostic uncertainty quantification method that has attracted considerable attention in the recent past \cite{vovk_algorithmic_2022}.
% CP leverages calibration data in order to construct prediction sets that achieve (guaranteed marginal) coverage at a specified error rate $\alpha$. 
% Conventional CP methods build these sets by computing \textit{nonconformity scores}, quantifying how atypical a data point is and comparing its nonconformity to the percentile of calibration data that corresponds to the desired $\alpha$.
% While a quantitative notion of nonconformity in terms of a score is a sufficient condition for constructing valid prediction sets, only a qualitative concept is necessary:
% Being able to tell that one data point is less conformal than another is all we need to compute the percentile and build prediction sets. 
% This observation gives rise to the use of \textit{preference learning} (PL) methods, that naturally deal with information that is present in terms of qualitative comparisons \cite{furnkranz_preference_2011}.

% In this work, we examine the use of PL to learn nonconformity functions for conformal classification from preference data.
% We show, that for each possible nonconformity score there is an exists an equivalent and provide a proof of validity for the prediction sets constructed by our method.
% We further verify our method in synthetic experiments, showcasing that established nonconformity scores can be replicated from pairwise preferences.
% Finally, we evaluate the use of nonconformity functions learned from preference feedback on real-world downstream tasks such as text classification with large language models, where preference feedback is the most prevalent and indispensable source of information \cite{ouyang_training_2022}. 


%% Outcommented 
  % \begin{table}[t]
  % \caption{Table of Notations.}
  % \label{sample-table}
  % \vskip 0.15in
  % \begin{center}
  % \begin{small}
  % % \begin{sc}
  % \begin{tabular}{ll}
  % \toprule
  % Notation & Description\\
  % \midrule
  % $x \in \mathcal{X}$ & instance\\
  % $y \in \mathcal{Y}$ & label\\
  % $\mathcal{C}(x)$ & CP set for the instance $x$\\
  % % \midrule
  % $\pi^\star:\mathcal{X} \fromto \mathbb{P}(\mathcal{Y})$    & true label probability function \\
  % $\pi^\star(x)_y$ & probability of class $y$ for instance $x$\\
  % $\hat{\pi}: \mathcal{X} \fromto \mathbb{P}(\mathcal{Y})$    & an estimate of $\pi^\star$\\
  % $s \colon \cX \times \cY \fromto \R$ & conformity score function \\
  % $\succsim_s $ & conformity (score) order relation \\
  % $\mathcal{D}_\text{calib}$ & calibration data\\
  % $\mathcal{D}_\text{train}$ & training data\\
  % $\alpha$ & CP error rate\\
  % \bottomrule
  % \end{tabular}
  % % \end{sc}
  % \end{small}
  % \end{center}
  % \vskip -0.1in
  % \end{table}
  
  % \begin{corollary}[equivalence]\label{thm:equivalence}
  % For any nonconformity function $s \colon \cX \times \cY \fromto \R$ in conformal prediction, there exists a rank equivalent preference relation $\succsim_s$ on $\cX \times \cY$.
  % \end{corollary}
  %First idea still needs work: All in all there are some rather well known adjacent things (DEBREU,1954 or JAFFRAY,1975) so I am not even sure we need to reproduce this in the main body. [Btw it works both ways: Suppose X is countable. Then \succsim is a preference relation if and only if there exists some utility function u : X \to R that represents \succsim.] For the last part maybe we have to show that under ideal conditions (eg given all comparisons) our PL model converges to the true ranking but also that should be (actually not so) trivial. Draft:
  % Let us define a weak preference relation $\succsim$ to be a complete and transitive binary relation on a set $\mathcal{A}$ (which describes a decision makers ranking of all elements).
  % We see that if $u: \mathcal{X} \to \mathbb{R}$ is a utility function representing $\succsim$, $\succsim$ must be complete and transitive: %can/should we write score function instead of utility function. important note without proving coninuity this only is guaranteed if our "feature space" XxY is countable I think (and I actually don't think it is).
  
  % \textbf{Transitivity.}  
  % Suppose $x \succsim y$ and $y \succsim z$.  
  % Since $u$ represents $\succsim$, we have $u(x) \geq u(y)$ and $u(y) \geq u(z)$.  
  % By transitivity of $\geq$, $u(x) \geq u(z)$.  
  % Thus, $x \succsim z$. Hence, $\succsim$ is transitivity.
  
  % \textbf{Completeness.}  
  % For any $x, y \in \mathcal{X} \times \mathcal{Y}$, $u(x)$ and $u(y)$ are real numbers.  
  % Therefore, either $u(x) \geq u(y)$ or $u(y) \geq u(x)$.  
  % Since $u$ represents $\succsim$, this implies $x \succsim y$ or $y \succsim x$.  
  % Hence, $\succsim$ is complete.
  
  % % Furthermore for any strictly increasing function $f: \mathbb{R} \to \mathbb{R}$ we have $U(a) \succsim U(b)$ iff $f(U(a)) \succsim f(U(b))$ iff $V(a) \succsim V(b)$.
  
  % Following, the imposed preference relation can be learned by a ranker, for example by learning a binary predictor for every pair of items (and aggregating the individual predictions to a ranking at test time) or fitting a Placket-Luce model.
  
  % \begin{corollary}[validity]\label{thm:validity}
  % If the data points in $\mathcal{D}_\text{calib} \cup (x_\text{new}, y_\text{new})$ are exchangeable, then 
  %     \begin{align*}
  %         \mathbb{P}(y_\text{new} \in \mathcal{C}(x)) \geq 1-\alpha.
  %     \end{align*}
  % \end{corollary}
  
  % \label{sec:cp_introduction}
  % \begin{itemize}
  %     \item Formally introduce CP
  %     \item The role of nonconformity scores
  %     \item Established nonconformity scores (LAC, APS, ...)
  %     \item Instead of relying on pre-specified nonconformity scores, learn nonconformity understanding of human annotators (Is there related work?)
  %     \item Humans are known to have a limited intuitive understanding of probabilities and give poor probability estimates \cite{tversky_judgment_1974}. Obtaining quantitative feedback for the nonconformity of a datapoint from humans annotators seems not sensible.
  %     \item Elicit the nonconformity understanding of humans via pairwise preferences is reasonable
  %     \end{itemize}
% \clearpage 

  % \newpage 
  
  
\section{Learning relations from preference data}\label{sec:pl}
  
  There are two main approaches to modeling preferences on a set of objects $\mathcal{Z}$, namely in terms of (binary) preference relations $R \subseteq \mathcal{Z} \times \mathcal{Z}$ and in terms of utility functions $u: \mathcal{Z} \fromto \mathbb{R}$\,---\,informally speaking, these approaches correspond, respectively, to comparing pairs of objects and evaluating individual objects \citep{mpub498,mpub499}. Mathematically, the relational approach is more general: while a utility function induces a preference relation in a straightforward way, not every preference relation can be represented in terms of a utility function. This is also the reason why we presented our extension of CP in the previous section in terms of preference relations.

  From a machine learning perspective, the relational approach is closely related to binary classification, as it comes down to learning a binary preference predicate \citep{rigu_sl11}.
%which essentially amounts to reducing preference learning to binary classification. For object ranking, the pairwise approach has first been proposed by \citet{cohen98learning}. 
%Thus, the idea is to approach the object ranking problem by learning a binary preference predicate $h(z,z')$, which predicts whether $z$ is preferred to $z'$ or vice versa. 
For example, this predicate could be realized in the form of a binary classifier $\mathcal{Z} \times \mathcal{Z} \fromto \{ -1, +1 \}$, which accepts a tuple $(Z,Z')$ as input and returns $+1$ if $Z \succsim Z'$ and $-1$ otherwise. As one disadvantage of this approach, note that a (binary) predicate trained in this way does not necessarily guarantee any specific properties (such as transitivity) of the induced preference relation. As explained before, such properties are naturally required in the context of ranking. 
% 
  The alternative of learning a (latent) utility, which in our case corresponds to a \textit{conformity} function $u \colon \cX \times \cY \fromto \R$, is appealing because the preference relation $\succsim$ induced by such a function guarantees desirable properties right away.
  %As we discussed in Section \ref{sec:problem}, any preference relation that fulfills \textit{completeness}, \textit{transitivity} and \textit{continuity} can be represented in terms of such a real-valued function \citep{debreu_representation_1954}.
  %While both approaches are theoretically applicable for representing a nonconformity relation in CP, 
  Therefore, although the relational approach would be even more in line with the idea of CP without nonconformity scores (see the discussion in Section \ref{sec:conc}), we focus on learning latent utility functions in the following.
  %as they have practical advantages from a machine learning point of view:
  %Utility functions provide a more compact representation compared to predicates, as they only need to model one numerical value for each alternative.
  % Additionally, they naturally allow for learning from features and predicting items that were never compared.
  % Although this also holds true for predicates, it is typically more challenging to interpolate between unseen items for them.
    
  Among the various alternatives for learning a latent utility function from preference data, we opt for a generalized Bradley-Terry (BT) model \citep{bradley_rank_1952}.
  Under this model, the probability of a pairwise preference is modeled as 
  %\footnote{For the ease of notation, we refer to the instance-label pairs as $i_1$ and $i_2$ here.}
  \begin{equation}
      \label{eq:bradley_terry}
      \Prob(Z_{i} \succ Z_{j}) = \frac{\exp(u(Z_{i}))}{\exp(u(Z_{i})) + \exp(u(Z_{j}))}.
  \end{equation}
    Having access to training data $\cD_\text{train} = \{(X_{i_1}, Y_{i_1}) \succ (X_{i_2}, Y_{i_2})\}_{i=1}^n$ of pairwise comparisons, model parameters of $u$ can be learned via maximum likelihood estimation, where the negative log-likelihood function is given as
      \begin{equation}
      \label{eq:bradley_terry_nll}
       l({i_1},{i_2}) = \log\big(\exp(u(Z_{i_1})) + \exp(u(Z_{i_2}))\big) - u(Z_{i_1}),  
  \end{equation}
  where $Z_j = (X_j, Y_j )$.
  This can be used as a loss function for training models of $u$ with standard gradient-based methods.
  Due to its probabilistic nature, the BT model deals gracefully with noisy preference labels and is an appropriate choice for the task of learning a preference relation for CP.
It is worth mentioning that one can also incorporate standard labeled data into the BT model training by converting it into preference data. To that end, for a given labeled instance $(X_i, Y_i)$, $|\mathcal{Y}|-1$ pairwise comparisons between the instance with its true label and the same instance with other incorrect labels, i.e., 
$\{(X_i, Y_i) \succ (X_i, y), \, \forall y \neq Y_i\},$
can be added to the training set. For datasets with a large number of classes, this can be done more efficiently, for example, by sampling.
  
  There exist numerous alternatives to BT that are worth discussing.
  Note that the BT model assumes a clear winner for each comparison ($\succ$ instead of $\succsim$).
  There exist extensions that explicitly incorporate the possibility of ties, like the Davidson model \citep{davidson_extending_1970} and the extension by Rao and Kupper \citep{rao_ties_1967}.
  We adhere to the original BT model within this work due to its simplicity and robustness in situations with limited data.
  Another interesting alternative is the Plackett-Luce model \citep{plackett_analysis_1975, luce_individual_1959} for scenarios where preference data can be obtained in the form of full or partial rankings instead of only pairwise comparisons.
  In fact, the BT model is a special case of the Plackett-Luce model for rankings of length $2$.
  While the Plackett-Luce model is more appealing from a data efficiency perspective, pairwise comparisons are typically easier to collect.
  %Thus, we focus on a BT model in this work, although an extension to the Plackett-Luce model is canonically possible. 


  In this work, we focus on conformal classification and model $u$ in terms of a neural network.
  The architecture comprises one output neuron per class label $y \in \cY$.
  % Given a training example $(X_{i_1}, Y_{i_1}) \succ (X_{i_2}, Y_{i_2})$, we conduct two forward passes, one for each instance $X$.
  Given a training example $(X_{i_1}, Y_{i_1}) \succ (X_{i_2}, Y_{i_2})$, we perform two forward passes, one for $X_{i_1}$ and the other for $X_{i_2}$.
  The negative log-likelihood of the BT model is then computed between the output of the neuron corresponding to $Y_{i_1}$ for input $X_{i_1}$ in the first forward pass and the output of the neuron corresponding to $Y_{i_2}$ for input $X_{i_2}$ in the second forward pass.
  An illustration of this procedure is depicted in Figure \ref{fig:neural_network}.
  Note that neither $X_{i_1} \neq X_{i_2}$ nor $Y_{i_1} \neq Y_{i_2}$ is required.
  Consequently, we can learn from preferences with varying class labels for the same instance as well as preferences (for potentially the same class label) across varying instances.
  A description of BT neural network training in terms of pseudocode is given in Algorithm \ref{alg:nn_bt_training}.
  \begin{figure}[t!]
      \centering
      \includegraphics[width=\linewidth]{figures/neural_net_cap.pdf}
      \caption{Illustration of the neural network architecture and loss computation. Only the outputs for the labels of the pair in the training examples (highlighted in magenta) are contributing to the loss.}
      \label{fig:neural_network}
  \end{figure}
  
\begin{algorithm}[tb]
	\caption{Bradley-Terry Neural Network Training}
	\label{alg:nn_bt_training}
	\begin{algorithmic}
		\State \textbf{Input:} Training data $\mathcal{D}_\text{train}$, number of epochs $N$ , learning rate $\eta$
		\State Initialize parameters

%\State Load training data $(X_{train}, Y_{train})$
\For{epoch = 1 to $N$}
\For{each $(X_{i_1}, Y_{i_1}) \succ (X_{i_2}, Y_{i_2}) \in \mathcal{D}_\text{train}$}
% \State $u_{i_1} \gets (u(X_{i_1}))_{Y_{i_1}}$ \Comment{Frist forward pass}
% \State $u_{i_2} \gets (u(X_{i_2}))_{Y_{i_2}}$ \Comment{Second forward pass}
% \State $Loss \gets$ $l(u_{i_1},u_{i_2})$ \Comment{Compute loss from \eqref{eq:bradley_terry_nll}}
\State $u(Z_{i_1}) \gets u(X_{i_1},Y_{i_1})$ \Comment{First forward pass}
\State $u(Z_{i_2}) \gets u(X_{i_2},Y_{i_2})$ \Comment{Second forward pass}
\State $Loss \gets$ $l({i_1},{i_2})$ \Comment{Compute loss from \eqref{eq:bradley_terry_nll}}
\State $Gradients \gets$ BackwardPropagation$(Loss)$
\State UpdateParameters$(Gradients, \eta)$
\EndFor
\EndFor
	\end{algorithmic}
\end{algorithm}
\section{Experimental Results}\label{sec:exp}
%
In this section, we will experimentally examine the proposed method, focusing on split conformal prediction as outlined in Algorithm \ref{alg:cp_without_ncs:split} for the sake of computational efficiency.
After discussing the experimental setup and baselines, we are specifically interested in answering the following research questions:\looseness=1
% \begin{description}
%     \item[RQ1:] Can existing nonconformity functions be replicated from preference feedback? 
%     \item[RQ2:] How do nonconformity functions learned from preference data perform in real-world classification tasks?
% \end{description}
\begin{itemize}
    \item[(A)] Can existing nonconformity functions be replicated by the BT model from preference feedback? ($\leadsto$ Section \ref{subsec:replicate}.)
    \item[(B)] Since our method is able to exploit preference data, does it bring any benefit in cases where both preference and standard labelled data are available for training? ($\leadsto$ Section \ref{subsec:mixed_setup})
    \item[(C)] How does preference-based conformal prediction perform in real-world classification tasks? ($\leadsto$ Section \ref{subsec:learn}.) 
\end{itemize}
%
% \subsection{Setup}
\paragraph{Setup.} 
All of the following experiments are conducted with neural networks as the model class.
The models are implemented in \texttt{PyTorch} and conformalized with \texttt{TorchCP} \citep{wei_torchcp_2024}.
Whenever our method is compared against a classification baseline (i.e., standard CP), both models share the same architecture, optimizer, and learning rate, and are just trained with different loss functions and a different type of data.
%however, with the same learning rate and the same optimizer.
The code that was used for carrying out the following experiments is made publicly available \footnote{\url{https://github.com/JonasHanselle/preference-based-cp}}.

% \subsection{RQ1: Replicating Nonconformity Scores from Oracle Feedback}
\subsection{Baseline Nonconformity Scores}

\begin{figure*}[ht!]
      \centering
      \includegraphics[width=\linewidth]{figures/lac_aps.pdf}
      \caption{Illustration of the LAC and APS nonconformity scores.}
      \label{fig:lac_aps_illustration}
  \end{figure*}
% Before using a learned nonconformity function for the task of constructing conformal prediction sets, we will assess our models capability of replicating the discussed nonconformity scores from oracle feedback.
% Two of the most prominent nonconformity functions for classification are often referred to as the \textit{least ambiguous set-valued classifier} (LAC) \citep{sadinle_least_2019} and the \textit{adaptive prediction set} (APS) \citep{romano_classification_2020} scores.
There are many nonconformity scores in the literature designed for the classification problem \citep{sadinle_least_2019, romano_classification_2020, angelopoulos_uncertainty_2020, huang_conformal_2024}. For the sake of comparison, we consider as baselines the two most commonly used ones, namely the \textit{least ambiguous set-valued classifier} (LAC) \citep{sadinle_least_2019} and the \textit{adaptive prediction set} (APS) \citep{romano_classification_2020}.
Let $\hat{p}(\cdot| x)$ denote the predicted conditional probability distribution over labels. The LAC score directly relates the nonconformity of a class label $y$ to its negated (estimated) probability:
  \begin{equation}
      \label{eq:lac}
      s_{\text{LAC}}(x,y) = 1 - \hat{p}(y\mid x).
  \end{equation}
The APS score, on the other hand, computes as nonconformity the cumulative probability of class labels that are equally or more likely than the label $y$:
  \begin{equation}
      \label{eq:aps}
      s_{\text{APS}}(x,y) = \sum_{y' \in \cY} \hat{p}(y'\mid x) \cdot \mathds{1} \{\hat{p}(y'\mid x)\geq \hat{p}(y\mid x)\}.
  \end{equation}
% In order to avoid overcoverage, it is common practice to use a randomized (also called smoothed) version of APS, such that the the least probable class labels that fulfill the condition in the indicator function in \ref{eq:aps} are included with uniform probability:
However, this score, in its current format, can result in many ties, deviating the distribution of p-values from uniformity. This, in turn, makes the CP framework more conservative, leading to coverage higher than the desired level. To address this, the randomized version of APS, aka \textit{smoothed} APS, was introduced by modifying \eqref{eq:aps} as follows:
  \begin{align*}
      % \label{eq:aps_randomized}
      s_{\text{APS}}(x,y, \xi) &= \sum_{y' \in \cY} \hat{p}(y'\mid x) \cdot \mathds{1} \{\hat{p}(y'\mid x) > \hat{p}(y\mid x)\}  \\&+ \, \xi \cdot \hat{p}(y\mid x), \nonumber
  \end{align*}
with $\xi \sim \text{Unif}[0,1]$.
% This results in prediction sets for which coverage holds exactly at level $1-\alpha$.
% Further extensions of APS have been proposed in the literature, including \textit{regularized adaptive prediction sets} (RAPS) \citep{angelopoulos_uncertainty_2020} and \textit{sorted adaptive prediction sets} (SAPS) \citep{huang_conformal_2024}
% An illustration of these scores for the case of a binary classification problem is shown in Figure \ref{fig:lac_aps_illustration}.
An illustration of the behavior of these scores is given in Figure \ref{fig:lac_aps_illustration} for a simple binary classification problem with a one-dimensional feature. Here, the true probability of each class, as well as the scores for each class given these probabilities, are plotted accordingly.
% However, if a randomization effect is desired, it can potentially be applied in a post-hoc fashion on top of the prediction sets of the ranker, which is a sound way of smoothing prediction sets.
\subsection{Replicating Nonconformity Scores from Oracle Feedback}\label{subsec:replicate}

Before evaluating preference-based conformal prediction, we will assess our method's capability of replicating the discussed nonconformity scores from oracle feedback. By oracle feedback, we mean that a preference relation between two pairs results from comparing their nonconformity scores. 
The nonconformity scores under consideration pose different challenges.
As illustrated in Figure \ref{fig:lac_aps_illustration}, the LAC score corresponds (in the binary case) to the probability of the opposite class.
The APS score in its non-randomized version is simply a cumulative probability.
Consequently, it exhibits plateaus in the regions where the label $y$ is the least probable class. 
This poses a challenge for learning the score from pairwise data, as all comparisons within these regions result in ties (because the scores are all one). 
Additionally, this score has discontinuities at the decision boundaries.
Due to the inherent stochasticity and global lack of smoothness, learning to replicate the randomized version of APS is conceptually intractable. 
Thus, we restrict ourselves to the cases of LAC and non-randomized APS.


In order to assess whether the proposed method can indeed replicate the LAC and APS nonconformity scores, we generate synthetic data with two features and two classes drawn from two multivariate Gaussian distributions for which the ground truth conditional distributions $p(\cdot|x)$ are known.
This is done in a two-stage process: First, the class label $c$ is sampled according to a prior distribution $p(y)$. 
Then, the features are sampled from corresponding multivariate normal distributions $X \sim \mathcal{N}(\vec{m}_c,\vec{\Sigma}_c)$, fully defined by a mean vector $\vec{m}_c$ and a covariance matrix $\vec{\Sigma}_c$.
The overall posterior probability of observing a class label $c$ given a feature vector $X$ is thus given by $$p(y = c \mid X) = \dfrac{p(X \mid y = c) p(y = c)}{\sum_{c'} p(X \mid y = c') p(y = c')},$$ where the denominator serves for normalization. Figure \ref{fig:data_process} depicts an example of data sampled from this data-generating process.
% Exmaple data ist depicted in Figure 
\begin{figure}
    \centering
    \includegraphics[width=0.75\linewidth]{figures/data_generating_process.pdf}
    \caption{Example data drawn from the data-generating process described in Section \ref{subsec:replicate}.}
    \label{fig:data_process}
\end{figure}


We construct preference training data $\mathcal{D}_\text{train}$ by drawing $n$ instances from the data-generating process and pairing them with each of the $k=2$ possible class labels.
We compute nonconformity scores $s(x,y)$ for each of these observations and build all $n \cdot k \choose 2$ ordered pairs in agreement with the corresponding scores.\footnote{Note that we assume a noiseless oracle here, which returns the pair strictly ordered according to the nonconformity scores.}
% 
%Pairwise data of this kind does not hint at the relative win frequencies and corresponds to the case in which each alternative wins against another one either with probability $0$ or $1$ under the BT model.
In case two observations have the same nonconformity score $s(X_i, Y_i) = s(X_j, Y_j)$, we include both possible pairs, that is, $(X_i, Y_i) \succsim (X_j, Y_j)$ and its symmetric counterpart $(X_j, Y_j) \succsim (X_i, Y_i)$ in the training data.
This is particularly relevant for APS, because of the presence of ties.
To assess how well the learned preference relation reflects the original nonconformity score, we generate additional $100$ data points from the data generating process and sort them according to $s$ and $\succsim_s$, resulting in rankings $\sigma_s$ and $\sigma_{\succsim_{s}}$. 
We compute the Kendall's Tau-b rank correlation coefficient \citep{kendall_treatment_1945} between said rankings:
\begin{equation}
    \label{eq:kendall_tau_b}
    \tau_b(\sigma_s, \sigma_{\succsim_{s}}) = \frac{C-D}{\sqrt{((C+D+T_1) \cdot(C+D+T_2)}} 
\end{equation}
where $C$ is the number of concordant pairs, $D$ the number of discordant pairs and $T_1$ and $T_2$ are the number of ties in $\sigma_s$ and $\sigma_{\succsim_{s}}$ respectively. Figure \ref{fig:replicating_conformity} shows the rank correlation as a function of the number of training instances $n$ averaged over $5$ runs. % Figure \ref{fig:replicating_conformity} shows the results for a varying amount of instances. 

We observe that the ranker quickly learns to replicate LAC, while it takes more training instances in order to replicate APS.
We attribute this behavior to the presence of ties in the APS training data.
When two data points of the least probable class are compared, it is a tie in APS as both exhibit an APS score of 1 (see the plateaus in Figure~\ref{fig:lac_aps_illustration}).
LAC, in contrast, almost always generates a strict ordering with its scores. As a result, the training data for LAC can be leveraged more effectively as each pairwise comparison contains strict order information.
This structural difference is important, as the BT model assumes a winner for each comparison and does not natively handle ties. The ranking induced by APS, with its inherent ties, is therefore a poor match for the model.
Hence, because Kendall's Tau-b formula accounts for ties in its denominator (see \eqref{eq:kendall_tau_b}), the score's theoretical maximum for the APS ranker is strictly less than 1, even if all non-tied pairs are ordered perfectly.
%the maximum value achievable in terms of Kendall's Tau-b is strictly less than 1, even if all non-tied elements are ordered concordantly, due to the increased denominator (see \eqref{eq:kendall_tau_b}).

\begin{figure}[t!]
      \centering
      \includegraphics[width=\linewidth]{figures/replicating.pdf}
      \caption{Rank correlation between $\cD_{val}$ sorted according to the ground truth conformity score $s$ and the preference relation $\succsim_s$ learned from pairwise annotations averaged over 5 runs. The shaded region indicates the $95\%$ confidence interval.}
      \label{fig:replicating_conformity}
  \end{figure}
\subsection{Mixed Setup: Existence of Classification and Preference Data}\label{subsec:mixed_setup}
In the following, we demonstrate the advantage that preference-based CP has over standard CP in cases where qualitative preference data is also 
available that can be used by the former but not by the latter.
To this end, we conduct a $3$-class classification experiment involving a mixture of $100$ pairwise comparison data points and a varying number of $n$ labeled data points, with $n$ ranging from $10$  to $150$.
The data-generating process follows the setup used in Section \ref{subsec:replicate}.
The pairwise comparisons are constructed by drawing two samples and the preference is determined based on their conditional probabilities, i.e. $(X_1, Y_1) \succ (X_2, Y_2) \iff p(Y_1 \mid X_1) > p(Y_2 \mid X_2)$. 
We also generate $100$ labeled data points for calibration and another $100$ for testing.
While the classifier is trained solely on the $n$ labeled data points, our ranker is trained using both the $100$ pairwise comparisons and the same $n$ labeled data transformed into pairwise comparisons as described in Section~\ref{sec:pl}.
We repeated the experiment with varying $n$ and $20$ different random seeds and report the results in Figure \ref{fig:synth_clf_vs_rnk}.

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figures/synthetic_clf_vs_ranker.pdf}
    \caption{Average prediction set sizes of a classifier using the LAC and randomized APS nonconformity scores and a ranker for a varying number of available classification training data and different $\alpha$. The results are averaged over 20 runs and the shaded region indicates the $95\%$ confidence interval.}
    \label{fig:synth_clf_vs_rnk}
\end{figure}

We observe that the ranker initially has an advantage, which diminishes as more classification data becomes available.
This highlights the usefulness of our approach in scenarios where a large amount of weakly supervised preference data is available, but supervision in the form of labeled classification data is limited.
This, of course, comes at an extra computational cost. More specifically, in the case where we have access to $m$ pairwise comparisons and $n$ labeled instances, we can construct $(k-1) \cdot n$ pairwise comparisons between the correct and incorrect class labels from the labeled instances, where $k$ is the number of classes. The overall number of forward passes required for BT model training is then in $\mathcal{O}(m + k \cdot n)$, compared to classifier training, which is $\mathcal{O}(n)$.

\subsection{preference-based conformal prediction in Classification }
\label{subsec:learn}
%

\begin{figure*}
    \centering
    \includegraphics[width=\linewidth]{figures/boxplots.pdf}
    \caption{Accuracy, coverage rate, and average set size of the baselines APS and LAC as well as the ranking method.}
    \label{fig:boxplots}
\end{figure*}

In the following, we will examine the applicability of our method for the task of tabular classification.
To this end, we utilize benchmark datasets from the OpenML database \citep{vanschoren_openml_2013}, which are summarized in Table \ref{tab:openml_datasets}.
For all experiments, we learn a feed-forward neural network with two hidden layers of $20$ units each.
For each dataset, we train a classifier model with cross-entropy loss on the original training data as a baseline.
In order to learn a ranking model, we transfer the classification data into preference data by deriving all pairs between the true and wrong labels for each observation in the training set, as described in Section~\ref{sec:pl}.
%: $$(X_i, Y_i) \succ (X_i, y), \, \forall y \neq Y_i.$$ 
This yields a preference dataset consisting of all $(k-1) \cdot n$ pairwise comparisons, where $k$ is the number of classes and $n$ the number of instances in the original dataset.

We employ a $10$-times Monte Carlo cross-validation, in which we reserve a fraction of $\frac{1}{5}$ of the original dataset as test data.
From the remaining data, we reserve $\frac{1}{4}$ of the data points as calibration data for split conformal prediction and use the rest for training the classifier and ranker.
Results for various calibration set sizes are given in Appendix~\ref{app:cal_set_size}.
We employ the LAC and the randomized APS nonconformity scores for building conformal prediction sets with the classifier.

\begin{table}[t]
\centering
\caption{Overview of OpenML Datasets.}
\resizebox{1\columnwidth}{!}{%
\begin{tabular}{|l|l|r|r|r|}
\hline
ID & Name & \# Feat. & \# Classes & \# Inst. \\
\hline
15 & \texttt{breast-w} & 10 & 2 & 699 \\
31 & \texttt{credit-g} & 21 & 2 & 1000 \\
4534 & \texttt{PhishingWebsites} & 31 & 2 & 11055 \\
61 & \texttt{iris} & 5 & 3 & 150 \\
187 & \texttt{wine} & 14 & 3 & 178 \\
54 & \texttt{vehicle} & 19 & 4 & 846 \\
35 & \texttt{dermatology} & 35 & 6 & 366 \\
\hline
\end{tabular}
}
\label{tab:openml_datasets}
\end{table}

The results for the tabular classification are summarized in Figure \ref{fig:boxplots}, and a detailed tabular representation is given in Appendix~\ref{app:classification_results}.
We report the empirical coverage rate $$\frac{1}{|\mathcal{D}_\text{test}|} \sum_{i\in \mathcal{D}_\text{test}} \mathds{1}\{Y_i \in \cC(X_i)\}$$ and average set size $$\frac{1}{|\mathcal{D}_\text{test}|} \sum_{i \in \mathcal{D}_\text{test}} |\cC(X_i)|$$ averaged over the 10 cross-validation folds for different error rates $\alpha \in \{0.02,0.05, 0.1, 0.2\}$, where $\mathcal{D}_\text{test}$ denote the test set.
Additionally, we report the accuracy of the model in a non-conformalized context, where class predictions are given by the argmax of the probabilities returned by the classifier.
For computing the accuracy of the ranker, the class that has the lowest latent nonconformity is predicted.

For the datasets under consideration, we observe a similar performance between the conventional CP methods using a classifier equipped with a nonconformity score and the preference-based CP using the ranker.
Interestingly, even the accuracies coincide for \texttt{breast-w}, \texttt{credit-g}, and \texttt{PhishingWebsites} datasets.
While this may be surprising at first glance, note that the BT model can be seen as a special case of logistic regression, and in the case of binary classification, its negative log-likelihood corresponds to the cross-entropy loss.
Additionally, both the classifier and ranker share the same neural architecture and optimization procedure.
Albeit the ranker receives more training data (at least in the multi-class case), the information is also identical to the classification data:
The decomposition of one classification data point into $k-1$ preferences uniquely identifies the true class label but carries no further information.

For the CP-specific evaluation metrics, we see that the preference-based CP achieves comparable performance to the conventional CP.
The ranker fulfills the specified coverage rates while not producing larger prediction sets than the other approaches within acceptable statistical fluctuations. 
We conclude that for tabular classification tasks, a ranker that inferred a nonconformity relation achieves comparable performance to the conventional approach of learning a probabilistic classifier and conformalizing it via a specific nonconformity function. 


\section{Discussion}\label{sec:conc}
%
% In this work, we drew a connection between conformal prediction (CP) and preference learning (PL).
In this work, we established a connection between conformal prediction and preference learning. We have shown that nonconformity functions can be equivalently replaced with preference relations, suggesting the use of PL methods to construct valid prediction sets without nonconformity scores.
Building upon this observation, we proposed a concrete method for directly inferring nonconformity functions from preference data based on the Bradley-Terry model.
The experiments carried out empirically validate that
\begin{enumerate}
    \item established nonconformity functions can be replicated by our method, given appropriate data;
    \item when preference data is abundant but classification data is scarce, preference-based CP is advantageous over standard CP;
    \item preference-based CP attains a level of performance on par with standard CP on downstream classification tasks.
\end{enumerate}
These results highlight the potential for deriving valid conformal prediction sets solely from preferential feedback.

\paragraph{Limitations and future work.} While our proposed methods are promising, they are not without limitations. 
First, the calibration data must still consist of observations $(X,Y) \in \mathcal{X} \times \mathcal{Y}$, which restricts the approach to datasets where such structured inputs and labels are available. 
Another (potential) limitation is that our current work has been confined to (multi-class) classification problems, so one immediate extension would be to adapt it to the regression setting. 
Additionally, incorporating dyad-ranking techniques \citep{schafer_dyad_2018} appears to be a promising future direction, as this could enable zero-shot predictions.

%So far, we only derived preference data from existing classification datasets in order to experimentally validate our method. Future research could focus on collecting human nonconformity judgments and curating a dataset for learning and analyzing the nonconformity concept present in the collected data.

%Moreover, the learned conformity function still technically returns a score. 

% \color{blue}
Last but not least, it would be interesting to explore the ``genuinely relational'' approach to preference learning, which is even more in the spirit of doing CP without nonconformity scores. Thus, instead of expressing a preference relation $R$ through a latent utility (nonconformity) function, this relation could be learned more directly, essentially by training a classifier to predict pairwise preferences. There are (at least) two ways in which this problem could be tackled. First, the learning procedure could assure that $R$ fulfills all desired mathematical properties (completeness, transitivity, continuity), so that Theorems \ref{thm:validity:full} and \ref{thm:validity:split} apply. This is challenging from a (preference) learning point of view, as it means, for example, that individual pairwise comparisons cannot be predicted independently of each other. Second, one may allow the learning procedure to produce relations $R$ of more general nature, which may violate some of the properties. While this simplifies the learning part, CP cannot be done in the standard way anymore, so this approach requires a generalization of conformal prediction (e.g., making it amenable to partial order relations on data points).
% \color{black}



% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}



\begin{acknowledgements} % will be removed in pdf for initial submission,
% 						 % (without ‘accepted’ option in \documentclass)
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
%     Briefly acknowledge people and organizations here.
This work has received funding from the European Union's Horizon Europe Research and Innovation Programme under the Marie Sklodowska-Curie grant agreement No 101073307. Alireza Javanmardi was supported by the Klaus Tschira Stiftung. Yusuf Sale was supported by the DAAD program Konrad Zuse Schools of Excellence in Artificial Intelligence, sponsored by the Federal Ministry of Education and Research.

\begin{figure}[H]
  %\centering
  \includegraphics[width=0.2\textwidth]{figures/EN_Co-fundedbytheEU_RGB_POS.png}
\end{figure}
%     \emph{All} acknowledgements go in this section.
\end{acknowledgements}

% \clearpage
% References
% \bibliography{references}
\printbibliography{}
% \newpage

% \onecolumn

% \title{Title in Title Case\\(Supplementary Material)}
% \maketitle


% \appendix
%
% \section{Omitted Proofs}
% \textcolor{blue}{YS: Please keep this section for now. I'll add notes here without disturbing the progress in the main paper.}
% \begin{theorem}\label{thm:equivalence-app}
% Let $s: \mathcal{Z} \rightarrow \mathbb{R}$ be any (pointwise) non-conformity score, and let $\rho: \mathcal{Z} \rightarrow \mathbb{R}$ be rank-equivalent to s. Then for every finite sample $\{z_1,\dots,z_n\} \subset \mathcal{Z}$ and every new point $z_{n+1}$ the conformal p-values coincide. Specifically, we have
%     \begin{align*}
%         \pi_{s}(z_{n +t} \given z_1,\dots, z_n ) =  \pi_{\rho}(z_{n +t} \given z_1,\dots, z_n ).
%     \end{align*}
% \end{theorem}
% % \begin{proof}[Proof of Theorem \ref{thm:equivalence-app}]
    
% % \end{proof}
% An immediate consequence of \ref{thm:equivalence-app} is that $\mathcal{C}_{s}(X_{n+1})$ and $\mathcal{C}_{\rho}(X_{n+1})$ coincide. 

% Suggestion for the introduction: 


% \newpage 

% \iffalse 
% \section{Work in progress}

% \begin{theorem}
% For any nonconformity function in conformal prediction, an equivalent ranking problem can be learned directly, provided that the appropriate data are available.
% \end{theorem}

%First idea still needs work: All in all there are some rather well known adjacent things (DEBREU,1954 or JAFFRAY,1975) so I am not even sure we need to reproduce this in the main body. [Btw it works both ways: Suppose X is countable. Then \succsim is a preference relation if and only if there exists some utility function u : X \to R that represents \succsim.] For the last part maybe we have to show that under ideal conditions (eg given all comparisons) our PL model converges to the true ranking but also that should be (actually not so) trivial. Draft:
% Let us define a weak preference relation $\succsim$ to be a complete and transitive binary relation on a set $\mathcal{A}$ (which describes a decision makers ranking of all elements).
% We see that if $u: \mathcal{X} \to \mathbb{R}$ is a utility function representing $\succsim$, $\succsim$ must be complete and transitive: %can/should we write score function instead of utility function. important note without proving coninuity this only is guaranteed if our "feature space" XxY is countable I think (and I actually don't think it is).

% \textbf{Transitivity.}  
% Suppose $x \succsim y$ and $y \succsim z$.  
% Since $u$ represents $\succsim$, we have $u(x) \geq u(y)$ and $u(y) \geq u(z)$.  
% By transitivity of $\geq$, $u(x) \geq u(z)$.  
% Thus, $x \succsim z$. Hence, $\succsim$ is transitivity.

% \textbf{Completeness.}  
% For any $x, y \in \mathcal{X} \times \mathcal{Y}$, $u(x)$ and $u(y)$ are real numbers.  
% Therefore, either $u(x) \geq u(y)$ or $u(y) \geq u(x)$.  
% Since $u$ represents $\succsim$, this implies $x \succsim y$ or $y \succsim x$.  
% Hence, $\succsim$ is complete.

% Furthermore for any strictly increasing function $f: \mathbb{R} \to \mathbb{R}$ we have $U(a) \succsim U(b)$ iff $f(U(a)) \succsim f(U(b))$ iff $V(a) \succsim V(b)$.

% Following, the imposed preference relation can be learned by a ranker, for example by learning a binary predictor for every pair of items (and aggregating the individual predictions to a ranking at test time) or fitting a Placket-Luce model. See after \ref{thm:validity}.

% \begin{definition}[Basic Spaces]
% Let $(X, Y)$ be a measurable space where:
% \begin{itemize}
%     \item $X$ is the feature space
%     \item $Y$ is the label space
%     \item $Z = X \times Y$ is the example space
% \end{itemize}
% \end{definition}

% \begin{definition}[Nonconformity Function]
% Let $\alpha: Z \to \mathbb{R}$ be a nonconformity function where:
% \begin{itemize}
%     \item For any $z \in Z_{test}$, $\alpha(z)$ measures the nonconformity of $z$ with respect to the calibration set $Z^*$
% \end{itemize}
% \end{definition}

% \begin{proof}
% \textbf{Part 1: Construction of the Ranking Function}
% \begin{definition}
% For any nonconformity function $\alpha$, define the ranking function $r_\alpha: Z \times Z  \to \{-1, 1\}$ as:
% \[
% r_\alpha(z_1, z_2) = \text{sign}(\alpha(z_2) - \alpha(z_1))
% \]
% \end{definition}

% \begin{lemma}
% $r_\alpha$ satisfies the properties of a ranking function:
% \begin{enumerate}
%     \item Antisymmetry: $r_\alpha(z_1, z_2 | Z) = -r_\alpha(z_2, z_1 | Z)$
%     \item Transitivity: If $r_\alpha(z_1, z_2 | Z) \geq 0$ and $r_\alpha(z_2, z_3 | Z) \geq 0$, then $r_\alpha(z_1, z_3 | Z) \geq 0$
% \end{enumerate}
% \end{lemma}

% \begin{proof}[Proof of Lemma]
% \begin{enumerate}
%     \item Antisymmetry follows from the properties of sign function:
%     \begin{align*}
%         r_\alpha(z_1, z_2 | Z) &= \text{sign}(\alpha(z_2, Z) - \alpha(z_1, Z)) \\
%         &= -\text{sign}(\alpha(z_1, Z) - \alpha(z_2, Z)) \\
%         &= -r_\alpha(z_2, z_1 | Z)
%     \end{align*}

%     \item Transitivity follows from the transitivity of real numbers:
%     \begin{itemize}
%         \item If $r_\alpha(z_1, z_2 | Z) \geq 0$, then $\alpha(z_2, Z) \geq \alpha(z_1, Z)$
%         \item If $r_\alpha(z_2, z_3 | Z) \geq 0$, then $\alpha(z_3, Z) \geq \alpha(z_2, Z)$
%         \item Therefore, $\alpha(z_3, Z) \geq \alpha(z_1, Z)$, implying $r_\alpha(z_1, z_3 | Z) \geq 0$
%     \end{itemize}
% \end{enumerate}
% \end{proof}

% \textbf{Part 2: Equivalence in conformal prediction}

% \begin{theorem}
% For any $z \in Z$, the p-value computed using $\alpha$ is equivalent to that computed using $r_\alpha$.
% \end{theorem}

% \begin{proof}
% The p-value using $\alpha$ is defined as:
% \[
% p_\alpha(z) = \frac{|\{z' \in Z : \alpha(z') \geq \alpha(z )\}|}{|Z| + 1}
% \]

% Using $r_\alpha$, we can express the same set:
% \[
% \{z' \in Z : \alpha(z) \geq \alpha(z)\} = \{z' \in Z : r_\alpha(z', z) \leq 0\}
% \]

% Therefore:
% \[
% p_\alpha(z) = \frac{|\{z' \in Z : r_\alpha(z', z) \leq 0\}|}{|Z| + 1} = p_r(z)
% \]
% \end{proof}

% \textbf{Part 3: Learnability}
% \end{proof}
% \fi 

% \newpage
%
% \section{Additional Experiments}
% \label{ap:experiments}

% \begin{table}[ht!]   
% \centering
% \caption{Results for tabular classification}
% \resizebox{0.5\linewidth}{!}{%
% \begin{tabular}{lllrrr}
% 	\toprule
% 	Dataset & $\alpha$ & Method & Accuracy & Coverage & Avg. Set Size \\
% \midrule
%  \multirow{12}{*}{PhishingWebsites (2)} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & 0.9608 & \textbf{0.9871} & 1.7095 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & 0.9608 & 0.9543 & 1.0457 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & 0.9608 & 0.9542 & 0.9876 \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.9609} & 0.9539 & \textbf{0.9872} \\
% \cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & 0.9608 & \textbf{0.9867} & 1.6897 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & 0.9608 & 0.8989 & 0.9659 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & 0.9608 & 0.9038 & \textbf{0.9167} \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.9609} & 0.9038 & 0.9168 \\
% \cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & 0.9608 & \textbf{0.9603} & 1.6634 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & 0.9608 & 0.7933 & 0.8407 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & 0.9608 & 0.8027 & \textbf{0.8054} \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.9609} & 0.8029 & 0.8056 \\
% \cline{1-6} \multirow{12}{*}{breast-w (2)} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & \textbf{0.9671} & \textbf{0.9693} & 1.2086 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & \textbf{0.9671} & 0.9571 & 1.0443 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & \textbf{0.9671} & 0.9529 & \textbf{0.9793} \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.9671} & 0.9536 & 0.9807 \\
% \cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & \textbf{0.9671} & \textbf{0.9143} & 0.9471 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & \textbf{0.9671} & 0.9107 & 0.9621 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & \textbf{0.9671} & 0.8957 & \textbf{0.9071} \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.9671} & 0.8964 & 0.9093 \\
% \cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & \textbf{0.9671} & \textbf{0.8271} & 0.8600 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & \textbf{0.9671} & 0.8057 & 0.8429 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & \textbf{0.9671} & 0.8100 & 0.8157 \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.9671} & 0.8100 & \textbf{0.8157} \\
% \cline{1-6} \multirow{12}{*}{credit-g (2)} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & \textbf{0.7340} & \textbf{0.9965} & 1.9780 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & \textbf{0.7340} & 0.9710 & 1.7440 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & \textbf{0.7340} & 0.9520 & \textbf{1.5990} \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.7340} & 0.9525 & 1.6050 \\
% \cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & \textbf{0.7340} & \textbf{0.9965} & 1.9780 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & \textbf{0.7340} & 0.9015 & 1.4915 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & \textbf{0.7340} & 0.9040 & \textbf{1.4270} \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.7340} & 0.9070 & 1.4470 \\
% \cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & \textbf{0.7340} & \textbf{0.9965} & 1.9780 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & \textbf{0.7340} & 0.8060 & 1.2170 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & \textbf{0.7340} & 0.8120 & \textbf{1.1775} \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.7340} & 0.8130 & 1.1790 \\
% \cline{1-6} \multirow{12}{*}{dermatology (6)} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & 0.9703 & 0.9500 & 1.8311 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & 0.9703 & 0.9500 & 1.0689 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & 0.9703 & 0.9595 & \textbf{0.9838} \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.9757} & \textbf{0.9730} & 1.0270 \\
% \cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & 0.9703 & 0.8892 & 1.7581 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & 0.9703 & 0.8986 & 0.9527 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & 0.9703 & 0.9108 & \textbf{0.9203} \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.9757} & \textbf{0.9270} & 0.9338 \\
% \cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & 0.9703 & 0.7824 & 1.5122 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & 0.9703 & 0.8162 & 0.8500 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & 0.9703 & \textbf{0.8351} & 0.8392 \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.9757} & 0.8311 & \textbf{0.8351} \\
% \cline{1-6} \multirow{12}{*}{iris (3)} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & \textbf{0.9700} & 0.9533 & 1.0967 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & \textbf{0.9700} & \textbf{0.9667} & 1.1333 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & \textbf{0.9700} & 0.9267 & \textbf{1.0100} \\
%   &  &  \multirow{1}{*}{Ranker  } & 0.9633 & 0.9300 & 1.1000 \\
% \cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & \textbf{0.9700} & 0.8633 & 0.9700 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & \textbf{0.9700} & \textbf{0.9233} & 0.9967 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & \textbf{0.9700} & 0.8733 & 0.8933 \\
%   &  &  \multirow{1}{*}{Ranker  } & 0.9633 & 0.8700 & \textbf{0.8733} \\
% \cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & \textbf{0.9700} & 0.7500 & 0.8300 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & \textbf{0.9700} & \textbf{0.8367} & 0.8833 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & \textbf{0.9700} & 0.8133 & 0.8167 \\
%   &  &  \multirow{1}{*}{Ranker  } & 0.9633 & 0.7433 & \textbf{0.7433} \\
% \cline{1-6} \multirow{12}{*}{vehicle (4)} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & 0.5653 & 0.9624 & 3.0394 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & 0.5653 & 0.9647 & 2.6924 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & 0.5653 & 0.9576 & \textbf{2.5400} \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.6035} & \textbf{0.9653} & 2.6065 \\
% \cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & 0.5653 & \textbf{0.9171} & 2.5712 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & 0.5653 & 0.9071 & 2.3329 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & 0.5653 & 0.9118 & 2.1688 \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.6035} & 0.9106 & \textbf{2.1547} \\
% \cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & 0.5653 & \textbf{0.8094} & 2.0553 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & 0.5653 & 0.7906 & 1.7788 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & 0.5653 & 0.8006 & 1.6382 \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.6035} & 0.7982 & \textbf{1.6071} \\
% \cline{1-6} \multirow{12}{*}{wine (3)} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & \textbf{0.9833} & \textbf{0.9806} & 1.2972 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & \textbf{0.9833} & 0.9667 & 1.0361 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & \textbf{0.9833} & 0.9722 & \textbf{0.9944} \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.9833} & 0.9528 & 1.0000 \\
% \cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & \textbf{0.9833} & 0.8861 & 1.0056 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & \textbf{0.9833} & \textbf{0.9000} & 0.9250 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & \textbf{0.9833} & 0.8778 & 0.8778 \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.9833} & 0.8583 & \textbf{0.8583} \\
% \cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & \textbf{0.9833} & \textbf{0.8139} & 0.9111 \\
%   &  &  \multirow{1}{*}{Classifier APS (rand)} & \textbf{0.9833} & 0.7806 & 0.7972 \\
%   &  &  \multirow{1}{*}{Classifier LAC} & \textbf{0.9833} & 0.7778 & \textbf{0.7778} \\
%   &  &  \multirow{1}{*}{Ranker  } & \textbf{0.9833} & 0.7833 & 0.7833 \\

% \bottomrule
% \end{tabular}
% }
% \label{tab:tabular_classification_results}
% \end{table}

\newpage
\appendix
\newcolumntype{R}{>{\raggedleft\arraybackslash}X}
\onecolumn

\section{Influence of Calibration Set Size}
\label{app:cal_set_size}
In order to assess the influence of the calibration set size on the performance of preference-based conformal prediction, we conducted a small experimental study using the binary classification dataset \texttt{PhishingWebsites} and the multi-class dataset \texttt{vehicle}.
We reserved varying portions of the original training data for calibration, ranging from $10\%$ to $50\%$, and used the remainder for training the ranking model.
Apart from this, the experimental setup is identical to the one used for the classification experiments in Section \ref{subsec:learn}.
We report our results in Table \ref{tab:cal_size}.
Overall, there is no clear tendency that increasing the calibration set size is beneficial.
Using smaller portions of calibration data allows the learner to use more data for training, which typically results in a better classification performance.

\begin{table*}[h]
\centering
\caption{Performance of Preference-based Conformal Prediction on Classification Tasks with a Varying Percentage of Training Data Reserved for Calibration, Ranging from $10\%$ to $50\%$.}
\label{tab:cal_size}
\scriptsize
\begin{tabularx}{0.8 \linewidth}{lllRRR}
\toprule
Dataset & $\alpha$ & Calibration Data & Accuracy & Coverage Rate & Avg. Set Size \\
\midrule
 \multirow{20}{*}{\rotatebox{90}{$\texttt{PhishingWebsites}$ (2)}} &  \multirow{5}{*}{0.02} &  \multirow{1}{*}{10\%} & $\mathbf{0.962 \pm 0.003}$ & $\mathbf{0.983 \pm 0.004}$ & $\mathbf{1.055 \pm 0.010}$ \\
  &  &  \multirow{1}{*}{20\%} & $0.960 \pm 0.007$ & $0.981 \pm 0.006$ & $1.055 \pm 0.010$ \\
  &  &  \multirow{1}{*}{30\%} & $0.958 \pm 0.006$ & $0.982 \pm 0.004$ & $1.062 \pm 0.011$ \\
  &  &  \multirow{1}{*}{40\%} & $0.957 \pm 0.004$ & $0.983 \pm 0.003$ & $1.071 \pm 0.011$ \\
  &  &  \multirow{1}{*}{50\%} & $0.955 \pm 0.007$ & $0.981 \pm 0.004$ & $1.068 \pm 0.009$ \\
\cline{2-6} &  \multirow{5}{*}{0.05} &  \multirow{1}{*}{10\%} & $\mathbf{0.962 \pm 0.003}$ & $0.953 \pm 0.009$ & $\mathbf{0.984 \pm 0.013}$ \\
  &  &  \multirow{1}{*}{20\%} & $0.960 \pm 0.007$ & $0.953 \pm 0.007$ & $0.987 \pm 0.010$ \\
  &  &  \multirow{1}{*}{30\%} & $0.958 \pm 0.006$ & $0.953 \pm 0.006$ & $0.990 \pm 0.008$ \\
  &  &  \multirow{1}{*}{40\%} & $0.957 \pm 0.004$ & $\mathbf{0.954 \pm 0.005}$ & $0.995 \pm 0.009$ \\
  &  &  \multirow{1}{*}{50\%} & $0.955 \pm 0.007$ & $0.952 \pm 0.007$ & $0.995 \pm 0.006$ \\
\cline{2-6} &  \multirow{5}{*}{0.1} &  \multirow{1}{*}{10\%} & $\mathbf{0.962 \pm 0.003}$ & $0.902 \pm 0.013$ & $\mathbf{0.912 \pm 0.016}$ \\
  &  &  \multirow{1}{*}{20\%} & $0.960 \pm 0.007$ & $0.903 \pm 0.010$ & $0.915 \pm 0.012$ \\
  &  &  \multirow{1}{*}{30\%} & $0.958 \pm 0.006$ & $0.903 \pm 0.009$ & $0.916 \pm 0.009$ \\
  &  &  \multirow{1}{*}{40\%} & $0.957 \pm 0.004$ & $\mathbf{0.904 \pm 0.009}$ & $0.919 \pm 0.009$ \\
  &  &  \multirow{1}{*}{50\%} & $0.955 \pm 0.007$ & $0.903 \pm 0.007$ & $0.919 \pm 0.008$ \\
\cline{2-6} &  \multirow{5}{*}{0.2} &  \multirow{1}{*}{10\%} & $\mathbf{0.962 \pm 0.003}$ & $\mathbf{0.808 \pm 0.014}$ & $0.810 \pm 0.015$ \\
  &  &  \multirow{1}{*}{20\%} & $0.960 \pm 0.007$ & $0.802 \pm 0.012$ & $\mathbf{0.804 \pm 0.012}$ \\
  &  &  \multirow{1}{*}{30\%} & $0.958 \pm 0.006$ & $0.802 \pm 0.009$ & $0.805 \pm 0.009$ \\
  &  &  \multirow{1}{*}{40\%} & $0.957 \pm 0.004$ & $0.804 \pm 0.015$ & $0.808 \pm 0.015$ \\
  &  &  \multirow{1}{*}{50\%} & $0.955 \pm 0.007$ & $0.804 \pm 0.010$ & $0.808 \pm 0.010$ \\
\cline{1-6} \multirow{20}{*}{\rotatebox{90}{$\texttt{vehicle}$ (4)}} &  \multirow{5}{*}{0.02} &  \multirow{1}{*}{10\%} & $0.594 \pm 0.045$ & $0.989 \pm 0.007$ & $3.018 \pm 0.159$ \\
  &  &  \multirow{1}{*}{20\%} & $\mathbf{0.599 \pm 0.043}$ & $\mathbf{0.991 \pm 0.008}$ & $3.069 \pm 0.229$ \\
  &  &  \multirow{1}{*}{30\%} & $0.574 \pm 0.032$ & $0.982 \pm 0.013$ & $\mathbf{2.902 \pm 0.143}$ \\
  &  &  \multirow{1}{*}{40\%} & $0.555 \pm 0.047$ & $0.986 \pm 0.013$ & $2.994 \pm 0.172$ \\
  &  &  \multirow{1}{*}{50\%} & $0.571 \pm 0.050$ & $0.987 \pm 0.011$ & $3.045 \pm 0.178$ \\
\cline{2-6} &  \multirow{5}{*}{0.05} &  \multirow{1}{*}{10\%} & $0.594 \pm 0.045$ & $0.958 \pm 0.031$ & $\mathbf{2.497 \pm 0.225}$ \\
  &  &  \multirow{1}{*}{20\%} & $\mathbf{0.599 \pm 0.043}$ & $0.960 \pm 0.021$ & $2.588 \pm 0.236$ \\
  &  &  \multirow{1}{*}{30\%} & $0.574 \pm 0.032$ & $0.943 \pm 0.030$ & $2.542 \pm 0.163$ \\
  &  &  \multirow{1}{*}{40\%} & $0.555 \pm 0.047$ & $0.963 \pm 0.018$ & $2.672 \pm 0.199$ \\
  &  &  \multirow{1}{*}{50\%} & $0.571 \pm 0.050$ & $\mathbf{0.964 \pm 0.018}$ & $2.692 \pm 0.140$ \\
\cline{2-6} &  \multirow{5}{*}{0.1} &  \multirow{1}{*}{10\%} & $0.594 \pm 0.045$ & $0.901 \pm 0.027$ & $2.085 \pm 0.161$ \\
  &  &  \multirow{1}{*}{20\%} & $\mathbf{0.599 \pm 0.043}$ & $0.902 \pm 0.024$ & $\mathbf{2.074 \pm 0.231}$ \\
  &  &  \multirow{1}{*}{30\%} & $0.574 \pm 0.032$ & $0.893 \pm 0.030$ & $2.184 \pm 0.107$ \\
  &  &  \multirow{1}{*}{40\%} & $0.555 \pm 0.047$ & $0.907 \pm 0.023$ & $2.233 \pm 0.179$ \\
  &  &  \multirow{1}{*}{50\%} & $0.571 \pm 0.050$ & $\mathbf{0.911 \pm 0.019}$ & $2.266 \pm 0.135$ \\
\cline{2-6} &  \multirow{5}{*}{0.2} &  \multirow{1}{*}{10\%} & $0.594 \pm 0.045$ & $0.786 \pm 0.042$ & $1.605 \pm 0.113$ \\
  &  &  \multirow{1}{*}{20\%} & $\mathbf{0.599 \pm 0.043}$ & $0.804 \pm 0.047$ & $\mathbf{1.595 \pm 0.163}$ \\
  &  &  \multirow{1}{*}{30\%} & $0.574 \pm 0.032$ & $0.776 \pm 0.052$ & $1.680 \pm 0.102$ \\
  &  &  \multirow{1}{*}{40\%} & $0.555 \pm 0.047$ & $0.805 \pm 0.036$ & $1.793 \pm 0.186$ \\
  &  &  \multirow{1}{*}{50\%} & $0.571 \pm 0.050$ & $\mathbf{0.819 \pm 0.027}$ & $1.779 \pm 0.102$ \\

\bottomrule
\end{tabularx}
\end{table*}

% \newpage
\section{Detailed Classification Results}
\label{app:classification_results}
In the following, we present the classification results of Section \ref{subsec:learn} that were summarized in Figure \ref{fig:boxplots} in detail in Table \ref{tab:app_clf_results1} and Table \ref{tab:app_clf_results2}.
\begin{table*}[h]
\scriptsize
\centering
\caption{Detailed Classification Results}
\label{tab:app_clf_results1}
\begin{tabularx}{0.8\linewidth}{lllRRR}
\toprule
Dataset & $\alpha$ & Method & Accuracy & Coverage Rate & Avg. Set Size \\
\midrule
 \multirow{16}{*}{\rotatebox{90}{$\texttt{dermatology}$ (6)}} &  \multirow{4}{*}{0.02} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.968 \pm 0.023}$ & $0.985 \pm 0.025$ & $2.383 \pm 1.356$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.968 \pm 0.023}$ & $\mathbf{0.993 \pm 0.010}$ & $1.889 \pm 1.352$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.968 \pm 0.023}$ & $0.992 \pm 0.022$ & $\mathbf{1.811 \pm 1.388}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.965 \pm 0.026$ & $0.986 \pm 0.030$ & $2.055 \pm 1.476$ \\
\cline{2-6} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.968 \pm 0.023}$ & $0.967 \pm 0.033$ & $1.845 \pm 0.243$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.968 \pm 0.023}$ & $0.970 \pm 0.026$ & $1.068 \pm 0.081$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.968 \pm 0.023}$ & $0.969 \pm 0.031$ & $\mathbf{1.016 \pm 0.059}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.965 \pm 0.026$ & $\mathbf{0.974 \pm 0.032}$ & $1.197 \pm 0.486$ \\
\cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.968 \pm 0.023}$ & $0.910 \pm 0.059$ & $1.778 \pm 0.253$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.968 \pm 0.023}$ & $0.915 \pm 0.038$ & $0.968 \pm 0.057$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.968 \pm 0.023}$ & $0.919 \pm 0.047$ & $\mathbf{0.928 \pm 0.048}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.965 \pm 0.026$ & $\mathbf{0.925 \pm 0.044}$ & $0.931 \pm 0.046$ \\
\cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.968 \pm 0.023}$ & $0.816 \pm 0.092$ & $1.624 \pm 0.333$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.968 \pm 0.023}$ & $0.824 \pm 0.078$ & $0.862 \pm 0.088$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.968 \pm 0.023}$ & $\mathbf{0.832 \pm 0.067}$ & $\mathbf{0.832 \pm 0.067}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.965 \pm 0.026$ & $0.831 \pm 0.071$ & $0.833 \pm 0.071$ \\
\cline{1-6} \multirow{16}{*}{\rotatebox{90}{$\texttt{iris}$ (3)}} &  \multirow{4}{*}{0.02} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.948 \pm 0.034}$ & $\mathbf{1.000 \pm 0.000}$ & $\mathbf{3.000 \pm 0.000}$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.948 \pm 0.034}$ & $\mathbf{1.000 \pm 0.000}$ & $\mathbf{3.000 \pm 0.000}$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.948 \pm 0.034}$ & $\mathbf{1.000 \pm 0.000}$ & $\mathbf{3.000 \pm 0.000}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.940 \pm 0.037$ & $\mathbf{1.000 \pm 0.000}$ & $\mathbf{3.000 \pm 0.000}$ \\
\cline{2-6} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.948 \pm 0.034}$ & $0.945 \pm 0.052$ & $1.160 \pm 0.113$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.948 \pm 0.034}$ & $\mathbf{0.957 \pm 0.067}$ & $1.133 \pm 0.189$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.948 \pm 0.034}$ & $0.950 \pm 0.041$ & $\mathbf{1.043 \pm 0.128}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.940 \pm 0.037$ & $0.955 \pm 0.046$ & $1.169 \pm 0.253$ \\
\cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.948 \pm 0.034}$ & $0.848 \pm 0.076$ & $0.967 \pm 0.137$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.948 \pm 0.034}$ & $\mathbf{0.914 \pm 0.080}$ & $1.007 \pm 0.106$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.948 \pm 0.034}$ & $0.888 \pm 0.066$ & $0.905 \pm 0.086$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.940 \pm 0.037$ & $0.857 \pm 0.110$ & $\mathbf{0.881 \pm 0.123}$ \\
\cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.948 \pm 0.034}$ & $0.764 \pm 0.097$ & $0.845 \pm 0.132$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.948 \pm 0.034}$ & $\mathbf{0.793 \pm 0.119}$ & $0.855 \pm 0.141$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.948 \pm 0.034}$ & $0.781 \pm 0.111$ & $\mathbf{0.783 \pm 0.115}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.940 \pm 0.037$ & $0.781 \pm 0.134$ & $0.788 \pm 0.138$ \\
\cline{1-6} \multirow{16}{*}{\rotatebox{90}{$\texttt{vehicle}$ (4)}} &  \multirow{4}{*}{0.02} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.579 \pm 0.054}$ & $\mathbf{0.995 \pm 0.009}$ & $3.784 \pm 0.314$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.579 \pm 0.054}$ & $0.978 \pm 0.017$ & $2.989 \pm 0.185$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.579 \pm 0.054}$ & $0.982 \pm 0.019$ & $\mathbf{2.910 \pm 0.199}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.565 \pm 0.027$ & $0.990 \pm 0.010$ & $3.030 \pm 0.167$ \\
\cline{2-6} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.579 \pm 0.054}$ & $\mathbf{0.971 \pm 0.022}$ & $3.097 \pm 0.468$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.579 \pm 0.054}$ & $0.951 \pm 0.031$ & $2.608 \pm 0.229$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.579 \pm 0.054}$ & $0.957 \pm 0.023$ & $\mathbf{2.484 \pm 0.215}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.565 \pm 0.027$ & $0.957 \pm 0.021$ & $2.608 \pm 0.226$ \\
\cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.579 \pm 0.054}$ & $0.909 \pm 0.027$ & $2.464 \pm 0.122$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.579 \pm 0.054}$ & $\mathbf{0.911 \pm 0.025}$ & $2.296 \pm 0.233$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.579 \pm 0.054}$ & $0.908 \pm 0.024$ & $\mathbf{2.072 \pm 0.201}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.565 \pm 0.027$ & $0.900 \pm 0.027$ & $2.125 \pm 0.149$ \\
\cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.579 \pm 0.054}$ & $\mathbf{0.816 \pm 0.042}$ & $2.027 \pm 0.121$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.579 \pm 0.054}$ & $0.810 \pm 0.045$ & $1.803 \pm 0.172$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.579 \pm 0.054}$ & $0.800 \pm 0.037$ & $\mathbf{1.596 \pm 0.169}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.565 \pm 0.027$ & $0.800 \pm 0.051$ & $1.684 \pm 0.145$ \\

\bottomrule
\end{tabularx}
\end{table*}



\begin{table*}[htbp]
\scriptsize
\centering
\caption{Detailed Classification Results}
\label{tab:app_clf_results2}
% \footnotesize % or \footnotesize, \scriptsize

\begin{tabularx}{0.8\linewidth}{lllRRR}
\toprule
Dataset & $\alpha$ & Method & Accuracy & Coverage Rate & Avg. Set Size \\
\midrule
 \multirow{16}{*}{ \rotatebox{90}{ $\texttt{PhishingWebsites}$ (2)}} &  \multirow{4}{*}{0.02} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.959 \pm 0.004}$ & $\mathbf{0.999 \pm 0.001}$ & $1.987 \pm 0.002$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.959 \pm 0.004}$ & $0.980 \pm 0.005$ & $1.109 \pm 0.014$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.959 \pm 0.004}$ & $0.980 \pm 0.003$ & $\mathbf{1.057 \pm 0.012}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $\mathbf{0.959 \pm 0.004}$ & $0.981 \pm 0.003$ & $1.058 \pm 0.011$ \\
\cline{2-6} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.959 \pm 0.004}$ & $\mathbf{0.967 \pm 0.018}$ & $1.212 \pm 0.421$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.959 \pm 0.004}$ & $0.948 \pm 0.006$ & $1.042 \pm 0.013$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.959 \pm 0.004}$ & $0.951 \pm 0.005$ & $\mathbf{0.985 \pm 0.006}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $\mathbf{0.959 \pm 0.004}$ & $0.951 \pm 0.005$ & $0.985 \pm 0.006$ \\
\cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.959 \pm 0.004}$ & $\mathbf{0.951 \pm 0.033}$ & $1.196 \pm 0.430$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.959 \pm 0.004}$ & $0.900 \pm 0.011$ & $0.974 \pm 0.013$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.959 \pm 0.004}$ & $0.899 \pm 0.008$ & $\mathbf{0.912 \pm 0.008}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $\mathbf{0.959 \pm 0.004}$ & $0.900 \pm 0.008$ & $0.912 \pm 0.008$ \\
\cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.959 \pm 0.004}$ & $\mathbf{0.895 \pm 0.082}$ & $1.140 \pm 0.463$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.959 \pm 0.004}$ & $0.801 \pm 0.011$ & $0.854 \pm 0.012$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.959 \pm 0.004}$ & $0.796 \pm 0.015$ & $\mathbf{0.799 \pm 0.015}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $\mathbf{0.959 \pm 0.004}$ & $0.796 \pm 0.014$ & $0.799 \pm 0.015$ \\
\cline{1-6} \multirow{16}{*}{\rotatebox{90}{$\texttt{breast-w}$ (2)}} &  \multirow{4}{*}{0.02} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.967 \pm 0.014}$ & $\mathbf{0.999 \pm 0.003}$ & $1.947 \pm 0.026$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.967 \pm 0.014}$ & $0.990 \pm 0.012$ & $1.245 \pm 0.108$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.967 \pm 0.014}$ & $0.991 \pm 0.010$ & $\mathbf{1.113 \pm 0.077}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $\mathbf{0.967 \pm 0.014}$ & $0.991 \pm 0.010$ & $1.116 \pm 0.082$ \\
\cline{2-6} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.967 \pm 0.014}$ & $\mathbf{0.966 \pm 0.029}$ & $1.216 \pm 0.407$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.967 \pm 0.014}$ & $0.953 \pm 0.027$ & $1.027 \pm 0.043$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.967 \pm 0.014}$ & $0.956 \pm 0.027$ & $\mathbf{0.984 \pm 0.030}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $\mathbf{0.967 \pm 0.014}$ & $0.956 \pm 0.028$ & $0.984 \pm 0.031$ \\
\cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.967 \pm 0.014}$ & $\mathbf{0.922 \pm 0.048}$ & $0.956 \pm 0.040$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.967 \pm 0.014}$ & $0.887 \pm 0.034$ & $0.935 \pm 0.031$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.967 \pm 0.014}$ & $0.914 \pm 0.038$ & $\mathbf{0.927 \pm 0.043}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $\mathbf{0.967 \pm 0.014}$ & $0.915 \pm 0.037$ & $0.929 \pm 0.042$ \\
\cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.967 \pm 0.014}$ & $\mathbf{0.818 \pm 0.071}$ & $0.851 \pm 0.064$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.967 \pm 0.014}$ & $0.814 \pm 0.052$ & $0.851 \pm 0.048$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.967 \pm 0.014}$ & $0.808 \pm 0.056$ & $\mathbf{0.811 \pm 0.056}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $\mathbf{0.967 \pm 0.014}$ & $0.808 \pm 0.057$ & $\mathbf{0.811 \pm 0.057}$ \\
\cline{1-6} \multirow{16}{*}{\rotatebox{90}{$\texttt{credit-g}$ (2)}} &  \multirow{4}{*}{0.02} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.726 \pm 0.029}$ & $\mathbf{0.994 \pm 0.005}$ & $1.969 \pm 0.012$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.726 \pm 0.029}$ & $0.978 \pm 0.014$ & $1.852 \pm 0.084$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.726 \pm 0.029}$ & $0.985 \pm 0.010$ & $1.800 \pm 0.060$ \\
  &  &  \multirow{1}{*}{Ranker  } & $\mathbf{0.726 \pm 0.029}$ & $0.985 \pm 0.010$ & $\mathbf{1.799 \pm 0.060}$ \\
\cline{2-6} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.726 \pm 0.029}$ & $\mathbf{0.994 \pm 0.005}$ & $1.969 \pm 0.012$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.726 \pm 0.029}$ & $0.952 \pm 0.012$ & $1.685 \pm 0.063$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.726 \pm 0.029}$ & $0.955 \pm 0.017$ & $\mathbf{1.611 \pm 0.054}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $\mathbf{0.726 \pm 0.029}$ & $0.955 \pm 0.017$ & $1.615 \pm 0.054$ \\
\cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.726 \pm 0.029}$ & $\mathbf{0.994 \pm 0.005}$ & $1.969 \pm 0.012$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.726 \pm 0.029}$ & $0.901 \pm 0.021$ & $1.487 \pm 0.071$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.726 \pm 0.029}$ & $0.906 \pm 0.018$ & $\mathbf{1.424 \pm 0.045}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $\mathbf{0.726 \pm 0.029}$ & $0.906 \pm 0.021$ & $1.432 \pm 0.051$ \\
\cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.726 \pm 0.029}$ & $\mathbf{0.974 \pm 0.073}$ & $1.909 \pm 0.220$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.726 \pm 0.029}$ & $0.800 \pm 0.044$ & $1.210 \pm 0.059$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.726 \pm 0.029}$ & $0.801 \pm 0.032$ & $\mathbf{1.159 \pm 0.052}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $\mathbf{0.726 \pm 0.029}$ & $0.801 \pm 0.034$ & $1.161 \pm 0.058$ \\
\cline{1-6} \multirow{16}{*}{\rotatebox{90}{$\texttt{wine}$ (3)}} &  \multirow{4}{*}{0.02} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.982 \pm 0.021}$ & $\mathbf{1.000 \pm 0.000}$ & $\mathbf{3.000 \pm 0.000}$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.982 \pm 0.021}$ & $\mathbf{1.000 \pm 0.000}$ & $\mathbf{3.000 \pm 0.000}$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.982 \pm 0.021}$ & $\mathbf{1.000 \pm 0.000}$ & $\mathbf{3.000 \pm 0.000}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.974 \pm 0.023$ & $\mathbf{1.000 \pm 0.000}$ & $\mathbf{3.000 \pm 0.000}$ \\
\cline{2-6} &  \multirow{4}{*}{0.05} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.982 \pm 0.021}$ & $0.954 \pm 0.059$ & $1.379 \pm 0.494$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.982 \pm 0.021}$ & $\mathbf{0.982 \pm 0.026}$ & $1.040 \pm 0.056$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.982 \pm 0.021}$ & $0.966 \pm 0.045$ & $\mathbf{0.988 \pm 0.067}$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.974 \pm 0.023$ & $0.974 \pm 0.035$ & $1.000 \pm 0.052$ \\
\cline{2-6} &  \multirow{4}{*}{0.1} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.982 \pm 0.021}$ & $0.895 \pm 0.077$ & $1.159 \pm 0.245$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.982 \pm 0.021}$ & $0.895 \pm 0.038$ & $0.923 \pm 0.045$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.982 \pm 0.021}$ & $\mathbf{0.915 \pm 0.068}$ & $0.917 \pm 0.069$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.974 \pm 0.023$ & $0.907 \pm 0.075$ & $\mathbf{0.911 \pm 0.079}$ \\
\cline{2-6} &  \multirow{4}{*}{0.2} &  \multirow{1}{*}{Classifier APS} & $\mathbf{0.982 \pm 0.021}$ & $0.817 \pm 0.074$ & $1.038 \pm 0.233$ \\
  &  &  \multirow{1}{*}{Classifier APS (rand)} & $\mathbf{0.982 \pm 0.021}$ & $\mathbf{0.819 \pm 0.080}$ & $0.839 \pm 0.086$ \\
  &  &  \multirow{1}{*}{Classifier LAC} & $\mathbf{0.982 \pm 0.021}$ & $0.817 \pm 0.097$ & $0.817 \pm 0.097$ \\
  &  &  \multirow{1}{*}{Ranker  } & $0.974 \pm 0.023$ & $0.812 \pm 0.106$ & $\mathbf{0.812 \pm 0.106}$ \\

\bottomrule
\end{tabularx}
\end{table*}


\end{document}
