\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{xcolor}
\usepackage{graphicx, setspace, latexsym,amsmath,amssymb,amsthm,color}


\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{commands}[2017/10/25 Math macros]

\RequirePackage{etoolbox}
\newtoggle{isdraft}
\DeclareOption{draft}{\toggletrue{isdraft}}
\ProcessOptions\relax


\newcommand{\todo}[1]{
\iftoggle{isdraft}{
\vspace{5 mm}\par \noindent \marginpar{\textsc{ToDo}}
\framebox{\begin{minipage}[c]{0.95 \columnwidth} \tt #1
\end{minipage}}\vspace{5 mm}\par}{}}

\newcommand{\propchange}[1]{
\iftoggle{isdraft}{
\textcolor{blue}{#1}}{#1}}

\newcommand{\propdelete}[2]{
\iftoggle{isdraft}{
\textcolor{blue}{\sout{#1} {#2}}}{}}

\def\qed{\rule[0pt]{5pt}{5pt}\par\medskip}

\newcommand{\minimize}{\mbox{minimize}}
\newcommand{\maximize}{\mbox{maximize}}
\newcommand{\st}{\mbox{subject to}}

\newcommand{\statespace}[4]{\left[ \begin{array}{c|c} #1 & #2 \\ \hline\rule{0pt}{2.6ex} #3 & #4 \end{array} \right]}
\newcommand{\tf}[1]{\boldsymbol{#1}}
\newcommand{\Ah}{\widehat{A}}
\newcommand{\Ahat}{\Ah}
\newcommand{\Bhat}{\Bh}
\newcommand{\Bh}{\widehat{B}}
\newcommand{\Kh}{\widehat{K}}
\newcommand{\Jh}{\widehat{J}}
\newcommand{\Phixh}{\hat{\tf \Phi}_x}
\newcommand{\Phiuh}{\hat{\tf \Phi}_u}
\newcommand{\Dh}{\hat{\tf{\Delta}}}
\newcommand{\trueA}{A}
\newcommand{\trueB}{B}
\newcommand{\trueK}{K_\star}
\newcommand{\A}{\mathcal{A}}
\newcommand{\B}{\mathcal{B}}
\newcommand{\Qq}{\mathcal{Q}}
\newcommand{\Ss}{\mathcal{S}}
\newcommand{\Ahh}{\hat{\A}}
\newcommand{\Bhh}{\hat{\B}}
\newcommand{\DA}{\mathcal{D}_{\A}}
\newcommand{\DB}{\mathcal{D}_{\B}}
\newcommand{\avg}{\text{avg}}


\DeclareMathOperator*{\argmin}{arg\!\min}
\DeclareMathOperator*{\argmax}{arg\!\max}
\DeclareMathOperator*{\sgn}{sgn}
\DeclareMathOperator*{\supp}{supp}
\DeclareMathOperator*{\rank}{rank}
\DeclareMathOperator*{\diag}{diag}
\DeclareMathOperator*{\Tr}{{tr}}
\DeclareMathOperator*{\image}{Im}
\DeclareMathOperator*{\nullspace}{Kern}
\DeclareMathOperator*{\rowspace}{RS}
\DeclareMathOperator*{\colspace}{CS}
\DeclareMathOperator*{\dom}{dom}
\DeclareMathOperator*{\closure}{cl}
\DeclareMathOperator*{\vol}{vol}
\DeclareMathOperator*{\Span}{span}
\DeclareMathOperator*{\polylog}{polylog}
\DeclareMathOperator*{\Band}{Band}
\newcommand{\grad}{\mathrm{grad}}
\newcommand{\bias}{\mathrm{Bias}}
\newcommand{\var}{\mathrm{Var}}

\newcommand{\Acal}{\ensuremath{\mathcal{A}}}
\newcommand{\Cset}{\ensuremath{\mathcal{C}}}
\newcommand{\X}{\ensuremath{\mathcal{X}}}
\newcommand{\Y}{\ensuremath{\mathcal{Y}}}
\newcommand{\Z}{\ensuremath{\mathcal{Z}}}
\newcommand{\R}{\ensuremath{\mathbb{R}}}
\newcommand{\C}{\ensuremath{\mathbb{C}}}
\newcommand{\Bcal}{\ensuremath{\mathcal{B}}}
\newcommand{\G}{\ensuremath{\mathcal{G}}}
\newcommand{\Q}{\ensuremath{\mathbb{Q}}}
\newcommand{\N}{\ensuremath{\mathbb{N}}}
\newcommand{\F}{\ensuremath{\mathcal{F}}}
\newcommand{\I}{\ensuremath{\mathcal{I}}}
\newcommand{\Set}{\ensuremath{\mathcal{S}}}
\newcommand{\Hyp}{\ensuremath{\mathcal{H}}}
\newcommand{\Loss}{\ensuremath{\mathcal{L}}}
\newcommand{\Lagrange}{\ensuremath{\mathcal{L}}}
\newcommand{\norm}[1]{\lVert #1 \rVert}
\newcommand{\bignorm}[1]{\left\lVert #1 \right\rVert}
\newcommand{\twonorm}[1]{\lVert #1 \rVert_{2}}
\newcommand{\bigtwonorm}[1]{\left\lVert #1 \right\rVert_{2}}
\newcommand{\spectralnorm}[1]{\twonorm{#1}}
\newcommand{\bigspectralnorm}[1]{\bigtwonorm{#1}}
\newcommand{\maxnorm}[1]{\lVert #1 \rVert_{\infty}}
\newcommand{\onenorm}[1]{\left\lVert #1 \right\rVert_{1}}
\newcommand{\mb}[1]{\mathbf{#1}}
\newcommand{\ip}[2]{\ensuremath{\langle #1, #2 \rangle}}
\newcommand{\bigip}[2]{\left\langle #1, #2 \right\rangle}
\newcommand{\PD}[2]{\ensuremath{\frac{\partial #1}{\partial #2}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\abs}[1]{\ensuremath{| #1 |}}
\newcommand{\bigabs}[1]{\ensuremath{\left| #1 \right|}}
\newcommand{\floor}[1]{\lfloor #1 \rfloor}
\newcommand{\ceil}[1]{\lceil #1 \rceil}
\newcommand{\Normal}{\mathcal{N}}
\newcommand{\rdraw}{\xleftarrow{\$}}
\newcommand{\ind}{\mathbbm{1}}
\renewcommand{\vec}{\mathrm{vec}}
\newcommand{\Sym}{\mathbf{S}}
\newcommand{\Toep}{\mathrm{Toep}}
\newcommand{\mgf}{\E\left[e^{\lambda(X-\E X)}\right]}
\newcommand{\Sp}{\calS}

\newcommand{\lmin}[1]{\lambda_{\min}\left(#1\right)}
\renewcommand{\exp}[1]{\mathrm{exp}\left(#1\right)}
\newcommand{\leb}{\mu}
\renewcommand{\Pr}{\mathbb{P}}
\newcommand{\Prob}[1]{\Pr\left[#1\right]}
\newcommand{\T}{\top}
\newcommand{\tp}{\mathsf{T}}
\newcommand{\Ncal}{\mathcal{N}}
\newcommand{\vecx}{{x}}
\newcommand{\vecw}{{w}}
\newcommand{\vecu}{{u}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\Res}[1]{\mathfrak{R}_{#1}}

\newcommand{\Rt}{\tf{\tilde{R}}}
\newcommand{\Mt}{\tf{\tilde{M}}}
\newcommand{\Nt}{\tf{\tilde{N}}}
\newcommand{\Lt}{\tf{\tilde{L}}}
\newcommand{\Kt}{\tf{\tilde{K}}}
\newcommand{\Pt}{\tf{\tilde{P}}}
\newcommand{\Gt}{\tf{\tilde{G}}}
\newcommand{\gt}{\tilde{g}}

\newcommand{\Ro}{\tf{{R}}_0}
\newcommand{\Mo}{\tf{{M}}_0}
\newcommand{\No}{\tf{{N}}_0}
\newcommand{\Lo}{\tf{{L}}_0}
\newcommand{\Ko}{\tf{{K}}_0}

\newcommand{\Rh}{\tf{\hat{R}}}
\newcommand{\Mh}{\tf{\hat{M}}}
\newcommand{\Nh}{\tf{\hat{N}}}
\newcommand{\Lh}{\tf{\hat{L}}}
\newcommand{\wh}{\hat{w}}

\newcommand{\Thetat}{\widetilde{\Theta}}
\newcommand{\Thetah}{\widehat{\Theta}}
\newcommand{\Thetamat}{\begin{bmatrix} \tf R & \tf N \\ \tf M & \tf L \end{bmatrix}}
\newcommand{\Thetatmat}{\begin{bmatrix} \Rt & \Nt \\ \Mt & \Lt \end{bmatrix}}
\newcommand{\Thetahmat}{\begin{bmatrix} \Rh & \Nh \\ \Mh & \Lh \end{bmatrix}}

\newcommand{\Ind}[1]{\mathbbm{1}_{#1}}

\newcommand{\statedim}{{n_x}}
\newcommand{\inputdim}{{n_u}}
\newcommand{\hinf}{\mathcal{H}_\infty}
\newcommand{\htwo}{\mathcal{H}_2}
\newcommand{\RHinf}{\mathcal{RH}_\infty}

\newcommand{\ltwonorm}[1]{\| #1 \|_2}
\newcommand{\hinfnorm}[1]{\| #1 \|_{\hinf}}
\newcommand{\iid}{\stackrel{\mathclap{\text{\scriptsize{ \tiny i.i.d.}}}}{\sim}}

\newcommand{\opt}{\mathrm{opt}}
\newcommand{\dist}{\stackrel{\mathclap{\text{\scriptsize{ \tiny dist}}}}{=}}

\newcommand{\calF}{\mathcal{F}}
\newcommand{\calN}{\mathcal{N}}
\newcommand{\calS}{\mathcal{S}}
\newcommand{\calE}{\mathcal{E}}

\newcommand{\cvectwo}[2]{\begin{bmatrix} #1 \\ #2 \end{bmatrix}}
\newcommand{\rvectwo}[2]{\begin{bmatrix} #1 & #2 \end{bmatrix}}
\newcommand{\bmattwo}[4]{\begin{bmatrix} #1 & #2 \\ #3 & #4 \end{bmatrix}}

\newcommand{\dlyap}{\mathsf{dlyap}}
\newcommand{\lambdah}{\widehat{\lambda}}

\newcommand{\Qh}{\widehat{Q}}

\newcommand{\redtext}[1]{{\color{red}#1}}

\newcommand{\Otilde}{\widetilde{O}}

\newtheorem{prop}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{defn}{Definition}
\newtheorem{ex}{Example}
\usepackage{float}

\def\R{\mathbb{R}}
\def\Eps{\mathcal{E}}
\def\E{\mathbb{E}}
\def\V{\mathbb{V}}
\def\F{\mathcal{F}}
\def\G{\mathcal{G}}
\def\H{\mathcal{H}}
\def\S{\mathcal{S}}
\def\1{\mathbf{1}}
\def\n{\nappa}
\def\h{\mathbf{w}}
\def\v{\mathbf{v}}
\def\x{\mathbf{x}}
\def\X{\mathcal{X}}
\def\Y{\mathcal{Y}}
\def\eps{\epsilon}
\def\y{\mathbf{y}}
\def\e{\mathbf{e}}
\def\M{\mathbf{M}}
\def\F{\mathbf{F}}
\def\P{\mathbf{P}}
\def\calP{\mathcal{P}}
\def\calM{\mathcal{M}}
\def\kl{\mathbb{D}_{\text{KL}}}

\newcommand{\D}{\mathcal{D}}




\usepackage{subcaption}
\usepackage{algorithm,algorithmic}
\usepackage{pdflscape}
\usepackage{afterpage}
\usepackage{import}
\usepackage[toc,title,page]{appendix}
\usepackage{cancel}
\usepackage{url}
\def\UrlBreaks{\do\/\do-}

\makeatletter
\newtheorem*{rep@theorem}{\rep@title}
\newcommand{\newreptheorem}[2]{%
\newenvironment{rep#1}[1]{%
 \def\rep@title{#2 \ref{##1}}%
 \begin{rep@theorem}}%
 {\end{rep@theorem}}}
\makeatother


\newtheorem{definition}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{example}{Example}
\numberwithin{theorem}{section}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{coro}[theorem]{Corollary}

\newtheorem{remark}[theorem]{Remark}
\newtheorem{hypothesis}[theorem]{Hypothesis}

\usepackage{thm-restate}

\newreptheorem{theorem}{Theorem}
\newreptheorem{lemma}{Lemma}
\newreptheorem{coro}{Corollary}

\usepackage{listings}

\title{Efficient and Accurate Top-$K$ Recovery from Choice Data}

\author[1]{\href{mailto:<mdnguyen@seas.upenn.edu>?Subject=Your UAI 2022 paper}{Duc Nguyen}{}}

% Add affiliations after the authors
\affil[1]{%
    Department of Computer and Information Science\\
    University of Pennsylvania.
}
  
\begin{document}
\maketitle


\begin{abstract}
The intersection of learning to rank and choice modeling is an active area of research with applications in e-commerce, information retrieval and the social sciences. In some applications such as recommendation systems, the statistician is primarily interested in recovering the set of the top ranked items from a large pool of items as efficiently as possible using passively collected \emph{discrete choice data}, i.e., the user picks one item from a set of multiple items. Motivated by this practical consideration, we propose \emph{the choice-based Borda count algorithm} as a fast and accurate ranking algorithm for \emph{top $K$-recovery} i.e., correctly identifying all of the top $K$ items. We show that the choice-based Borda count algorithm has optimal sample complexity for top-$K$ recovery under a broad class of \emph{random utility models}. We prove that in the limit, the choice-based Borda count algorithm produces the same top-$K$ estimate as the commonly used Maximum Likelihood Estimate method but the former's speed and simplicity brings considerable advantages in practice. Experiments on both synthetic and real datasets show that the counting algorithm is competitive with commonly used ranking algorithms in terms of accuracy while being several orders of magnitude faster.
\end{abstract}

%%%%%%%%%%% Introduction
\section{Introduction}\label{sect:intro}

The research on discrete choice modeling and learning to rank has received a lot of interest in recent years thanks to the growing availability of discrete choice data generated by e-commerce platforms, search engines and the social sciences. In the discrete choice setting, when presented with a set of items, also referred to as \emph{menu}, the user picks the most preferred item. Discrete choice data is an intermediate between pairwise comparison data and full ranking data. In many settings such as e-commerce and political surveys, a large quantity of passively collected data is in the form of discrete choice data, e.g., consumers choosing to buy a product when presented with a catalogue of items, voters picking a favorite candidate from a pool of candidates.

In this paper, we focus on the problem of learning to rank using choice data. Specifically, we are interested in the top-$K$ recovery problem, i.e.,  identifying the set of the top $K$ items out of a universe of $n$ items, using \emph{passively collected choice data}. This problem has many useful applications. For example, in e-commerce applications, marketers are interested in finding the set of the best items based on how consumers make purchasing decisions. In the social sciences, political scientists are interested in determining the most preferred candidates among a pool of candidates using survey questionnaires.

To ground our theoretical discussions, we posit that the choice data is generated according to a probabilistic choice model- when presented with a menu of items $S$, the user makes a non-deterministic decision, picking a single item $i$ from $S$ with some probability $p_{i|S}$.
More specifically, we assume our choice model falls within the class of Random Utility Models with Independently and Identically Distributed noise (IID-RUMs), described in detail in section (\ref{sect:preliminaries}). IID-RUMs are an expressive and flexible framework that can be used to model pairwise comparison data, discrete choice data as well as full ranking data. For example, the Multinomial Logit (MNL) model is one of the most commonly used IID-RUMs to model discrete choice data \citep{train2009discrete}.

\underline{\textbf{Our motivation:}} While expressive, random utility models also pose hard computational problems.
For example, many models within the class of IID-RUMs with the few exceptions such as the MNL model do not admit analytical expression for the choice probabilities (while the pairwise comparison probabilities can be evaluated easily), limiting inference to MCMC-based algorithms. However, sampling-based algorithms can be time inefficient when running on large choice datasets with many items and menus. Furthermore, most classical inference algorithms assume a parametric model generating the choice data. In practice, it is often hard to verify if the data comes from a specific parametric model. Therefore, developing efficient ranking algorithms that are robust to model misspecification is of timely interest. 

Motivated by these considerations, we study the generalization of a simple yet powerful counting algorithm for ranking- Borda count - to the discrete choice setting. The Borda count algorithm itself has a long history, dating back to the 18th century and its analysis has been instantiated in various contexts such ranking from pairwise comparisons in \cite{rajkumar2014statistical,shah2017simple}. Our work, however, is the first to study the theoretical guarantees of Borda count in the \emph{discrete choice setting} under a broad class of discrete choice models.

\underline{\textbf{Our contributions:}}
\begin{itemize}
\item In Section \ref{sect:upper-bound} and Section \ref{sect:lower-bound}, we show that the choice-based Borda count algorithm needs $\theta(n\log n)$ samples in order to exactly recover all of the top $K$ items using choice data. We further show that this sample complexity is optimal for a broad class of IID-RUMs. This hinges on a fundamental property shared by many IID-RUMs which we term \emph{Borda consistency}.
\item In Section \ref{sect:menu-size}, we study the effect of the menu size $m$ on the sample complexity for top $K$ recovery. For the special case of the MNL model, which is a common assumption in the ranking literature, we present an asymptotic characterization of the optimal sample complexity for top $K$ recovery in terms of $m$. This bound monotonically decreases, but at a decreasing rate, with $m$. This suggests that there is a benefit to increasing the menu size but such benefit comes with diminishing returns. To the best of our knowledge, this result is the first of its kind in the choice modeling and ranking literature.
\item In Section \ref{sect:connection-mle-borda}, we study the connections between the choice-based Borda count algorithm and two commonly used top-$K$ recovery algorithms: Maximum Likelihood Estimate under MNL assumption (MNL-MLE) and Spectral Ranking \citep{negahban2017rank,maystre2015fast,agarwal2018accelerated}. We prove that the choice-based Borda count algorithm and MNL-MLE produce the same top-$K$ estimate in the limit of infinite data, even if the data has not been generated by an IID-RUM. On the other hand, Spectral Ranking does not in general give the same estimate as the choice-based Borda count algorithm/MNL-MLE even with infinite data.
\item In Section \ref{sect:experiment-compact}, We show through empirical experiments that the choice-based Borda count algorithm is competitive in terms of accuracy with both MNL-MLE and Spectral Ranking while being several orders of magnitude faster. This highlights the advantage of the choice-based Borda count algorithm in applications where the statistician is primarily interested in efficiently and accurately identifying the top items.
\end{itemize}

\subsection{Related works}
Our work falls within the literature on learning to rank under Random Utility Models (RUMs). 
There has been a substantial amount of work on learning to rank under Random Utility Models and mixtures of Random Utility Models using \textit{full ranking data} \citep{parkes2012random, azari2013generalized, azari2013mom, soufiani2014computing, zhao2016learning,zhao2019learning}. Furthermore, most classical ranking methods assume that the data is generated by a well specified RUM. To the best of our knowledge, our paper is the first to propose a method for top-$K$ ranking under a broad class of RUMs using \textit{passively collected choice data alone}.

The related literature on ranking from pairwise comparisons is vast and we can only refer the interested reader to adjacent problems such as \emph{active top-$K$ recovery} from pairwise comparisons \citep{busa2013top,agarwal2017learning,mohajer2017active,falahatgar2017maxing,falahatgar2018limits,heckel2019active}; top-$K$ recovery from pairwise comparisons \citep{chen2015spectral,shah2017simple,chen2019spectral}; top-$K$ recovery from \emph{$m$-wise sorted data} (full rankings among some $m$ items) \citep{jang2017optimal,chen2020top}.

Closest to our work is the analysis of Borda count by \cite{shah2017simple} who showed that it is optimal for top-$K$ recovery from \textit{pairwise comparisons}. Our work complements theirs by showing that the choice-based Borda count is optimal even in the \emph{general choice setting}. To this end, we obtain in Section \ref{sect:upper-bound} sample complexity upper and lower bounds that are \emph{both more general and refined} than those given by \cite{shah2017simple}. We also study in Section \ref{sect:menu-size} the effect of the menu size on the sample complexity. To the best of our knowledge, our paper presents the first asymptotic characterization of the sample complexity for ranking from choice data in terms of the menu size under the very commonly used MNL model. Operating on \emph{m-wise sorted data}, \cite{jang2017optimal} showed that the optimal sample complexity for top-$K$ recovery under the Plackett Luce model\footnote{Within the ranking literature, Plackett-Luce (PL) is a class of distributions over permutations, induced by the IID-RUM with standard Gumbel noise.} scales with $O(\frac{1}{m})$. Our results complement theirs by showing that the sample complexity for top-$K$recovery from \emph{discrete choice data} scales as $O(1+\frac{1}{m})$. Furthermore, the choice-based Borda count algorithm is different from the Spectral-MLE algorithm studied there which is specialized to the Plackett-Luce model.

\section{Notations and problem formulation}\label{sect:preliminaries}

Let there be $n$ items in the universe. Each item $i$ has a \emph{deterministic and hidden} utility, also referred to as partworth, $U_i$ for $i = 1, \ldots, n$. Let us assume the non-degenerate case where no two items have identical partworths. Without loss of generality, we also assume that $U_{\max} = U_1 > U_2 > \ldots > U_n = U_{\min} > 0$. Let $\Ss_K^* = \{1,\ldots,K\}$ denote the set of $K$ items with the highest parthworths. 

Items are presented to the consumer in a set $S$, also referred to as menu, of size at least 2. When $S$ is presented to the consumer, the perceived utility of each item $i \in S$ is the sum of its parthworth and a random noise term: $X_i = U_i + \epsilon_i$ where the $\epsilon_i$'s are independently and identically distributed according to an \emph{unknown} universal noise distribution $D$. The consumer then picks the item $i$ with the highest perceived utility among all the items in $S$. Such a choice model is referred to as a random utility model with independent and identically distributed noise (IID-RUM). In short, a choice model $\rho$ within the class of IID-RUMs is parametrized by a set of partworths $\{U_1,\ldots U_n\}$ and noise distribution $D$. 

As an overload of notation, we will also use $\rho(i|S)$ to denote the probability that a consumer picks item $i$ from menu $S$ under choice model $\rho$. By definition, $\rho(i|S) = \PP(X_i > X_k\,\forall k \in S\backslash \{i\})\,$. For simplicity, we consider a fixed menu size $m$. However, our analysis can be easily extended to account for a mixture of menu sizes. 

A choice sample is a (menu, item) tuple $(S, y)$ where the consumer chooses item $y$ from menu $S$.
A choice dataset is a set of choice samples.
A top-$K$ recovery algorithm takes in a choice dataset and returns an estimate of the top $K$ items, $\hat\Ss_K$. The goal is to exactly recover the top $K$ items and the performance metric of interest is the 0-1 loss: $L_{01}(\hat \Ss_K, \Ss_K^* ) = \mb 1 [\hat \Ss_K = \Ss_K^*]$ \footnote{$\mb 1$ is the indicator function and the equality is with respect to set equality. }.

We emphasize that as opposed to the \emph{top-$K$ ranking} problem, the objective of the top-$K$ recovery prolem is to accurately identify the set of the top $K$ items, while allowing for mis-ranking among these items.

\section{The choice-based Borda count algorithm}\label{sect:algo}

As discussed previously, the general counting approach referred to as Borda count has a long history and has been instantiated in various contexts such as ranking from pairwise comparisons. Here, we instantiate the Borda count approach to the more general discrete choice setting. This is shown in Algorithm \ref{alg:gbc}.

As would be expected, the algorithm essentially tallies the number of observed `wins' by each item and finally ranking the items by their number of wins, returning the top $K$ items. As with other versions of the Borda count approach, the algorithm is simple and easy to implement; and very efficient in practice. This makes the choice-based Borda count algorithm appropriate in settings where the statistician is primarily interested in efficiently and accurately recovering the set of the top items from a large pool of choice data.

\begin{algorithm}[]
\caption{The choice-based Borda count algorithm}
\hspace*{\algorithmicindent}\textbf{Input: }Choice dataset $\B = \{(S_l,y_l)\}_{l=1}^N$\\
\hspace*{\algorithmicindent}\textbf{Output: }Top-$K$ estimate $\hat \Ss_K$\\
\begin{algorithmic}[1]
  \STATE For each item $i = 1,\ldots, n$\\
  \STATE \hspace{1em} Compute the number of times $i$ gets chosen:\\
  \STATE \hspace{2em} $\hat W_i := \sum_{l=1}^N \mathbf{1}[y_l = i]$\\
  \STATE Return the set of $K$ items corresponding to the highest $\hat W_i$'s. Ties are broken arbitrarily.\\
\end{algorithmic}
\label{alg:gbc}
\end{algorithm}

%%%%%% Upper bound
\section{Sample complexity bound}\label{sect:upper-bound}

In this section, we present the sample complexity of the choice-based Borda count algorithm for top-$K$ recovery. We first formalize our sampling model in Section \ref{subsect:sampling-model}. In Section \ref{subsect:generalized-borda-score}, we characterize the class of IID-RUMs under which the choice-based Borda count can successfully identify all of the top $K$ items via a theoretical quantity we term the \emph{generalized Borda score}. The main theorems on the sample complexity of the choice-based Borda count algorithm are presented in Section \ref{subsect:sample-complexity-borda}.

\subsection{The Sampling Model} \label{subsect:sampling-model}
Let $\Cset^{(m)}$ be the set of \textit{all menus} of size $m \geq 2$ (i.e., $\lvert C^{(m)}\rvert = {n\choose m}$). Additionally, let $\Cset_i^{(m)}$ be the set of all menus of size $m$ containing item $i$ (i.e., $\lvert \Cset_i^{(m)}\rvert = {n-1\choose m-1} $). We consider a multiple-round uniform sampling model with $R$ rounds of sampling in total. In each round $r=1,\ldots,R$, each menu $S\in \Cset^{(m)}$ is independently offered with probability $p > 0$. Let $\hat\Cset^{(m,r)}$ denote the set of menus of size $m$ that are offered in round $r$.  If offered menu $S$, the user responds with a random choice $y_S^{(r)}$, where
$$ \Pr(y_S^{(r)} = i) = \rho(i|S)\,.$$
It is easy to check that we have, in expectation, $pR{n\choose m}$ samples over $R$ rounds. 

As a practical example, this sampling procedure can be used to design online political surveys. Suppose that there are $R$ voters willing to take part in answering survey questionnaires to determine support for $n$ political candidates. Fix a ballot size $m$. For each voter, each ballot of size $m$ is independently presented to that user with probability $p$. For each ballot, the voter picks one favourite candidate. 

\subsection{The generalized Borda score}\label{subsect:generalized-borda-score}
For each item $i$, define the following theoretical quantity, which we term the \textit{generalized Borda score}:
\begin{equation*}\label{def:tau}
    \tau^{(m)}_i = \frac{1}{{n-1\choose m-1}} \cdot \sum_{S\in \Cset_i^{(m)}} \rho(i|S) \,.
\end{equation*}
Intuitively, the generalized Borda score is the expected probability that an item $i$ is chosen from a menu $S$ where $S$ is uniformly sampled from $C_i^{(m)}$. Note that $\tau_i^{(m)} \in [0,1]$ for all $i\in [n]$. The generalized Borda score is interesting to us because \emph{for a large class of IID-RUMs}, the order among the items with respect to the generalized Borda scores is the same as that with respect to the partworths. Therefore, it suffices to rank the items by their generalized Borda scores to recover the items with the highest partworths. Formally, we can characterize this class of IID-RUMs using a property we term \emph{Borda consistency}.
\begin{definition}\label{def:borda-consistency} An IID-RUM $\rho$ satisfies \textit{Borda consistency} if for any two items $i, j$ and any menu size $m \geq 2$,
$$\tau^{(m)}_i  > \tau^{(m)}_j \Leftrightarrow U_i > U_j \,.$$
\end{definition}
The follow lemma establishes that many commonly used IID-RUMs such as the MNL (Gumbel distributed noise) and the Probit (Normal distributed noise) model satisfy Borda consistency.
\begin{lemma}\label{lem:iid-rum-order} All IID-RUMs whose noise distribution has absolutely continuous density function and support on the real line satisfy Borda consistency.
\end{lemma}

In the supplementary materials, we will show that Borda consistency is satisfied by an even broader class of IID-RUMs that include other commonly used models such as the IID-RUM with exponentially distributed noise. Intuitively, this stems from the property enjoyed by many IID-RUMs: for any two items $i, j$ where $U_i > U_j$, $\rho(i|S) > \rho(j|S) \,\,\forall S \in \Cset^{(m)}:i, j\in S$; and $\rho(i|S\cup \{i\}) > \rho(j|S\cup\{j\}) \,\,\forall S \in \Cset^{(m-1)}: i, j\notin S$. To the best of our knowledge, this fundamental property that holds across a very broad class of IID-RUMs has not been previously decribed in the literature and may be useful to future works exploring the intersection of ranking and choice modeling.

\subsection{Exact top-$K$ recovery }\label{subsect:sample-complexity-borda}
Having established Borda consistency as a property enjoyed by many IID-RUMs, we will now present the finite sample guarantees of the choice-based Borda count algorithm for top-$K$ recovery that holds for all choice models in this broad class of IID-RUMs. To show that the choice-based Borda count algorithm accurately identifies all of the top $K$ items with high probability, it suffices to bound the probability that the algorithm mistakenly ranks an item $j \notin \Ss_K^*$ higher than another item $i \in \Ss_K^*$. Specifically, we want to bound the following probabilities.
$$ \PP(\hat W_j > \hat W_i) \quad \forall i \in \Ss_K^*, j\notin \Ss_K^* \,,$$
where $\hat W_i$ is defined in Algorithm (\ref{alg:gbc}). 
Considering this, the fundamental hardness of top-$K$ ranking lies in distinguishing between the $K$-th and $K+1$-th best item, and therefore depends on the gap between their generalized Borda scores: 
\begin{equation*}\label{def:delta-K}
    \Delta_K^{(m)} = \tau^{(m)}_{K} - \tau^{(m)}_{K+1} \,.
\end{equation*}
The smaller this gap, the more data the algorithm requires in order to correctly separate between the top $K$ and the bottom $n-K$ items. Building on this intuition and generalizing to any pair of items $(i,j)$ where $\tau_i > \tau_j$, we obtain the following upper bound on $ \PP(\hat W_j > \hat W_i)$.
\begin{lemma}\label{thm:error-bound-gen-borda}
Consider an IID-RUM that satisfies Borda consistency per Definition \ref{def:borda-consistency}. Assume input choice data with menu size $m$ is generated according to the sampling model described in Section \ref{subsect:sampling-model}. For any two items $i$ and $j$ where $\tau_i^{(m)} > \tau_j^{(m)}$, the choice-based Borda count algorithm satisfies
\begin{multline*}
\PP(\hat W_j > \hat W_i) \leq \exp{\frac{-3pR{n\choose m} {m(\tau_i^{(m)}-\tau_j^{(m)})}^2 }{8n(\tau_i^{(m)} +\tau_j^{(m)})}} \,.
\end{multline*}
\end{lemma}

The proof of Lemma \ref{thm:error-bound-gen-borda} uses a standard concentration inequality argument based on Bernstein's inequality (cf. Theorem 2.8.4 \cite{vershynin2018high}). The lemma itself states that, for each pair $i\in \Ss_K^*, j\notin \Ss_K^*$, if $pR{n\choose m} \geq  {\frac{8n\log n (\tau_i^{(m)} + \tau_j^{(m)} )}{m(\tau^{(m)}_i - \tau^{(m)}_j)^2}}$, then $\Pr(\hat W_j > \hat W_i) = O(\frac{1}{n^3})$. We also have the following lemma which presents an upper bound on the item-dependent term $ \frac{\tau^{(m)}_i + \tau^{(m)}_j}{(\tau^{(m)}_i - \tau^{(m)}_j)^2 }$.

\begin{lemma}\label{lem:simple-inequality} 
Consider an IID-RUM that satisfies Borda consistency per Definition \ref{def:borda-consistency}. For any $K$, we have
$$ \frac{\tau^{(m)}_K + \tau^{(m)}_{K+1}}{{\Delta_K^{(m)}}^2 } = \max_{i\in \Ss^*_K, j\notin \Ss^*_K} \bigg\{ \frac{\tau^{(m)}_i + \tau^{(m)}_j}{(\tau^{(m)}_i - \tau^{(m)}_j)^2 } \bigg\} \, .$$
\end{lemma}

By combining the two lemmas above and applying union bound over all pairs $i\in \Ss_K^*, j\notin \Ss_K^*$, we obtain the following sample complexity bound for exact top-$K$ recovery:
\begin{theorem}\label{cor:sample-complexity-gbs}
    Assume the conditions of lemma (\ref{thm:error-bound-gen-borda}). Given sufficiently large $p, R$ such that $pR{n \choose m} \geq \frac{8n \log n}{m{\Delta_K^{(m)}}^2}\cdot(\Delta_K^{(m)} +2\tau_{K+1}^{(m)})$, the choice-based Borda count algorithm correctly identifies all of the top $K$ items with probability at least $1-O(\frac{K}{n^2})$.
\end{theorem}
The reader may also recognize that $\Delta^{(m)}_K + 2\tau_{K+1}^{(m)}$ is simply $\tau_K^{(m)}+\tau_{K+1}^{(m)}$. The former presentation is, however, useful in highlighting the main quantities that will also reappear in our matching lower bound. In summary, the choice-based Borda count algorithm has the following sample complexity for exact top-$K$ recovery:
$$O\bigg(\frac{n\log n}{m{\Delta_K^{(m)}}}\cdot(1 + \frac{\tau_{K+1}^{(m)}}{\Delta_K^{(m)}} )\bigg)\,.$$
This shows that overall, we only need $O(n\log n)$ examples to recover the top $K$ items from choice data with high accuracy. Our upper bound (and matching lower bound to be shown) can be seen as both \emph{generalization and refinement} of Theorem 1 of \cite{shah2017simple}. Under the pairwise comparison setting ($m=2$), we can simply upper bound $\tau_{K+1}^{(m)} \leq 1$ and recover the (optimal) sample complexity $O\big(\frac{n\log n}{{\Delta_K^{(2)}}^2}\big)$ of Borda count obtained by \cite{shah2017simple}. The analysis approach there, however, is insufficient to produce an optimal sample complexity bound in the discrete choice setting. Note also that there can be combinatorially many realizations of the data in the discrete choice setting as $\lvert \Cset^{(m)} \rvert = {n\choose m}$. Our proof therefore requires considerably more effort. Our bound also shows that the sample complexity depends not only on the gap $\Delta_K^{(m)}$ between the $K$-th and $K+1$-th item, but also the relative `strength' of the $K+1$-th item, as captured by the $\frac{\tau_{K+1}^{(m)}}{\Delta_K^{(m)}}$ term.

In general, the factors $\tau_{K+1}^{(m)}$ and $\Delta_K^{(m)}$ don't admit closed form expressions because both are sums of ${n-1\choose m-1}$ terms. The reader may also recognize that these parameters also depend on the menu size $m$, the partworth parameters and the noise distribution. In the next section, we will show a \emph{matching lower bound} in terms of the same parameters, establishing the optimality of the choice-based Borda count algorithm, and discuss why the exact relation between $\Delta_K^{(m)}$, $\tau_{K+1}^{(m)}$ and the model parameters remains elusive. 

Often in practice, we may tolerate some error for top-$K$ ranking by allowing the algorithm to misidentify, up to a threshold, some number of items. This is known as \emph{approximate top-$K$ recovery}. We include detailed discussions of this problem in the supplementary materials and show that the choice-based Borda count algorithm also has \emph{optimal sample complexity} for approximate top-$K$ recovery under the broad class of IID-RUMs that satisfy Borda consistency.

%%%%%% Lower bound

\section{Information-theoretic lower bound}\label{sect:lower-bound}
In this section, we will show that the choice-based Borda count algorithm enjoys optimal sample complexity by furnishing a matching lower bound. 
To show a lower bound, we will construct a special subclass of the MNL family where any estimator requires $\Omega(n\log n)$ examples in order to exactly recover the top $K$ items. We defer detailed descriptions of this model to the supplementary materials while stating the main results as follows.
\begin{theorem}\label{thm:lower-bound-all} 
Consider the sampling model described in Section \ref{subsect:sampling-model}.
There exists a class of MNL models such that for $n \geq 20$, if $pR{n\choose m} \leq \frac{n\log n }{8}\cdot\frac{\tau_{K+1}^{(m)} +\Delta_K^{(m)} }{m{\Delta_K^{(m)}}^2}$ then any estimator fails to correctly identify all of the top $K$ items with probability at least $\frac{1}{12}$.
\end{theorem}

The proof of Theorem (\ref{thm:lower-bound-all}) first reduces the problem of exact top-$K$ recovery to a multiple hypothesis testing problem and then applies Fano's lemma \citep{cover1999elements}. Each hypothesis in the testing problem corresponds to an MNL model. Within each model, the set of the top $K$ items always includes items $1,\ldots, K-1$. However, the index of remaining item in the top-$K$ set is different for each model (i.e., there are $n-K+1$ different models). We make all of the top $K$ items have the same partworths while the bottom $n-K$ items have the same (and lower) partworths. The key challenge then is to obtain a tight upper bound on the KL divergence between any two hypothesis models. 
In summary, Theorem (\ref{thm:lower-bound-all}) implies the following \textit{minimum} sample complexity for any algorithm for top-$K$ recovery:
$$ \Omega\bigg( \frac{n\log n}{m{\Delta_K^{(m)}}} \cdot \big(1 + \frac{\tau_{K+1}^{(m)}}{\Delta_K^{(m)}}\big) \bigg) \,.$$
Comparing with the bound in Theorem \ref{cor:sample-complexity-gbs}, one can see that the sample complexity of Borda Count is optimal in terms of both $m$, $n$ as well as the model dependent parameters $\Delta_K^{(m)}$ and $\tau_{K+1}^{(m)}$.

\section{The role of the menu size $m$}\label{sect:menu-size}
The effect of the menu size on the performance of top-$K$ recovery algorithms is an aspect of both theoretical and practical importance. In real life applications, the menu size could range from 2 to hundreds of items. One may suspect that increasing the menu size means the data carries more information per data point, and thereby reduces the sample complexity for top-$K$ recovery. However, to the best of our knowledge, such a relationship has not been theoretically established in the literature on choice modeling, even for the very commonly used MNL model.

As seen in the matching lower and upper bound for the sample complexity of top-$K$ recovery, the menu size enters in complex ways through the factors $\Delta_K^{(m)}$ and $\tau_{K+1}^{(m)}$. Both factors can vary in subtle ways with $m$, depending on the underlying choice model. Even for the class of MNL models which admit closed form choice probabilities, these factors don't seem to have a closed form expression as each of them is a sum of ${n-1\choose m-1}$ terms. To bypass the difficulty of exactly evaluating $\Delta_K^{(m)}$ and $\tau_{K+1}^{(m)}$, we characterize the asymptotic dependency of $\frac{1}{m{\Delta_K^{(m)}}}$ and $\frac{\tau_{K+1}^{(m)}}{\Delta_K^{(m)}}$ on $m$ \emph{under the MNL class of models} and show that both of these factors monotonically decrease with $m$ but at a \emph{decreasing rate}. This implies that while there is an advantage to using choice data of larger menu sizes, there is a diminishing return to increasing the menu size.
\begin{theorem}\label{thm:mnl-variational-lower-bound} For any MNL model and a fixed $K$,
    \begin{equation*}
     \frac{1}{m\Delta_K^{(m)}} = \theta\bigg( \frac{1}{e^{U_K}-e^{U_{K+1}}}\cdot \big(1 + \frac{1}{m-1}\big)  \bigg)\,,
    \end{equation*}
    \begin{equation*}
    \frac{\tau^{(m)}_{K+1}}{\Delta_K^{(m)}} = \theta\bigg( \frac{e^{U_{K+1}}}{e^{U_K}-e^{U_{K+1}}}\cdot \big( 1+ \frac{1}{m-1}\big) \bigg) \,.
    \end{equation*}
\end{theorem}
It can be seen in both $ \frac{1}{m\Delta_K^{(m)}}$ and $\frac{\tau^{(m)}_{K+1}}{\Delta_K^{(m)}}$ that the term which depends on $m$, $1 + \frac{1}{m-1}$, montonically decreases with $m$ but at a diminishing rate. Combining the above theorem and the matching sample complexity bounds obtained earlier, one can see that the optimal sample complexity for top-$K$ recovery from choice data scales as $\theta(1+\frac{1}{m})$.

Outside of the MNL family of models, we are not aware of any IID-RUM that admits a closed form expression for the choice probabilities. However, suppose that we know all of the partworths and the noise distribution, we can still approximate the choice probabilities via Monte Carlo sampling. Given these (approximated) choice probabilities, one can then evaluate $ \frac{1}{m\Delta_K^{(m)}}$ and $\frac{\tau^{(m)}_{K+1}}{\Delta_K^{(m)}}$. 
As an example, Figure \ref{fig:model-quantities} shows how these quantities vary with $m$ under a randomly generated MNL and Probit model (IID-RUM with standard normal noise) \citep{train2009discrete} with $n=15$, $K=3$. The partworths were independently generated from a zero-mean normal distribution which is also a commonly chosen prior in the literature \citep{parkes2012random, train2009discrete}. The curves for $\frac{1}{m{\Delta_K^{(m)}}}$ and $\frac{\tau_{K+1}^{(m)}}{\Delta_K^{(m)}}$ decrease at a rate approximately similar to those of the MNL model as stated in Theorem \ref{thm:mnl-variational-lower-bound}.
\begin{figure}[]
    \centering
        \includegraphics[scale=0.5]{figs/mnl_probit_special_quantities.png}
    \caption{$\frac{1}{m{\Delta_K^{(m)}}}$ and $\frac{\tau_{K+1}^{(m)}}{\Delta_K^{(m)}}$ decrease with larger $m$ under a randomly generated MNL and Probit model. This suggests that there is an advantage, albeit with dimnishing return, to using larger menu sizes. }
    \label{fig:model-quantities}
\end{figure}
Ranking from choice data under MNL model assumptions remains an active area of research \citep{agarwal2018accelerated,agarwal2020choice} and to the best of our knowledge, our work presents \emph{the first asymptotic characterization} of the optimal sample complexity for top-$K$ recovery in terms of the menu size $m$ under this often used class of choice models.

%%%% connection between MLE and Borda
\section{Connections to commonly used ranking algorithms}\label{sect:connection-mle-borda}

In this section, we establish close connections among choice-based Borda count, the method of maximum likelihood estimate under MNL assumptions (MNL-MLE) \citep{train2009discrete} and Spectral Ranking \citep{negahban2017rank,maystre2015fast,agarwal2018accelerated} which will explain many experimental results we present in later sections.

Firstly, one can prove that choice-based Borda count and MNL-MLE are `equivalent' top-$K$ recovery algorithms in the limit of infinite data. This connection is formalized as follows.
\begin{theorem}\label{thm:borda-mle-connection} Consider the sampling model described in Section \ref{subsect:sampling-model}, for any $p > 0$, in the limit as $R\rightarrow \infty$, MNL-MLE and choice-based Borda count will produce the same top-$K$ estimate. Moreover, this holds even if the data does not come from the MNL model or any IID-RUM.
\end{theorem}

A similar observation was made in \cite{rajkumar2014statistical}: under the \emph{pairwise comparison} setting the Borda count algorithm and MNL-MLE are both consistent for full ranking under a class of pairwise comparison models that is strictly more general than the BTL model \footnote{The BTL model is the instantiation of the MNL to the pairwise comparison setting}. Our results generalize the relation between the two algorithms to the choice setting and show that in fact the Borda count algorithm and MNL-MLE produce the same estimate in the limit of infinite data under any choice models. This connection between the two methods is reflected in our experiments where the performance of the choice-based Borda count algorithm is almost identical to that of MLE, when the sample size is large. While performing similarly to MNL-MLE, the choice-based Borda count algorithm is several orders of magnitude faster thanks to its simplicity. This suggests that if the statistician is mostly concerned with recovering a small number of top items, the choice-based Borda count algorithm should be seriously considered due to its speed, simplicity and guaranteed optimal sample complexity.

The above result also means that MNL-MLE is a consistent top-$K$ ranking algorithm under the broad class of IID-RUMs, since the choice-based Borda count algorithm is consistent in recovering the top $K$ items. This shows that MNL-MLE may be used for ranking applications even when the data does not satisfy the MNL assumption. Consistency of MLE under model misspecification is an underexplored question and we leave the careful characterization of the sample complexity of MNL-MLE when the data comes from a non-MNL distribution as a subject of future studies.

On the other hand, Spectral Ranking does not in general produce the same top-$K$ estimate as MNL-MLE/choice-based Borda count. However, when the underlying choice model falls within a broad class of IID-RUMs which include many commonly used choice models such as the MNL and Probit model, all three algorithms produce the same estimate given infinite data.

\begin{theorem}\label{thm:borda-mle-asr-connection} Consider the sampling model described in Section \ref{subsect:sampling-model}. Assume that the underlying choice model generating the data is in the class of IID-RUMs whose noise distribution has absolutely continuous density function with support on the real line. For any $p > 0$, in the limit as $R\rightarrow \infty$, then Spectral Ranking, MNL-MLE and choice-based Borda count produce the same top-$K$ estimate.

On the other hand, there exists a choice model where in the limit as $R\rightarrow \infty$, the Spectral Ranking algorithm produces a different top-$K$ estimate from MNL-MLE/Borda count.
\end{theorem}

%%%% Experiment
\section{Experiments}\label{sect:experiment-compact}
In this section, we present experiment results on both synthetic and real datasets. The main performance metric is top-$K$ accuracy. More specifically, we measure top-$K$ accuracy as the frequency at which the respective algorithm correctly identifies \emph{all} of the true top $K$ items, \emph{over 100 trials}.

\subsection{Synthetic data}\label{subsect:experiments-synthetic}
We verify, via synthetic experiments, the efficacy of the choice-based Borda count algorithm and the effect of the menu size $m$ on its performance.
Let there be $n=50$ items in the universe. We experiment with 3 different noise distributions: standard Normal noise (Probit), standard Gumbel noise (MNL) and standard Exponential noise. We vary the menu size $m=2,4,6,8$ and $K = 1,3,5$. Figure \ref{fig:synthetic-borda-compact} shows top-$K$ accuracy against the sample size. In all experiments, choice-based Borda count successfully identifies the top $K$ items with high probability given sufficiently large sample size. Furthermore, using larger menu sizes improves the performance of Borda Count. However, it can be seen that there is a diminishing return in performance gains from using larger menu sizes, agreeing with our theoretical analysis in Section \ref{sect:lower-bound}.

\begin{figure}[h]
\centering
\hbox{
    \hspace{-.5cm}
    \includegraphics[scale=0.35]{./figs/synthetic_BC.png}
}
\caption{\textbf{Synthetic data:} Exact top $K$ accuracy of choice-based Borda count against sample size for different menu sizes 2 ({\color{blue}blue}), 4 ({\color{orange}orange}), 6 ({\color{green}green}), 8 ({\color{red}red}) with $K=1,3,5$ and $n=50$. Increasing the menu size improves performance but with a diminishing return.}
\label{fig:synthetic-borda-compact}
\end{figure}

\subsection{Real data}\label{subsect:experiments-real}

\textbf{Baseline algorithms:} We compare choice-based Borda count against Accelerated Spectral Ranking (ASR) \citep{agarwal2018accelerated} and Maximum Likelihood Estimate (MLE) under MNL assumptions \citep{train2009discrete} in terms of top $K$ accuracy. We implement MLE using Scipy's L-BFGS optimizer \citep{2020SciPy}.

\textbf{Data description:} We follow standard procedures commonly used in previous works such as \cite{rajkumar2014statistical,agarwal2020choice}. Operating on full ranking datasets, we can estimate the choice probabilities for any menu $S$, i.e., choice probability $\rho(i|S)$ is the proportion of rankings that ranks item $i$ highest among all them items in $S$. Given these probabilities, we can simulate the sampling model as described in Section \ref{subsect:sampling-model}. Our datasets include SUSHI \citep{kamishima2003nantonac}, APA election dataset \citep{diaconis1989generalization}, 3 Irish election datsets and F1 race dataset included in the library PrefLib \citep{mattei2013preflib}. Notably, the induced pairwise choice probabilities of these datasets all satisfy stochastic transitivity. Therefore, there exists a universal ordering of the items which we can use as a true global ranking over the items. Due to space constraint, we can only present a few representative experimental findings and leave additional results with detailed descriptions of data processing in the supplementary materials. 

\textbf{Speed advantage:} Across all experiments, choice-based Borda count is several orders of magnitude faster than ASR and MLE. This difference is especially pronounced in datasets with more items such as the F1 dataset, as shown in Figure \ref{fig:f1-time-compact}.
\begin{figure}[]
\centering
\hbox{
    \includegraphics[scale=0.40]{figs/f1_time_all.png}
}
\caption{\textbf{F1 dataset ($n=22$):} Average training time (seconds) against sample size for $m=2,4,8$. Choice-based Borda count ({\color{blue} blue}) is several orders of magnitude faster than its competitors.}
\label{fig:f1-time-compact}
\end{figure}

\textbf{Competitive accuracy:} Figure \ref{fig:irish-meath-experiment-compact} show the performance of the algorithms under the Irish-Meath dataset and Figure \ref{fig:irish-west-experiment-compact} shows the results for the Irish-West dataset. Our theoretical analysis in Section \ref{sect:connection-mle-borda} is reflected in our experimental findings: the performance of MNL-MLE and the Borda count algorithm are very similar given sufficiently large sample size. Spectral Ranking, on the other hand, may perform better or worse than MLE/Borda count depending on the dataset and the choice of $m$ and $K$. For many combinations of $m$ and $K$, we observe that the choice-based Borda count algorithm accurately recovers the top $K$ items and is highly competitive with MNL-MLE and Spectral Ranking. Notably, in most datasets, for smaller $K$ and large $m$, ithe Borda count algorithm has considerable advantages thanks to its accuracy and faster running time. 
In practice, this means choice-based Borda count is appropriate for applications where the statistician is interested in quickly determining a single (or a few) top candidate(s) from a large amount of data such as aggregating political surveys.

\begin{figure}[]
\centering
    \includegraphics[scale=0.4,height=80mm]{figs/irish_meath_all.png}
\caption{\textbf{Irish-Meath dataset ($n=14$):} Exact top-$K$ accuracy against sample size. choice-based Borda count ({\color{blue}blue}) is competitive with baseline algorithms. Using larger menu sizes generally improves the performance of the algorithms.}
\label{fig:irish-meath-experiment-compact}
\end{figure}

\begin{figure}[]
\centering
    \includegraphics[scale=0.4,height=80mm]{figs/irish_west_all.png}
\caption{\textbf{Irish West dataset ($n=9$):} %Exact top-$K$ accuracy against sample size. 
choice-based Borda count ({\color{blue}blue}) performs very similarly to MNL-MLE ({\color{gray}gray}) while the performance of ASR ({\color{orange}orange}) may diverge from MNL-MLE/choice-based Borda count.}
\label{fig:irish-west-experiment-compact}
\end{figure}

\section{Conclusion}
Ranking under Random Utility Models is a promising area of research with many practical applications. Our work shows how an efficient algorithm can perform very well under a broad family of RUMs. That being said, the class of IID-RUMs constitutes only a subset of models within the class of general RUMs. Beyond IID-RUMs, not much is known in terms of efficient inference and ranking algorithms. In the future, we hope to see more ranking methods developed for more expressive RUMs which have non-identical noise distributions or dependent noise distributions. 

\begin{acknowledgements} The author thanks Shivani Agarwal for suggesting the idea of generalizing the Borda count algorithm to the choice setting, proofreading earlier versions of this paper and for helpful discussions. The author also thanks Prathamesh Patil and William Zhang for proofreading this paper; and the anonymous reviewers for their comments. This work is supported in part by the National Science Foundation (NSF) under grant number 1717290 (awarded to Shivani Agarwal). Any opinions expressed in this paper are those of the author and do not necessarily reflect the views of the National Science Foundation.
\end{acknowledgements}

%%% Bibliography
\bibliography{nguyen_28}


\end{document}
