\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{xcolor}
\usepackage{graphicx, setspace, latexsym,amsmath,amssymb,amsthm,color}

\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{commands}[2017/10/25 Math macros]

\RequirePackage{etoolbox}
\newtoggle{isdraft}
\DeclareOption{draft}{\toggletrue{isdraft}}
\ProcessOptions\relax


\newcommand{\todo}[1]{
\iftoggle{isdraft}{
\vspace{5 mm}\par \noindent \marginpar{\textsc{ToDo}}
\framebox{\begin{minipage}[c]{0.95 \columnwidth} \tt #1
\end{minipage}}\vspace{5 mm}\par}{}}

\newcommand{\propchange}[1]{
\iftoggle{isdraft}{
\textcolor{blue}{#1}}{#1}}

\newcommand{\propdelete}[2]{
\iftoggle{isdraft}{
\textcolor{blue}{\sout{#1} {#2}}}{}}

\def\qed{\rule[0pt]{5pt}{5pt}\par\medskip}

\newcommand{\minimize}{\mbox{minimize}}
\newcommand{\maximize}{\mbox{maximize}}
\newcommand{\st}{\mbox{subject to}}

\newcommand{\statespace}[4]{\left[ \begin{array}{c|c} #1 & #2 \\ \hline\rule{0pt}{2.6ex} #3 & #4 \end{array} \right]}
\newcommand{\tf}[1]{\boldsymbol{#1}}
\newcommand{\Ah}{\widehat{A}}
\newcommand{\Ahat}{\Ah}
\newcommand{\Bhat}{\Bh}
\newcommand{\Bh}{\widehat{B}}
\newcommand{\Kh}{\widehat{K}}
\newcommand{\Jh}{\widehat{J}}
\newcommand{\Phixh}{\hat{\tf \Phi}_x}
\newcommand{\Phiuh}{\hat{\tf \Phi}_u}
\newcommand{\Dh}{\hat{\tf{\Delta}}}
\newcommand{\trueA}{A}
\newcommand{\trueB}{B}
\newcommand{\trueK}{K_\star}
\newcommand{\A}{\mathcal{A}}
\newcommand{\B}{\mathcal{B}}
\newcommand{\Qq}{\mathcal{Q}}
\newcommand{\Ss}{\mathcal{S}}
\newcommand{\Ahh}{\hat{\A}}
\newcommand{\Bhh}{\hat{\B}}
\newcommand{\DA}{\mathcal{D}_{\A}}
\newcommand{\DB}{\mathcal{D}_{\B}}
\newcommand{\avg}{\text{avg}}


\DeclareMathOperator*{\argmin}{arg\!\min}
\DeclareMathOperator*{\argmax}{arg\!\max}
\DeclareMathOperator*{\sgn}{sgn}
\DeclareMathOperator*{\supp}{supp}
\DeclareMathOperator*{\rank}{rank}
\DeclareMathOperator*{\diag}{diag}
\DeclareMathOperator*{\Tr}{{tr}}
\DeclareMathOperator*{\image}{Im}
\DeclareMathOperator*{\nullspace}{Kern}
\DeclareMathOperator*{\rowspace}{RS}
\DeclareMathOperator*{\colspace}{CS}
\DeclareMathOperator*{\dom}{dom}
\DeclareMathOperator*{\closure}{cl}
\DeclareMathOperator*{\vol}{vol}
\DeclareMathOperator*{\Span}{span}
\DeclareMathOperator*{\polylog}{polylog}
\DeclareMathOperator*{\Band}{Band}
\newcommand{\grad}{\mathrm{grad}}
\newcommand{\bias}{\mathrm{Bias}}
\newcommand{\var}{\mathrm{Var}}

\newcommand{\Acal}{\ensuremath{\mathcal{A}}}
\newcommand{\Cset}{\ensuremath{\mathcal{C}}}
\newcommand{\X}{\ensuremath{\mathcal{X}}}
\newcommand{\Y}{\ensuremath{\mathcal{Y}}}
\newcommand{\Z}{\ensuremath{\mathcal{Z}}}
\newcommand{\R}{\ensuremath{\mathbb{R}}}
\newcommand{\C}{\ensuremath{\mathbb{C}}}
\newcommand{\Bcal}{\ensuremath{\mathcal{B}}}
\newcommand{\G}{\ensuremath{\mathcal{G}}}
\newcommand{\Q}{\ensuremath{\mathbb{Q}}}
\newcommand{\N}{\ensuremath{\mathbb{N}}}
\newcommand{\F}{\ensuremath{\mathcal{F}}}
\newcommand{\I}{\ensuremath{\mathcal{I}}}
\newcommand{\Set}{\ensuremath{\mathcal{S}}}
\newcommand{\Hyp}{\ensuremath{\mathcal{H}}}
\newcommand{\Loss}{\ensuremath{\mathcal{L}}}
\newcommand{\Lagrange}{\ensuremath{\mathcal{L}}}
\newcommand{\norm}[1]{\lVert #1 \rVert}
\newcommand{\bignorm}[1]{\left\lVert #1 \right\rVert}
\newcommand{\twonorm}[1]{\lVert #1 \rVert_{2}}
\newcommand{\bigtwonorm}[1]{\left\lVert #1 \right\rVert_{2}}
\newcommand{\spectralnorm}[1]{\twonorm{#1}}
\newcommand{\bigspectralnorm}[1]{\bigtwonorm{#1}}
\newcommand{\maxnorm}[1]{\lVert #1 \rVert_{\infty}}
\newcommand{\onenorm}[1]{\left\lVert #1 \right\rVert_{1}}
\newcommand{\mb}[1]{\mathbf{#1}}
\newcommand{\ip}[2]{\ensuremath{\langle #1, #2 \rangle}}
\newcommand{\bigip}[2]{\left\langle #1, #2 \right\rangle}
\newcommand{\PD}[2]{\ensuremath{\frac{\partial #1}{\partial #2}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\abs}[1]{\ensuremath{| #1 |}}
\newcommand{\bigabs}[1]{\ensuremath{\left| #1 \right|}}
\newcommand{\floor}[1]{\lfloor #1 \rfloor}
\newcommand{\ceil}[1]{\lceil #1 \rceil}
\newcommand{\Normal}{\mathcal{N}}
\newcommand{\rdraw}{\xleftarrow{\$}}
\newcommand{\ind}{\mathbbm{1}}
\renewcommand{\vec}{\mathrm{vec}}
\newcommand{\Sym}{\mathbf{S}}
\newcommand{\Toep}{\mathrm{Toep}}
\newcommand{\mgf}{\E\left[e^{\lambda(X-\E X)}\right]}
\newcommand{\Sp}{\calS}

\newcommand{\lmin}[1]{\lambda_{\min}\left(#1\right)}
\renewcommand{\exp}[1]{\mathrm{exp}\left(#1\right)}
\newcommand{\leb}{\mu}
\renewcommand{\Pr}{\mathbb{P}}
\newcommand{\Prob}[1]{\Pr\left[#1\right]}
\newcommand{\T}{\top}
\newcommand{\tp}{\mathsf{T}}
\newcommand{\Ncal}{\mathcal{N}}
\newcommand{\vecx}{{x}}
\newcommand{\vecw}{{w}}
\newcommand{\vecu}{{u}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\Res}[1]{\mathfrak{R}_{#1}}

\newcommand{\Rt}{\tf{\tilde{R}}}
\newcommand{\Mt}{\tf{\tilde{M}}}
\newcommand{\Nt}{\tf{\tilde{N}}}
\newcommand{\Lt}{\tf{\tilde{L}}}
\newcommand{\Kt}{\tf{\tilde{K}}}
\newcommand{\Pt}{\tf{\tilde{P}}}
\newcommand{\Gt}{\tf{\tilde{G}}}
\newcommand{\gt}{\tilde{g}}

\newcommand{\Ro}{\tf{{R}}_0}
\newcommand{\Mo}{\tf{{M}}_0}
\newcommand{\No}{\tf{{N}}_0}
\newcommand{\Lo}{\tf{{L}}_0}
\newcommand{\Ko}{\tf{{K}}_0}

\newcommand{\Rh}{\tf{\hat{R}}}
\newcommand{\Mh}{\tf{\hat{M}}}
\newcommand{\Nh}{\tf{\hat{N}}}
\newcommand{\Lh}{\tf{\hat{L}}}
\newcommand{\wh}{\hat{w}}

\newcommand{\Thetat}{\widetilde{\Theta}}
\newcommand{\Thetah}{\widehat{\Theta}}
\newcommand{\Thetamat}{\begin{bmatrix} \tf R & \tf N \\ \tf M & \tf L \end{bmatrix}}
\newcommand{\Thetatmat}{\begin{bmatrix} \Rt & \Nt \\ \Mt & \Lt \end{bmatrix}}
\newcommand{\Thetahmat}{\begin{bmatrix} \Rh & \Nh \\ \Mh & \Lh \end{bmatrix}}

\newcommand{\Ind}[1]{\mathbbm{1}_{#1}}

\newcommand{\statedim}{{n_x}}
\newcommand{\inputdim}{{n_u}}
\newcommand{\hinf}{\mathcal{H}_\infty}
\newcommand{\htwo}{\mathcal{H}_2}
\newcommand{\RHinf}{\mathcal{RH}_\infty}

\newcommand{\ltwonorm}[1]{\| #1 \|_2}
\newcommand{\hinfnorm}[1]{\| #1 \|_{\hinf}}
\newcommand{\iid}{\stackrel{\mathclap{\text{\scriptsize{ \tiny i.i.d.}}}}{\sim}}

\newcommand{\opt}{\mathrm{opt}}
\newcommand{\dist}{\stackrel{\mathclap{\text{\scriptsize{ \tiny dist}}}}{=}}

\newcommand{\calF}{\mathcal{F}}
\newcommand{\calN}{\mathcal{N}}
\newcommand{\calS}{\mathcal{S}}
\newcommand{\calE}{\mathcal{E}}

\newcommand{\cvectwo}[2]{\begin{bmatrix} #1 \\ #2 \end{bmatrix}}
\newcommand{\rvectwo}[2]{\begin{bmatrix} #1 & #2 \end{bmatrix}}
\newcommand{\bmattwo}[4]{\begin{bmatrix} #1 & #2 \\ #3 & #4 \end{bmatrix}}

\newcommand{\dlyap}{\mathsf{dlyap}}
\newcommand{\lambdah}{\widehat{\lambda}}

\newcommand{\Qh}{\widehat{Q}}

\newcommand{\redtext}[1]{{\color{red}#1}}

\newcommand{\Otilde}{\widetilde{O}}

\newtheorem{prop}{Proposition}
\newtheorem{corollary}{Corollary}
\newtheorem{defn}{Definition}
\newtheorem{ex}{Example}
\usepackage{float}

\def\R{\mathbb{R}}
\def\Eps{\mathcal{E}}
\def\E{\mathbb{E}}
\def\V{\mathbb{V}}
\def\F{\mathcal{F}}
\def\G{\mathcal{G}}
\def\H{\mathcal{H}}
\def\S{\mathcal{S}}
\def\1{\mathbf{1}}
\def\n{\nappa}
\def\h{\mathbf{w}}
\def\v{\mathbf{v}}
\def\x{\mathbf{x}}
\def\X{\mathcal{X}}
\def\Y{\mathcal{Y}}
\def\eps{\epsilon}
\def\y{\mathbf{y}}
\def\e{\mathbf{e}}
\def\M{\mathbf{M}}
\def\F{\mathbf{F}}
\def\P{\mathbf{P}}
\def\calP{\mathcal{P}}
\def\calM{\mathcal{M}}
\def\kl{\mathbb{D}_{\text{KL}}}

\newcommand{\D}{\mathcal{D}}


\usepackage{subcaption}
\usepackage{algorithm,algorithmic}
\usepackage{pdflscape}
\usepackage{afterpage}
\usepackage{import}
\usepackage[toc,title,page]{appendix}
\usepackage{cancel}
\usepackage{url}
\def\UrlBreaks{\do\/\do-}

\makeatletter
\newtheorem*{rep@theorem}{\rep@title}
\newcommand{\newreptheorem}[2]{%
\newenvironment{rep#1}[1]{%
 \def\rep@title{#2 \ref{##1}}%
 \begin{rep@theorem}}%
 {\end{rep@theorem}}}
\makeatother


\newtheorem{definition}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{example}{Example}
\numberwithin{theorem}{section}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{coro}[theorem]{Corollary}

\newtheorem{remark}[theorem]{Remark}
\newtheorem{hypothesis}[theorem]{Hypothesis}

\usepackage{thm-restate}

\newreptheorem{theorem}{Theorem}
\newreptheorem{lemma}{Lemma}
\newreptheorem{coro}{Corollary}

\usepackage{listings}

\allowdisplaybreaks

\usepackage{xr}
\externaldocument{nguyen_28}


\title{Efficient and Accurate Top-$K$ Recovery from Choice Data (Supplementary Materials)}

\author[1]{\href{mailto:<mdnguyen@seas.upenn.edu>?Subject=Your UAI 2022 paper}{Duc Nguyen}{}}

% Add affiliations after the authors
\affil[1]{%
    Department of Computer and Information Science\\
    University of Pennsylvania.
}
  
\begin{document}
\onecolumn
\maketitle

\section{Proofs}\label{sect:proofs}
In this section, we will prove all the theoretical results of the paper. The first Section \ref{sect:IID-rum-properties}, however, details some of the important properties of IID-RUMs that are useful in proving the main theorems and lemmas. Many of these results can be of independent interest such as the \emph{Borda consistency} property which is shared by a very broad class of IID-RUMs. The reader may also skip to Section \ref{sect:error-bounds-top-K} for the proofs of the main results stated earlier in the paper.

\subsection{A closer look at IID-RUMs}
\label{sect:IID-rum-properties}
All IID-RUMs are in a family of choice models known as Fechnerian models \cite{becker1963stochastic}.
\begin{definition} A choice model $\rho$ is a Fechnerian model if there exists a function $F: \R \times \R \rightarrow [0,1]$ that is increasing in the first argument and decreasing in the second argument such that
$$ \rho(i|\{i, j\}) = F(U_i, U_j)\,. $$
\end{definition}
It is well known that Fechnerian models satisfy strongly stochastic transitivity (SST). That is, if $\rho(i|\{i, j\}) > \frac{1}{2}$ then $\rho(i|\{i, k\}) \geq \rho(j|\{j, k\})$ for all $k \neq i, j$. A weaker notion of stochastic transitivity is weak stochastic transitivity (WST): if $\rho(i|\{i, j\}) > \frac{1}{2}, \rho(j|\{j,k\}) > \frac{1}{2}$ then $\rho(i|\{i, k\}) > \frac{1}{2}$. The following lemma establishes that all IID-RUMs are Fechnerian models. 
\begin{lemma} All IID-RUMs are Fechnerian models.
\end{lemma}

\begin{proof} The proof is simply showing the existence of the function $F$ in the definition of Fechnerian models. Consider any two items $i$ and $j$
\begin{equation*}
\begin{aligned}
\rho(i|\{i, j\}) &= \PP(X_i > X_j) = \PP(U_i + \epsilon_i > U_j + \epsilon_j)\\
&= \PP(\epsilon_j - \epsilon_i < U_i - U_j)\\
&= F_{ij}(U_i - U_j)\,.
\end{aligned}
\end{equation*}
where $F_{ij}$ is the CDF of the random variable $\epsilon_j - \epsilon_i$. Note that all the $\epsilon_i$'s are identically distributed and thus $F = F_{ij}$ for all pairs $(i,j)$. This completes the proof.
\end{proof}

As the class of SST models are a special subclass of WST models, there exists a universal ordering among the items, induced by pairwise preference. Under IID-RUMs, this ordering the same as that of the partworth parameters. It's also notable that while IID-RUMs satisfy SST, RUMs with independent but not identically distributed noise (Independent RUMs) don't in general. In fact, Independent RUMs may not even satisfy WST. Next, we introduce a property of random utility models which we term the \textit{order preserving} property. 

\begin{definition} A choice model $\rho$ satisfies the order preserving property if
$$ \rho(i|\{i, j\}) > \frac{1}{2} \,\Rightarrow\, \rho(i|S) > \rho(j|S) \,\forall\, S: i, j \in S \,.$$
\end{definition}

The following lemma states that a broad class of IID-RUMs which include the MNL and Probit model satisfy the order preserving property.

\begin{lemma}\label{lem:iid-rum-order} All IID-RUMs whose noise distribution has absolutely continuous density function and support on the real line satisfy the order preserving property.
\end{lemma}

\begin{proof} Consider a pair of item $i, j$ such that $U_i > U_j$. The proof for the case when $S = \{i, j\}$ is trivial. We immediately have $\rho(i|\{i,j\}) > \frac{1}{2}$. Now consider any choice set $S$ containing both $i$ and $j$ where $|S| \geq 3$. Let $f_{\epsilon_i}$ denote the density function of the item-specific noise distribution of item $i$. Of course, under IID-RUM, $f_{\epsilon_i} = f_{\epsilon_j} = f$ for all $i\neq j$. We have
\begin{equation*}
    \begin{aligned}
    \rho(i | S) &= \PP(X_i > X_k , \forall k \in S\backslash \{i \})\\
    &= \PP(U_i + \epsilon_i > U_k + \epsilon_k, \forall k\in S\backslash\{i\})\\
    &= \PP(\epsilon_k < U_i - U_k + \epsilon_i, \forall k\in S\backslash\{i\})\\
    &= \int_\epsilon \PP(\epsilon_k < U_i - U_k + \epsilon_i, \forall k\in S\backslash\{i\}, \epsilon_i = \epsilon) d\epsilon\\
    &= \int_\epsilon \PP(\epsilon_k < U_i - U_k + \epsilon_i, \forall k\in S\backslash\{i\}| \epsilon_i = \epsilon) f_{\epsilon_i}(\epsilon) d\epsilon\\
    &= \int_\epsilon \bigg( \prod_{k\in S\backslash \{i\}} \PP(\epsilon_k < U_i - U_k  + \epsilon)  \bigg) f(\epsilon)d\epsilon \,.\\
    \end{aligned}
\end{equation*}
In the last equality, we have used the independence assumption of IID-RUMs. Further breaking down the product gives

\begin{equation}
\label{proof:order-preserving-eq1}
\begin{aligned}
&\int_\epsilon \bigg( \prod_{k\in S\backslash \{i\}} \PP(\epsilon_k < U_i - U_k  + \epsilon) \bigg) f(\epsilon)d\epsilon \\
&= \int_\epsilon \bigg( \prod_{k\in S\backslash \{i, j\}} \PP(\epsilon_k < U_i - U_k  + \epsilon)  \bigg) \PP(\epsilon_j < U_i - U_j + \epsilon) f(\epsilon)d\epsilon \,.\\
\end{aligned}
\end{equation}

By our assumption of identicallity and that the noise distribution has continuous density with support on the real line, $\PP(\epsilon_k < U_i - U_k + \epsilon) < \PP(\epsilon_k < U_j - U_k + \epsilon)$ for all $k\in S\backslash\{i, j\}$. 
On the other hand, for a random variable $\epsilon'$ distributed identically to the $\epsilon_i$'s,
$$ \PP(\epsilon_j < U_i - U_j + \epsilon) > \PP(\epsilon' < U_j - U_i + \epsilon) \quad \forall \epsilon\,. $$
Putting these two inequalities back into (\ref{proof:order-preserving-eq1}) gives
\begin{equation*}
\begin{aligned}
\rho(i | S) &= \int_\epsilon \bigg( \prod_{k\in S\backslash \{i, j\}} \PP(\epsilon_k < U_i - U_k  + \epsilon)  \bigg) \PP(\epsilon_j < U_i - U_j + \epsilon) f(\epsilon) d\epsilon\\
&> \int_\epsilon \bigg( \prod_{k\in S\backslash \{i, j\}} \PP(\epsilon_k < U_j - U_k  + \epsilon) \bigg) \PP(\epsilon' < U_j - U_i + \epsilon) f(\epsilon) d\epsilon\\
&= \rho(j | S)\,.
\end{aligned}
\end{equation*}
This completes the proof.\end{proof}

While the class of IID-RUMs whose noise distribution has continuous density function and support on the real line includes many commonly used IID-RUMs such as the MNL and Probit model, there are other IID-RUMs outside this class that also satisfy the order preserving property. An example is IID-Exponential RUM.

\begin{lemma}\label{lem:iid-exp-order-preserving} IID-Exponential RUM satisfies the order preserving property.
\end{lemma}

\begin{proof} Since the pairwise case ($m=2$) is trivial, we will focus on proving Lemma \ref{lem:iid-exp-order-preserving} for $m \geq 3$. Consider a menu $S\in \Cset^{(m)}: i, j \in S$, let $S' = S \backslash \{i, j\}$. For this proof, let us consider a `copy' of $i$ and call it $i'$. That is, $X_{i'}$ is distributed identically to $X_i$. We have
\begin{equation*}
\begin{aligned}
\rho(i | S) &= \Pr(X_i > \max\{ \{X_k\}_{k\in S'} \cup \{j\}\})\\
&= \int_{0}^{\infty} \Pr(X_k < U_i + \epsilon \,\forall k \in S', X_j < U_i + \epsilon) \cdot f(\epsilon) d\epsilon\\
&= \int_{0}^{\infty} \Pr(X_k < U_i + \epsilon \,\forall k\in S') \cdot \Pr(X_j < U_i + \epsilon) \cdot f(\epsilon)d\epsilon\\
&> \int_{0}^{\infty} \Pr(X_k < U_i + \epsilon \,\forall k\in S') \cdot \Pr(X_{i'} < U_i + \epsilon) \cdot f(\epsilon)d\epsilon\\
&> \int_{0}^{\infty} \Pr(X_k < U_j + \epsilon \,\forall k\in S') \cdot \Pr(X_{i'} < U_j + \epsilon) \cdot f(\epsilon)d\epsilon\\
&= \int_{0}^{\infty} \Pr(X_k < U_j + \epsilon \,\forall k \in S', X_{i'} < U_j + \epsilon) \cdot f(\epsilon)d\epsilon\\
&= \Pr(X_j > \max\{ \{X_k\}_{k\in S'} \cup \{i'\}\})\\
&= \rho(j|S)\,.
\end{aligned}
\end{equation*}
The first inequality holds because for any $\epsilon > U_j - U_i$, $\Pr(X_j < U_i + \epsilon) > \Pr(X_{i'} < U_i + \epsilon)$. The second inequality holds because for any $\epsilon > \max\{0,\max\{U_k - U_j\}_{k\in S' \cup\{j\}}\}$, $\Pr(X_k < U_i + \epsilon) > \Pr(X_k < U_j + \epsilon)$ for all $k \in S'$ and $\Pr(X_{i'} < U_j + \epsilon) > \Pr(X_{i'} < U_j + \epsilon)$. This completes the proof.
\end{proof}

Recall the Borda Consistency property introduced in Section \ref{subsect:generalized-borda-score}. One can show that a broad class of IID-RUMs which contains many commonly used RUMs such as the MNL and the Probit model satisfies Borda Consistency, the key property that ensures correctness of Generalized Borda Count.

\begin{theorem} Any IID-RUM $\rho$ whose noise distribution has continuous density function and support on the real line satisfies Borda consistency.
\end{theorem}

\begin{proof} Note that $|\Cset_i^{(m)}| =  |\Cset_j^{(j)}| = {n-1\choose m-1}$ for any $i, j$ and $m$. The proof for $m = 2$ is trivial so we will focus on $m \geq 3$. By definition,
\begin{equation*}
\begin{aligned}
\tau^{(m)}_i &= \frac{1}{|\Cset_i^{(m)}|} \sum_{S \in \Cset_i^{(m)}} \rho(i | S)\\
&= \frac{1}{|\Cset_i^{(m)}|} \bigg( \sum_{S: |S|=m, i, j \in S }\rho(i | S) + \sum_{S: |S|=m, i\in S, j\notin S} \rho(i|S) \bigg)\,.
\end{aligned}
\end{equation*}
By the ordering preserving property of IID-RUMs, for any $S: i, j \in S$, $p(\rho|S) > \rho(j|S)$. To prove that $\tau^{(m)}_i > \tau^{(m)}_j$, we only need to prove that for any $S': |S'| = m-1; i, j \notin S'$,
$$ \rho(i|S' \cup \{i\}) > \rho(j|S'\cup \{j\})\,. $$
Let $f_{\epsilon}$ denote the density function of the noise distribution, we have
\begin{equation*}
\begin{aligned}
\rho(i | S'\cup \{i\}) &= \PP(X_i > X_k \forall k \in S')\\
&= \PP(U_i + \epsilon_i > U_k + \epsilon_k \forall k \in S')\\
&= \PP(\epsilon_k < U_i - U_k + \epsilon_i \forall k \in S')\\
&= \int_{-\infty}^{\infty} \PP(\epsilon_k < U_i - U_k + \epsilon_i, \epsilon_i = \epsilon)\\
&= \int_{-\infty}^{\infty} \PP(\epsilon_k < U_i - U_k + \epsilon) f(\epsilon) d\epsilon\\
&> \int_{-\infty}^{\infty} \PP(\epsilon_k < U_j - U_k + \epsilon) f(\epsilon) d\epsilon\\
&= \rho(j|S'\cup \{j\}) \,. 
\end{aligned}
\end{equation*}
This completes the proof.\end{proof}

We also have a more general theorem that characterizes the class of IID-RUMs, beyond those whose noise distribution has continuous density function and support on the real line, that satisfy Borda consistency
\begin{theorem}\label{thm:consistency-borda-score-general} Consider an IID-RUM $\rho$. Fix a menu size $m$. If, for any pair of item $i, j$ where $U_i > U_j$,
\begin{itemize} 
    \item There exists a menu $S \in \Cset^{(m)}: i, j\in S$ such that $\rho(i|S) \neq \rho(j|S)$, or
    \item $m \geq 3$ and there exists a menu $S'\in \Cset^{(m-1)}: i, j \notin S$ such that $\rho(i|S\cup\{i\}) \neq \rho(j|S\cup\{j\})$,
\end{itemize}
then $\rho$ satisfies Borda consistency.
\end{theorem}

\begin{proof} Consider an IID-RUM $\rho$ and a fixed menu size $m$. For any pair of items $i, j$ where $U_i > U_j$, one can easily check that
$$ \rho(i|S) \geq \rho(j|S) \,\forall S \in \Cset^{(m)}: i, j\in S $$
and
$$ \rho(i|S'\cup\{i\}) \geq \rho(j|S'\cup \{j\}) \,\forall S'\in \Cset^{(m-1)}: i, j\notin S'\,.$$
If there exists a menu $S\in\Cset^{(m)}:i, j\in S$ where $\rho(i|S) \neq \rho(j|S)$, then $\rho(i|S) > \rho(j|S)$. Similarly, if there exists a menu $S'\in\Cset^{(m-1)}:i, j\notin S$ where $\rho(i|S\cup \{i\}) \neq \rho(j|S\cup\{j\})$ then $\rho(i|S\cup\{i\}) > \rho(j|S\cup\{j\})$. If either of these cases hold, we have
\begin{equation*}
\begin{aligned}
\tau^{(m)}_i &= \frac{1}{|\Cset_i^{(m)}|} \sum_{S \in \Cset_i^{(m)}} \rho(i | S)\\
&= \frac{1}{|\Cset_i^{(m)}|} \bigg( \sum_{S: |S|=m, i, j \in S }\rho(i | S) + \sum_{S: |S|=m, i\in S, j\notin S} \rho(i|S) \bigg)\\
&> \frac{1}{|\Cset_j^{(m)}|} \bigg( \sum_{S: |S|=m, i, j \in S }\rho(j | S) + \sum_{S: |S|=m, j\in S, i\notin S} \rho(j|S) \bigg) = \tau^{(m)}_j\,.
\end{aligned}
\end{equation*}
This completes the proof.\end{proof}

\subsection{Sample complexity for exact top-$K$ recovery}
\label{sect:error-bounds-top-K}
In this section we prove all theorems stated in Section \ref{sect:upper-bound} as well as the sample complexity for approximate top-$K$ ranking of the choice based Borda count algorithm.

\begin{replemma}{thm:error-bound-gen-borda}
Consider an IID-RUM that satisfies Borda consistency per Definition \ref{def:borda-consistency}. Assume input choice data with menu size $m$ is generated according to the sampling model described in Section \ref{subsect:sampling-model}. For any two items $i$ and $j$ where $\tau_i^{(m)} > \tau_j^{(m)}$, the choice-based Borda count algorithm satisfies
$$\PP(\hat W_j > \hat W_i) \leq \exp{\frac{-3pR{n\choose m} {m(\tau_i^{(m)}-\tau_j^{(m)})}^2 }{8n(\tau_i^{(m)} +\tau_j^{(m)})}} \,.$$
\end{replemma}

\begin{proof} 
Let $\calE$ denote the event $\{i \in \Ss^*_K, j\notin \Ss^*_K\}$. We wish to prove that if $\tau_i - \tau_j > {\Delta_K^{(m)}}$, then with sufficiently large $p$ and $R$, $\hat W_i$ is smaller than $\hat W_j$ with very small probability. The main external concentration inequality used in this proof is Bernstein's inequality (cf. Theorem 2.8.4 \cite{vershynin2018high}). We start our proof by expanding on the probability of misranking $i$ and $j$.
\begin{equation}\label{eqn:bound-easy-expand}
\begin{aligned}
&\PP\bigg(\hat W_j > \hat W_i \,\lvert\, \calE\bigg)\\
&= \PP\bigg(\sum_{r=1}^R \sum_{S\in \Cset_j^{(m)}} \underbrace{\mathbf{1}[y_S^{(r)} = j]}_{X_S^{j,r}} - \sum_{r=1}^R \sum_{S\in \Cset_i^{(m)}} \underbrace{\mathbf{1}[y_S^{(r)} = i]}_{X_S^{i,r}} > 0 \,|\, \calE   \bigg)\\
&= \PP\bigg(\sum_{r=1}^R\sum_{S\in\Cset_j^{(m)}} X_S^{j,r} - \E[X_S^{j,r}] + \E[X_S^{j,r}] - \sum_{r=1}^{R}\sum_{S\in\Cset_i^{(m)}} X_S^{i,r} - \E[X_S^{i,r}] + \E[X_S^{i,r}] > 0 \,|\,\calE \bigg)\\
&= \PP\bigg(\sum_{r=1}^R\sum_{S\in\Cset_j^{(m)}} X_S^{j,r} - \E[X_S^{j,r}] - \sum_{r=1}^{R}\sum_{S\in\Cset_i^{(m)}} X_S^{i,r} - \E[X_S^{i,r}]> \sum_{r=1}^R\sum_{S\in\Cset_i^{(m)}} \E[X_S^{i,r}]  - \sum_{r=1}^{R}\sum_{S\in\Cset_j^{(m)}}  \E[X_S^{j,r}] \,|\,\calE \bigg)\\
&= \PP\bigg(\sum_{r=1}^R\sum_{S\in\Cset_j^{(m)}} X_S^{j,r} - \E[X_S^{j,r}] - \sum_{r=1}^{R}\sum_{S\in\Cset_i^{(m)}} X_S^{i,r} - \E[X_S^{i,r}] > pR{n-1\choose m-1}(\tau^{(m)}_i - \tau^{(m)}_j) \bigg)\,.
\end{aligned}
\end{equation}
In the last equality, we have used the fact that $\sum_{r=1}^R\sum_{S\in\Cset_i^{(m)}}\E[X_S^{i,r}] - \sum_{r=1}^R\sum_{S\in\Cset_j^{(m)}}\E[X_S^{j,r}] = pR{n-1 \choose m-1}(\tau_i^{(m)} - \tau_j^{(m)}) \geq pR{n-1\choose m-1} (\tau^{(m)}_i - \tau^{(m)}_j)$. Now, we will expand the two terms on the LHS of the inequality as
\begin{equation}
\label{eqn:to-apply-bernstein}
\begin{aligned}
&\PP\bigg(\hat W_j > \hat W_j \,\lvert\, \calE\bigg)\\
&= \PP\bigg( \sum_{S\in \Cset_j^{(m)}: i\notin S} \sum_{r=1}^R \big[X_S^{j,r} - \E[X_S^{j,r}] \big] - \sum_{S\in \Cset_i^{(m)}: j\notin S} \sum_{r=1}^R \big[X_S^{i,r} - \E[X_S^{i,r}] \big]\\
& + \sum_{S \in \Cset^{(m)}: i, j\in S} \sum_{r=1}^R  \big[(X_S^{j,r} - X_S^{i,r}) - (\E[X_S^{j,r} -X_S^{i,r}]) \big] > pR{n-1\choose m-1}(\tau^{(m)}_i - \tau^{(m)}_j)  \bigg)\,.
\end{aligned}
\end{equation}

Note that with the above decomposition, the terms $\{X_S^{j,r}\}_{S\in \Cset^{(m)}_j: i\notin S}, \{X_S^{i,r}\}_{S\in \Cset^{(m)}_i: j\notin S}, \{X_S^{j,r}-X_S^{i,r}\}_{S\in \Cset^{(m)}: i, j \in S}$ are all mutually independent. To apply Bernstein's inequality, we need to evaluate (or upper bound) the following.
$$\sum_{S\in \Cset_j^{(m)}: i\notin S} \sum_{r=1}^R \E[(X_S^{j,r})^2]+\sum_{S\in \Cset_i^{(m)}: j\notin S} \sum_{r=1}^R \E[(X_S^{i,r})^2] +\sum_{S \in \Cset^{(m)}: i, j\in S} \sum_{r=1}^R  \E[(X_S^{j,r} - X_S^{i,r})^2] \,. $$
We can easily see that for $S \in \Cset^{(m)}_j: i\notin S$,
$$ X_S^{j, r} = \begin{cases} 1 &\text{ with probability } p\cdot \rho(j|S)\\
    0 &\text{ with probability } 1 - p\cdot \rho(j|S)\\
\end{cases} $$
and for $S\in \Cset^{(m)}: i, j \in S$,
$$ X_S^{j, r} - X_S^{i,r} = \begin{cases} 1 &\text{ with probability } p\cdot \rho(j|S)\\
    -1 &\text{ with probability } p\cdot \rho(i|S)\\
    0 &\text{ with probability } 1 - p\cdot \rho(j|S) -p\cdot \rho(i|S)\\
\end{cases}\,. $$
As such $\E[(X_S^{j, r})^2] =  p\cdot\rho(j|S)$ and $\E[(X_S^{j,r} - X_S^{i,r})^2] = p\cdot(\rho(j|S) + \rho(i|S)) $. We have
\begin{equation*}
\begin{aligned}
&\sum_{S\in \Cset_j^{(m)}: i\notin S} \sum_{r=1}^R \E[(X_S^{j,r})^2]+\sum_{S\in \Cset_i^{(m)}: j\notin S} \sum_{r=1}^R \E[(X_S^{i, r})^2] +\sum_{S \in \Cset^{(m)}: i, j\in S} \sum_{r=1}^R  \E[(X_S^{j,r} - X_S^{i,r})^2] \\
&= \sum_{S\in \Cset_j^{(m)}: i\notin S} \sum_{r=1}^R p\cdot\rho(j|S) + \sum_{S\in \Cset_i^{(m)}: j\notin S} \sum_{r=1}^R p\cdot\rho(i|S) + \sum_{S\in \Cset^{(m)}: i, j \in S} \sum_{r=1}^R p\cdot(\rho(j|S) + \rho(i|S))\\
&= \sum_{S\in \Cset_j^{(m)}} \sum_{r=1}^R p\cdot\rho(j|S) + \sum_{S\in \Cset_i^{(m)}} \sum_{r=1}^R p\cdot\rho(i|S)\\
&= pR{n-1\choose m-1}\tau^{(m)}_i  + pR{n-1\choose m-1}\tau^{(m)}_j\\
&= pR{n-1\choose m-1}(\tau^{(m)}_i + \tau^{(m)}_j) \,. \\
\end{aligned}
\end{equation*}
Now, applying Bernstein's inequality to (\ref{eqn:to-apply-bernstein}) directly yields:
\begin{equation*}
\begin{aligned}
&\PP\bigg(\hat W_j > \hat W_j \,\lvert\, \calE\bigg)\\
&\leq \exp{-\frac{\big(pR{n-1\choose m-1}{(\tau^{(m)}_i - \tau^{(m)}_j)}\big)^2}{2(pR{n-1\choose m-1}(\tau^{(m)}_i + \tau^{(m)}_j)+ \frac{pR{n-1\choose m-1}{(\tau^{(m)}_i - \tau^{(m)}_j) }}{3} )}}\\
&= \exp{-\frac{pR{n-1\choose m-1}{(\tau^{(m)}_i - \tau^{(m)}_j)}^2}{2(\tau^{(m)}_i + \tau^{(m)}_j)+\frac{2 (\tau^{(m)}_i - \tau^{(m)}_j) }{3}}}\\
&\leq \exp{-\frac{pR{n-1\choose m-1}(\tau^{(m)}_i - \tau^{(m)}_j)^2}{8/3(\tau_i^{(m)} +\tau_j^{(m)} ) }}\,.
\end{aligned}
\end{equation*}
This completes the proof. \end{proof}

Next we present the proof of Lemma \ref{lem:simple-inequality}.
\begin{replemma}{lem:simple-inequality}
Consider an IID-RUM that satisfies Borda consistency per definition (\ref{def:borda-consistency}). Fix a $K$, we have
$$ \frac{\tau^{(m)}_K + \tau^{(m)}_{K+1}}{{\Delta_K^{(m)}}^2 } = \max_{i\in \Ss^*_K, j\notin \Ss^*_K} \bigg\{ \frac{\tau^{(m)}_i + \tau^{(m)}_j}{(\tau^{(m)}_i - \tau^{(m)}_j)^2 } \bigg\} \, .$$
\end{replemma}

% \begin{replemma}\label{lem:simple-inequality}
% Consider an IID-RUM $\rho$ that satisfies Borda consistency. Consider two items $i\in \Ss_K^*$ and $j\notin \Ss_K^*$. Given choice data with menu size $m$ generated according to the sampling model described in section (\ref{subsect:sampling-model}), the Borda Count algorithm satisfies:
% $$ \PP(\hat W_j > \hat W_i \,\lvert\, i \in \Ss^*_K, j\notin \Ss^*_K) \leq \exp{\frac{-3pR{n-1\choose m-1} (\tau^{(m)}_i - \tau^{(m)}_j)^2 }{8(\tau_i^{(m)} +\tau_j^{(m)} )}}$$
% \end{lemma}

\begin{proof}
Firstly, it is easy to see that, fixing an $i\in \Ss_K^*$, for all $j\notin \Ss_K^*$,
$$ \frac{\tau_i^{(m)} + \tau_{K+1}^{(m)} }{(\tau_i^{(m)} - \tau_{K+1}^{(m)})^2 } \geq \frac{\tau_i^{(m)} + \tau_j^{(m)} }{(\tau_i^{(m)} - \tau_j^{(m)})^2} \,.$$

It remains to prove that for any $i \in \Ss_K^*$,
$$ \frac{\tau_K^{(m)} + \tau_{K+1}^{(m)}}{(\tau_K^{(m)} - \tau_{K+1}^{(m)})^2} \geq \frac{\tau_i^{(m)} + \tau_{K+1}^{(m)} }{(\tau_i^{(m)} - \tau_{K+1}^{(m)})^2 }\,. $$
To declutter the notation, we'll remove the superscript $m$. Let $\Delta_{iK} = \tau_i - \tau_K$ for some $i \in \Ss_K^*$. For any $i \in \Ss_K^* \backslash \{K\}$, we have
\begin{equation*}
\begin{aligned}
\frac{\tau_K + \tau_{K+1}}{\Delta_K^2} &\geq \frac{\tau_i + \tau_{K+1}}{(\tau_i - \tau_{K+1})^2}\\
\Leftrightarrow \frac{\tau_K + \tau_{K+1}}{\Delta_K^2} &\geq \frac{\tau_K + \Delta_{iK} + \tau_{K+1}}{(\Delta_{iK} + \Delta_K)^2}\\
\Leftrightarrow \frac{\tau_K + \tau_{K+1}}{\Delta_K^2} &\geq \frac{\tau_K + \Delta_{iK} + \tau_{K+1}}{\Delta^2_{iK} + \Delta^2_K + 2\Delta_{iK}\Delta_K}\\
\Leftrightarrow \tau_K\Delta_{iK}^2 + \tau_K\Delta_K^2 +2\tau_K\Delta_{iK}\Delta_K +\tau_{K+1}\Delta_{iK}^2 &+\\
\tau_{K+1}\Delta_K^2 +2\tau_{K+1}\Delta_{iK}\Delta_K &\geq \tau_K\Delta_K^2 + \tau_{K+1}\Delta_K^2+\Delta_{iK}\Delta_K^2 \\
\Leftrightarrow \tau_K\Delta_{iK}^2 +2\tau_K\Delta_{iK}\Delta_K +\tau_{K+1}\Delta_{iK}^2 +2\tau_{K+1}\Delta_{iK}\Delta_K &\geq \Delta_{iK}\Delta_K^2\,.
\end{aligned}
\end{equation*}
In deriving the last statement, the first two terms on the RHS get canceled out. Note that $2\tau_K\Delta_{iK}\Delta_K \geq 2\Delta_K\Delta_{iK}\Delta_K \geq \Delta_{iK}\Delta_K^2$. This completes the proof.
\end{proof}

\begin{reptheorem}{cor:sample-complexity-gbs}
Assume the conditions of Lemma \ref{thm:error-bound-gen-borda}. Given sufficiently large $p, R$ such that $pR{n \choose m} \geq \frac{8n \log n}{m{\Delta_K^{(m)}}^2}\cdot(\Delta_K^{(m)} +2\tau_{K+1}^{(m)})$, the choice-based Borda count algorithm correctly identifies all of the top $K$ items with probability at least $1-O(\frac{K}{n^2})$.
\end{reptheorem}

\begin{proof} The exponential bound in Lemma \ref{thm:error-bound-gen-borda} holds simultaneously for all pairs $i \in \Ss_K^*, j\notin \Ss_K^*$ given sufficiently large $p,R$ such that
$$ pR{n-1\choose m-1} \geq \max_{i\in \Ss^*_K, j\notin \Ss^*_K} \frac{8\log n}{(\tau^{(m)}_i - \tau^{(m)}_j)^2} \cdot (\tau_i^{(m)} +\tau_j^{(m)}) \,.$$
From Lemma (\ref{lem:simple-inequality}), we have
$$ \max_{i\in \Ss^*_K, j\notin \Ss^*_K} \frac{\tau_i^{(m)} + \tau_j^{(m)} }{(\tau_i^{(m)} - \tau_j^{(m)})^2 } = \frac{\tau_K^{(m)} +\tau_{K+1}^{(m)}}{{\Delta_K^{(m)}}^2} \,.$$
This means that if
$$ pR{n-1\choose m-1} \geq 8\log n \cdot\frac{\tau_K^{(m)}+\tau_{K+1}^{(m)} }{{\Delta_K^{(m)}}^2 } $$
then
$$ \Pr(\hat W_i < \hat W_j) \leq \frac{1}{n^3}  \,.$$
Apply union bound over all pairs $i, j$ such that $i \in \Ss_K^*, j\notin \Ss_K^*$. Then the event
$$ \hat W_i > \hat W_j \,\,\forall i\in \Ss_{K}^*, j\notin \Ss_{K}^* $$
happens with probability at least $1-O(\frac{K}{n^2})$. 
\end{proof}

\subsection{Sample complexity for approximate top-$K$ recovery}

In practice, some error in the top $K$ estimate could be tolerable. This is known as approximate top-$K$ ranking. The metric of interest in approximate top-$K$ ranking is the edit distance between the estimate $\hat\Ss_K$ and the true top $K$ items $\Ss^*_K$:
$$ D_{01}(\hat \Ss_K, \Ss_K^*) = K - \lvert \hat\Ss_K \cup \Ss_K \rvert \,. $$
One can see that correctly separating the top $K-h$ items and the bottom $n-K-h$ items, i.e., unable to identify at most $h$ of the top $K$ items, is sufficient to guarantee that the approximate loss is bounded by $h$.
Considering this, the fundamental quantity that determines the hardness of approximate top-$K$ ranking is the gap between the generalized Borda Score of the $K-h$ item and that of the $K+h+1$ item. For convenience, let us denote such quantity as 
$$\Delta^{(m)}_{K,h} =  \tau^{(m)}_{K-h-1} - \tau^{(m)}_{K+h+1}\,. $$
Clearly, this quantity generalizes $\Delta_K^{(m)}$ as $\Delta_{K,0}^{(m)} =\Delta_K^{(m)}$. We are interested in bounding the probability that, given an error threshold $h$, Generalized Borda Count fails to output an estimate with loss bounded by $h$. Namely,
$$ \Pr(D_{01}(\hat \Ss_K, \Ss_K^*) \leq h)\,. $$
Building on Lemma \ref{thm:error-bound-gen-borda}, we obtain the following sample complexity of Generalized Borda Count for approximate top-$K$ ranking.

\begin{theorem}\label{cor:sample-complexity-borda-approximate}
Assume the conditions of Lemma \ref{thm:error-bound-gen-borda}. Fix an error threshold $h$. Given sufficiently large $p, R$ such that $pR{n \choose m} \geq \frac{8n \log n}{m{\Delta_{K,h}^{(m)}}}\cdot(1+\frac{\tau_{K+h+1}^{(m)}}{\Delta_{K,h}^{(m)} } )$, the choice based Borda count algorithm outputs a set $\hat\Ss_K$ that satisfies
$$ D_H(\Ss^*_K, \hat\Ss_K) \leq h $$
with probability at least $1-O(\frac{K}{n^2})$.
\end{theorem}

\begin{proof} Applying Lemma \ref{thm:error-bound-gen-borda}, we have for every pair $i, j$ where $i \in \Ss_{K-h}^*$, $j\notin \Ss_{K+h}^*$,
$$ \Pr(\hat W_j > \hat W_i) \leq \exp{-\frac{3pR{n-1\choose m-1}(\tau^{(m)}_{i}-\tau_j^{(m)})^2 }{8(\tau^{(m)}_{i}+\tau_j^{(m)}) }   }\,.$$

Following the same argument as in the proof of Theorem \ref{cor:sample-complexity-gbs}, one can show that
$$ \frac{\tau_{K-h}+ \tau_{K+h+1}}{\Delta^2_{K,h}} = \max_{i\in \Ss_{K-h}^*, j\notin \Ss_{K+h}^*}\frac{\tau_i^{(m)} +\tau_j^{(m)}}{(\tau^{(m)}_{i}-\tau_j^{(m)})^2 }\,. $$
The rest of the proof follows by showing that given the condition on $p, R$ in the theorem statement, the probability that Generalized Borda Count wrongly flips the relative order of any two items $i \in \Ss_{K-h}^*, j \notin \Ss_{K+h}^*$ is upper bounded by $O(\frac{1}{n^3})$. Applying union bound over all such pairs of $i, j$ completes the proof. 
\end{proof}

In short, Generalized Borda Count has the following sample complexity for approximate top-$K$ ranking
$$ O\bigg( \frac{n\log n}{m\Delta_{K,h}^{(m)}} \cdot (1+\frac{\tau_{K+h+1}^{(m)}}{\Delta_{K,h}^{(m)}} \bigg)\,. $$

\subsection{Lower bound on the sample complexity of top-$K$ recovery}

\subsubsection{Preliminaries and notations}
We first restate a version of Fano's inequality \cite{cover1999elements} which will be useful to the construction of our lower bound:

\begin{lemma} [Fano's inequality] Consider a set of $L$ distributions $\{\Pr^1,\ldots, \Pr^L\}$. Suppose that we observe a random variable (or a set of random variables) $Y$ that was generated by first picking an index $A \in \{1,\ldots,L\}$ uniformly at random and then $Y \sim \Pr^A$. Fano's inequality states that any hypothesis test $\phi$ for this problem has an error probability lower bounded as
$$ \Pr[\phi(Y) \neq A] \geq 1 - \frac{\max_{a,b\in [L], a \neq b} \kl(\Pr^a(Y)\rVert \Pr^b(Y)) + \log 2 }{\log L}\,. $$
\end{lemma}

To obtain a lower bound in the form of Theorem \ref{thm:lower-bound-all}, we need to construct an IID-RUM such that any method requires $\Omega(n\log n)$ examples in order to accurately recover the top $K$ items. For this purpose, we look for a model within the Multinomial Logit family. It is well known that the MNL model is an instance of IID-RUM where the noise distribution is the standard Gumbel distribution. While the MNL model has a random utility characterization, it is often more convenient to describe the MNL model in terms of weighted probabilities. Specifically, an MNL model over $n$ items can be parameterized by a set of $n$ positive real numbers $w_1, \ldots, w_n$ called weights. There is also a well defined relation between the weighted probabilities representation and the utility partworth representations. Specifically, the resulting choice probability is defined to be:
$$\rho(i|S) = \frac{w_i}{\sum_{k\in S} w_k} = \frac{e^{U_i}}{\sum_{k\in S}e^{U_k} } $$
Now consider a special sub-class of MNL models that is defined by 3 parameters $(v, \delta, \Ss^*_K)$ for some $0 < \delta < v$ and a set $\Ss^*_K$ of size $K$. The item-specific weights are then defined as follows.
$$ w_i = \begin{cases}v + \delta &\text{if } i \in \Ss^*_K\\
v &\text{otherwise}
\end{cases} \,.$$
That is, all the top items have the same weights and all the bottom items have the same (but smaller) weight. For any $A \in [K, \ldots, n]$ let $\mathcal{M}^A$ be a special MNL model parametrized by $(v, \delta, \{1,\ldots, K-1, A\})$ as described above. One can see that Fano's inequality can be used to lower bound the error probability of any hypothesis test for the identity of $A$. That is, to lower bound the probability that any estimator fails to correctly identify the $K$-th item.

Recall our uniform sampling model described in Section \ref{subsect:sampling-model}, let us use $\{Y^r_S\}_{S\in \Cset^{(m)}, r=1:R}$ to denote the observed choice data where:
$$ Y_S^r = \begin{cases} y &\text{for some } y \in S \text{ if menu $S$ is offered in round }r\\
    0 &\text{if menu $S$ is not offered in round }r
\end{cases}\,. $$
As a short hand, denote $\rho^a$ as the choice rule corresponding to $\calM^a$ and $\rho^b$ corresponding to $\calM^b$. Let $\Pr^a(\{Y_S^r\})$ denote the likelihood of choice data $\{Y_S^r\}$ under choice model $\calM^a$ (define analogously for $\calM^b$).

\subsubsection{Exact top-$K$ recovery}\label{subsub:proof-exact-top-k}
The following pair of lemma and theorem establish the lower bound for exact top-$K$ recovery via showing an upper bound on the KL divergence between any two models $\calM^a, \calM^b$ and then applying Fano's inequality.
    
\begin{lemma}\label{lem:kl-div}
    Assuming the same sampling model as in Theorem \ref{thm:error-bound-gen-borda}, we have, for any $a, b \in [K, \ldots, n]$,
    $$ \kl(\Pr^a(\{Y_S^r\})\rVert \Pr^b(\{Y_S^r\})) \leq 4pR{n-1\choose m-1}\frac{{\Delta_K^{(m)}}^2}{\tau_{K}^{(m)}} \,. $$
\end{lemma}

\begin{reptheorem}{thm:lower-bound-all}
Consider the sampling model described in Section \ref{subsect:sampling-model}.
There exists a choice model $\calM^*$ within the class of Multinomial Logit Models (MNLs) such that for $n \geq 20$, if $pR{n\choose m} \leq \frac{n\log n }{8}\cdot\frac{\tau_{K}^{(m)}}{m{\Delta_K^{(m)}}^2}$ then any statistical estimator fails to correctly identify all of the top $K$ items with probability at least $\frac{1}{12}$.
\end{reptheorem}

Once Lemma \ref{lem:kl-div} has been established, we can prove Theorem \ref{thm:lower-bound-all} as follows. For any $n \geq 20$, if ${\Delta_K^{(m)}}^2 \leq \frac{\tau_{K}^{(m)}\cdot \log n}{8 pR{n-1\choose m-1}}$, Fano's inequality gives us a lower bound on the error of any estimator for finding the top $K$ items under $\calM(v, \delta, \Ss^*_K)$:
$$ \Pr(\hat \Ss_K \neq \Ss^*_K) \geq 1- \frac{0.5\log n +\log 2}{\log (n-K+1)} \geq 1- \frac{0.5\log n +\log 2}{\log (n/2+1)} \geq 0.086 \,. $$
The rest of the proof of Theorem \ref{thm:lower-bound-all} involves simple algebra, noting that ${n\choose m} = {n-1\choose m-1}\cdot \frac{n}{m}$. We now move on to the proof of Lemma \ref{lem:kl-div}.

\begin{proof} (Proof of Lemma \ref{lem:kl-div}). Let $\hat\C^{(m,r)}$ denote the set of menus of size $m$ being picked in round $r$ under the sampling model described in Section \ref{subsect:sampling-model}. Let us first decompose $\kl(\Pr^a(\{Y_S^r\})\rVert\Pr^b(\{Y_S^r\}))$ into a more manageable form.
\begin{equation*}
    \begin{aligned}
        &\kl(\Pr^a(\{Y_S^r\})\rVert\Pr^b(\{Y_S^r\}))\\
        &[\text{thanks to mutual independence among } \{Y_S^r\}]\\
        &=\sum_{r=1}^R \sum_{S\in\Cset^{(m)}} \kl(\Pr^a(Y_S^r) \rVert \Pr^b(Y_S^r))\\
        &=\sum_{r=1}^R \sum_{S\in\Cset^{(m)}} \kl(\Pr^a(Y_S^r|S\in \Cset_r^{(m,r)}) \Pr^a(S\in\hat\Cset^{(m)}) \rVert \Pr^b(Y_S^r|S\in \hat\Cset^{(m,r)}) \Pr^a(S\in\hat\Cset^{(m)}))\\
        &=\sum_{r=1}^R \sum_{S\in\Cset^{(m)}} \kl(\Pr^a(Y_S^r|S\in \hat\Cset^{(m,r)}) \Pr^a(S\in\hat\Cset^{(m,r)}) \rVert \Pr^b(Y_S^r|S\in \hat\Cset_r^{(m,r)}) \Pr^b(S\in\hat\Cset^{(m,r)}))\\
        &[\text{Using the fact that } \Pr^a(S\in\hat\Cset^{(m,r)}) = \Pr^b(S\in \hat\Cset^{(m,r)}) = p]\\
        &= pR\sum_{S\in \Cset^{(m)}}\kl(\Pr^a(Y_S^1|S\in\hat\Cset^{(m,1)})\rVert \Pr^b(Y_S^1|S\in\hat\Cset^{(m,1)}))\,.
    \end{aligned}
\end{equation*}
To handle the above sum, partition $\Cset^{(m)}$ into 4 subsets $\{S \in \Cset^{(m)}: a, b\notin S \}, \{S \in \Cset^{(m)}: a\in S, b\notin S \}, \{S \in \Cset^{(m)}: a\notin S, b \in S \}, \{S \in \Cset^{(m)}: a, b\in S \}$. It is easy to see that for $S : a, b\notin S$, the corresponding KL divergence terms are all 0. Hence, we only have to work with other 3 subsets. Before going into the details, let us define some shorthand notations: let $w^a(S)$ denote the sum of the weights of the items in menu $S$ under $\calM^a$ and $S + a$ denote set union. 

\textbf{Subset 1:} $\{S\in\Cset^{(m)}: a, b\in S \}$
\begin{equation*}
    \begin{aligned}
        &\sum_{S\in \Cset^{(m)}: a, b\in S}\kl(\Pr^a(Y_S^1|S\in\hat\Cset^{(m,1)})\rVert \Pr^b(Y_S^1|S\in\hat\Cset^{(m,1)}))\\
        &[\text{Note that $\rho^a(x|S) = \rho^b(x|S) \quad\forall x\neq a, b$ and $ w^a(S) = w^b(S)$ for all $S: a,b\in S$}]\\
        &=\sum_{S\in \Cset^{(m)}: a, b \in S} \rho^a(a|S)\log\frac{\rho^a(a|S)}{\rho^b(a|S)} + \rho^a(b|S)\log\frac{\rho^a(b|S)}{\rho^b(b|S)}\\
        &=\sum_{S\in \Cset^{(m)}: a, b \in S} \frac{v+\delta}{w^a(S)}\log\frac{v+\delta}{v} + \frac{v}{w^a(S)}\log\frac{v}{v+\delta}\\
        &= \sum_{S\in \Cset^{(m)}: a, b \in S} \frac{\delta}{w^a(S)}\log\frac{v+\delta}{v}\,.
    \end{aligned}
\end{equation*}

\textbf{Subset 2:} $\{S\in\Cset^{(m)}: a \in S, b\notin S \}$. Firstly, it is easy to verify the following useful identities: $w^a(S' + a) = w^b(S' + b), w^a(S'+b) = w^b(S'+a) \forall S': a, b\notin S'$. Furthermore, $w^a(x) = w^b(x) \forall x\neq a, b$. Now we have
\begin{equation*}
    \begin{aligned}
        &\sum_{S\in \Cset^{(m)}: a\in S, b\notin S}\kl(\Pr^a(Y_S^1|S\in\hat\Cset^{(m,1)})\rVert \Pr^b(Y_S^1|S\in\hat\Cset^{(m,1)}))\\
        &=\sum_{S'\in \Cset^{(m-1)}: a, b\notin S'} \bigg[ \bigg(\sum_{x\in S'} \rho^a(x|S'+a) \log\frac{\rho^a(x|S'+a)}{\rho^b(x|S'+a)}\bigg) + \rho^a(a|S'+a)\log \frac{\rho^a(a|S'+a)}{\rho^b(a|S'+a)} \bigg] \\
        &=\sum_{S'\in \Cset^{(m-1)}: a, b\notin S'} \bigg[\bigg(\sum_{x\in S'} \frac{w^a(x)}{w^a(S'+a)} \log\frac{w^b(S'+a)}{w^a(S'+a)} \bigg)+ \frac{v+\delta}{w^a(S'+a)}\log \frac{v+\delta}{v}\cdot\frac{w^b(S'+a)}{w^a(S'+a)} \bigg] \\
        &=\sum_{S'\in \Cset^{(m-1)}: a, b\notin S'} \bigg[\bigg( \sum_{x\in S'} \frac{w^a(x)}{w^a(S'+a)} \log\frac{w^a(S'+b)}{w^a(S'+a)}\bigg) + \frac{v+\delta}{w^a(S'+a)}\log \frac{v+\delta}{v}\cdot\frac{w^a(S'+b)}{w^a(S'+a)} \bigg] \,.
    \end{aligned}
\end{equation*}

\textbf{Subset 3:} $\{S\in\Cset^{(m)}: a \in S, b\notin S \}$. Following a very similar procedure to the 2nd subset, we obtain:
\begin{equation*}
    \begin{aligned}
        &\sum_{S\in \Cset^{(m)}: a\notin S, b\in S}\kl(\Pr^a(Y_S^1|S\in\hat\Cset^{(m,1)})\rVert \Pr^b(Y_S^1|S\in\hat\Cset^{(m,1)}))\\
        &=\sum_{S'\in \Cset^{(m-1)}: a, b\notin S'} \bigg[\bigg( \sum_{x\in S'} \frac{w^a(x)}{w^a(S'+b)} \log\frac{w^a(S'+a)}{w^a(S'+b)}\bigg) + \frac{v}{w^a(S'+b)}\log \frac{v}{v+\delta}\cdot\frac{w^a(S'+a)}{w^a(S'+b)} \bigg] \,.
    \end{aligned}
\end{equation*}

Focusing on the terms from the 2nd and 3rd subset and grouping them in an intelligent way gives us
\begin{equation*}
    \begin{aligned}
        &\sum_{S\in \Cset^{(m)}: a\notin S, b\in S}\kl(\Pr^a(Y_S^1|S\in\hat\Cset^{(m,1)})\rVert \Pr^b(Y_S^1|S\in\hat\Cset^{(m,1)}))\\ &+ \sum_{S\in \Cset^{(m)}: a\in S, b\notin S}\kl(\Pr^a(Y_S^1|S\in\hat\Cset^{(m,1)})\rVert \Pr^b(Y_S^1|S\in\hat\Cset^{(m,1)}))\\
        &=\sum_{S'\in \Cset^{(m-1)}: a, b\notin S} \bigg[\bigg( \sum_{x\in S'} \frac{w^a(x)}{w^a(S'+a)} \log\frac{w^a(S'+b)}{w^a(S'+a)} \bigg)+ \frac{v+\delta}{w^a(S'+a)}\log \frac{v+\delta}{v}\cdot\frac{w^a(S'+b)}{w^a(S'+a)} \bigg]\\
        &+\sum_{S'\in \Cset^{(m-1)}: a, b\notin S} \bigg[\bigg( \sum_{x\in S'} \frac{w^a(x)}{w^a(S'+b)} \log\frac{w^a(S'+a)}{w^a(S'+b)}\bigg) + \frac{v}{w^a(S'+b)}\log \frac{v}{v+\delta}\cdot\frac{w^a(S'+a)}{w^a(S'+b)} \bigg] \\
        &=\sum_{S'\in \Cset^{(m-1)}: a, b\notin S} \bigg[\bigg(\frac{v+\delta}{w^a(S'+a)} - \frac{v}{w^a(S'+b)}\bigg)\log \frac{v+\delta}{v} \bigg] \\
        &+\sum_{S'\in \Cset^{(m-1)}: a, b\notin S} \log\frac{w^a(S'+a)}{w^a(S'+b)}\cdot \bigg[\sum_{x\in S'} \bigg(\frac{w^a(x)}{w^a(S'+b)} - \frac{w^a(x)}{w^a(S'+a)}\bigg) +  \frac{v}{w^a(S'+b)} - \frac{v+\delta}{w^a(S'+a)}  \bigg] \\
        &=\sum_{S'\in \Cset^{(m-1)}: a, b\notin S} \bigg[\bigg(\frac{v+\delta}{w^a(S'+a)} - \frac{v}{w^a(S'+b)}\bigg)\log \frac{v+\delta}{v} \bigg]\,.
    \end{aligned}
\end{equation*}
The last equality comes from recognizing that $\sum_{x\in S'} \frac{w^a(x)}{w^a(S'+b)} +\frac{v}{w^a(S'+b)} = \frac{w^a(S'+b)}{w^a(S'+b)} = 1$ and similarly $\sum_{x\in S'} \frac{w^a(x)}{w^a(S'+a)} +\frac{v+\delta}{w^a(S'+a)} = \frac{w^a(S'+a)}{w^a(S'+a)} = 1$ so they cancel out and the second term becomes $0$. We now have the following much more compact identity.
$$ \kl(\Pr^a(\{Y_S^r\})\rVert\Pr^b(\{Y_S^r\})) $$
$$=pR\cdot \log\frac{v+\delta}{v} \cdot \bigg(\sum_{S\in \Cset^{(m)}: a, b \in S} \frac{\delta}{w^a(S)} + \sum_{S'\in \Cset^{(m-1)}: a, b\notin S} \bigg[\frac{v+\delta}{w^a(S'+a)} - \frac{v}{w^a(S'+b)}\bigg] \bigg) \,.$$
To make a connection between this and $\Delta_K^{(m)}$, recognize that under $\calM^a$, $\Delta_K^{(m)} = \tau_a^{(m)} - \tau_b^{(m)}$. We therefore have
\begin{equation*}
\begin{aligned}
\Delta_K^{(m)} &=\tau_a^{(m)} - \tau_b^{(m)}\\
&= \frac{1}{{n-1\choose m-1}} \cdot \bigg( \sum_{S\in \Cset^{(m)}: a\in S} \rho^a(a|S) - \sum_{S\in\Cset^{(m)}:b\in S}\rho^a(b|S)  \bigg)\\
&= \frac{1}{{n-1\choose m-1}} \cdot \bigg( \sum_{S\in \Cset^{(m)}: a\in S} \frac{v+\delta}{w^a(S)}- \sum_{S\in\Cset^{(m)}:b\in S}\frac{v}{w^a(S)}  \bigg)\\
&= \frac{1}{{n-1\choose m-1}} \cdot \bigg( \sum_{S\in \Cset^{(m)}: a, b\in S} \frac{v}{w^a(S)} + \sum_{S\in\Cset^{(m)}:a \in S, b\notin S}\frac{v+\delta}{w^a(S)} - \sum_{S\in\Cset^{(m)}:a \notin S, b\in S}\frac{v}{w^a(S)}  \bigg)\\
&= \frac{1}{{n-1\choose m-1}} \cdot \bigg(\sum_{S\in \Cset^{(m)}: a, b \in S} \frac{\delta}{w^a(S)} + \sum_{S'\in \Cset^{(m-1)}: a, b\notin S'} \bigg[\frac{v+\delta}{w^a(S'+a)} - \frac{v}{w^a(S'+b)} \bigg]\bigg)\,.
\end{aligned}
\end{equation*}
We thus have
\begin{equation}\label{eqn:kl-compact}
    \kl(\Pr^a(\{Y_S^r\})\rVert\Pr^b(\{Y_S^r\})) = pR\log\frac{v+\delta}{v} {n-1\choose m-1} \Delta_K^{(m)} \,.
\end{equation}

We will now bound $\log \frac{v+\delta}{v}$ in terms of $\frac{\Delta^{(m)}_K}{\tau^{(m)}_{K}}$.
\begin{equation*}
\begin{aligned}
\frac{\Delta^{(m)}_K}{\tau_{K}^{(m)}} &= \frac{\sum_{S\in\Cset^{(m)}:a,b\in S} \frac{w^a(a) - w^a(b)}{w^a(S) + 2v+\delta} + \sum_{S\in\Cset^{(m-1)}:a,b\notin S} \frac{w^a(a)}{w^a(S)+v+\delta}-\frac{w^a(b)}{w^a(S)+v}}{\sum_{S\in\Cset^{(m)}:a,b\in S} \frac{w^a(a)}{w^a(S) + 2v+\delta} + \sum_{S\in\Cset^{(m-1)}:a,b\notin S} \frac{w^a(a)}{w^a(S)+v+\delta} }\\
&= \frac{\sum_{S\in\Cset^{(m)}:a,b\in S} \frac{\delta}{w^a(S) + 2v+\delta} + \sum_{S\in\Cset^{(m-1)}:a,b\notin S} \frac{\delta(w^a(S))}{(w^a(S)+v+\delta)(w^a(S)+v)} }{\sum_{S\in\Cset^{(m)}:a,b\in S} \frac{v +\delta}{w^a(S) + 2v+\delta} + \sum_{S\in\Cset^{(m-1)}:a,b\notin S} \frac{v+\delta}{w^a(S)+v+\delta} }\\
&= \frac{\sum_{S\in\Cset^{(m)}:a,b\in S} \frac{1}{w^a(S) + 2v+\delta} + \sum_{S\in\Cset^{(m-1)}:a,b\notin S} \frac{(w^a(S))}{(w^a(S)+v+\delta)(w^a(S)+v)} }{\sum_{S\in\Cset^{(m)}:a,b\in S} \frac{1}{w^a(S) + 2v+\delta} + \sum_{S\in\Cset^{(m-1)}:a,b\notin S} \frac{1}{w^a(S)+v+\delta} } \cdot \frac{\delta}{v+\delta} \\
&\geq \frac{\sum_{S\in\Cset^{(m-1)}:a,b\notin S} \frac{1}{w^a(S)+v+\delta}\cdot \frac{w^a(S)}{w^a(S)+v} }{\sum_{S\in\Cset^{(m-1)}:a,b\notin S} \frac{1}{w^a(S)+v+\delta} } \cdot \frac{\delta}{v+\delta}\\
&\geq \frac{\sum_{S\in\Cset^{(m-1)}:a,b\notin S} \frac{1}{w^a(S)+v+\delta}\cdot \frac{(m-1)v}{(m-1)v + v } }{\sum_{S\in\Cset^{(m-1)}:a,b\notin S} \frac{1}{w^a(S)+v+\delta} } \cdot \frac{\delta}{v+\delta}\\
&= \frac{(m-1)v}{mv }  \cdot \frac{\delta}{v+\delta}\\
&= \frac{m-1}{m} \frac{\delta}{v+\delta} \geq \frac{1}{2}\frac{\delta}{v+\delta}\,.
\end{aligned}
\end{equation*}
We can now bound $\log \frac{v+\delta}{v}$ as
$$ \log \frac{v+\delta}{v} \leq \frac{\delta}{v} \leq \frac{2\delta}{v+\delta} \leq \frac{4\Delta^{(m)}_K}{\tau^{(m)}_{K}} \,.$$
Combining with equation (\ref{eqn:kl-compact}) completes the proof.
\end{proof}


\subsubsection{Approximate top-$K$ recovery}

To obtain a lower bound on the sample complexity for approximate top-$K$ recovery, we require a more nuanced construction than that in the proof of Theorem \ref{thm:lower-bound-all}. At a high level, we need to construct a multiset of $K$-size subsets $\{\Ss^1,\ldots, \Ss^L\}$. Each $K$-size subset $\Ss^l$ corresponds to an MNL model whose top $K$ items are exactly $\Ss^l$. We need to carefully design this multiset such that the pairwise edit distance between any two subsets is larger than $2h$. This automatically ensures that any top-$K$ estimate is close in edit distance to at most one of the $L$ subsets. At the same time, we also want the distance between the distributions induced by any two models to be small in KL divergence sense. This would then allow us to invoke Fano's lemma to lower bound the probability that any statistical estimator outputs a top-$K$ estimate set with small edit distance error. Lastly, we also want $L$ to be sufficiently large in order to obtain a good lower bound using Fano's lemma.

We first present a reworded version of Lemma 9 of \cite{shah2017simple}. The original lemma, in turn, is based on a result due to \cite{levenshtein1971upper} for fixed weighted binary codes.
\begin{lemma}\label{lem:packing-result-shah}\cite[lemma 9]{shah2017simple} Consider the regime where $h <\frac{2}{3}\min\{K, \sqrt n, n-K \}$. For sufficiently large $n$, there exists a multiset $\{s^1,\ldots, s^L\}$ with cardinality $L \geq \exp{\frac{9}{20}h\log n}$ and $s^l \subseteq [\frac{n}{2}, n]$ for $l \in [L]$ such that
$$ |s^l| = \frac{3h}{2}\,\, \forall l\in [L] \quad\text{and}\quad D_{01}(s^j, s^l) = 2h+1 \,\, \text{for all $j\neq l \in [L]$} \,.$$
\end{lemma}

By the above lemma, there exists a multiset of cardinality $\exp{\frac{9}{20}h\log n}$, consisting of sets of size exactly $\frac{3h}{2}$, with elements from $[\frac{n}{2}, n]$. For $s^l$, let us construct a MNL model as follows: let $u = \{1,\ldots, K-\frac{3h}{2}\}$ and $\Ss^l_K = u \cup s^l$ be the set of the top $K$ items for model $l\in [L]$. Note that this construction is valid since $h < \frac{2K}{3}$. Following closely the description of the special MNL model used in subsection (\ref{subsub:proof-exact-top-k}), we assign $v$ as weight for all items in $[n]\backslash \Ss^l_K$, $v+\delta/2$ for all items in $u$ and $v+\delta$ for all items in $s_l$  for some $v > \delta > 0$. In other words, $v^l$ is the set of the top $\frac{3h}{2}$ items for the $l$-th model while $u$ is the set composing of the $\frac{3h}{2}+1$-th, $\ldots, K$-th best item for model $l$.   

We emphasize that by design, $D_{01}(\Ss^l_K, \Ss^j_K) > 2h$ for any $l\neq j\in[L]$. This also means that for any estimate $\hat\Ss_K$, there exists at most one index $a \in [L]$ such that $D_{01}(\hat\Ss_K, \Ss_K^a) \leq h$. In other words, outputting an estimate with low edit distance error is equivalent to exactly identifying the set of the top $K$ items. All that remains is to prove that it is hard to distinguish between any two models as their distributions over observed choice data have small KL divergence. Such a result is summarized in the lemma below:

\begin{lemma}\label{lem:kl-approximate-recover} Consider the construction described above, for any two models indexed by $a,b \in [L]$ with $a\neq b$. Let $\Pr^a$ and $\Pr^b$ be the distributions parametrized by $\Ss^a_K$ and $\Ss_K^b$ as described above, respectively. Under the sampling model described in Section \ref{subsect:sampling-model}, we have
$$ \kl(\Pr^a(\{Y\}) \rVert \Pr^b(\{Y\})) \leq 12hpR{n-1\choose m-1}\frac{{\Delta_{K,h}^{(m)}}^2}{\tau^{(m)}_{K-h}} $$
\end{lemma}

Before proving Lemma \ref{lem:kl-approximate-recover}, let us state and prove the sample complexity lower bound for approximate top-$K$ ranking, the proof of which directly makes use of the lemma.

\begin{theorem}\label{thm:lower-bound-approximate-recovery}
Consider the sampling model described in Section \ref{subsect:sampling-model}.
There exists a choice model $\calM^*$ within the class of Multinomial Logit Models (MNLs) such that for $n \geq 20$, if $pR{n\choose m} \leq \frac{1}{96}\frac{n\log n}{m{\Delta_{K,h}^{(m)}}} \cdot \frac{\tau^{(m)}_{K-h}}{\Delta^{(m)}_{K,h}}$ then any statistical estimator outputs an estimate $\hat\Ss_K$ and
$$ D_{01}(\hat\Ss_K, \Ss^*_K) > h $$
with probability at least $\frac{1}{5}$.
\end{theorem}

\begin{proof} Consider the regime when $p,R$ are small enough such that
$$ pR{n-1\choose m-1} \leq \frac{\log n}{96 {\Delta_{K,h}^{(m)}}^2  }\cdot \tau^{(m)}_{K-h}= \frac{\log n}{96 {\Delta_{K,h}^{(m)}}^2  }\cdot (\tau^{(m)}_{K-h}+\Delta_{K,h}^{(m)}) $$
and the multiset construction as described above. Invoking Fano's inequality, we have the probability that any statistical estimator failing to output an estimate $\hat\Ss_K$ such that $D_{01}(\hat\Ss_K, \Ss^*_K) \leq h$, is lower bounded as
$$ 1-\frac{ \frac{1}{8}h\log n +\log 2 }{\frac{9}{20}h\log n} \geq \frac{1}{5} \quad\text{for } n \geq 20 \,.$$
This finishes the proof.
\end{proof}

We will now proceed to prove Lemma \ref{lem:kl-approximate-recover}.
\begin{proof}
Following the same notation and argument as in the proof of Lemma \ref{lem:kl-div}, we have
$$ \kl(\Pr^a(\{Y\})\rVert \Pr^b(\{Y\})) = pR\sum_{S\in \Cset^{(m)}} \kl(\Pr^a(y_S^1|S\in \hat\Cset^{(m,1)} ) \rVert \Pr^b(y_S^1|S\in \hat\Cset^{(m,1)}))\,. $$
Similarly to the proof of Lemma \ref{lem:kl-div}, let us define some shorthand notations: let $w^a(S)$ denote the sum of the weights of the items in menu $S$ and $w^a(i)$ the weight of item $i$ under the special MNL model whose top $K$ items is $\Ss^a_K$ (similarly defined for $b\neq a$). Expanding on the sum of the KL divergence terms and rearranging the summation order gives us

\begin{equation*}
\begin{aligned}
&\frac{1}{pR}\kl(\Pr^a(\{Y\}) \rVert \Pr^b(\{Y\})) \\
&= \sum_{i\in \Ss_K^{a}\backslash \Ss_K^b} \sum_{S\in C_i^{(m)}} \frac{w^a(i)}{w^a(S)} \log \frac{w^a(i)\cdot w^b(S)}{w^b(i)\cdot w^a(S) } + \sum_{i\in \Ss_K^b \backslash \Ss_K^a }\sum_{S\in C_i^{(m)}} \frac{w^a(i)}{w^a(S)} \log\frac{w^a(i)\cdot w^b(S)}{w^b(i) \cdot w^a(S)} \\
&+ \sum_{i\in \Ss^a_K \cap \Ss_K^b } \sum_{S\in C_i^{(m)}} \frac{w^a(i)}{w^a(S)} \log\frac{w^a(i)w^b(S)}{w^b(i)w^a(S)} +   \sum_{i\in [n] \backslash (\Ss^a_K \cup \Ss_K^b) } \sum_{S\in C_i^{(m)}} \frac{w^a(i)}{w^a(S)} \log\frac{w^a(i)w^b(S)}{w^b(i)w^a(S)}\\
&= \sum_{i\in \Ss_K^{a}\backslash \Ss_K^b} \sum_{S\in C_i^{(m)}} \frac{v+\delta}{w^a(S)} \log \frac{(v+\delta)\cdot w^b(S)}{v\cdot w^a(S) } + \sum_{i\in \Ss_K^b \backslash \Ss_K^a }\sum_{S\in C_i^{(m)}} \frac{v}{w^a(S)} \log\frac{v\cdot w^b(S)}{(v+\delta) \cdot w^a(S)} \\
&+ \sum_{i\in \Ss^a_K \cap \Ss_K^b } \sum_{S\in C_i^{(m)}} \frac{w^a(i)}{w^a(S)} \log\frac{w^b(S)}{w^a(S)} +   \sum_{i\in [n] \backslash (\Ss^a_K \cup \Ss_K^b) } \sum_{S\in C_i^{(m)}} \frac{w^a(i)}{w^a(S)} \log\frac{w^b(S)}{w^a(S)}\\
&= \sum_{i\in \Ss_K^{a}\backslash \Ss_K^b} \sum_{S\in C_i^{(m)}} \frac{v+\delta}{w^a(S)} \log \frac{v+\delta}{v} + \sum_{i\in \Ss_K^b \backslash \Ss_K^a }\sum_{S\in C_i^{(m)}} \frac{v}{w^a(S)} \log\frac{v}{v+\delta} \\
&+\sum_{i\in \Ss_K^{a}\backslash \Ss_K^b} \sum_{S\in C_i^{(m)}} \frac{v+\delta}{w^a(S)} \log \frac{w^b(S)}{w^a(S) } + \sum_{i\in \Ss_K^b \backslash \Ss_K^a }\sum_{S\in C_i^{(m)}} \frac{v}{w^a(S)} \log\frac{w^b(S)}{w^a(S)}\quad\quad(*) \\
&+ \sum_{i\in \Ss^a_K \cap \Ss_K^b } \sum_{S\in C_i^{(m)}} \frac{w^a(i)}{w^a(S)} \log\frac{w^b(S)}{w^a(S)} +   \sum_{i\in [n] \backslash (\Ss^a_K \cup \Ss_K^b) } \sum_{S\in C_i^{(m)}} \frac{w^a(i)}{w^a(S)} \log\frac{w^b(S)}{w^a(S)} \,.
\end{aligned}
\end{equation*}
In the second equality, we have made use of the fact that for any item $i$ in $\Ss_K^a\cup \Ss_K^b$ and $\Ss_K^a \cap \Ss_K^b$, $w^a(i) = w^b(i)$.
Focusing on the first two summations,
\begin{equation*}
\begin{aligned}
&\sum_{i\in \Ss_K^{a}\backslash \Ss_K^b} \sum_{S\in C_i^{(m)}} \frac{v+\delta}{w^a(S)} \log \frac{v+\delta}{v} + \sum_{i\in \Ss_K^b \backslash \Ss_K^a }\sum_{S\in C_i^{(m)}} \frac{v}{w^a(S)} \log\frac{v}{v+\delta} \\
&= \bigg(\sum_{i\in \Ss_K^{a}\backslash \Ss_K^b} \sum_{S\in C_i^{(m)}} \frac{v+\delta}{w^a(S)} - \sum_{i\in \Ss_K^b \backslash \Ss_K^a }\sum_{S\in C_i^{(m)}} \frac{v}{w^a(S)} \bigg) \log \frac{v+\delta}{v} \\
&= (2h+1){n-1\choose m-1} \Delta_{K,h}^{(m)} \log \frac{\delta+v}{v} < 3h{n-1\choose m-1}\Delta_{K,h}^{(m)} \log\frac{\delta+v}{v}\,.
\end{aligned}
\end{equation*}
The last equality comes from recognizing that $|\Ss_K^a \backslash \Ss_K^b| = |\Ss_K^b\backslash \Ss_K^a| = D_{01}(\Ss_K^a, \Ss_K^b) = 2h+1$. We now simplify the remaining four summation terms of $(*)$.
\begin{equation*}
\begin{aligned}
&\sum_{i\in \Ss_K^{a}\backslash \Ss_K^b} \sum_{S\in C_i^{(m)}} \frac{v+\delta}{w^a(S)} \log \frac{w^b(S)}{w^a(S) } + \sum_{i\in \Ss_K^b \backslash \Ss_K^a }\sum_{S\in C_i^{(m)}} \frac{v}{w^a(S)} \log\frac{w^b(S)}{w^a(S)} \\
&+ \sum_{i\in \Ss^a_K \cap \Ss_K^b } \sum_{S\in C_i^{(m)}} \frac{w^a(i)}{w^a(S)} \log\frac{w^b(S)}{w^a(S)} +   \sum_{i\in [n] \backslash (\Ss^a_K \cup \Ss_K^b) } \sum_{S\in C_i^{(m)}} \frac{w^a(i)}{w^a(S)} \log\frac{w^b(S)}{w^a(S)}\\
&= \sum_{i\in [n]} \sum_{S\in \Cset_i^{(m)}} \frac{w^a(i)}{w^a(S)}\log \frac{w^b(S)}{w^a(S)}\\
&= \sum_{S\in \Cset^{(m)}}\sum_{i\in S}\frac{w^a(i)}{w^a(S)}\log \frac{w^b(S)}{w^a(S)}\\
&= \sum_{S\in\Cset^{(m)}} \log \frac{w^b(S)}{w^a(S)}\\
&= \log \frac{\prod_{S\in \Cset^{(m)}} w^b(S)  }{ \prod_{S\in\Cset^{(m)}} w^a(S)  }\\
&= \log 1 = 0\,.
\end{aligned}
\end{equation*}
The last equality comes from recognizing that the two models $a$ and $b$ only differ by the the identities of the items while the weights are the same. Therefore, the two products in the log term are identical. We thus obtain
$$\kl(\Pr^a(\{Y\}) \rVert \Pr^b(\{Y\})) < pR3h{n-1\choose m-1} \Delta_{K,h}^{(m)} \log\frac{\delta+v}{v}\,. \quad(**)$$
Following the same argument as in the proof of Lemma \ref{lem:kl-div}, we can bound
$$\log \frac{\delta+v}{v} \leq 4\frac{\Delta_{K,h}^{(m)}}{\tau_{K-h}^{(m)}} \,.$$
Substituting this into $(**)$ completes the proof.
\end{proof}

In short, the (matching) lower bound on the sample complexity for approximate top-$K$ ranking is
$$ \Omega\bigg(\frac{n\log n}{m{\Delta_{K,h}^{(m)}}} \cdot \big( 1+ \frac{\tau^{(m)}_{K+h+1}}{\Delta^{(m)}_{K,h}}) \bigg) \,.$$


\subsection{Effect of the menu size on the sample complexity}

\begin{reptheorem}{thm:mnl-variational-lower-bound} Consider an MNL model with $n\geq 2$ items and fix a $K$, we have
    \begin{equation*}
     \frac{1}{m\Delta_K^{(m)}} = \theta\bigg( \frac{1}{e^{U_K}-e^{U_{K+1}}}\cdot \big(1 + \frac{1}{m-1}\big)^2  \bigg)\,,
    \end{equation*}
    \begin{equation*}
    \frac{\tau^{(m)}_{K+1}}{\Delta_K^{(m)}} = \theta\bigg( \frac{e^{U_{K+1}}}{e^{U_K}-e^{U_{K+1}}}\cdot \big( 1+ \frac{1}{m-1}\big) \bigg) \,.
    \end{equation*}
\end{reptheorem}

\begin{proof}

\textbf{Part 1}. We will prove that there exists a positive valued function $l(m)$ such that $m\Delta_K^{(m)} < l(m)$ and that $l(m)$ monotonically increases with $m$, and at diminishing rate. Once we have obtained $l(m)$, we can set $f_1(m) = \frac{1}{l(m)}$. In fact, we will prove a slightly more general result concerning any pair of items $i, j$ where $w_i := e^{U_i} > e^{U_j} =: w_j$. Let $\Delta^{(m)}_{ij} = \tau_i^{(m)}- \tau_j^{(m)}$. Additionally, let $w_n := e^{U_n} = e^{U_{\min}}$ and $w_1 := e^{U_1} = e^{U_{\max}}$ be two model-dependent constants. We have
\begin{equation*}
\begin{aligned}
m\Delta^{(m)}_{ij} &= \frac{m}{{n-1\choose m-1}} \cdot \bigg(\sum_{S\in \Cset^{(m)}_i \cap \Cset^{(m)}_j} \frac{w_i}{w_i + w_j + \sum_{k\in S} w_k} -\frac{w_j}{w_i + w_j + \sum_{k\in S}w_k} \\
&+ \sum_{S' \in \Cset^{(m-1)}: i, j\notin S} \frac{w_i}{w_i + \sum_{k\in S} w_k } - \frac{w_j}{w_j + \sum_{k\in S} w_k}   \bigg)\\
&= \frac{m(w_i -w_j)}{{n-1\choose m-1}} \bigg( \sum_{S\in \Cset^{(m)}_i \cap \Cset^{(m)}_j} \frac{1}{w_i + w_j + \sum_{k\in S} w_k} + \sum_{S' \in \Cset^{(m-1)}: i, j\notin S} \frac{1}{w_j + \sum_{k\in S} w_k}\cdot  \frac{\sum_{k\in S} w_k}{w_i + \sum_{k\in S} w_k}  \bigg)\\
&\leq \frac{m(w_i -w_j)}{{n-1\choose m-1}} \bigg( {n-2\choose m-2} \cdot \frac{1}{w_i + w_j + (m-2)w_n} + {n-2\choose m-1} \frac{1}{w_j + (m-1)w_n} \cdot  \frac{(m-1)w_1}{w_i + (m-1)w_1}  \bigg)\\
&=  \frac{m(w_i -w_j)}{n-1} \bigg( (m-1) \cdot \frac{1}{w_i + w_j + (m-2)w_n} + (n-m)\frac{1}{w_j + (m-1)w_n} \cdot  \frac{(m-1)w_1}{w_i + (m-1)w_1}  \bigg)\\
&= \frac{(w_i -w_j)}{n-1} \bigg(n\frac{m}{[w_j + (m-1)w_n]}\frac{(m-1)w_1}{[w_i + (m-1)w_1]} \\
&+ m(m-1)\cdot \bigg[ \frac{1}{w_i + w_j + (m-2)w_n} - \frac{mw_1}{[w_i + (m-1)w_1][w_j + (m-1)w_n]} \bigg]  \bigg)\\
&= \frac{(w_i -w_j)}{n-1} \bigg(n\frac{m}{[mw_n + w_j -w_n]}\frac{(m-1)w_1}{[w_i + (m-1)w_1]} \\
&+ m(m-1)\cdot \frac{(w_1-w_i)(w_n - w_j)}{[(m-2)w_n+w_i+w_j][(m-1)w_n + w_j][(m-1)w_1+w_i]}  \bigg)\\
&= \frac{(w_i -w_j)}{n-1} \bigg(n\frac{m}{[mw_n + w_j -w_n]}\frac{(m-1)w_1}{[w_i + (m-1)w_1]} \\
&+ \underbrace{\frac{(w_1-w_i)(w_n - w_j)}{(m-2)w_n+w_i+w_j}\cdot \frac{m}{mw_n + w_j -w_n} \cdot\frac{(m-1)}{(m-1)w_1+w_i]}}_{\leq 0}  \bigg)\\
&\leq (w_i -w_j)\cdot \frac{n}{n-1} \cdot \frac{m}{[mw_n + w_j -w_n]}\frac{(m-1)w_1}{[w_i + (m-1)w_1]}\\
&=(w_i -w_j)\cdot \frac{n}{n-1} \cdot (1 - \frac{w_j/w_n-1}{m-1 + w_j/w_n}) \cdot (1-\frac{w_i/w_1}{m-1+w_i/w_1}) \,.
\end{aligned} 
\end{equation*}

It is easy to see that all of the terms containing $m$ are positive and \emph{increase} with $m$. Additionally, it is easy to show that all of these terms increase with \emph{decreasing} rate with respect to $m$.
Though complicated, the above lower bound can somewhat be simplified using big-$O$ notations.
$$ m\Delta^{(m)}_{ij} = \Omega\bigg( (w_i-w_j) \cdot (1-\frac{1}{m})^2 \bigg)\,. $$
By modifying the above argument, we can also show a variational lower bound on $m\Delta^{(m)}_{ij}$.

\begin{equation*}
\allowdisplaybreaks
\begin{aligned}
m\Delta^{(m)}_{ij} &= \frac{m}{{n-1\choose m-1}} \cdot \bigg(\sum_{S\in \Cset^{(m)}_i \cap \Cset^{(m)}_j} \frac{w_i}{w_i + w_j + \sum_{k\in S} w_k} -\frac{w_j}{w_i + w_j + \sum_{k\in S}w_k} \\
&+ \sum_{S' \in \Cset^{(m-1)}: i, j\notin S} \frac{w_i}{w_i + \sum_{k\in S} w_k } - \frac{w_j}{w_j + \sum_{k\in S} w_k}   \bigg)\\
&= \frac{m}{{n-1\choose m-1}} \cdot \bigg(\sum_{S\in \Cset^{(m)}_i \cap \Cset^{(m)}_j} \frac{w_i - w_j}{w_i + w_j + \sum_{k\in S} w_k} + \sum_{S' \in \Cset^{(m-1)}: i, j\notin S} \frac{w_i}{w_i + \sum_{k\in S} w_k } - \frac{w_j}{w_j + \sum_{k\in S} w_k}   \bigg)\\
&= \frac{m}{{n-1\choose m-1}} \cdot \bigg(\sum_{S\in \Cset^{(m)}_i \cap \Cset^{(m)}_j} \frac{w_i - w_j}{w_i + w_j + \sum_{k\in S} w_k} + \sum_{S' \in \Cset^{(m-1)}: i, j\notin S} \frac{(w_i-w_j)(\sum_{k\in S} w_k)}{(w_i + \sum_{k\in S} w_k)(w_j + \sum_{k\in S} w_k)}   \bigg)\\
&= \frac{m(w_i -w_j)}{{n-1\choose m-1}} \bigg( \sum_{S\in \Cset^{(m)}_i \cap \Cset^{(m)}_j} \frac{1}{w_i + w_j + \sum_{k\in S} w_k} + \sum_{S' \in \Cset^{(m-1)}: i, j\notin S} \frac{1}{w_i + \sum_{k\in S} w_k}\cdot  \frac{\sum_{k\in S} w_k}{w_j + \sum_{k\in S} w_k}  \bigg)\\
&[\text{Using the fact that } \frac{\sum_{k\in S} w_k }{w_j + \sum_{k\in S} w_k} \geq \frac{\sum_{k\in S} w_n }{w_j + \sum_{k\in S} w_n} \,\forall S: j \in S] \\
&\geq \frac{m(w_i -w_j)}{{n-1\choose m-1}} \bigg( \sum_{S\in \Cset^{(m)}_i \cap \Cset^{(m)}_j} \frac{1}{w_i + w_j + \sum_{k\in S} w_k} + \sum_{S' \in \Cset^{(m-1)}: i, j\notin S} \frac{1}{w_i + \sum_{k\in S} w_k}\cdot  \frac{(m-1) w_n}{w_j + (m-1) w_n}  \bigg)\\
&\geq \frac{m(w_i -w_j)}{{n-1\choose m-1}} \bigg( {n-2\choose m-2} \frac{1}{w_i + w_j + (m-2) \bar w} + {n-2\choose m-1} \frac{1}{w_i + (m-1)\bar w}\cdot  \frac{(m-1) w_n}{w_j + (m-1) w_n}  \bigg)\,.
\end{aligned}
\end{equation*}

where $\bar w = \frac{1}{n-2}\sum_{k\neq i, j} w_k$. The last inequality comes from applying Jensen's inequality. More precisely, one could treat the sum $\sum_{k\in S} w_k$ as a random variable and the summation as the unnormalized `expectation' over the uniform distribution of the menus. As $\frac{1}{x}$ is a convex function in $x$ for positive $x$, Jensen's inequality applies. Furthermore, without loss of generality, one can assume that $\bar w = 1$ (via scaling of the weights). Consequently, $w_n \leq 1$. Continuing with the expansion gives us

\begin{equation*}
\allowdisplaybreaks
\begin{aligned}
m\Delta^{(m)}_{ij} &\geq \frac{m(w_i -w_j)}{{n-1\choose m-1}} \bigg( {n-2\choose m-2} \frac{1}{w_i + w_j + (m-2) \bar w} + {n-2\choose m-1} \frac{1}{w_i + (m-1)\bar w}\cdot  \frac{(m-1) w_n}{w_j + (m-1) w_n}  \bigg)\\
&=  \frac{m(w_i -w_j)}{n-1} \bigg(  \frac{m-1}{w_i + w_j + (m-2)} + \frac{n-m}{w_i + (m-1)}\cdot  \frac{(m-1) w_n}{w_j + (m-1) w_n}  \bigg)\\
&= \frac{(w_i -w_j)}{n-1} \bigg(  \frac{m(m-1)}{w_i + w_j + (m-2) } + \frac{m(n-m)}{w_i + (m-1)}\cdot  \frac{(m-1) w_n}{w_j + (m-1) w_n}  \bigg)\\
&\geq \frac{(w_i -w_j)}{n-1} \bigg(  \frac{m(m-1)}{w_i + w_j + (m-2)} + \frac{(m-1)(n-m)}{w_i + (m-1)}\cdot  \frac{(m-1) w_n}{w_j + (m-1) w_n}  \bigg)\\
&\geq \frac{(w_i -w_j)}{n-1} \bigg( n\cdot \frac{m-1}{m-1+w_i} \cdot \frac{(m-1)w_n}{(m-1)w_n+w_j} + \frac{m(m-1)}{w_i + w_j + (m-2)} - \frac{m(m-1)(m-1)w_n}{(w_i + m-1)(w_j + (m-1)w_n)}   \bigg)\\
&\geq \frac{(w_i -w_j)}{n-1} \bigg( n\cdot \frac{m-1}{m-1+w_i} \cdot \frac{(m-1)w_n}{(m-1)w_n+w_j} \\
&+m(m-1)\bigg( \frac{1}{w_i + w_j + (m-2)} - \frac{(m-1)w_n}{(w_i + m-1)(w_j + (m-1)w_n)} \bigg)  \bigg)\\
&[\text{Expanding on the difference term}]\\
&\geq \frac{(w_i -w_j)}{n-1} \bigg( n\cdot \frac{m-1}{m-1+w_i} \cdot \frac{(m-1)w_n}{(m-1)w_n+w_j} \\
&+m(m-1)\bigg( \frac{1}{w_i + w_j + (m-1)} - \frac{(m-1)w_n}{(w_i + m-1)(w_j + (m-1)w_n)} \bigg)  \bigg)\\
&= \frac{(w_i -w_j)}{n-1} \bigg( n\cdot \frac{m-1}{m-1+w_i} \cdot \frac{(m-1)w_n}{(m-1)w_n+w_j} \\
&+m(m-1)\cdot (w_iw_j + w_i(m-1)w_n + mw_j +m(m-1)w_n - w_j - (m-1)w_n - w_i(m-1)w_n - w_j(m-1)w_n - \\
&m(m-1)w_n+1(m-1)w_n))/\big((w_i + w_j + (m-1))(w_i + m-1)(w_j + (m-1)w_n)\big) \bigg)\\
&= \frac{(w_i -w_j)}{n-1} \bigg( n\cdot \frac{m-1}{m-1+w_i} \cdot \frac{(m-1)w_n}{(m-1)w_n+w_j} \\
&+m(m-1)\cdot \frac{mw_j + w_iw_j - w_j - w_j(m-1)w_n}{(w_i + w_j + (m-1))(w_i + m-1)(w_j + (m-1)w_n)}  \bigg)\\
&= \frac{(w_i -w_j)}{n-1} \bigg( n\cdot \frac{m-1}{m-1+w_i} \cdot \frac{(m-1)w_n}{(m-1)w_n+w_j} \\
&+m(m-1)w_j\cdot \frac{m + w_i - 1 - (m-1)w_n}{(w_i + w_j + (m-1))(w_i + m-1)(w_j + (m-1)w_n)}  \bigg)\\
&= \frac{(w_i -w_j)}{n-1} \bigg( n\cdot \frac{m-1}{m-1+w_i} \cdot \frac{(m-1)w_n}{(m-1)w_n+w_j} + \frac{m(m-1)w_j((m-1)(1-w_n) + w_i)}{(w_i + w_j + (m-1))(w_i + m-1)(w_j + (m-1)w_n)}  \bigg)\\
&\geq  \frac{(w_i -w_j)}{n-1} \bigg( n\cdot \frac{m-1}{m-1+w_i} \cdot \frac{(m-1)w_n}{(m-1)w_n+w_j} + \frac{m(m-1)w_j((m-1)(1-w_n) + w_i)}{(w_i + w_j + m)(w_i + m-1)(w_j + (m-1)w_n)}  \bigg)\\
&= \frac{(w_i -w_j)}{n-1} \bigg( n\cdot \frac{m-1}{m-1+w_i} \cdot \frac{(m-1)w_n}{(m-1)w_n+w_j} + \underbrace{w_j\cdot \frac{m}{w_i+w_j+m}\cdot \frac{m-1}{w_j + (m-1)w_n}\cdot \frac{w_i + (m-1)(1-w_n)}{w_i+(m-1)}}_{\geq 0} \bigg)\\
&\geq \frac{(w_i -w_j)}{n-1} \bigg( n\cdot (1-\frac{w_i}{m-1+w_i}) \cdot (1-\frac{w_j}{(m-1)w_n+w_j}) \bigg) \\
&=(w_i -w_j)\cdot \frac{n}{n-1} \cdot (1-\frac{w_i}{m-1+w_i}) \cdot (1-\frac{w_j/w_n}{m-1+w_j/w_n})  \,.
\end{aligned}
\end{equation*}

We thus have $m\Delta_{ij} = O((w_i - w_j) \cdot 1-\frac{1}{m})^2)$. Combining with the upper bound shown earlier, we get
$$m\Delta_{ij} = \theta\bigg((w_i - w_j) \cdot (1-\frac{1}{m})^2\bigg) \,.$$

\textbf{Part 2.} We will prove that there exists a positive valued function $l(m)$ such that $\frac{\Delta^{(m)}_{ij}}{\tau^{(m)}_{j}} \geq l(m)$ and that $l(m_2) > l(m_1)$ for $m_2 > m_1 \geq 2$. Consider for any two items $i, j$ such that $w_i > w_j$. By definition
\begin{equation*}
\begin{aligned}
\frac{\Delta^{(m)}_{ij}}{\tau_{j}^{(m)}} &= \frac{\sum_{S\in \Cset_i^{(m)} \cap \Cset_j^{(m)}} \frac{w_i - w_j}{\sum_{k\in S} w_k } +  \sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{w_i}{w_i + \sum_{k\in S} w_k } -  \frac{w_j}{w_j + \sum_{k\in S} w_k }  }{ \sum_{S\in \Cset_i^{(m)} \cap \Cset_j^{(m)}} \frac{w_j}{\sum_{k\in S} w_k } +  \sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{w_j}{w_j + \sum_{k\in S} w_k } }\\
&= \frac{\sum_{S\in \Cset_i^{(m)} \cap \Cset_j^{(m)}} \frac{w_i - w_j}{\sum_{k\in S} w_k } +  \sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{(w_i-w_j)(\sum_{k\in S}w_k )}{(w_i + \sum_{k\in S} w_k)(w_j + \sum_{k\in S} w_k) }  }{ \sum_{S\in \Cset_i^{(m)} \cap \Cset_j^{(m)}} \frac{w_j}{\sum_{k\in S} w_k } +  \sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{w_j}{w_j + \sum_{k\in S} w_k } }\\
&= \frac{\sum_{S\in \Cset_i^{(m)} \cap \Cset_j^{(m)}} \frac{1}{\sum_{k\in S} w_k } +  \sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{\sum_{k\in S}w_k }{(w_i + \sum_{k\in S} w_k)(w_j + \sum_{k\in S} w_k) }  }{ \sum_{S\in \Cset_i^{(m)} \cap \Cset_j^{(m)}} \frac{1}{\sum_{k\in S} w_k } +  \sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{1}{w_j + \sum_{k\in S} w_k } } \cdot \frac{w_i - w_j}{w_j}\\
&= (1- \frac{\sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{w_j }{(w_i + \sum_{k\in S} w_k)(w_j + \sum_{k\in S} w_k) }}{ \sum_{S\in \Cset_i^{(m)} \cap \Cset_j^{(m)}} \frac{1}{\sum_{k\in S} w_k } +  \sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{1}{w_j + \sum_{k\in S} w_k }}) \cdot \frac{w_i - w_j}{w_j}\,.
\end{aligned}
\end{equation*}

The goal is to upper (and lower) bound the quantity
$$ \frac{\sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{w_j }{(w_i + \sum_{k\in S} w_k)(w_j + \sum_{k\in S} w_k) }}{ \sum_{S\in \Cset_i^{(m)} \cap \Cset_j^{(m)}} \frac{1}{\sum_{k\in S} w_k } +  \sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{1}{w_j + \sum_{k\in S} w_k }} \,.$$
For a lower bound, we can show that
\begin{equation*}
\begin{aligned}
&\frac{\sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{w_j }{(w_i + \sum_{k\in S} w_k)(w_j + \sum_{k\in S} w_k) }}{ \sum_{S\in \Cset_i^{(m)} \cap \Cset_j^{(m)}} \frac{1}{\sum_{k\in S} w_k } +  \sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{1}{w_j + \sum_{k\in S} w_k }}\\
&\geq \frac{{n-2\choose m-1} \frac{1}{(m-1)w_1+w_i} \frac{w_j}{w_j + (m-1)w_1} }{{n-2\choose m-2}\cdot \frac{1}{(m-2)w_n + w_i + w_j}  + {n-2\choose m-1}\cdot \frac{1}{(m-1)w_n + w_j}   }\\
&= \frac{ (n-m)\cdot \frac{1}{(m-1)w_1+w_i} \frac{w_j}{w_j + (m-1)w_1} }{(m-1)\cdot \frac{1}{(m-2)w_n + w_i + w_j}  + (n-m)\cdot \frac{1}{(m-1)w_n + w_j}   }\\
&[\text{Noting that } (m-2)w_n + w_i + w_j = (m-1)w_n + w_j + (w_i - w_n) \geq (m-1)w_n + w_j ]\\
&\geq \frac{ (n-m)\cdot \frac{1}{(m-1)w_1+w_i} \frac{w_j}{w_j + (m-1)w_1} }{(m-1)\cdot \frac{1}{(m-1)w_n + w_j}  + (n-m)\cdot \frac{1}{(m-1)w_n + w_j}   }\\
&= \frac{n-m}{n-1}\cdot \frac{1}{(m-1)w_1+w_i} \cdot \frac{(m-1)w_n + w_j}{(m-1)w_1+w_j}\\
&\geq \frac{n-m}{n-1}\cdot \frac{1/w_1}{m-1+w_i/w_1} \cdot \frac{w_n}{w_1}\,.
\end{aligned}
\end{equation*}
For an upper bound, we can show that
\begin{equation*}
\begin{aligned}
&\frac{\sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{w_j }{(w_i + \sum_{k\in S} w_k)(w_j + \sum_{k\in S} w_k) }}{ \sum_{S\in \Cset_i^{(m)} \cap \Cset_j^{(m)}} \frac{1}{\sum_{k\in S} w_k } +  \sum_{S\in \Cset^{(m-1)}: i,j \notin S} \frac{1}{w_j + \sum_{k\in S} w_k }}\\
&\leq \frac{{n-2\choose m-1} \frac{1}{(m-1)w_n+w_i} \frac{w_j}{w_j + (m-1)w_n} }{{n-2\choose m-2}\cdot \frac{1}{(m-2)w_1 + w_i + w_j}  + {n-2\choose m-1}\cdot \frac{1}{(m-1)w_1 + w_j}   }\\
&= \frac{ (n-m)\cdot \frac{1}{(m-1)w_n+w_i} \frac{w_j}{w_j + (m-1)w_n} }{(m-1)\cdot \frac{1}{(m-2)w_1 + w_i + w_j}  + (n-m)\cdot \frac{1}{(m-1)w_1 + w_j}   }\\
&[\text{Noting that } (m-2)w_1 + w_i + w_j = (m-1)w_1 + w_j + (w_i - w_1) \leq (m-1)w_1 + w_j ]\\
&\leq \frac{ (n-m)\cdot \frac{1}{(m-1)w_n+w_i} \frac{w_j}{w_j + (m-1)w_n} }{(m-1)\cdot \frac{1}{(m-1)w_1 + w_j}  + (n-m)\cdot \frac{1}{(m-1)w_1 + w_j}   }\\
&= \frac{n-m}{n-1}\cdot \frac{1/w_n}{m-1+w_i/w_n} \cdot \frac{w_1}{w_n}\,.
\end{aligned}
\end{equation*}
In both cases, it is straight forward to check that both of the upper and lower bounds decrease with increasing $m$ and at a decreasing rate. Furthermore, they have the same asympotic dependency on $m$. Putting both bounds together, we have
$$ \frac{\Delta^{(m)}_{ij}}{\tau^{(m)}_{j}} = \theta\bigg( (1-\frac{n-m}{n-1}\cdot \frac{1}{m}) \cdot \frac{w_i - w_j}{w_j} \bigg) = \theta\bigg( \frac{n}{n-1}\cdot (1-\frac{1}{m}) \cdot \frac{w_i - w_j}{w_j} \bigg) \,. $$
Simplifying the above expression by removing the dependency on $n$ gives
$$\frac{\Delta^{(m)}_{ij}}{\tau^{(m)}_{j}} = \theta\bigg( (1-\frac{1}{m}) \cdot \frac{w_i - w_j}{w_j} \bigg)\,. $$
This completes the proof.
\end{proof}

\subsection{Connection among MNL-MLE, Generalized Borda Count and Spectral Ranking}

In this section, we will prove the theorems about the connections among the three algorithms. Here, we provide more general results pertaining to ranking output (as opposed to just top $K$ estimate).

\begin{reptheorem}{thm:borda-mle-connection} Consider the sampling model described in Section \ref{subsect:sampling-model}, for any $p > 0$, in the limit as $R\rightarrow \infty$, MNL-MLE and choice-based Borda count will produce the same top $K$ estimate. Moreover, this holds even if the data does not come from the MNL model or any IID-RUM.
\end{reptheorem}

\begin{proof} 
\textbf{Notation:} As a shorthand notation, we will use $p(i|S)$ denote the probability of item $i$ being chosen from menu $S$. Note that this probability does not necessarily follow any parametric RUM. Let $\text{ord}$ denote the ordering function whose input is a set of real numbers and whose output is the full ordering of the indices of those numbers in increasing order.
Under our sampling model, in the limit as $R\rightarrow \infty$, all the possible menus in $\Cset^{(m)}$ are observed given that $p > 0$. Furthermore, given infinite data, the observed probability becomes exact. Consider the log likelihood function
$$ \Loss(\mb U) =\sum_{i=1}^n \bigg(\sum_{S\in \Cset_i^{(m)}} p(i|S) \cdot \log \frac{e^{U_i}}{\sum_{j\in S} e^{U_j}}\bigg) \,.$$

The derivative of the log likelihood with respect to individual partworth parameter is given as
\begin{equation*}
\begin{aligned}
\nabla_{U_i}\Loss(\mb U) &=  \sum_{S\in \Cset_i^{(m)}} \bigg( -\sum_{j\in S, j\neq i} p(j|S) \frac{e^{U_i}}{\sum_{k\in S} e^{U_k}} -  p(i|S) \frac{e^{U_i}}{\sum_{k\in S} e^{U_k}} + p(i|S) \bigg) \\
&= \sum_{S\in \Cset_i^{(m)}} \bigg( p(i|S) - \frac{e^{U_i}}{\sum_{k\in S} e^{U_k} }   \bigg)\,.
\end{aligned}
\end{equation*}
As the log likelihood function is concave in $U$, the MLE estimate $\hat {\mb U}$ must satisfy:
$$ \sum_{S\in \Cset_i^{(m)}} \frac{e^{\hat U_i}}{\sum_{j\in S} e^{\hat U_j}} = \sum_{S\in \Cset_i^{(m)}}  p(i|S) \quad \forall i \,.$$
On the other hand it can easily be shown that for any two items $i\neq j$,
$$ \hat U_i > \hat U_j \Rightarrow \sum_{S\in \Cset_i^{(m)}} \frac{e^{\hat U_i}}{\sum_{k\in S}e^{\hat U_k}} > \sum_{S\in \Cset_j^{(m)}} \frac{e^{\hat U_j}}{\sum_{k\in S}e^{\hat U_k}} \,.$$
Hence, 
$$ \text{ord}(\{\hat U_i\}_{i=1}^n) = \text{ord}(\{\sum_{S\in \Cset_i^{(m)}}\frac{e^{\hat U_i}}{\sum_{k\in S} e^{\hat U_k} } \}_{i=1}^n ) = \text{ord}(\{\sum_{S\in \Cset_i^{(m)}} p(i|S)\}_{i=1}^n ) \,. $$
Observing that the ordering given by the Borda Count algorithm is consistent with the ordering induced by $\{\sum_{S\in \Cset_i^{(m)}} p(i|S)\}_{i=1}^n$ completes the proof.
\end{proof} 

\begin{reptheorem}{thm:borda-mle-asr-connection} Consider the sampling model described in Subsection \ref{subsect:sampling-model}. Assume that the underlying choice model generating the data is in the class of IID-RUMs whose noise distribution has absolutely continuous density function with support on the real line. For any $p > 0$, in the limit as $R\rightarrow \infty$, then Spectral Ranking, MNL-MLE and choice-based Borda count produce the same top $K$ estimate.

On the other hand, there exists a choice model where in the limit as $R\rightarrow \infty$, the spectral ranking algorithm produces a different top $K$ estimate from MNL-MLE/Borda count.
\end{reptheorem}

\begin{proof} Following the argument as in the proof of Theorem \ref{thm:borda-mle-connection}, given $p > 0$ as $R\rightarrow \infty$, all possible menus of size $m$ are observed and all choice probabilities are exact. For the first part of the theorem it suffices to prove that under IID-RUMs, the Spectral Ranking algorithm is consistent in recovering the true ordering among the items, as this will be the same ordering as given by Generalized Borda Count/MLE.

The Spectral Ranking algorithm ranks the items by the stationary distribution of a Markov Chain constructed using choice data. For analysis purpose, we follow the construction due to \cite{negahban2017rank,maystre2015fast}. Fix a menu size $m \geq 2$ and consider the following Markov Chain where for any two items $i, j$,
$$ \M_{ij} =\begin{cases}\frac{1}{{n-1\choose m-1}} \sum_{S\in\Cset^{(m)}: i, j\in S} \rho(j|S)&\text{if } j\neq i \\
1 - \sum_{k\neq i} \M_{ik} &\text{if } j = i
\end{cases} \,.$$
Consider any two items $i, j$ such that $U_i > U_j$. Under IID-RUMs assumptions in the theorem statement, we have
$$ \rho(i|S) > \rho(j|S) \quad\forall S \in\Cset^{(m)}: i, j \in S $$
and
$$ \rho(i|S' \cup \{i\}) > \rho(j | S' \cup \{j\}) \quad \forall S' \in \Cset^{(m-1)}: i, j\notin S \,.$$
It is easy to show that
$$ \M_{ki} > \M_{kj} \quad\forall k\neq i, j $$
and
$$ \M_{ii} > \M_{jj}\,. $$
Let $\pi$ denote the stationary distribution of the Markov Chain constructed above.
By definition, the stationary distribution of the Markov Chain satisfies
\begin{equation*}
\begin{aligned}
\pi_i &= \sum_{k\in [n]} \pi_k\cdot \M_{ki} > \sum_{k\in [n]} \pi_k \cdot \M_{kj} = \pi_j \,.
\end{aligned}
\end{equation*}

For the second half of the theorem, let us consider the pairwise comparison setting ($m=2$). We will construct a pairwise choice model such that MNL-MLE/Generalized Borda Count give a different ordering among the items from Spectral Ranking algorithm in the limit as $R\rightarrow \infty$. Consider an universe of 4 items with the following pairwise choice probability. Note that $P_{ij} = P(j|\{i, j\})$. Define
$$P = \begin{bmatrix}
0.5 &0.6 &0.55 &0.55\\
0.2 &0.5 &0.85 &0.60\\
0.45 &0.40 &0.5 &0.95\\
0.45 &0.45 &0.15 &0.5\\
\end{bmatrix} \,.$$
It is easy to check that $\tau_4 > \tau_3 > \tau_2 > \tau_1$. In the limit of infinite data, MLE/Borda Count will output the ordering $4,3,2,1$ (best item first). However, the Spectral Ranking algorithm will give the ordering $4,2,3,1$. This completes the proof.\end{proof}


\newpage
\section{Additional experiments}\label{sect:experiment}

\subsection{Dataset descriptions}

Table (\ref{tab:dataset}) shows the characteristics of the datasets used in our experiments: number of items, number of rankings available and whether the data contains partial rankings.
\begin{table}[H]
\centering
\begin{tabular}{|l|l|l|l|}
\hline
\multicolumn{1}{|c|}{Dataset} & \multicolumn{1}{c|}{$n$} & \multicolumn{1}{c|}{Num rankings} & \multicolumn{1}{c|}{Partial ranking?} \\ \hline
APA  &5      &$ 6000$  &No                                       \\ \hline
Sushi  &11      &$ 5000$   &No                                       \\ \hline
Irish-North   &12   &$ 44k$   &Yes   \\ \hline
Irish-West &9 &$ 30k$ & Yes \\ \hline
Irish-Meath &14 &$ 64k$  & Yes  \\ \hline
F1 &22 &18 &Yes                                       \\ \hline
\end{tabular}
\caption{\label{tab:dataset}Characteristics of the datasets used in our experiments.}
\end{table}

\textbf{Data preparation:} Given menu size $m$, to estimate choice probability from rankings and partial rankings, we first enumerate all the possible menus of size $m$. For each menu $S$, and for each $i\in S$, we count the number of rankings where $i$ is ranked ahead of all the other items in $S$, a 'win' for $i$. For a partial ranking where there are $l$ items ranked as `equal', we count $\frac{1}{l}$-`win' for each item. Choice probability is obtained by normalizing the number of wins by the number of rankings.

\textbf{Experiments: } For each independent trial, we first generate all the data as specified by the maximum sample size. We then feed increasing portion of this data to the algorithm and check if the algorithm correctly identifies \emph{all} of the top $K$ items. For all experiments, we keep $R=100$ and adjust $p$ to obtain the appropriate expected sample size.

\subsection{Additional experimental results}

\begin{figure}[H]
\centering
    \includegraphics[scale=0.5]{figs/apa_time_all.png}
\caption{
    \textbf{APA election dataset:} Average training time (seconds) against sample size for $m=2,3,4$.}
\end{figure}

\begin{figure}[H]
\centering
\includegraphics[scale=0.5]{figs/irish_north_time_all.png}
\caption{\textbf{Irish North dataset:} Average training time (seconds) against sample size for $m=2,4,8$.}
\end{figure}

\begin{figure}[H]
\centering
\includegraphics[scale=0.5]{figs/irish_west_time_all.png}
\caption{\textbf{Irish West dataset:} Average training time (seconds) against sample size for $m=2,4,6$.}
\end{figure}

\begin{figure}[H]
\centering
    \includegraphics[scale=0.5]{figs/irish_meath_time_all.png}
\caption{\textbf{Irish Meath dataset:} Average training time (seconds) against sample size for $m=2,4,8$.}
\end{figure}
            
\begin{figure}[H]
\centering
    \includegraphics[scale=0.5]{figs/sushi_time_all.png}
\caption{\textbf{SUSHI dataset:} Average training time (seconds) against sample size for $m=2,4,8$.}
\end{figure}

\begin{figure}[H]
\centering
    \includegraphics[scale=0.5]{figs/f1_time_all.png}
\caption{\textbf{F1 dataset:} Average training time (seconds) against sample size for $m=2,4,8$. }
\end{figure}

\newpage
\begin{figure}[H]
\centering
    \includegraphics[scale=0.5]{figs/apa_all_full.png}
\caption{
    \textbf{APA election dataset:} Exact top $K$ accuracy against sample size}

\label{fig:apa-experiment}
\end{figure}

\newpage
\begin{figure}[H]
\centering
    \includegraphics[scale=0.5]{figs/sushi_all_full.png}
\caption{\textbf{SUSHI dataset:} Exact top $K$ accuracy against sample size}
\label{fig:sushi-experiment}
\end{figure}

\newpage
\begin{figure}[H]
\centering
    \includegraphics[scale=0.5]{figs/irish_north_all_full.png}
\caption{\textbf{Irish North dataset:} Exact top $K$ accuracy against sample size}
\label{fig:irish-north-experiment}
\end{figure}

\newpage
\begin{figure}[H]
\centering
    \includegraphics[scale=0.5]{figs/irish_west_all_full.png}
\caption{\textbf{Irish West dataset:} Exact top $K$ accuracy against sample sizes}
\label{fig:irish-west-experiment}
\end{figure}

\newpage
\begin{figure}[H]
\centering
    \includegraphics[scale=0.5]{figs/irish_meath_all_full.png}
\caption{\textbf{Irish Meath dataset:} Exact top $K$ accuracy against sample size}
\label{fig:irish-meath-experiment}
\end{figure}

\newpage
\begin{figure}[H]
\centering
    \includegraphics[scale=0.35]{figs/f1_all_full.png}
\caption{\textbf{F1 dataset:} Exact top $K$ accuracy against sample size. }
\label{fig:f1-experiment}
\end{figure}

\bibliography{nguyen_28-supp}

\end{document}
