\vspace{-10pt}
\section{Preliminaries and Setting}
\label{sec:prb_setup}


%\vspace{-5pt}
\begin{figure}[H]
%\begin{widepage}
%\hspace{-18pt}
\begin{subfigure}{0.23\textwidth}
%\includegraphics[trim={0cm 0cm 0cm 0cm},clip,scale=0.2,width=0.25\textwidth]{./figs/inp.JPG}
\includegraphics[width=1\linewidth]{./Plots/rg.png}
\caption{\label{fig:rg}Relation graph $G([n],E)$\\ associated to $\bU$ (Sec. \ref{sec:prb_set})}
\end{subfigure}
\hspace*{-1pt}
\begin{subfigure}{0.23\textwidth}
%\includegraphics[trim={0cm 0cm 0cm 0cm},clip,scale=0.2,width=0.25\textwidth]{./figs/inp.JPG}
\includegraphics[width=1\linewidth]{./Plots/bi_grph2.png}
\caption{\label{fig:bi_grph}The bipartite graph $C_M = (\cI(G) \cup M, \Delta_{M})$ (Thm. \ref{thm:halls-equations})}
\end{subfigure}
%\hspace{-33pt}
\vspace*{-5pt}
\caption{\label{fig:algo_dem}Few graphical demonstrations}
%\hspace{-40pt}
\vspace{-2pt}
%\end{widepage}
\end{figure}
\vspace{-10pt}

\textbf{Notations.} We use lowercase boldface letters for vectors, uppercase boldface letters for matrices, lowercase letters for scalars and uppercase letters for constants. $\|\cdot\|_2$ denotes the $\ell_2$ norm for vectors and spectral norm for matrices. $\|\cdot\|_F$ denotes the Frobenius norm for matrices. We denote the set $\{1,\ldots,n\}$ by $[n]$. For any matrix $\mathbf A \in \mathbb R^{m \times n}$, we abbreviate $A_{ij} = A(i,j)$.
%We abbreviate `without loss of generality' as w.l.o.g.
%For a square matrix $\mathbf A \in \mathbf R^{n \times n}$, we denote by $\lambda_{\max}(\mathbf A)$ the magnitude of the largest eigenvalue of $\mathbf A$.

\textbf{Bradley-Terry-Luce (BTL) model.}(\cite{bradleyTe1952}, \cite{luce05})
A standard probabilistic model for pairwise comparisons is the Bradley-Terry-Luce (BTL) model where the probability of preferring item $i$ over $j$ is given by: 
$
    P_{ij} = \frac{\exp(\theta_i)}{\exp(\theta_i) + \exp(\theta_j)},
$
%$
%    P_{ij} = \frac{\theta_i}{\theta_i + \theta_j},
%$
$\btheta \in \bR^n$ being the `\emph{score vector}' of the $n$ items.

\vspace{-10pt}
\subsection{Problem Setting}
\label{sec:prb_set}
\vspace*{-5pt}
Let $[n] = \{1, \ldots, n\}$ be the set of items to be ranked, and their feature vectors are $\mathbf{U} = \{\bu_1, \ldots, \bu_n\} \subset \bR^d$, where $\bU$ respects a \emph{relation graph} $G([n],E)$ as follows:

\textbf{Feature ($\bU$) vs Relation graph ($G$).} 
We assume the feature set $\bU$ of the $n$ items are associated to an underlying relation graph $G([n],E)$ by a natural assumption: $G([n],E)$ is such that there exists an independent set of $G$, say $\cI(G)$, such that the set of item features $\bU = \{\bu_i\}_{i \in [n]}$ lies in the linear span of only that of $\cI(G)$, $\{\bu_i\}_{i \in \cI(G)}$. More formally,

\begin{equation}
\label{eqn:basis}
    \bu_i = \displaystyle \sum_{j \in \mathcal{I}(G) \cap \bar N_G(i)} B_{ji}\bu_j ~~\forall i \in [n],
\end{equation}

where $N_G(i) = \{j \in [n] \mid (i,j) \in E\}$ denotes the set of neighboring nodes of $i$ in $G$, and $\bar N_G(i) = N_G(i) \cup \{i\}$. Here $\bB \in \mathbb{\bR}^{n \times \alpha}$ is a \emph{coefficient matrix} that  expresses $\bU$ in terms of the bases features $\{\bu_i\}_{i \in \cI(G)}$.
Note, we also assume $\bB$ is such that any $\alpha \times \alpha$ submatrix of $\bB$ is of rank $\alpha$, which ensures none of the dependent features can be represented as a linear combination of the other dependent features, or precisely $\cI(G)$ is a maximal independent set of the independent nodes and all the dependent items $[n]\setminus \cI(G)$, can only be represented as a unique linear combination of the independent nodes $\cI(G)$. 
Thus we assume $B_{ij} = 0$ whenever $(i,j) \notin E$, and the subset of vectors in $\mathbf{U}$ corresponding to the items in the independent set $\mathcal{I}(G)$ are \emph{linearly independent}. Thus $d \ge \alpha$, and $\{\bu_i\}_{i \in \cI(G)}$ form a basis for $span(\bU)$. Hence $B_{ij} = 1 $, if $j = i$, or else $B_{ij} = 0, \, \forall j \neq i$, $\forall i \in [\alpha]$. 
We denote $\alpha = |\mathcal{I}(G)|$; clearly, it becomes the independent number of $G$ if $\cI(G)$ corresponds to a maximum independent set. We will henceforth assume $\cI(G) = [\alpha]$, w.l.o.g, unless specified otherwise.  (see Fig. \ref{fig:rg} for further illustration).

%\vspace*{-8pt}
%%\hspace*{-20pt}
%\begin{figure}[ht]
%\begin{center}
%\includegraphics[scale=0.3]{./rg.png} 
%\vspace*{-10pt}
%\caption{Relation graph $G([n],E)$ associated to $\bU$}
%\label{fig:matrix}
%\end{center}
%\end{figure}
%%\hspace*{-20pt}
%\vspace*{-8pt}



%We will assume (without loss of generality) the set $[\alpha]$ to be $\cI(G)$. When there are multiple independent sets of $G$, we arbitrarily fix one. 

\textbf{Preference Model.} We introduce the \emph{feature Bradley--Terry--Luce model} (f-BTL) where the probability of preferring item $i$ over $j$ is given by:
%$P_{ij} = \frac{{\textbf{w}^{T}\bu_i}}{ {\textbf{w}^{T}\bu_i} + {\textbf{w}^{T}\bu_j}},$
$P_{ij} = \dfrac{e^{(\textbf{w}^{T}\bu_i)}}{ e^{(\textbf{w}^{T}\bu_i)} + e^{(\textbf{w}^{T}\bu_j)}},$
 $\textbf{w} \in \mathbb{R}^d$. Note that the f-BTL model reduces to the standard BTL model when $\alpha = n$ and $\mathbf{U}$ is the standard basis. Clearly the `\emph{score vector}' $\btheta \in \bR^{n}$ for f-BTL model turns out to be $\theta_i = \bw^T\bu_i$.

\textbf{Sampling Model.} We assume that a set $M$ of $m \in [{n\choose 2}]$ pairs is generated where each pair is chosen with some probability $p \in [0,1]$. Each pair in $M$ is compared $K$ times independently according to f-BTL model. 

\begin{rem}
\eqref{eqn:basis} shows that two items with similar set of neighbours in $\cI(G)$ are similar in terms their features. This along with f-BTL model ensures two similar items are also similar in their scores $\theta_i$, and hence rankings. 
\end{rem}

\textbf{Problem.} Under f-BTL model and given $\bU$, for what values of $m$ and $K$  can one find an estimated score vector $\hat{\btheta}$ such that
$ P\Big(\frac{\|\btheta - \hat{\btheta}\|_2}{\|\btheta\|_2} \le \epsilon \Big) > 1 - \delta ~?$ Here $\epsilon > 0$ and $\delta \in [0,1]$ are two given problem parameters, which respectively denote the allowable error limit and performance confidence. 

\textbf{Performance Error.}
%{\color{red} Add PD discussion.} 
The error above is measured in terms of \emph{normalized 
$l_2$ error} $\frac{\|\btheta - \hat{\btheta}\|_2}{\|\btheta\|_2}$, which is a natural performance measure for score based probability models (e.g. BTL, Thurstone etc.) \cite{negahban+12,borkarNi16}. 
Moreover, it is actually suitable for measuring ranking performances as it upper bounds the pairwise disagreement error -- the weighted Kendall-Tau loss \cite{negahban+12}:
\begin{align*}
\hspace{-3pt} pd(\btheta,\bhtheta) \hspace{-2pt} = \hspace{-2pt} \Bigg(\hspace{-3pt}\frac{1}{n\|\btheta\|_2^2} &\sum_{i < j} \hspace{-3pt} \big( \theta_i - \theta_j \big)^2\hspace{-3pt}\mathbf{1}\big((\theta_i - \theta_j)(\hat \theta_i - \hat \theta_j) \hspace{-3pt} < \hspace{-3pt}0 \big) \hspace{-3pt}\Bigg)^{\hspace{-4pt} \frac{1}{2}}
\end{align*} 
Thus giving a $(\epsilon,\delta)$ guarantee for the normalized $\ell_2$-error also ensures the same for $pd(\btheta,\bhtheta)$. We use both of the losses in our experiment evaluations (Section \ref{sec:expts}).

\subsection{Related Works}
\label{sec:relworks}

Ranking from pairwise comparisons has been studied extensively in various disciplines owning to its huge practical importance, reviewing all lies beyond the scope of this work. We review only the works most relevant to our setting. The most related work is 
\cite{niranjanRa17}, however, they assume the features to lie in some low dimensional space and use a matrix completion-based approach to predict the ranking. Note that the low-rank assumption is a \emph{global} assumption on the features that might miss out completely on the exact dependencies on the items. 
\cite{chiang+17} also consider a feature preference information model, but do not analyze the graph theoretic aspects of feature dependencies.% and their guarantees are again w.r.t the feature dimension $d$, whereas ours is in terms of number of 'independent items' $\alpha$ and thus much tighter when $d >> \alpha$. %Furthermore, our graph theoretic analysis also captures the interaction between the I(G) and non-I(G) nodes explicitly (through Thm 3.2), which they cannot.
%As we will see, the set of features in a low dimensional space might give rise to very different type of relation graphs which may lead to very different sample complexity bounds that our analysis will capture while theirs does not.   
\cite{gleichLek11}, \cite{borkarNi16} also use a least squares-based approach, but without any feature information. \cite{negahban+12,wauthier+13,busaHu14,rajkumarAg14,shahWa15} \cite{chenJo16}, \cite{rajkumarAg16}, \cite{shah+16} work in the pairwise ranking setting under different probabilistic models (including BTL model), but again none of them use features explicitly and hence are sub-optimal for our setting (as we will see in the experiments).
\cite{jamiesonNo11} work in a setting where the probabilities come from some unknown low-dimensional feature embedding of the items. However, they require the pairs to be queried actively, whereas our work focuses on random (passive) selection of pairs.  There is also a rich ranking literature on noisy sorting \cite{bravermanMo08}, approximation algorithms \cite{ailon08}, dueling bandits \cite{yue12} etc., which are fundamentally different from the passive setting under the BTL model considered here.
Table \ref{tab:sum_con} summarizes the sample complexities of a few related works.

%Table \ref{tab:sum_con} summarizes the sample complexities of a few related works.



Previous results show that under the standard BTL model, the \emph{Rank Centrality} \cite{negahban+12} \cite{rajkumarAg16}, \emph{MLE} under the BTL model \cite{shahWa15} and the \emph{Least Squares} \cite{borkarNi16} algorithms need $O(n\log n)$ comparisons to achieve a small error with probability at least $1 - O(poly(1/n))$. However, these algorithms do not consider the features explicitly. The \emph{Feature Low Rank} model of \cite{niranjanRa17}  uses features but requires $O(d^2\log(n))$ pairs to be compared. 
Another related work is \cite{kadioglu2021sample}, which proposes an estimator for the parameters of a generalized linear parametric model, which includes classical preference models like Bradley-Terry and Thurstone. By addressing the violation of independence, they prove a sample complexity guarantee, showing that with Gaussian-distributed features, the estimator converges to a rescaled version of the model parameters based on the ambient dimension, number of samples, and comparisons. Their results indicate that achieving an accuracy \(\epsilon > 0\) in model parameters requires \(\Omega(dn \log^3 n / \epsilon^2)\) comparisons when the number of samples is \(\Omega(d / \epsilon^2)\), which they validate through experiments on synthetic data.
%
We show our proposed fBTL-LS algorithm requires only $O(\alpha\log \alpha)$ samples. 

%\vspace*{-2pt}
%\iffalse
\begin{table}[h]
\vspace*{-4pt}
%\hspace*{-50pt}
\begin{center}
\scalebox{0.75}{
\begin{tabular}{|c|c|c|}
\hline
\textbf{Ranking} & \textbf{Sampling}  & \textbf{Sample} \\
\textbf{Model} & \textbf{Technique}  & \textbf{Complexity} \\
\hline
 Noisy permutation \cite{bravermanMo08} & Active  & $O(n \log n)$ \\
\hline
 Low $d$-dimensional embedding \cite{jamiesonNo11} & Active  & $O(d \log^2 n)$ \\
\hline
 Deterministic tournament \cite{ailon08} & Active  & $O(n \text{poly}(\log n))$\\
\hline
 Rank-$r$ preference with $\nu$ incoherence \cite{gleichLek11} & Passive  & $O(n\nu r(\log n)^2)$ \\
\hline
 Bradley Terry Luce (BTL) \cite{negahban+12} & Passive  & $O(n \log n)$\\
\hline
 Noisy permutation \cite{wauthier+13} & Passive & $O(n \log n)$\\
\hline
 Low $r$-rank pairwise preference \cite{rajkumarAg16} & Passive & $O(nr \log n)$\\
\hline
 Low $d$-rank feature with BTL \cite{niranjanRa17} & Passive & $O(d^2 \log n)$\\
\hline
Rank aggregation balancing features \cite{chiang+17} & Passive & $O(n)$\\
\hline
\textbf{f-BTL} ($\alpha$ `independent items') [This work] & Passive  & $O(\alpha \log \alpha)$\\
\hline
\end{tabular}}
\vspace*{-10pt}
    \caption{State-of-the-art vs Our work}
\label{tab:sum_con}
\end{center}
\end{table}
\vspace*{-10pt}