\vspace*{-10pt}

\vspace*{-5pt}
\begin{figure*}[t]
\vspace{-5pt}
\hspace*{-25pt}
\includegraphics[trim={1.4cm 0 3.5cm 0},clip,scale=0.3,width=0.2\textwidth]{./Plots/N-Plots/Vary-N-on-D-Clique(10).png}
\hspace{-0pt}
\includegraphics[trim={1.4cm 0 3.5cm 0},clip,scale=0.3,width=0.2\textwidth]{./Plots/N-Plots/Vary-N-on-D-Clique(10)_pd.png}
\hspace{-0pt}
\includegraphics[trim={0.4cm 0 3.5cm 0},clip,scale=0.3,width=0.2\textwidth]{./Plots/N-Plots/Vary-N-on-D-Clique(10)_sc.png}
\includegraphics[trim={0.4cm 0 3.5cm 0},clip,scale=0.3,width=0.2\textwidth]{./Plots/N-Plots/Vary-N-on-d-Regular(10).png}
\includegraphics[trim={1.4cm 0 3.5cm 0},clip,scale=0.3,width=0.2\textwidth]{./Plots/N-Plots/Vary-N-on-d-Regular(10)_pd.png}
\hspace{-0pt}
\includegraphics[trim={0.5cm 0 3.5cm 0},clip,scale=0.3,width=0.2\textwidth]{./Plots/N-Plots/Vary-N-on-d-Regular(10)_sc.png}
\includegraphics[trim={1.4cm 0 3.5cm 0},clip,scale=0.3,width=0.2\textwidth]{./Plots/N-Plots/Vary-N-on-Tree(2).png}
\includegraphics[trim={1.4cm 0 3.5cm 0},clip,scale=0.3,width=0.2\textwidth]{./Plots/N-Plots/Vary-N-on-Tree(2)_pd.png}
\includegraphics[trim={0.7cm 0 3.5cm 0},clip,scale=0.3,width=0.2\textwidth]{./Plots/N-Plots/Vary-N-on-Tree(2)_sc.png}
\vspace{-8pt}
\caption{Performance vs $n$ on (1) $10$-disconnected clique, (2) $10$-regular graph (3) full binary tree}
%\setlength{\textfloatsep}{-25pt}
\label{fig:vs_n}
\hspace{-10pt}
\end{figure*}
\vspace*{-0pt}

\section{Experiments}
\label{sec:expts}
\label{SEC:EXPTS}
\vspace{-10pt}
%We now describe the experiments we ran with the fBTL-LS algorithm on various synthetic and real-world datasets. 
We compared our algorithm \fbtl\, with three state-of-the-art methods: (i) \emph{Ordinary Least Squares (\ols)} \cite{borkarNi16}, (ii) \emph{Rank Centrality (\rc)} \cite{negahban+12} and (iii) \emph{Inductive Pairwise Ranking} based on inductive matrix completion (\ipr) \cite{niranjanRa17}. The first two algorithms do not use any feature information while the third algorithm does. %Both the OLS and RC algorithms are guaranteed to work well for the standard BTL model with $O(n\log(n))$ pairs compared while the IPR algorithm requires $O(d^2\log(n))$ pairs to be compared for its guarantees to hold good (for a generalized BTL model that they define), $d$ being the feature dimension.   

\textbf{Performance Measures:}
%We measure the performance of algorithms using the following metrics
%\begin{enumerate}
\textbf{1. Normalized $\ell_2$-error:} For experiments where there is a true score vector, we use the normalized $\ell_2$ error between the estimated score vector and the true score vector $\Big(\frac{\|\bhtheta - \btheta\|}{\|\btheta\|}\Big)$.

\textbf{2. Pairwise disagreement (pd) error:}
Suppose $\bP^* \in [0,1]^{n \times n}$ denotes the pairwise preference matrix corresponding to the true (and unknown) score $\theta$, given by $P^*_{ij} = \frac{e^{\theta_i}}{e^{\theta_i}+e^{\theta_j}}$ and $\hat \bP \in [0,1]^{n \times n}$ be the estimated preference matrix return by the algorithm (note if the algorithm returns a score vector estimate $\bhtheta$, we compute $\hat{P}_{ij} = \frac{e^{\hat \theta_i}}{e^{\hat \theta_i}+e^{\hat \theta_j}} ~\forall i,j \in [n]$), then pd-error is defined as:
$%\begin{align*}
\mbox{pd}(\hat \bP,\bP^{*}) = \frac{2}{n(n-1)} \sum_{i < j} \big( \mathbb{I}(\hat P_{ij}  \ge 0.5 \wedge P^*_{ij} < 0.5)
 +\mathbb{I}(\hat P_{ij} < 0.5 \wedge P^*_{ij} > 0.5) \big)
$
%\vspace*{-20pt}
\begin{rem} 
This counts the
fraction of pairs where $\bP^*$ and $\hat \bP$ disagree -- the \emph{Kendall's Tau ranking loss} \cite{KumarVa10,negahban+12} between true and estimated ranking ($\btheta$ and $\bhtheta$).
\end{rem}
\vspace*{-2pt}
%& + \frac{1}{2n^2}\sum_{i \neq j} \mathbb{I}(\hat P_{ij} = 0.5 \wedge P^*_{ij} \neq 0.5) 
\textbf{3. Sample complexity(sc$(\epsilon)$):} Minimum number of pairwise comparisons required to be observed to obtain normalized $\ell_2$-error $\Big(\frac{\|\bhtheta - \btheta\|}{\|\btheta\|}\Big) < \epsilon$.
%\end{enumerate}

\vspace*{-2pt}
\subsection{Experiments on Synthetic Datasets}
\label{sec:expt_synth}
\vspace*{-3pt}
%We evaluate the four algorithms on three different type of graphs with respect to the above performance measures. 
We consider three different settings--- \textit{Type-I plots}: with increasing node size $(n)$, \textit{Type-II plots}: with increasing sampling rate $(p)$ but fixed node size $(n)$ and independence number $(\alpha)$ \textit{Type-III plots}: with increasing independence number $(\alpha)$ and fixed node size $(n)$.% Experimental setups and results are provided below.

\textbf{Graphs used.} We use $3$ different graphs for synthetic experiments: $(1)$ $r$-disconnected cliques: Union of $r$-cliques $(2)$ $d$-regular graphs: Graphs with each node having degree $d$ and $(3)$ $k$-ary trees: Trees with every node having $k$-children (except the leaf nodes). 

\textbf{Data generation} For each of the above type of graphs $G$, we first fix a maximum independent set $\cI(G)$ of $G$, and embed the $i^{th}$ node of  $\cI(G)$ with the $i^{th}$ canonical 
%\vspace*{-20pt}
basis vector of $\bR^{\alpha}$, i.e. $\bu_i = \mathbf e_i, \, \forall i \in [\alpha]$. %, denoted by $\mathbf e_i \in \{0,1\}^{\alpha}$ with $e_i(j) = 1$, if $i = j$ and $0$ otherwise $\forall i,j \in [\alpha]$, $\alpha = |\cI(G)|$.
 Thus our feature dimension is $d = \alpha$. We next generate a random coefficient matrix $\bB \in \bR^{n \times \alpha}$ and obtain the feature embedding $\{ \bu_i \}_{i =1}^{n} \subset \bR^\alpha$ of rest of the items using \eqref{eqn:basis}. Choose a random vector $\bw \in \bR^{\alpha}$ and assign a BTL score $\theta_i = \bw^T \bu_i$  to every node $i \in [n]$ as defined in \eqref{eqn:scores}. Finally $\btheta = (\theta_1,\theta_2,\ldots,\theta_n)$ is normalized to $\ell_2$-norm $1$, i.e. $\|\btheta\|_2 = 1$, setting $\theta_i = \frac{\theta_i}{\|\btheta\|_2}, \, \forall i \in [n]$. 

\textbf{Parameter setting.} As follows from the data generation, the feature dimension $d$ is equal to the independence number $\alpha = |\cI(G)|$ of $G$ in each case. We also fix $K = 1000$ (unless performance is reported against $K$), and report the average performances over $50$ runs.% along with the standard deviations. 

\vspace{-12pt}

\subsection*{Type-I plots: increasing node size $(n)$}
\vspace{-3pt}
We compare the algorithms, with varying node size $(n)$, on three different graphs: $(1)$ Union of $10$ disconnected cliques on $n$ nodes, $(2)$ $d$-Regular graph of $n$ nodes with fixed degree $d = 10$ and $(3)$ Full binary tree of $n$ nodes. The results are reported in Figure \ref{fig:vs_n}. They clearly reflect the superior performance of \fbtl~ for each of the three performance measures. 

\textbf{Results.}
For $10$-disconnected cliques, $\alpha = 10$ is fixed for all $n$, unlike graph $(2)$ and $(3)$ where $\alpha$ scales with $n$. The sample complexity sc$(0.5)$ of \fbtl~ for achieving a target error $\epsilon = 0.5$ for $10$-disconnected clique is almost constant for $n$ from $20$ to $500$, unlike the rest of the algorithms where it scales with $n$--which justifies our claim of the required sample complexity to be $O(\alpha \log \alpha)$,  as also remarked in Thm. \ref{thm:fbtlls}. This also justifies why for $10$-regular graph and full binary tree, the sample complexity of \fbtl~ monotonically increases with $n$, as  $\alpha$ itself scales with $n$ for them.

\vspace{-1pt}
\begin{rem}
The above reflects how our algorithm finds the position of a newly added item in an already estimated ranking without collecting extra pairwise preferences, as long as it lies in the span of $\cI(G)$ (i.e. $\alpha$ remains fixed), the sample complexity remains unaffected too (e.g. $10$-disconnected cliques) -- This is a significant advantage of our method over the rest which cannot exploit the underlying item dependencies and thus needs to observe preference information of the newly added nodes leading to increased sample complexity with increasing $n$.
\end{rem}

%We also have reported more experiments with varying $p$ and $\alpha$, as well as compared ourself on real-datasets. The details are presented in Appendix \ref{app:expts}.

\vspace*{-5pt}
\begin{figure}[H]
\vspace*{-10pt}
\hspace*{-5pt}
\includegraphics[trim={0.8cm 0 3.5cm 0},clip,scale=0.3,width=0.24\textwidth]{./Plots/P-Plots/Vary-p-on-D-Clique(500,100).png}
\hspace{-0pt}
\includegraphics[trim={0.8cm 0 3.5cm 0},clip,scale=0.3,width=0.24\textwidth]{./Plots/P-Plots/Vary-p-on-d-Regular(500,50).png}

\hspace{-5pt}
\includegraphics[trim={0.8cm 0 3.5cm 0},clip,scale=0.3,width=0.24\textwidth]{./Plots/P-Plots/Vary-p-on-Tree(8,2).png}
\hspace{-0pt}
\includegraphics[trim={0.8cm 0 3.5cm 0},clip,scale=0.3,width=0.24\textwidth]{./Plots/P-Plots/Vary-p-on-D-Clique(500,100)_pd.png}

\hspace{-5pt}
\includegraphics[trim={0.8cm 0 3.5cm 0},clip,scale=0.3,width=0.24\textwidth]{./Plots/P-Plots/Vary-p-on-d-Regular(500,50)_pd.png}
\hspace{-0pt}
\includegraphics[trim={0.8cm 0 3.5cm 0},clip,scale=0.3,width=0.24\textwidth]{./Plots/P-Plots/Vary-p-on-Tree(8,2)_pd.png}
\vspace{-7pt}
\caption{Performance vs $p$, where $p = \frac{C\alpha\log\alpha}{{n\choose 2}}$ on $(1)$ $100$-disconnected clique, $(2)$ $50$-regular graph and $(3)$ full binary tree, with $\sim 500$ nodes in each}
\label{fig:vs_p}
\hspace{0pt}
\end{figure}
\vspace*{-30pt}


\subsection*{Type-II plots: increasing $p$, fixed $n,\alpha$}
\vspace*{-10pt}
Here we compare the algorithms with varying sampling rate $p$ for two estimation error metrics, normalized $\ell_2$-error and pairwise disagreement pd$(\hat \bP, \bP^*)$, 
on the following three different graphs: $(1)$ Union of $100$ disconnected cliques on $500$ nodes, i.e. each clique of $5$ nodes, $(2)$ $50$-Regular graphs on $500$ nodes, each of degree $d = 50$ and $(3)$ Full binary tree of height $8$ ($511$ nodes). Thus in each case, $n$ and $\alpha$ are kept fixed, with $p$ to be set as $p = \frac{C\alpha\log\alpha}{{n\choose 2}}$, $C$ varying from $0.5$ to $32$. 

\vspace*{-5pt}
\begin{figure}[H]
\vspace{-0pt}
\hspace{-5pt}
\includegraphics[trim={1.4cm 0 3.5cm 0},clip,scale=0.24,width=0.24\textwidth]{./Plots/A-Plots/Vary-I-on-D-Clique(500).png}
\hspace{-2pt}
\includegraphics[trim={1.4cm 0 3.5cm 0},clip,scale=0.24,width=0.24\textwidth]{./Plots/A-Plots/Vary-I-on-D-Clique(500)_pd.png}

\hspace{-5pt}
\includegraphics[trim={0.5cm 0 3.5cm 0},clip,scale=0.24,width=0.24\textwidth]{./Plots/A-Plots/Vary-I-on-D-Clique(500)_sc.png}
\hspace{0pt}
\includegraphics[trim={0.5cm 0 3.5cm 0},clip,scale=0.24,width=0.24\textwidth]{./Plots/A-Plots/Vary-I-on-d-Regular(500).png}

\hspace{-5pt}
\includegraphics[trim={1.4cm 0 3.5cm 0},clip,scale=0.24,width=0.24\textwidth]{./Plots/A-Plots/Vary-I-on-d-Regular(500)_pd.png}
\hspace{-0pt}
\includegraphics[trim={0.5cm 0 3.5cm 0},clip,scale=0.24,width=0.24\textwidth]{./Plots/A-Plots/Vary-I-on-d-Regular(500)_sc.png}
\vspace{-10pt}
\caption{Performance vs $\alpha$ on disconnected cliques and $d$-regular graph ($n=500$ nodes in each)}
\label{fig:vs_a}
%\vspace{-12pt}
\end{figure}
\vspace*{-12pt}

%\begin{rem}
\textbf{Results.}
Fig. \ref{fig:vs_p} shows, as expected, the performance of all the algorithms gets improved with a higher sampling rate $p$. 
%as it can exploit the feature correlation faster.
However, the performance improvement rate is far more drastic for \fbtl\, compared to the rest due to its inherent ability to exploit the feature correlation, and thus attains accurate score estimates faster.


\subsection*{Type-III plots: increasing $\alpha$ and fixed $n$}
\vspace*{-5pt}
In the third setup, we compare the four algorithms with varying independence set size (or independence number) $\alpha$ for a fixed set of $n = 500$ nodes on the following two graphs: $(1)$ Union of $r$-disconnected cliques over $500$ nodes with varying $r$ and $(2)$ $d$-Regular graph of $500$ nodes with varying degree $d$ (Figure \ref{fig:vs_a}). 

\textbf{Results.}
The results show varying $p$ as $p = \frac{10 (\alpha \log \alpha)}{{n \choose 2}}$ normalized $\ell_2$-error and pd$(\hat \bP,\bP^*)$, remains almost constant validating the claim of the required sample complexity of \fbtl\, to be $O(\alpha\log \alpha)$, as follows from Theorem \ref{thm:fbtlls}. The sample complexity curves on the other hand validate the dependency of sc$(0.5)$ on $\alpha$, which increases with higher values of $\alpha$, as expected.


\vspace*{-10pt}
\subsection{Real Data Experiments}
\label{sec:expt_real}
\label{SEC:EXPT_REAL}

We finally evaluate the algorithms on two benchmark real-world preference learning datasets: {\it car} and {\it sushi}. %Due the space constants the datasets and the experimental setup is detailed in Appendix \ref{app:real_expts}.

\textbf{1. Car Dataset.} (\cite{car}) It contains pairwise preferences of $20$ cars given by $60$ users, where
each car is represented by a $6$-dimensional feature vector. 
\textbf{2. Sushi Dataset.} (\cite{sushi}) This dataset contains over 100 sushis rated according to their preferences, each sushi is represented by a $7$-dimensional feature vector.

%\vspace*{-10pt}
\begin{figure}[H]
\hspace{-5pt}
%\vspace{-15pt}
\includegraphics[trim={2.cm 0 0.5cm 0},clip,scale=0.3,width=0.24\textwidth]{./Plots/Real/Vary-p-on-car.png}
%\hspace{7pt}
\includegraphics[trim={2.cm 0 0.5cm 0},clip,scale=0.3,width=0.24\textwidth]{./Plots/Real/Vary-p-on-sushi.png}

\hspace{-5pt}
\includegraphics[trim={2.cm 0 0.5cm 0},clip,scale=0.3,width=0.24\textwidth]{./Plots/Real/Vary-K-on-car.png}
%\hspace{7pt}
\includegraphics[trim={2.cm 0 0.5cm 0},clip,scale=0.3,width=0.24\textwidth]{./Plots/Real/Vary-K-on-sushi.png}
\vspace{-0pt}
\caption{Pairwise disagreement error pd$(\hat \bP, \bP^*)$ vs sampling rate $\bigg(p = \frac{C\alpha\log\alpha}{{n\choose 2}}\bigg)$, and number of repeated samples $(K)$ on {\it Car} and {\it Sushi}}
\label{fig:real}
\hspace{0pt}
%\vspace*{-10pt}
\end{figure}
\vspace*{-10pt}

\textbf{Setup.} 
Note that the real-world datasets do not satisfy any preference modeling assumption, e.g. BTL assumption, and hence there is no true score vector $\btheta$ associated to the item preferences.
From the user preferences, we first compute the underlying pairwise preference matrix $\bP^*$, where $P^*_{ij}$ is computed by taking the empirical average of number of times an item $i$ is preferred over item $j$. 
Further to construct the feature matrix $\bU$, we use the provided feature information of the item set, that is provided in each dataset. Specifically, if each item is represented by $d$-dimensional feature vector (as described before, $d = 6$ for {\it Car} and $d=7$ for {\it Sushi}), we find a set of $d$ items whose corresponding features are linearly independent that forms a basis of $\bR^d$ and use these $d$ items as the independent set $\cI$. The coefficient matrix $\bB$ is then constructed by representing the rest of the items as a linear combination of $\cI$, such that it satisfies \eqref{eqn:basis} (see Sec. \ref{sec:prb_set}).


\textbf{Performance Measure.} As noted above, the real-world datasets do not satisfy the BTL assumption, so there is no true score vector $\btheta$ associated with the item preferences. We however measure the performances of the algorithms with respect to the true preference matrix $\bP^*$, using pairwise disagreement error pd$(\hat \bP, \bP^*)$.

In both cases, our algorithm outperforms the rest. We also evaluate the algorithms with an increasing number of repeated samples per pair $(K)$. As expected, it shows higher $K$ leads to improved performance (Fig. \ref{fig:real}).
