%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
%\usepackage{ulem} % for strikethrough




\usepackage{amsmath,amsfonts,amssymb,amsthm,amsxtra,graphicx,verbatim,epsfig,color,enumerate,array,mathtools,dsfont,mathrsfs,hyperref,url,bookmark, subcaption, wrapfig,thmtools,thm-restate,float, bm}

\usepackage[ruled,linesnumbered,vlined]{algorithm2e}
\usepackage{algpseudocode} % command set for algorithm2e

\usepackage{mathabx}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
%\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
%\newtheorem[theorem][Remark]

\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cJ}{\mathcal{J}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cW}{\mathcal{W}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cZ}{\mathcal{Z}}

\newcommand{\kA}{\mathfrak{A}}
\newcommand{\kB}{\mathfrak{B}}
\newcommand{\kC}{\mathfrak{C}}
\newcommand{\kD}{\mathfrak{D}}
\newcommand{\kE}{\mathfrak{E}}
\newcommand{\kF}{\mathfrak{F}}
\newcommand{\kG}{\mathfrak{G}}
\newcommand{\kH}{\mathfrak{H}}
\newcommand{\kI}{\mathfrak{I}}
\newcommand{\kJ}{\mathfrak{J}}
\newcommand{\kK}{\mathfrak{K}}
\newcommand{\kL}{\mathfrak{L}}
\newcommand{\kM}{\mathfrak{M}}
\newcommand{\kN}{\mathfrak{N}}
\newcommand{\kO}{\mathfrak{O}}
\newcommand{\kP}{\mathfrak{P}}
\newcommand{\kQ}{\mathfrak{Q}}
\newcommand{\kR}{\mathfrak{R}}
\newcommand{\kS}{\mathfrak{S}}
\newcommand{\kT}{\mathfrak{T}}
\newcommand{\kU}{\mathfrak{U}}
\newcommand{\kV}{\mathfrak{V}}
\newcommand{\kW}{\mathfrak{W}}
\newcommand{\kX}{\mathfrak{X}}
\newcommand{\kY}{\mathfrak{Y}}
\newcommand{\kZ}{\mathfrak{Z}}

\newcommand{\Real}{\mathbb{R}}
\newcommand{\Nat}{\mathbb{N}}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Hilbert}{\mathcal{H}}
\newcommand{\Pmeas}{\mathfrak{M}_1^+}
\newcommand{\Meas}{\mathcal{M}}

\newcommand{\Esp}{\mathbb{E}}
\newcommand{\Var}{\mathbb{V}}
\renewcommand{\Pr}{\mathbb{P}}
\newcommand{\PP}{{\cal P}}
\newcommand{\Pn}{\mathcal{P}_n}

\newcommand{\Rad}{\mathcal{R}}
\newcommand{\Normal}{\cN}
\newcommand{\Bern}{\mathcal{B}}



\newcommand{\supp}{\mathrm{supp}}
% \newcommand{\kl}{\texttt{KL}}
% \newcommand{\KL}{\texttt{KL}}
% \newcommand{\klber}{\texttt{kl}}
% \newcommand{\TV}{\texttt{TV}}
% \newcommand{\Span}{\mathbb{S}}


\newcommand{\ind}{\mathbb{I}}
\newcommand{\indic}[1]{\mathbb{I}\{#1\}}
\renewcommand{\mod}[2]{[#1 \,\, \mathrm{mod} \,\, #2]}
\newcommand{\fracpartial}[2]{\frac{\partial #1}{\partial  #2}}
\newcommand{\abs}[1]{\left|#1\right|}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\expect}[1]{\mathbb{E}\left[{#1}\right]}
\newcommand{\prob}[1]{\mathbb{P}\left[{#1}\right]}
\newcommand{\given}{\; \big\vert \;} 
\newcommand{\bydef}{:=}
\newcommand{\inner}[2]{\langle #1, #2 \rangle}
\newcommand{\at}[2][]{#1|_{#2}}
\newcommand*{\as}{\mathrm{(a.s.)}}
\newcommand*{\iid}{i.i.d.\xspace}

\newcommand{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand{\Argmax}{\mathop{\mathrm{Argmax}}}
\newcommand{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand{\argsup}{\mathop{\mathrm{argsup}}}
\newcommand{\arginf}{\mathop{\mathrm{arginf}}}

\newcommand{\beq}{\begin{equation}}
\newcommand{\eeq}{\end{equation}}
\newcommand{\beqn}{\begin{equation*}}
\newcommand{\eeqn}{\end{equation*}}
\newcommand{\beqa}{\begin{eqnarray}}
\newcommand{\eeqa}{\end{eqnarray}}
\newcommand{\beqan}{\begin{eqnarray*}}
\newcommand{\eeqan}{\end{eqnarray*}}

\renewcommand{\phi}{\varphi}
\renewcommand{\epsilon}{\varepsilon}
\renewcommand{\leq}{\leqslant}
\renewcommand{\geq}{\geqslant}
\renewcommand{\hat}{\widehat}
\newcommand{\wh}{\widehat}
\newcommand{\ol}{\overline}
\newcommand{\mt}{\widetilde{\mu}}
\newcommand{\wt}{\widetilde}
\renewcommand{\d}{\mbox{d}}
\newcommand{\nup}{\kappa}
\newcommand{\eps}{\varepsilon}
\newcommand{\ra}{\rightarrow}
\newcommand{\eqdef}{\stackrel{\rm def}{=}}
\newcommand{\Otilde}[1]{\tilde{O}\left(#1\right)}
%\newcommand{\qed}{\hfill$\square$}

\DeclareMathOperator{\Tr}{tr}
\DeclareMathOperator{\T}{T}
\DeclareMathOperator{\Rk}{rank}
\DeclareMathOperator{\Dg}{diag}
\DeclareMathOperator{\F}{F}
\DeclareMathOperator{\HS}{HS}
\DeclareMathOperator{\op}{op}
\DeclareMathOperator{\V}{\mathbb{V}ar}
\DeclareMathOperator{\C}{\mathbb{C}ov}
\DeclareMathOperator{\parent}{par}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\MI}{I}
\DeclareMathOperator{\KL}{KL}
\DeclareMathOperator{\kl}{kl}
\DeclareMathOperator{\TV}{TV}
\DeclareMathOperator{\diff}{d}
\DeclareMathOperator{\bC}{\mathbb{C}}

\DeclareMathOperator{\Reg}{Reg}
\DeclareMathOperator{\Risk}{Risk}

\DeclareMathOperator{\reg}{reg}
\DeclareMathOperator{\risk}{risk}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\naga}[1]{{\color{black}#1}}
\newcommand{\am}[1]{{\color{black}#1}}
% \newcommand{\todo}[1]{{\color{red}N: #1}}
\newcommand{\CombMAB}{\texttt{Comb-MAB}}
\newcommand{\KBAI}{\texttt{K-BAI}}


\title{Combinatorial Categorized Bandits with Expert Rankings}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<t-sayakr@microsoft.com>?Subject=Your UAI 2023 paper}{Sayak Ray Chowdhury$^*$}{}}
\author[1]{Gaurav Sinha$^*$}
\author[1]{Nagarajan Natarajan}
\author[1]{Amit Sharma}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Microsoft Research\\
    Bengaluru, India
}
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }


  
\begin{document}

 
\maketitle
\def\thefootnote{*}\footnotetext{Equal contribution} 

% \setlength{\belowdisplayskip}{4pt} \setlength{\belowdisplayshortskip}{4pt}
% \setlength{\abovedisplayskip}{4pt} \setlength{\abovedisplayshortskip}{4pt}

\begin{abstract}
  Many real-world systems such as e-commerce websites and content-serving platforms employ two-stage recommendation --- in the first stage, multiple nominators (experts) provide ranked lists of items (one nominator per category, e.g., sports and political news articles), and in the second stage, an aggregator filters across the lists and outputs a single (short) list of $K$ items to the users. The aggregation stage can be posed as a combinatorial multi-armed bandit problem, with the additional structure that the arms are grouped into categories (disjoint sets of items) and the ranking of arms within each category is known. We propose algorithms for selecting top $K$ items in this setting under two learning objectives, namely minimizing regret over rounds and identifying the top $K$ items within a fixed number of rounds. For each of the objectives, we provide sharp regret/error analysis using carefully defined notion of ``gap'' that exploits our problem structure. The resulting regret/error bounds strictly improve over prior work in combinatorial bandits literature. We also provide supporting evidence from simulations on synthetic and semi-synthetic problems. 
\end{abstract}


\section{Introduction}\label{sec:intro}

Multi-Armed Bandits (MAB) is a popular approach to model sequential decision making problems~\citep{bouneffouf2020survey}; and has been applied to real-world situations such as recommendation systems~\citep{glowacka2019bandit} and online advertising~\citep{tyler2010showing, avadhanula2021stochastic}. In many of these applications, however, the decision maker (called agent) needs to identify a \textit{combination} of arms which when pulled together could yield high rewards. For instance, recommender systems often recommend a subset of relevant items to its users. The decision making problem is much more challenging in this setting, as the search space is combinatorially large. This is typically formulated as a ``combinatorial multi-armed bandit problem'' ~\citep{kveton2015tight} (\CombMAB), when the goal is to optimize cumulative rewards over rounds, or as a ``K-best arm identification problem'' ~\citep{bubeck2013multiple} (\KBAI), when the goal is to find the best $K$ arms within a fixed number of rounds.

As a motivating example, consider the ``whole page optimization'' problem arising in recommendation systems for e-commerce, news articles, etc. Here, the web page has a real estate for, say, at most $K$ items, which are typically a combination of products or news articles \am{from different categories. }% and display ads. 
Selecting the ``best'' $K$ items can be posed as a standard combinatorial multi-armed bandits problem, with the goal of optimizing for click-based rewards. However, in practice, there is more structure to the problem. In particular, the recommender systems employed in large-scale commercial settings typically comprise two stages~\citep{hron2021component, ma2020off}, wherein there are multiple \textit{nominators} in the first stage, each producing a ranked list of items (e.g., a nominator for ranking news articles \am{from the sports category, and another for ranking from political category, and so on}), and an \textit{aggregator} that selects the top $K$ items from the ranked lists to populate the web page. 

% Two concrete problems capturing this are the Combinatorial Multi Armed Bandit (CombMAB) problem and the $m$ Best Arm Identification ($m$-BAI) problem. In CombMAB, an agent pulls a combination of (say $m$) arms at each time step (called a round) and receives a reward
% for each arm in the combination (known as \emph{semi-bandit feedback}). The agent designs a policy describing which combinations to  pull at each round, given rewards from the previous pulls, and tries to maximize the total reward accumulated till each round. In $m$-BAI, the agent explores one arm per round for a given time horizon and identifies the top $m$ arms. 

In such two-stage settings where nominators provide reliable rankings, the core online learning problem then is to perform optimal filtering, i.e., subset selection in the second stage (across different nominators) in a sample-efficient manner. In such scenarios, directly applying the known algorithms for \CombMAB\ \citep{kveton2015tight} and \KBAI\ \citep{bubeck2013multiple} for selecting subset of arms can be sub-optimal. Designing and analysing algorithms for the second stage given the structure induced by the first stage is the core technical problem we address in this paper. 

Specifically, we study learning algorithms for the setting where arms (i.e., items) are grouped into categories (or disjoint clusters) and ranking of arms within each category is known to the learning algorithm. Note that only rankings of arms within each category is assumed to be known and not the actual reward distributions. For instance, in the above web page population scenario, the second-stage recommender system would have access to the correct ranking of all the news articles in \am{sports (first category), politics (second category), % as well as that of all the display ads (second category),
and so on. }

\subsection{Our Contributions}
We design new algorithms for the \CombMAB\ and \KBAI\ problems under the above mentioned structural assumption, i.e., the arms are grouped into different disjoint categories and the true ordering of arms (with respect to their rewards) within each category is known. We summarize our main contributions below.

\textbf{Regret Minimization:} For \CombMAB, our objective is to minimize the expected cumulative regret, over $T$ rounds, of selecting $K$ candidate arms to play at each round. We propose \textit{Ordered Combinatorial UCB}, based on the widely-used upper confidence bound (UCB) algorithm (Section~\ref{sec:regret_min}). We adapt the strategy of~\cite{kveton2015tight} to incorporate the knowledge of the expert rankings in two key respects: (a) designing a computationally efficient and provably correct sub-routine for selecting $K$ items at each round; (b) providing a regret analysis for our algorithm that strictly improves over~\cite{kveton2015tight}, via defining an appropriate notion of sub-optimality gap for our setting (Theorem~\ref{thm:regret}).

\textbf{K-Best Arm Identification:}  For \KBAI, we seek to discover the $K$ best arms at the end of $T$ rounds with high probability. We propose \textit{Ordered SAR} that adapts the Successive-Accept-Reject (SAR) strategy of~\cite{bubeck2013multiple} (Section~\ref{sec:best_arm}) to our setting with the ``prefix structure'', i.e., the optimal ranking of items across different experts must necessarily incorporate prefixes of ranked lists from experts. We give a sharper analysis of the error bound for our algorithm that strictly improves over the bound of~\cite{bubeck2013multiple}, via a novel definition of  ``instance-specific complexity'', for our structured setting (Theorem~\ref{thm:topK}). 

We show findings from simulations, that support our theoretical results, in Section~\ref{sec:sims}. Our work leads to interesting follow-up research questions in two-stage recommender systems that we highlight in Section~\ref{sec:concl}. Before we proceed with formally setting up the problem and metrics in Section~\ref{sec:setup}, we review closely related work next.

\subsection{Related Work}

Combinatorial bandits is a generalization of the well-studied multi-armed bandit problem \citep{auer2002finite,bubeck2012regret}. While the problem has been studied in the adversarial setting \cite{cesa2012combinatorial,kale2010non},   in this work we focus on stochastic combinatorial bandits.

The cumulative regret minimization problem in stochastic combinatorial bandits under additive rewards was first studied by \citet{gai2012combinatorial}. The theoretical guarantee of the proposed algorithm in the above work was subsequently analyzed by \citet{kveton2015tight}. \citet{wang2017improving} further generalize these results to the combinatorial bandit setting with general reward structure. In our work, departing from the above line of research, we consider a structured arm set, i.e., where the arms are grouped into different disjoint categories.

The K-Best arm identification problem 
was first studied under the fixed budget setting by \citet{bubeck2013multiple}, and under the fixed confidence setting by \citet{kalyanakrishnan2010efficient,kalyanakrishnan2012pac}. Follow-up works either a) generalized the results of the above papers to the general combinatorial bandit setting \citep{chen2014combinatorial,gabillon2012best}; or b) considered specific combinatorial structures like matroids
\citep{chen2017nearly}; or c) improved the algorithms \citep{chen2016pure,jiang2017practical}. \citet{chaudhuri2017online} considers the learning-to-rank problem with feedback from K-best arms. The goal is to rank the K-best arms in an online fashion, whereas we focus on identifying the K-best arms.


A well-studied special case of K-Best-Arm is the Best-Arm identification problem \citep{even2006action,audibert2010best}, in which we are required to
identify the single arm with the largest mean \am{reward}. Nearly tight sample complexity bounds as well as error probabilities for
Best Arm identification problem were obtained by \cite{jamieson2014lil,kaufmann2013information}. However, completely understanding the exact complexity of Best-Arm identification continues
to attract significant attention. The same is the case for K-Best arm identification problem, which is the focus of this work.


\section{Problem Setup}\label{sec:setup}

We are given $N$ ranked lists of \naga{items corresponding to different categories, from $N$ experts, as introduced in Section~\ref{sec:intro}. We refer to items as actions or arms interchangeably, as in the bandits literature.} Each list $i$ consists of $M$ \naga{items} $a_{i,1},\ldots,a_{i,M}$. In typical scenarios discussed in Section~\ref{sec:intro}, $M \gg N$. We assume that \naga{lists are disjoint, i.e., each list has a unique set of items}.\footnote{When an item appears in multiple lists, we can keep any
one copy of it and throw away the rest. Since our objective is to find the top $K$ (distinct) items in the union of all lists, the
best arm (i.e., top $K$ items) does not change by doing so. 
% Now, we can continue with our algorithms as it is since there is no
% overlap between the lists. We do agree that which copy of the particular item is kept might affect the gaps that appear in the
% regret expression and therefore our regret reflects the worst case regret with respect to this selection
}
%we assume that each list is disjoint, i.e., has different keywords than other lists. 
Each \naga{item} $a_{i,j}, i \in [N],j\in [M]$ is associated with a reward distribution (with a well-defined density) $\nu_{i,j}$, supported on $[0,1]$, with mean $\mu^*_{i,j}:=\mathbb{E}_{r \sim \nu_{i,j}}[r]$. We adopt the setting of combinatorial multi-armed bandits, but with \naga{the following structure}. %twist. 
We assume that each list is sorted with respect to $\mu^*$, i.e, $\mu^*_{i,1}\geq \mu^*_{i,2} \geq \ldots \geq \mu^*_{i,M} $ for all $i \in [N]$. The $N$-by-$M$ matrix of mean rewards $\mu^*:=\big[\mu^*_{i,j}\big]_{i,j}$ is unknown to the learning agent, but she has the \emph{side information} that each list is sorted. 

\naga{The learning agent has a budget of $T$ rounds (or pulls), and at each round, the agent has to return a set of $K$ \naga{items} from the $N$ sorted lists of $M$ \naga{items} each. In practice, $K \ll M$ (see Remark~\ref{rem:MandK}).}

% where $1\!<\!K \!\leq\! M$ \naga{(note that if $K = M' > M$, then we should elicit $M'$ items from every list, given our problem structure; in which case, we simply redefine $M'$ as $M$)}.}
We assume semi-bandit feedback model, i.e., the agent can observe rewards for each selected \am{item}, consistent with the literature~\citep{kveton2015tight,chen2014combinatorial}. To guide the agent make the combinatorial selection at each round, we consider two widely-used learning objectives:

\paragraph{Regret minimization.} The first objective we consider is the \emph{optimization goal}, where the agent aims to maximize her expected cumulative reward over time by repeatedly interacting with the unknown environment. The learning protocol is as follows: at each time $t$, (i) the agent chooses \naga{a set of $K$ items from the $N$ sorted lists of $M$ items each} based on the rewards received before time $t$;  equivalently, with the knowledge of the lists being sorted w.r.t. $\mu^*$, she chooses top $z_{t, i}$ \naga{items} from each list $i$ such that $\sum_{i=1}^N z_{t, i}\!=\!K$, where each $z_{t,i}$ can take values in $\lbrace 0,1,\ldots,K\rbrace$, and (ii) observes rewards of all the $K$ chosen \naga{items} $r_t(a_{i,j}), i \!\leq\! N, j \!\leq\! z_{t,i}$. 

Let $\cZ$ denote the set of all possible ``list prefixes'' or ``allocations'' the agent can make at any given round, i.e.,
\begin{align}\label{eq:action_set}
    \cZ \!=\! \left\lbrace \!(z_1,\ldots,z_N): z_i \!\in\! \lbrace 0,1,\ldots,K\rbrace,\sum_{i=1}^N\! z_i\!=\!K\!\right\rbrace.
\end{align}
Let $f^{\mu*}(z)$ denote the
total expected reward or \emph{utility} of an allocation $z \in \cZ$. If the agent knew $\mu^*$ a priori, she could choose the optimal allocation $z^* \in \argmax_{z \in \cZ} f^{\mu^*}(z)$ at each round $t$.  In this setting, we evaluate the performance of the agent's strategy using \emph{expected cumulative regret} due to not knowing $\mu^*$, that is
\begin{align}\label{eq:regret}
    R_T &= \sum_{t=1}^T \left[f^{\mu^*}(z^*) - f^{\mu^*}(z_t)\right].
    % \\ &= \sum_{t=1}^T \sum_{i=1}^N \left(\sum_{j=1}^{z^*_i}\mu(a_i^j)-\sum_{j=1}^{z_{t,i}}\mu(a_i^j)\right).
\end{align}
For simplicity of presentation, we assume that utility function is additive, i.e., $f^{\mu}(z) =\sum_{i=1}^N \sum_{j=1}^{z_i}\mu_{i,j}$ for any set of parameters $[\mu]_{i,j}$. However, our results would hold for any monotone utility function, i.e., for any $f$ satisfying $f^{\mu}(z) \!\leq\! f^{\mu'}(z)$ if $\mu_{i,j} \!\leq\! \mu'_{i,j}$ for all $i\!\leq\! N, j \!\leq\! M$.

\paragraph{$K$-Best arm identification.} We also study a related objective of the \emph{search goal}, \naga{i.e., the agent has a budget of $T$ rounds (or pulls), and is tasked} to find top-$K$ \naga{items} from the $N$ sorted lists of $M$ \naga{items} each, where $1\!<\!K \!\leq\! M$. Equivalently, with the knowledge of the lists being sorted w.r.t. $\mu^*$, she needs to find the optimal allocation $z^* \in \cZ$, that corresponds to the set of $K$ \naga{items} with the highest mean rewards across all lists. 

The sequential evaluation protocol proceeds as follows: at each round $t\!=\!1,\ldots,T$, the agent chooses an \naga{item} $a_{i,j}$ and observes a reward $r_t(a_{i,j})$,  drawn from $\nu_{i,j}$ independent of the past given $a_{i,j}$. At the end of $T$ rounds, she returns an allocation $ z^{\text{out}} \in \cZ$. We evaluate the performance of the agent's strategy by the \emph{probability of error} (or misidentification), that is
\begin{equation}\label{eq:p-error}
    \delta_T = \prob{z^{\text{out}} \neq z^*}.
\end{equation}

\begin{remark} Note that, without loss of generality, we can trim each of the lists from size $M$ down to size $K$, given (a) our problem structure, i.e., we want the $K$ items returned to be a prefix of the lists, and (b) the 
monotonicity of the utility function. This reduces the search space, and so, in effect, $M = K$.
\label{rem:MandK}
\end{remark}



\section{\CombMAB: Regret minimization}\label{sec:regret_min}
\naga{In this section, we present the learning algorithm for the first objective set up in Section~\ref{sec:setup}, i.e., \textit{optimization} goal, its regret bounds, and show how our guarantees improve over the state-of-the-art results in the literature by exploiting the problem structure.}

\subsection{Algorithm}

Motivated by the simplicity of the widely-used Upper Confidence Bound (UCB) strategy, \citet{kveton2015tight} designed and analyzed an algorithm for stochastic combinatorial semi-bandits for regret minimization. We adapt this algorithm to our setting, where the agent knows the true ordering of items in each list. We call this algorithm \emph{Ordered Combinatorial UCB}.

% is constrained to choose actions only from the prefix of each list. Therefore, we call it Prefix-aware Combinatorial UCB.

Informally, our algorithm consists of three steps at each time $t$. First, we compute the UCBs on the expected reward $\mu^*_e$ of each \naga{item} $e \in \lbrace a_{i,j}\rbrace_{i\leq N,j\leq M}$ as
\begin{equation*}
U_{t}(e) = \hat\mu_{T_{t-1}(e)}(e)+ \beta_{t-1, T_{t-1}(e)},
\end{equation*}
where $T_t(e)$ denotes the number of times \naga{item} $e$ is observed in $t$ rounds, $\hat \mu_s(e)$ denotes the empirical mean of $s$ samples from $\nu_e$ and $\beta_{t,s}$ denotes the radius of a confidence interval around $\hat \mu_s(e)$. Choosing $\beta_{t,s}= \sqrt{\frac{3\log t}{2s}}$, it holds that $\mu^*_e$ lies in the said confidence interval with high probability. Next, we choose an allocation $z_t \in \cZ$ by solving a \emph{combinatorial optimization} problem using UCB estimates:
\begin{equation}\label{eq:opt_prob}
    z_t \in \argmax_{z \in \cZ} f^{U_t}(z) = \argmax_{z \in \cZ}\sum_{i=1}^N \sum_{j=1}^{z_i}U_t(a_{i,j}).
\end{equation}
Now, we play the set of $K$ \naga{items} $a_{i,j}, i \leq N, j \leq z_{t,i}$, given by the allocation $z_t$ and observe rewards of all the \naga{items}. Finally, we update the estimates $T_t(a_{i,j})$ and $\hat \mu_{T_t(a_{i,j})}(a_{i,j})$ of these \naga{items}. See Algorithm~\ref{alg:regret_min} for complete pseudocode.

\paragraph{DP-based optimization solution.} Now, we provide a subroutine to 
find the allocation $z_t$ as given in \eqref{eq:opt_prob}. Our proposed solution is based on dynamic programming (DP), and is computationally efficient.  
Given a set of parameters $\theta_{i,j}, i \leq N, j\leq M$, the 
objective is to compute $\argmax_{z \in \cZ}\sum_{i=1}^N\sum_{j=1}^{z_i} \theta_{i,j}$. In other words, we need to find optimal selection of $K$ items w.r.t. the parameter $\theta$ using prefixes from lists $1,\ldots, N$.

To this end, let $V_{i,j}^\theta$ denote the \emph{value} of the selection of $j$ items using prefixes from lists $1,\ldots, i$. Also, for each list $i$, let $s^\theta_{i,j}=\sum_{k=1}^j \theta_{i,k}$ denote the sum of $\theta$'s of first $j$ items. By definition, $V^\theta_{1,j}\!=\!s_{1,j}^\theta$ for all $j$ and $V_{i,0}^\theta\!=\!s_{i,0}^\theta\!=\!0$ for all $i$.
Now, for each $2 \!\leq \!i\!\leq\! N$ and $1\! \leq\! j\! \leq\! K$, we compute the value $V_{i,j}^\theta$ using the following recurrence:
\begin{equation}\label{eq:DP}
  \!\!V^\theta_{i,j}\! =\! \max \!
  \begin{cases}
  \!V^\theta_{i-1,j}\!+\! s^\theta_{i,0} & \!\text{(no item from list } i)\\
  \!V^\theta_{i-1, j-1} \!+\! s^\theta_{i,1} &\!\text{(first item from list } i) \\
  \!V^\theta_{i-1, j-2} \!+\! s^\theta_{i,2}& \!\text{(first 2 items from list } i)\\
  \vdots\\
  \!V^\theta_{i-1, 0} \!+\!s^\theta_{i,j}&  \!\text{(all $j$ items from list } i) 
  \end{cases}
\end{equation}
We return as $z_t$ the selection of $K$ items from lists $1,\ldots,N$ that attain the value $V_{N,K}^{U_t}$, where $U_t(a_{i,j})$ denotes UCB estimates at round $t$. 
The following lemma shows the optimality of this solution. It can be proved using simple induction argument. 
\begin{lemma}[Optimality of DP]
Let $\cZ$ be given by \eqref{eq:action_set}. Then, for any set of parameters $\theta_{i,j} \!>\! 0$, 
$i \!\in\! [N]$, $j \!\in\! [M]$, we have $V^\theta_{N,K} \!=\! \max_{z \in \cZ} \sum_{i=1}^N\sum_{j=1}^{z_i} \theta_{i,j}$.
\end{lemma}
The time complexity of finding the allocation $z_t$ is $O(NK^2)$ implying our algorithm is also computationally efficient, especially since, in general, $K$ is small.



\begin{algorithm}[t]
\caption{Ordered Combinatorial UCB}
\label{alg:regret_min}
\DontPrintSemicolon
\KwIn{$N$ lists of \naga{items} $(a_{i,1},\ldots,a_{i,M}), i \leq N$\naga{, and $K$ (\#items to retrieve)}.}
\textbf{Initialize:} 
Play each arm $a_{i,j}, i\leq N,j\leq M$ once and observe reward $r(a_{i,j}) \sim \nu_{i,j}$.\;
Set $T_{MN}(a_{i,j})=1$, $\hat \mu_{MN}(e) = r(a_{i,j})$.\;
\For{round $t=MN+1,2,\ldots$}{
For each $i \leq N, j\leq M$, compute UCBs:
$U_{t}(a_{i,j}) = \hat\mu_{T_{t-1}(a_{i,j})}(a_{i,j})+\sqrt{\frac{3\log(t-1)}{2T_{t-1}(a_{i,j})}}$.\;
Choose allocation
    $z_t \in \cZ$ using \eqref{eq:opt_prob} and \eqref{eq:DP}. \;
For each $ i \leq N$ and each $j \leq z_{t,i}$, play arm $a_{i,j}$ and observe its reward
$ r_t(a_{i,j})$.\;
Update number of plays \naga{for $a_{i,j}$'s played in the above step:} $T_t(a_{i,j})= T_{t-1}(a_{i,j})+1$.\;
Update \naga{their} mean estimates: $\hat \mu_{T_t(a_{i,j})}(a_{i,j})=\frac{\hat \mu_{T_{t-1}(a_{i,j})}(a_{i,j})T_{t-1}(a_{i,j})+r_t(a_{i,j})}{T_t(a_{i,j})}$.
}
\end{algorithm}


\subsection{Cumulative Regret}\label{subsec:results_regret}

We introduce the following notion of gap based on the utility function $f$ and optimal allocation $z^*$, which is essential to characterize the performance of our regret minimization algorithm. First, we define the \emph{gap} of an allocation $z \in \cZ$ as
$\Delta_z = f^{\mu^*}(z^*)-f^{\mu^*}(z)$. Now, we define the minimum gap of any sub-optimal allocation $z=(z_1,\ldots,z_N)$ that selects top $j$ items, $j\!\leq\! M$, from $i$-th list, $i \!\leq\! N$, as
\begin{equation}\label{eq:gap_regret}
 \Delta_{i,j} \!:=\! \!\min_{z\neq z^\star : z_i=j} \Delta_{z}\! = \!  f^{\mu^*}(z^*)\!-\!\!\max_{z\neq z^\star : z_i=j}f^{\mu^*}(z).
\end{equation}
With this definition of gap, we bound the cumulative regret of Algorithm~\ref{alg:regret_min} as follows.
\begin{theorem}[Cumulative regret]\label{thm:regret}
After $T$ rounds, the Ordered UCB algorithm enjoys the regret bound
\begin{align*}
  R_T \leq 
  \sum_{(i,j): z^*_i+1 \leq j \leq M} \frac{C K\log T}{\Delta_{i,j}} \!+\! \left(\frac{\pi^2}{3}\!+\!1 \right)KMN ,  
\end{align*}
where $\Delta_{i,j}$ is given by \eqref{eq:gap_regret} and $C\! >\!0$ is a universal constant.
\end{theorem}
It is worth noting that the summation in the above expression is over the sub-optimal items, i.e., the items not appearing in the optimal allocation $z^*$. At the same time, the sum is over the items that can appear in a sub-optimal allocation, i.e., those within $K$ positions from the top of each list. This is because any item below the $K$-th position of any would never be played even by a sub-optimal allocation due to the ordering structure. Also, note that $M=K$ for the Ordered UCB algorithm due to Remark~\ref{rem:MandK}.

\paragraph{Comparison with prior work.} 

One can directly employ the \emph{CombUCB1} algorithm of \citet{kveton2015tight} to solve the above regret minimization problem. To do so, one needs to instantiate the feasible set $\Theta$, which CombUCB1 takes as an input, with the allocation set $\cZ$ as given in~\eqref{eq:action_set}. This is due to the fact that any allocation $z \in \cZ$ induces a subset $\Theta \in 2^{MN}$ of size $K$, where $z_i \geq j$ implies $a_{i,j} \in \Theta$. This simple tweak of CombUCB1 would achieve a regret similar to Theorem~\ref{thm:regret} with the gaps $\Delta_{i,j}$ being replaced by 
\begin{align*}
    \widetilde \Delta_{i,j} = \min_{z \neq z^*:z_i \geq j} \Delta_z~.
\end{align*}
Note that $\Delta_{i,j} \geq  \widetilde \Delta_{i,j}$, since the minimum in~\eqref{eq:gap_regret} is over a smaller set of allocations in $\cZ$. Hence, Algorithm~\ref{alg:regret_min} enjoys a smaller regret bound as compared to CombUCB1. This is because our regret analysis is carefully fine-tuned to the prefix structure present in the problem, whereas \citet{kveton2015tight} present a general analysis for combinatorial action sets oblivious to the ordering in each list.




\subsubsection{Proof Sketch}

In this section, we provide the main ideas to prove Theorem~\ref{thm:regret}, and contrast it to the analysis of \citet{kveton2015tight} when needed. First, we define the event
\begin{align*}
    \cE_t  = \left\lbrace \Delta_{z_t} \leq\!\!\!\sum_{(i,j): z_i^* < j \leq z_{t,i} } 2\sqrt{\frac{1.5\log T}{T_{t-1}(a_{i,j})}}, \Delta_{z_t} > 0 \right\rbrace,
\end{align*}
where $\Delta_z$ denotes the gap of an allocation $z$. Now, defining $\hat R_T= \sum_{t=MN+1}^T \Delta_{z_t}\mathds{1}\lbrace \cE_t \rbrace$, we see from~\citet[Lemma 1]{kveton2015tight} that
\begin{align*}
    R_T \leq \expect{\hat R_T} + (1+\pi^2/3)KMN~.
\end{align*}
Now, let us consider two sequences of constants $(\alpha_l)_{l \geq 1}$ and $(\beta_l)_{l \geq 0}$ as in \citet{kveton2015tight} and define
$m_{l,t}=\frac{\alpha_l K^2 \log T}{\Delta_{z_t}^2}$. Furthermore, let $\tilde A_t$ denote the subset of items included in the allocation $z_t$ but not in $z^*$.
Then, we  define a series of mutually exclusive events $(G_{l,t})_{l \geq 1}$, where $G_{l,t}$ denotes the event that
at least $\beta_l K$ items in $\tilde A_t$ were observed at most $m_{l,t}$ times and
for all $j < i$, less than $\beta_1 K$ items in $\tilde A_t$ were observed at most $m_{l-1,t}$ times. Then, under $\cF_t$, it holds that the event $\bigcup_{l \geq 1}G_{l,t}$ happens, and hence
\begin{align*}
    \hat R_T  =\sum\limits_{l=1}^{\infty} \sum\limits_{t=MN+1}^T \Delta_{z_t} \mathds{1}\lbrace G_{l,t}, \Delta_{z_t}> 0 \rbrace .
\end{align*}
Now, let $G_{a_{i,j},l,t} = G_{l,t} \bigcap F_{a_{i,j},l,t}$ be the event that item $a_{i,j}$ is not observed sufficiently often under $G_{i,t}$, where $F_{a_{i,j},l,t} = \lbrace  z^*_i < j \leq z_{t,i}, T_{t-1}(a_{i,j}) \leq m_{l,t}\rbrace$. Then \citet{kveton2015tight} bound $\hat R_T$ as
\begin{align*}
    \hat R_T \leq \sum_{l} \sum_{t}\sum_{(i,j):z_i^* < j}\mathds{1} \lbrace  j \leq z_{t,i}, T_{t-1}(a_{i,j}) \leq m_{l,t}\rbrace\frac{\Delta_{z_t}}{\beta_l K}.
\end{align*}
Our main analytical novelty is to identify some ``double counting'' present in the above regret expression
under the ordered structure.
To do so, we define for $k \geq 0$, the events
\begin{align*}
    F^k_{a_{i,j},l,t} = \lbrace  z^*_i < j+k = z_{t,i}, T_{t-1}(a_{i,j+k}) \leq m_{l,t}\rbrace~.
\end{align*}
Note that, because of the ordered structure, if $a_{i,j}$ has only been observed a certain number of times, then $a_{i,j_k}$ would be observed less than or equal number of times i.e., $T_{t-1}(a_{i,j+k}) \leq T_{t-1}(a_{i,j})$, which implies that the event 
\begin{align*}
    \lbrace  z^*_i < j+k = z_{t,i}, T_{t-1}(a_{i,j}) \leq m_{l,t}\rbrace \subseteq F^k_{a_{i,j},l,t}~.
\end{align*}
This yields $F_{a_{i,j},l,t} \subseteq \bigcup_{k=0}^{M-j} F^k_{a_{i,j},l,t}=:H_{a_{i,j},l,t}$, which gives $\bigcup_{j=1}^M \lbrace G_{l,t} \bigcap F_{a_{i,j},l,t} \rbrace \subseteq  \bigcup_{j=1}^M \lbrace G_{l,t} \bigcap H_{a_{i,j},l,t} \rbrace $. Now observe that $H_{a_{i,1},l,t} \supseteq H_{a_{i,2},l,t} \supseteq \ldots H_{a_{i,M},l,t}$, implying that the RHS of the above is a union over decreasing sets and hence $\bigcup_{j=1}^M \lbrace G_{l,t} \bigcap F_{a_{i,j},l,t} \rbrace \subseteq  \lbrace G_{l,t} \bigcap H_{a_{i,i},l,t} \rbrace $. This further implies that
\begin{align*}
 \bigcup_{j=1}^M G_{a_{i,j},l,t}  \!\subseteq\! \bigcup_{j=1}^M\! \left\lbrace G_{l,t} \bigcap \lbrace  z^*_i \!<\! j \!=\! z_{t,i}, T_{t-1}(a_{i,j})\! \leq\! m_{l,t}\rbrace\!\right \rbrace.
\end{align*}
Therefore, we can bound $\hat R_T$ as
\begin{align*}
  \hat R_T \leq \sum_{l} \sum_{t}\sum_{(i,j):z_i^* < j}\mathds{1} \lbrace j= z_{t,i}, T_{t-1}(a_{i,j}) \leq m_{l,t}\rbrace\frac{\Delta_{z_t}}{\beta_l K},
\end{align*}
which corrects for the ``double counting'' mentioned above.
To bound the regret, we now need to look at sub-optimal allocations that end at item $a_{i,j}$ for list $i$, which can be accounted with our definition of \emph{minimum gap} $\Delta_{i,j}$ (see~\eqref{eq:gap_regret}). The rest of the proof follows similar arguments as in~\citet{kveton2015tight}. See Appendix \textbf{A.1} for details.


\section{\KBAI: Minimize Error Probability}\label{sec:best_arm}
\naga{In this section, we present the learning algorithm for the second objective set up in Section~\ref{sec:setup}, i.e., the \textit{search} goal. We prove a bound on the probability of error of our algorithm and show that our guarantee improves the state-of-the-art results in the literature by exploiting the problem structure.}

\subsection{Algorithm}


We propose an algorithm for finding top-$K$ \naga{items} from $N$ lists obeying the ordered structure. We adapt the Successive Accepts and Rejects (SAR) strategy of \citet{bubeck2013multiple} to our setting, originally designed for top-$K$ identification in stochastic combinatorial semi-bandits. We call this algorithm \emph{Ordered SAR}.

Informally, our algorithm proceeds as follows. We divide the total budget of $T$ rounds into $MN-1$ phases. At the end of each phase, we either \emph{accept} a\naga{n} \naga{item} from the \emph{top} of a list or \emph{reject} a\naga{n} \naga{item} from the \emph{bottom} of a list. In any case, that \naga{item} is ``deactivated''.
The \naga{item}s that are still active are sampled for an equal number of rounds in the next phase. Now, we describe the procedure for choosing an item to accept or reject. Let $\Phi_k$ denote the set of active items at the start of phase $k$. We pull each item $e \in \Phi_k$ for $T_k\!-\!T_{k-1}$ rounds and update their empirical means with observed rewards, where 
\begin{equation}\label{eq:phase_len}
 T_k \!=\! \left\lceil\! \frac{1}{\overline\log(MN)} \frac{T\!-\!MN}{MN\!+\!1\!-\!k}\!\right\rceil,\; \overline\log(n)\!=\! \frac{1}{2}\!+\!\sum_{i=2}^{n}\frac{1}{i}~,   
\end{equation}
with $T_0:=0$.
Similar to \citet{bubeck2013multiple}, the key to decide whether to accept or reject an item is to consider estimates of the gaps $\Delta_e$. To this end, let $m_k > 0$ denote the number of items left to find at the start of phase $k$. First, we compute the ``empirical gap'' of each item $e \in \Phi_k:$ 
\begin{equation}\label{eq:emp-gap}
   \hat \Delta_{k,e} = \begin{cases}
\hat \mu_{k,e} - \hat\mu_{k,[m_k+1]}, & \text{if}\; \hat \mu_{k,e} \geq \hat\mu_{k,[m_k]}\\
\hat\mu_{k,[m_k]} - \hat\mu_{k,e}, &  \text{if}\;\hat\mu_{k,e} \leq  \hat\mu_{k,[m_k+1]}
\end{cases}, 
\end{equation}
where $\hat \mu_{k,[l]}$ denotes the $l$-th largest empirical mean among all items in $\Phi_k$. 
Then, we find the item $e_k$ which has the largest empirical gap among all active items $\Phi_k$. Now, let $e_k$ be an item from list $i_k$. If $e_k$ is the current empirical best item, we accept the current topmost active item from list $i_k$. Else, we reject the current bottom most active item from it. In any case, we deactivate the accepted or rejected item, and update the top or bottom of the list accordingly.
% Now, let $\Phi_k^{\text{top}}$ and $\Phi_k^{\text{bot}}$ denote the collection of active arms from the top most and the bottom most positions of all the lists, respectively. 
% Then, we deactivate the arm $e_k$ which has the largest empirical gap in the set $\Phi_k^{\text{top}} \cup \Phi_k^{\text{bot}}$ . Finally, if $e_k \in \Phi_k^{\text{top}}$, then it is accepted, otherwise it is rejected. 
See Algorithm~\ref{alg:best_arm} for pseudo-code.



\begin{algorithm}[t]
\caption{Ordered Successive-Accept-Reject}
\label{alg:best_arm}
\DontPrintSemicolon
\KwIn{$N$ lists of \naga{items} $(a_{i,1},\ldots,a_{i,M}), i \leq N$, phase lengths $(T_k)_{0\leq k < MN}$\naga{, and $K$ (\#items to retrieve)}.}
\textbf{Initialize:} $\Phi_1 \!=\! \lbrace a_{i,j} \rbrace_{i\leq N,j \leq M}$, 
% $\Phi_1^{\text{top}} \!=\! \lbrace a_{i,1} \rbrace_{i \leq N}$, $\Phi_1^{\text{bot}} \!=\! \lbrace a_{i,M} \rbrace_{i \leq N}$, 
$m_1 \!=\! K$, $z^{\text{out}}_i\!=\!0$, $\text{top}_i\!=\!1, \text{bot}_i\!=\!M\; \forall i\!\leq\!N$   \;
\For{each phase $k=1,2,\ldots,MN-1$}{
Pull each arm $e \in \Phi_k$ for $T_k-T_{k-1}$ rounds and update its empirical mean $\hat \mu_{k,e}$. \;
Compute empirical gap $\hat \Delta_{k,e}$ for each arm $e \in \Phi_k$ using~\eqref{eq:emp-gap}.\;
% Let $e_k \in \argmax_{e \in \Phi_k^{\text{top}} \cup \Phi_k^{\text{bot}}} \hat \Delta_{k,e}$ (ties broken arbitrarily) and $i_k,j_k$ be such that $e_k \!=\!a_{i_k,j_k}$.\; 
Let $e_k \in \argmax_{e \in \Phi_k} \hat \Delta_{k,e}$ (ties broken arbitrarily) and $i_k$ be such that $e_k \!=\!a_{i_k,j}$ for some $j$.\;
%\todo{why is the RHS of $(i_k, j_k)$ a set?}\;
\If{$\hat \mu_{k,e_k} > \hat \mu_{k,[m_k]}$}{
 Set $j_k\!=\!\text{top}_{i_k}$, $\text{top}_{i_k}\!=\!\text{top}_{i_k}\!+\!1$,\;
$m_{k+1}=m_k-1$, $z^{\text{out}}_{i_k}=z^{\text{out}}_{i_k}+1$.  
}
\Else{
Set $j_k\!=\!\text{bot}_{i_k}$, $\text{bot}_{i_k}\!=\!\text{bot}_{i_k}\!-\!1$.
}
Set $\Phi_{k+1}\!=\!\Phi_k \setminus \{a_{i_k,j_k}\}$.
% \If{$e_k \in \Phi_k^{\emph{top}}$}{
%  $\Phi_{k+1}\!=\!\Phi_k \!\setminus\! e_k$, $\Phi_{k+1}^{\text{top}} \!=\! \lbrace\Phi_{k}^{\text{top}}\!\setminus\! e_k\rbrace\! \cup\! \lbrace a_{i_k,j_k+1} \rbrace$,
%  $m_{k+1}=m_k-1$, $z^{\text{out}}_{i_k}=z^{\text{out}}_{i_k}+1$.  
% }
% \Else{
% $\Phi_{k+1}\!=\!\Phi_k \!\setminus\! e_k$, $\Phi_{k+1}^{\text{bot}} \!=\! \lbrace\Phi_{k}^{\text{bot}}\!\setminus\! e_k\rbrace\! \cup\! \lbrace a_{i_k,j_k-1} \rbrace$.
% }
}
\textbf{Output:} Allocation $z^{\text{out}}$ of accepted arms.
\end{algorithm}



\subsection{Probability of Error}\label{subsec:results_topK}

We introduce the following complexity measure to characterize the performance of our top-$K$ identification algorithm. Recall that $(z^*_1,\ldots,z^*_N) \in \cZ$ is the optimal allocation corresponding to the set of $K$ arms with highest mean rewards. Define the set of ``boundary'' arms
\begin{equation}\label{eq:boundary}
    \Phi = \bigcup_{i=1}^N \left\lbrace a_{i,z^*_i}, a_{i,z^*_i+1}\right\rbrace,
\end{equation}
where $a_{i,0}\!:=\!\emptyset$ for all $i \!\in\! [N]$. That is, $\Phi$ contains only $z_i$-th and $z_{i}\!+\!1$-st arms from the top of each list $i$. Note that the cardinality of this boundary set is at most twice the number of lists
% \todo{at most twice the number of lists?}
, i.e., $|\Phi| \!\leq\! 2N$. 


% Let $\mu^*_{[l]},1 \!\leq\! l \!\leq\! |\Phi|,$ denote 
% the $l$-th largest mean reward among all arms in $\Phi$, i.e., $\mu^*_{[1]}\!\geq\! \ldots \!\geq\! \mu^*_{[|\Phi|]}$. Now, similar to \citet{bubeck2013multiple}, we define the gap of each arm $e \in \Phi$:
% \begin{equation}\label{eq:gap_topK}
%    \Delta_{e} = \begin{cases}
%  \mu^*_{e} - \mu^*_{[K+1]}, & \text{if}\; \mu^*_{e} \geq \mu^*_{[K]}\\
% \mu^*_{[K]} - \mu^*_{e}, &  \text{if}\;\mu^*_{e} \leq  \mu^*_{[K+1]}
% \end{cases}.
% \end{equation}
% Let $\Delta_{[l]}$ denote the $l$-th smallest such gap, i.e., $\Delta_{[1]}\!\leq\! \ldots \!\leq\! \Delta_{[|\Phi|]}$. Now, we define the complexity measure 
% \begin{equation}\label{eq:complexity_topK}
%     H = \max_{l \in \lbrace 1,\ldots,|\Phi| \rbrace} l\; \Delta_{[l]}^{-2}~. 
% \end{equation}


Let $\mu^*_{[l]},1 \!\leq\! l \!\leq\! MN,$ denote 
the $l$-th largest mean reward among all arms, i.e., $\mu^*_{[1]}\!\geq\! \ldots \!\geq\! \mu^*_{[MN]}$. Now, similar to \citet{bubeck2013multiple}, we define the gap of each arm $e \in \lbrace a_{i,j} \rbrace_{i\leq N, j \leq M}$:
\begin{equation}\label{eq:gap_topK}
   \Delta_{e} = \begin{cases}
 \mu^*_{e} - \mu^*_{[K+1]}, & \text{if}\; \mu^*_{e} \geq \mu^*_{[K]}\\
\mu^*_{[K]} - \mu^*_{e}, &  \text{if}\;\mu^*_{e} \leq  \mu^*_{[K+1]}
\end{cases}.
\end{equation}
Let $\Delta_{[l]}$ be the $l$-th smallest such gap, i.e., $\Delta_{[1]}\!\leq\! \ldots \!\leq\! \Delta_{[MN]}$. Let $k_1 \leq \ldots \leq k_{|\Phi|}$ be the phases in which Algorithm \ref{alg:best_arm} accepts or rejects an arm from the boundary set $\Phi$. We define the complexity measure 
\begin{equation}\label{eq:complexity_topK}
    H_{\Phi} = \max_{1 \leq j \leq |\Phi|}\frac{(MN+1-k_j)}{\Delta_{[MN+1-k_j]}^{2}} \;~. 
\end{equation}



With these definitions in place, we bound the probability of error of Algorithm~\ref{alg:best_arm} as follows.
\begin{theorem}[Probability of error]\label{thm:topK}
Given a time budget $T\! >\! MN$, running the ordered SAR algorithm with choice of $T_k$'s given in~\eqref{eq:phase_len}, achieves the probability of error
\begin{align*}
  \delta_T \leq 2MN |\Phi| \exp\left(-\frac{T-MN}{8\overline \log (MN)H_{\Phi}} \right),  
\end{align*}
where $H_{\Phi}$ is given by \eqref{eq:complexity_topK}.
\end{theorem}
It is worth noting that gaps of only $|\Phi|$ many arms influence the final error in the selected arms, which is a consequence of our algorithm exploiting the prefix structure. Furthermore, if reward gaps are large for these $|\Phi|$ many arms, then $H_{\Phi}$ is small and hence, the probability of error is also small, i.e., it is easy to distinguish the top-$K$ arms from the rest.

Furthermore, in our setting, the dependence on $M$ in the bound above is extraneous as stated in the remark below.
\begin{remark} Given our problem structure and assumption on the utility function, the top-$K$ items must necessarily incorporate prefixes of the lists. So, when $M > K$, the lists can be trimmed to size $K$, before presenting to the algorithm, as mentioned in Remark~\ref{rem:MandK}. Thus, we can replace $M$ with $K$ in the bound of Theorem~\ref{thm:topK}. 
\end{remark}
\paragraph{Comparison with prior work.} Observe that one can directly apply the SAR algorithm of \citet{bubeck2013multiple} to find the optimal allocation $z^*$. This algorithm gives a guarantee that the probability of error is
\[
\leq 2M^2N^2 \exp\left(-\frac{T-MN}{8\overline \log (MN)H} \right)
\]

where the complexity measure $H$ is defined as: 
\begin{align*}
  H= \max_{1 \leq l \leq MN} l\; \Delta_{[l]}^{-2}~.  
\end{align*}
Note that $H_{\Phi}\! \leq\! H$ since the maximum in~\eqref{eq:complexity_topK} is over a much smaller set of arms $\Phi$ of size $\leq 2N$ compared to the maximum over all the $MN$ arms in $H$. Hence, Algorithm~\ref{alg:best_arm} achieves a smaller probability of error compared to the above work. This is because we adapt our strategy to the ordering of the lists, whereas the SAR algorithm does not. Comparing the terms outside the negative exponential\footnote{these arise due to application of union bounds} in both these error guarantees, we can see that our guarantee depends linearly on $M$, whereas the guarantee for the SAR algorithm in \cite{bubeck2013multiple} has a quadratic dependence. Our experiments on $K$-best arm identification in Section \ref{sec:sims} provide good support to these theoretical findings.

% \paragraph{Sample complexity under fixed confidence.} One might wish to design a strategy to find the optimal allocation under the \emph{fixed confidence} setting (i.e., given a fixed failure probability $\delta$)
% % \todo{we should say a sentence on what this means?}
% and find the sample complexity of this strategy (as a function of $\delta$). Let us provide some intuition on this. To do so,
% similar to \citet{bubeck2013multiple}, we define a different but related complexity measure adapted to the prefix structure:
% \begin{equation}
%     \overline H_{\Phi} = \sum_{ e \in \Phi} \Delta_{e}^{-2}.
% \end{equation}
% It turns out that $\overline H_{\Phi}$ is equivalent to $H_{\Phi}$ up to a logarithmic factor: $H_{\Phi} \!\leq\! \overline H_{\Phi} \!\leq\! \log(2MN) H_{\Phi} $, see \citet{audibert2010best}. Now, setting the probability of error to $\delta$, we can see that one would require $T \!=\!  O\!\left(\overline H_{\Phi}\log (MN/\delta)\right)$ samples to find the optimal allocation with a \emph{fixed confidence}. We conjecture that a variant of the \emph{Combinatorial Lower Upper Confidence Bound} algorithm of \citet{kalyanakrishnan2012pac} adapted to the ordering of lists would achieve this sample complexity up to a logarithmic factor.


\subsubsection{Proof Sketch}
In this section we provide a high level sketch of our proof for Theorem \ref{thm:topK}. Complete details are provided in Appendix \textbf{A.2}. At a high level, our proof uses ideas from the proof of Theorem $1$ in \cite{bubeck2013multiple}. However, there are some crucial differences that take advantage of the known ordered structure between items and therefore leads to better guarantees. Let $k_1 < \ldots < k_{|\Phi|}$ be the phases where an item from $\Phi$ (i.e. the boundary set) was accepted or rejected by Algorithm \ref{alg:best_arm}. Since we always accept an item that is the top item of some list and reject an item that is the bottom item of some list, the first error can only occur at a boundary item i.e. there can be no errors before phase $k_1$. During phase $k_1$, there will be $MN+1-k_1$ active items, let's call them $a_1, \ldots, a_{MN+1-k_1}$ such that $\mu_{a_1}\geq \ldots \geq \mu_{a_{MN+1-k_1}}$. 

% We can break this set of active items into two parts $\{a_1, \ldots, a_{K^\prime}\}$ and $\{a_{K^\prime+1}, \ldots, a_{MN_1-k_1}\}$ such that the first set is a subset of the top $K$ items and the second set is a subset of the bottom $MN-K$ items.

Now let's say an error occurs at phase $k_1$ and an item $a_l\in \Phi$ was accepted when it should have actually been rejected. We follow the proof idea in \cite{bubeck2013multiple} and prove that this cannot hold by showing that it leads to a contradiction. In particular we show $\Delta_{[MN+1-k_1]} > \max\{\mu_{a_1}- \mu_K, \mu_{K} - \mu_{a_{MN+1-k_1}}\}$, where $\mu_K$ is the $K^{th}$ largest mean rewards. It's a contradiction because at stage $k_1$ only $k_1-1$ items would have been accepted or rejected, implying $\Delta_{[MN+1-k_1]} \leq \max\{\mu_{a_1}- \mu_K, \mu_{K} - \mu_{a_{MN+1-k_1}}\}$. We create a high probability event where this can be shown:
\[
\eta_1\! =\!\left \{\forall \text{ items }a : \abs{\frac{1}{n_{k_1}}\sum\limits_{s=1}^{n_{k_1}}X_{a,s}\! -\! \mu_a} \!<\! \frac{1}{4}\Delta_{[MN+1-k_1]}\right\},
\]
where $X_{a,s}$ is the reward received on the $s^{th}$ pull of item $a$. Proof of why this holds under $\eta_1$ is technical and is presented with all details in Appendix \textbf{A.2}. The general idea is similar to the one presented in \cite{bubeck2013multiple}. However, the proof needs to be crucially modified at many places to make it work. Proof in \cite{bubeck2013multiple} directly uses the item $a_l$ that was accepted (by mistake) in its technical calculations to show the above inequality. In their SAR algorithm, they accept $a_l$ when it has the largest empirical gap and the largest mean empirical reward among all active items. This fact is crucial in showing the inequality mentioned above. However, our algorithm accepts $a_l$ without actually considering its own empirical mean reward and therefore we cannot directly use the proof in \cite{bubeck2013multiple}. To get around this problem, we note that $a_l$ is accepted by Algorithm \ref{alg:best_arm}, only when there is some item $a_p$ in the same list as $a_l$ with true mean reward $\mu_{a_p}\leq \mu_{a_l}$, largest empirical gap and largest empirical mean reward compared to all other active items. Since $a_l$ should have been rejected it is not in top $K$ items and therefore $a_p$ is also not in the top $K$ items. For the rest of the proof we follow the steps in \cite{bubeck2013multiple} but work with $a_p$ instead of $a_l$ and all steps go through. The technical calculations only require that within all the active items, $a_p$ has the largest empirical gap and the largest empirical mean reward and that it is not a part of top $K$ items. 

Now we can extend this argument to  phase $k_2$ since if there was no error at phase $k_1$, the next error has to happen at the next item from the boundary set i.e. at phase $k_2$. To prevent this we assume the event:
\[
\eta_2 \!=\! \left\{\forall \text{ items }a : \abs{\frac{1}{n_{k_2}}\sum\limits_{s=1}^{n_{k_2}}X_{a,s}\! -\! \mu_a}\! < \!\frac{1}{4}\Delta_{[MN+1-k_2]}\right\}.
\]
Continuing in this fashion, we define the intersection of all these events and our proof holds when this event is assumed. Since we only had to repeat this argument for $|\Phi|$ many phases compared to the SAR algorithm which repeats it for all $MN-1$ phases, we are able to guarantee a better bound.


\section{Simulations}\label{sec:sims}
The objective of this section is to verify if the theoretical guarantees on  improvements over existing methods in terms of upper bounds  hold empirically. We show results on synthetic and semi-synthetic problem instances.

\subsection{Regret minimization}
In this section, we empirically evaluate the regret performance of Algorithm~\ref{alg:regret_min} on bandit instances generated from synthetic and real-world data. We compare against the combinatorial bandit algorithm of \citet{kveton2015tight} which is oblivious to the ordering of the lists. Specifically, we instantiate the algorithm of \citet{kveton2015tight} with the feasible set $\Theta=2^{MN}$ to serve as a baseline (referred to as CombUCB). We plot cumulative regret as a function of number of rounds, and we average results over 20 trial runs with different seeds. 

\textbf{Synthetic bandit instances.}
First, we generate combinatorial bandit instances with $N=5$ ordered lists, with each list consisting of $M=10$ items (arms).
The arm means are sampled uniformly in $[0.25, 0.75]$.  
We consider real-valued rewards sampled from Gaussian (Figure~\ref{fig:Gausss}), Bernoulli (Figure~\ref{fig:Bernoulli}) distributions with aforementioned means, projected to $[0,1]$. We show the results for $K =5$ in Figures~\ref{fig:Gausss} and~\ref{fig:Bernoulli}; the growth of cumulative regret for the two algorithms aligns with our theoretical findings in Section~\ref{sec:regret_min}.

\textbf{Semi-synthetic bandit instance.}
Next, we generate bandit instances from Microsoft Learning to Rank dataset MSLR-WEB10K \citep{DBLP:journals/corr/QinL13}. The dataset consists of 1,200,192 rows and 138 columns, where each row corresponds to a query-url pair. The first column is relevance label $\{0, 1, 2, 3, 4\}$ of the pair, which we take as rewards. The second column denotes the
query id, and the rest 136 columns denote contexts of a query-url pair. We cluster the data by running K-means
algorithm with 50 clusters. We treat each cluster as a bandit arm with mean reward as the empirical mean of the individual ratings in the cluster. This way, we obtain a bandit instance with 50 total arms. We then divide them into $N=5$ lists of $M=10$ arms in each. The results are shown in Figure~\ref{fig:Real} for $K = 5$.

In all the simulations above, we observe that the cumulative regret of our algorithm (\textit{Ordered CombUCB}) is much lower than the baseline (Vanilla \textit{CombUCB}), consistent with our 
theoretical result.

\begin{figure}[!htb]
  \centering
  \includegraphics[width=0.7\linewidth]{Gaussian-easy-regret-comparison.pdf}
  \caption{\footnotesize{Comparison of cumulative regret for CombUCB and Ordered CombUCB on synthetic Gaussian bandit instance.}}\label{fig:Gausss}
\end{figure}


\begin{figure}[!htb]
  \centering
  \includegraphics[width=0.7\linewidth]{Bernoulli-easy-regret-comparison.pdf}
  \caption{\footnotesize{Comparison of cumulative regret for CombUCB and Ordered CombUCB on synthetic Bernoulli bandit instance.}}\label{fig:Bernoulli}
\end{figure}




\begin{figure}[!htb]
  \centering
  \includegraphics[width=0.7\linewidth]{Bernoulli-real-regret-comparison.pdf}
  \caption{\footnotesize{Comparison of cumulative regret for CombUCB and Ordered CombUCB on semi-synthetic bandit instance. }}\label{fig:Real}
\end{figure}






\subsection{$K$-Best Arm identification}

In this section, we first aim to find top $K=5$ arms 
of the (first) synthetic bandit instance used in the above experiments, i.e., when means of each of the 50 arms are sampled uniformly in $[0.25,0.75]$. We use the algorithm of \citet{bubeck2013multiple} as baseline (referred to as SAR) against our proposed Algorithm~\ref{alg:best_arm}. We observe that (plot not shown), within 5000 rounds, both the algorithms are able to find top 5 arms. This, we believe, is due to the fact that the \emph{problem instance is easy} (i.e., top-5 arms are easy to find when the mean rewards are fairly spread out). 


To demonstrate the advantage of our algorithm, we generate a \emph{hard instance} by sampling arm means uniformly in $[0.45, 0.55]$. The rewards are sampled from Gaussian (Figure~\ref{fig:Gauss_error}) and Bernoulli (Figure~\ref{fig:Bernoulli_error}) distributions with aforementioned means and projected to $[0,1]$. 
We run both the algorithms for rounds $T\in[1000,\ldots,10000]$ for 100 independent trials and compute the fraction of trials for which they fail to output the optimal allocation. In Figures~\ref{fig:Gauss_error} and~\ref{fig:Bernoulli_error}, we compare the probability of error of Ordered SAR (Algorithm~\ref{alg:best_arm}) with the SAR algorithm of \citet{bubeck2013multiple} as a function of the budget, i.e., number of rounds. We find that the failure probability of Ordered SAR is consistently lower than that of SAR, which validates our theory.

\begin{figure}[!htb]
  \centering
  \includegraphics[width=0.7\linewidth]{Gaussian-hard-error-comparison.pdf}
  \caption{\footnotesize{Comparison of probability of error for SAR and Ordered SAR on synthetic Gaussian bandit instance.}}\label{fig:Gauss_error}
\end{figure}


\begin{figure}[!htb]
  \centering
  \includegraphics[width=0.7\linewidth]{Bernoulli-easy-error-comparison.pdf}
  \caption{\footnotesize{Comparison of probability of error for SAR and Ordered SAR on synthetic Bernoulli bandit instance.}}\label{fig:Bernoulli_error}
\end{figure}


\section{Conclusions and Future Work}
\label{sec:concl}
We identify and formulate an important problem arising in two-stage recommendation systems that employ different experts for different categories of items. We propose solutions, adapting existing algorithms for combinatorial multi-arm bandits, and provide regret/error bounds that strictly improve over state-of-the-art for our setting. Our work opens up interesting follow-up research questions: i) can we incorporate user context while selecting top $K$ items, when available? ii) can we design an algorithm to find the optimal allocation with a \emph{fixed confidence}, say $\delta$, and find the sample complexity of this strategy as a function of $\delta$?  
We conjecture that a variant of the \emph{Combinatorial Lower Upper Confidence Bound} algorithm of \citet{kalyanakrishnan2012pac} adapted to the ordering of lists would work in this setting. Another interesting direction is to lift these results to the setting distributed bandits\citep{korda2016distributed,mahadik2020fast}.




















% \begin{figure*}
%     \centering
%     \begin{tikzpicture}[xscale=1.5]
%         \coordinate (origin);
%         \draw[->] (origin) -- +(1cm,0) node[below] {$x$};
%         \draw[->] (origin) -- +(0,1cm) node[left] {$y$};
%         \fill[gray] (45:1cm) circle[radius=.2cm];
%     \end{tikzpicture}
%     \caption{A Nice Filled Ellipse with a Pair of Coordinate Axes.}\label{fig:tikz}
% \end{figure*}

% \begin{figure}[!htb]
%   \centering
%   \includegraphics[width=0.7\linewidth]{pitt}
%   \caption{A View of a Nice City.}\label{fig:pitt}
% \end{figure}


% \begin{table}
%     \centering
%     \caption{An Interesting Table.}\label{tab:data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}






% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     SRC and GS contributed jointly as a first author.
% \end{contributions}

%\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
   % Briefly acknowledge people and organizations here.

    % \emph{All} acknowledgements go in this section.
%\end{acknowledgements}


\bibliography{chowdhury_387}
\end{document}
