%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{amsmath,amsfonts,amssymb,amsthm,amsxtra,graphicx,verbatim,epsfig,color,enumerate,array,mathtools,dsfont,mathrsfs,hyperref,url,bookmark, subcaption, wrapfig,thmtools,thm-restate,float, bm}

\usepackage[ruled,linesnumbered,vlined]{algorithm2e}
\usepackage{algpseudocode} % command set for algorithm2e

\usepackage{mathabx}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
%\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
%\newtheorem[theorem][Remark]

\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cJ}{\mathcal{J}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cW}{\mathcal{W}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cZ}{\mathcal{Z}}

\newcommand{\kA}{\mathfrak{A}}
\newcommand{\kB}{\mathfrak{B}}
\newcommand{\kC}{\mathfrak{C}}
\newcommand{\kD}{\mathfrak{D}}
\newcommand{\kE}{\mathfrak{E}}
\newcommand{\kF}{\mathfrak{F}}
\newcommand{\kG}{\mathfrak{G}}
\newcommand{\kH}{\mathfrak{H}}
\newcommand{\kI}{\mathfrak{I}}
\newcommand{\kJ}{\mathfrak{J}}
\newcommand{\kK}{\mathfrak{K}}
\newcommand{\kL}{\mathfrak{L}}
\newcommand{\kM}{\mathfrak{M}}
\newcommand{\kN}{\mathfrak{N}}
\newcommand{\kO}{\mathfrak{O}}
\newcommand{\kP}{\mathfrak{P}}
\newcommand{\kQ}{\mathfrak{Q}}
\newcommand{\kR}{\mathfrak{R}}
\newcommand{\kS}{\mathfrak{S}}
\newcommand{\kT}{\mathfrak{T}}
\newcommand{\kU}{\mathfrak{U}}
\newcommand{\kV}{\mathfrak{V}}
\newcommand{\kW}{\mathfrak{W}}
\newcommand{\kX}{\mathfrak{X}}
\newcommand{\kY}{\mathfrak{Y}}
\newcommand{\kZ}{\mathfrak{Z}}

\newcommand{\Real}{\mathbb{R}}
\newcommand{\Nat}{\mathbb{N}}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\Hilbert}{\mathcal{H}}
\newcommand{\Pmeas}{\mathfrak{M}_1^+}
\newcommand{\Meas}{\mathcal{M}}

\newcommand{\Esp}{\mathbb{E}}
\newcommand{\Var}{\mathbb{V}}
\renewcommand{\Pr}{\mathbb{P}}
\newcommand{\PP}{{\cal P}}
\newcommand{\Pn}{\mathcal{P}_n}

\newcommand{\Rad}{\mathcal{R}}
\newcommand{\Normal}{\cN}
\newcommand{\Bern}{\mathcal{B}}



\newcommand{\supp}{\mathrm{supp}}
% \newcommand{\kl}{\texttt{KL}}
% \newcommand{\KL}{\texttt{KL}}
% \newcommand{\klber}{\texttt{kl}}
% \newcommand{\TV}{\texttt{TV}}
% \newcommand{\Span}{\mathbb{S}}


\newcommand{\ind}{\mathbb{I}}
\newcommand{\indic}[1]{\mathbb{I}\{#1\}}
\renewcommand{\mod}[2]{[#1 \,\, \mathrm{mod} \,\, #2]}
\newcommand{\fracpartial}[2]{\frac{\partial #1}{\partial  #2}}
\newcommand{\abs}[1]{\left|#1\right|}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\expect}[1]{\mathbb{E}\left[{#1}\right]}
\newcommand{\prob}[1]{\mathbb{P}\left[{#1}\right]}
\newcommand{\given}{\; \big\vert \;} 
\newcommand{\bydef}{:=}
\newcommand{\inner}[2]{\langle #1, #2 \rangle}
\newcommand{\at}[2][]{#1|_{#2}}
\newcommand*{\as}{\mathrm{(a.s.)}}
\newcommand*{\iid}{i.i.d.\xspace}

\newcommand{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand{\Argmax}{\mathop{\mathrm{Argmax}}}
\newcommand{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand{\argsup}{\mathop{\mathrm{argsup}}}
\newcommand{\arginf}{\mathop{\mathrm{arginf}}}

\newcommand{\beq}{\begin{equation}}
\newcommand{\eeq}{\end{equation}}
\newcommand{\beqn}{\begin{equation*}}
\newcommand{\eeqn}{\end{equation*}}
\newcommand{\beqa}{\begin{eqnarray}}
\newcommand{\eeqa}{\end{eqnarray}}
\newcommand{\beqan}{\begin{eqnarray*}}
\newcommand{\eeqan}{\end{eqnarray*}}

\renewcommand{\phi}{\varphi}
\renewcommand{\epsilon}{\varepsilon}
\renewcommand{\leq}{\leqslant}
\renewcommand{\geq}{\geqslant}
\renewcommand{\hat}{\widehat}
\newcommand{\wh}{\widehat}
\newcommand{\ol}{\overline}
\newcommand{\mt}{\widetilde{\mu}}
\newcommand{\wt}{\widetilde}
\renewcommand{\d}{\mbox{d}}
\newcommand{\nup}{\kappa}
\newcommand{\eps}{\varepsilon}
\newcommand{\ra}{\rightarrow}
\newcommand{\eqdef}{\stackrel{\rm def}{=}}
\newcommand{\Otilde}[1]{\tilde{O}\left(#1\right)}
%\newcommand{\qed}{\hfill$\square$}

\DeclareMathOperator{\Tr}{tr}
\DeclareMathOperator{\T}{T}
\DeclareMathOperator{\Rk}{rank}
\DeclareMathOperator{\Dg}{diag}
\DeclareMathOperator{\F}{F}
\DeclareMathOperator{\HS}{HS}
\DeclareMathOperator{\op}{op}
\DeclareMathOperator{\V}{\mathbb{V}ar}
\DeclareMathOperator{\C}{\mathbb{C}ov}
\DeclareMathOperator{\parent}{par}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\MI}{I}
\DeclareMathOperator{\KL}{KL}
\DeclareMathOperator{\kl}{kl}
\DeclareMathOperator{\TV}{TV}
\DeclareMathOperator{\diff}{d}
\DeclareMathOperator{\bC}{\mathbb{C}}

\DeclareMathOperator{\Reg}{Reg}
\DeclareMathOperator{\Risk}{Risk}

\DeclareMathOperator{\reg}{reg}
\DeclareMathOperator{\risk}{risk}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{main}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Combinatorial Categorized Bandits with Expert Rankings\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<t-sayakr@microsoft.com>?Subject=Your UAI 2023 paper}{Sayak Ray Chowdhury$^*$}{}}
\author[1]{Gaurav Sinha$^*$}
\author[1]{Nagarajan Natarajan}
\author[1]{Amit Sharma}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Microsoft Research\\
    Bengaluru, India
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
\def\thefootnote{*}\footnotetext{Equal contribution} 

% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 

% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 

\appendix

% \section{Baseline}

% A natural baseline is a straightforward adaptation of UCB on all the $NK$ keywords in the following way:

% \begin{itemize}
%     \item Play keywords given by each ranked list $\pi_i$ for one round and get reward estimates $\hat\mu(a_i^1),\ldots,\hat\mu(a_i^K)$ of each keyword. Also, compute the UCB estimates $ \text{UCB}(a_i^1),\ldots,\text{UCB}(a_i^K)$
%     \item Put all the $NK$ keywords in one list and denote UCB estimates as $\text{UCB}(1),\ldots,\text{UCB}(NK)$. Play the top $K$ keywords as
%     \begin{align}
%         \argmax_{A \subset 2^{NK}:|A|=K} \sum_{a \in A}\text{UCB}(a)
%     \end{align}
%     (Note that choosing items by merely sorting the UCB's won't help us get a bound since we have assumed additive reward structure. We need to do a combinatorial optimization like above.)
%     \item Update reward and UCB estimates of the chosen $K$ keywords.
% \end{itemize}
% Here, we work with total $NK$ actions and at each round choose $K$ actions. Hence, the regret of UCB would be $O(\sqrt{K\cdot NK\cdot T\log T})=O(K\sqrt{NT \log T})$ \citep{kveton2015tight}.

% Note that this algorithm doesn't use the given ranked lists. With this additional structure of $N$ ranked lists, we can choose just $N$ allocations and hope to get a regret of the order $O(\sqrt{K\cdot N \cdot T\log T})$.

% Let us first adapt the proof idea of \citet{kveton2015tight} to our setting, and see if we can get any improvement over them.

% Let $\cE_t$ denotes the event that for every action $a$, the following relation holds:
% \begin{equation}
% \mid \hat \mu_{t-1}(a) - \mu(a) \mid \leq \sqrt{\frac{3 \log t}{2n_t(a)}}.
% \end{equation}
% It holds that $\prob{\cE_t} \geq 1-2NK/t^2$. Under the event $\cE_t$ and by our choice of $z_t$, we have 
% \begin{equation}
% f_{z_t}(U_{t-1}) \geq f_{z^*}(U_{t-1}) = \sum_{i=1}^N \sum_{j=1}^{z_i^*} U_{t-1}(a_i^j) \geq \sum_{i=1}^N \sum_{j=1}^{z_i^*} \mu(a_i^j) = f_{z^*}(\mu).
% \end{equation}
% Define, for any allocation $z \in \cZ$, the loss in mean reward as $\Delta_z=f_{z^*}(\mu)-f_{z}(\mu)$. Then, we have $\text{Reg}(T)=\sum_{t=1}^T \Delta_{z_t}$, where we can bound $\Delta_{z_t}$ as
% \begin{equation}
%     \Delta_{z_t} \leq f_{z_t}(U_{t-1}) - f_{z_t}(\mu) = \sum_{i \in z_t^+} \sum_{j \leq z_{t,i}} \left( U_{t-1}(a_i^j) - \mu(a_i^j) \right)~.
% \end{equation}
% Here $z_t^+ = \lbrace i \in [N]:z_{t,i} > 0\rbrace$ denotes the index of lists those contribute at least one action at round $t$.

% Define, for each list $i \in [N]$, its gap as
% \begin{equation}
%     \Delta_{\min,i} = \min_{z \neq z^*:z_i > 0} \Delta_z~.
% \end{equation}
% Then, for all $i \in z_t^+$, we have $\Delta_{\min,i} \leq \Delta_{z_t}$. Therefore, we obtain $M_{z_t}:=\max_{i \in z_t^+} \Delta_{\min,i} \leq \Delta_{z_t}$.

% Using the above, we upper bound $\Delta_{z_t}$ under the event $\cE_t$ as
% \begin{align}
%     \Delta_{z_t} \leq 2\sum_{i \in z_t^+} \sum_{j \leq z_{t,i}} \left( U_{t-1}(a_i^j) - \mu(a_i^j) \right) - M_{z_t} &\leq 4 \sum_{i \in z_t^+} \sum_{j \leq z_{t,i}} \sqrt{\frac{3 \log t}{2n_t(a_i^j)}} - M_{z_t}\\
%     & = \sum_{i \in z_t^+} \sum_{j \leq z_{t,i}} \left(\sqrt{\frac{24 \log t}{n_t(a_i^j)}} - \frac{M_{z_t}}{K} \right)\\
%     & \leq \sum_{i \in z_t^+} \sum_{j \leq z_{t,i}} \left(\sqrt{\frac{24 \log T}{n_t(a_i^j)}} - \frac{\Delta_{\min,i}}{K} \right).
% \end{align}
% Define $g_T(\Delta_{\min,i})=\frac{24K^2\log T}{\Delta_{\min,i}^2}$ and set $\beta_T(\Delta_{\min,i},s) = \sqrt{\frac{24\log T}{s}}\indic{s \leq g_T(\Delta_{\min,i})}$. This yields
% \begin{align}
%     \Delta_{z_t} \leq \sum_{i \in z_t^+} \sum_{j \leq z_{t,i}} \beta_T(\Delta_{\min,i},n_t(a_i^j)).
%     \end{align}
% Therefore, we can upper bound regret under the event $\cap_{t=1}^T\cE_t$ as
% \begin{align}
%     \text{Reg}(T) &\leq \sum_{t=1}^T \sum_{i \in z_t^+} \sum_{j \leq z_{t,i}} \beta_T(\Delta_{\min,i},n_t(a_i^j))\\
%     & = \sum_{i=1}^N \sum_{j=1}^K \sum_{s=1}^{n_T(a_i^j)} \beta_T(\Delta_{\min,i},s)\\
%    & =  \sum_{i=1}^N \sum_{j=1}^K \sum_{s=1}^{g_T(\Delta_{\min,i})}\sqrt{\frac{24\log T}{s}}\\
%    & \leq \sum_{i=1}^N \sum_{j=1}^K  2\sqrt{24 g_T(\Delta_{\min,i})\log T }\\
%    & = C\sum_{i=1}^N K\cdot\frac{ K\log T}{\Delta_{\min,i}}.
% \end{align}

% Let's now see what regret vanilla-UCB \citep{kveton2015tight} gets in this setting. This algorithm choose a subset $S$ of $K$ actions form the action space $\cS = \lbrace S \subset 2^{NK} : |S|=K\rbrace$. Define the loss in mean reward for the subset $S$ as
% \begin{align}
%    \widetilde \Delta_S = \sum_{a \in S^*} \mu(a) - \sum_{a \in S}\mu(a).
% \end{align}
% Now, define for each item $a \in [NK]$, its gap
% \begin{align}
%     \widetilde \Delta_{\min,a} = \min_{S\neq S^*: a \in S} \widetilde\Delta_S ~.
% \end{align}
% With this notation, they get the regret bound
% \begin{align}
%     Reg(T) \leq \widetilde C\sum_{a=1}^{NK} \frac{K\log T}{\widetilde \Delta_{\min,a}}
% \end{align}


% \begin{table}[!h]
%     \centering
%     \caption{An Interesting Table.} \label{tab:supp-data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

\section{Proofs}

\subsection{Regret Upper Bound of Algorithm 1}

We start by recalling that $\Delta_{z_t}:=f^{\mu^*}(z^*)-f^{\mu^*}(z_t)$ denotes the regret or \emph{gap} of selecting allocation $z_t$ at round $t$ instead of the optimal allocation $z^*$.
Now, define the event
\begin{align*}
    \cE_t  = \left\lbrace \Delta_{z_t} \leq 2\sum_{(i,j): z_i^* < j \leq z_{t,i} } \sqrt{\frac{1.5\log T}{T_{t-1}(a_{i,j})}}, \Delta_{z_t} > 0 \right\rbrace.
\end{align*}
Also, define $\hat R_T= \sum_{t=MN+1}^T \Delta_{z_t}\mathds{1}\lbrace \cE_t \rbrace$. Then, from~\citet[Lemma 1]{kveton2015tight}, it holds that
\begin{align}\label{eq:high-prob}
    R_T \leq \expect{\hat R_T} + (1+\pi^2/3)KMN~.
\end{align}
Now, let us consider two sequence of constants $(\alpha_i)_{i \geq 1}$ and $(\beta_i)_{i \geq 0}$ such that $\beta_0=1$, $\alpha_i > \alpha_j$, $\beta_i > \beta_j$ for all $i > j$, $\lim_{i \to \infty}\alpha_i=\lim_{i \to \infty}\beta_i=0$ and $\sum_{i \geq 1}\frac{\beta_{i-1}-\beta_i}{\sqrt{\alpha_i}} \leq 1/\sqrt{6}$. 

Let $A_t$ denote the subset of items induced by the allocation $z_t$ chosen at round $t$. In other words, an item $a_{i,j} \in A_t$ if $z_{t,i} \geq j$. Similarly, let $A^*$ denote the corresponding subset induced by the optimal allocation $z^*$. Let $\tilde A_t = A_t\setminus A^*$ and $m_{i,t}=\frac{\alpha_i K^2 \log T}{\Delta_{z_t}^2}$. Now, similar to~\citet{kveton2015tight}, define the series of mutually exclusive events $(G_{i,t})_{i \geq 1}$, where $G_{i,t}$ denotes the event that
at least $\beta_i K$ items in $\tilde A_t$ were observed at most $m_{i,t}$ times and
for all $j < i$, less than $\beta_1 K$ items in $\tilde A_t$ were observed at most $m_{i-1,t}$ times. Then, under the event $\cF_t$, it holds that the event $\lbrace\bigcup_{i \geq 1}G_{i,t}\rbrace$ happens~\cite[Lemma 3]{kveton2015tight}. Hence, we have
\begin{align*}
    \hat R_T  =\sum_{l=1}^{\infty} \sum_{t=MN+1}^T \Delta_{z_t} \mathds{1}\lbrace G_{l,t}, \Delta_{z_t}> 0 \rbrace .
\end{align*}
Now for any item $a_{i,j}$, let us define the events
\begin{align*}
F_{a_{i,j},l,t} = \lbrace  z^*_i < j \leq z_{t,i}, T_{t-1}(a_{i,j}) \leq m_{l,t}\rbrace,\;\; 
G_{a_{i,j},l,t} = G_{l,t} \bigcap F_{a_{i,j},l,t}.   
\end{align*}
Let us now define the following events:
\begin{align*}
    F^k_{a_{i,j},l,t} = \lbrace  z^*_i < j+k = z_{t,i}, T_{t-1}(a_{i,j+k}) \leq m_{l,t}\rbrace, k \geq 0~.
\end{align*}
Note that because of the ordered structure.  if $a_{i,j}$ has only been observed a certain number of times, then $a_{i,j_k}$ would be observed less than or equal number of times i.e., $T_{t-1}(a_{i,j+k}) \leq T_{t-1}(a_{i,j})$, which, in turn, implies that 
\begin{align*}
    \widetilde F^k_{a_{i,j},l,t} := \lbrace  z^*_i < j+k = z_{t,i}, T_{t-1}(a_{i,j}) \leq m_{l,t}\rbrace \subseteq F^k_{a_{i,j},l,t}~.
\end{align*}
It turns out that $F_{a_{i,j},l,t} =  \bigcup_{k=0}^{M-j} \widetilde F^k_{a_{i,j},l,t}$, which in turn implies $F_{a_{i,j},l,t} \subseteq \bigcup_{k=0}^{M-j} F^k_{a_{i,j},l,t}=: H_{a_{i,j},l,t}$. This implies that $\bigcup_{j=1}^M \lbrace G_{l,t} \bigcap F_{a_{i,j},l,t} \rbrace \subseteq  \bigcup_{j=1}^M \lbrace G_{l,t} \bigcap H_{a_{i,j},l,t} \rbrace $. Now observe that $H_{a_{i,1},l,t} \supseteq H_{a_{i,2},l,t} \supseteq \ldots H_{a_{i,M},l,t}$, implying that the RHS of the above is a union over decreasing sets, and hence it holds that $\bigcup_{j=1}^M \lbrace G_{l,t} \bigcap F_{a_{i,j},l,t} \rbrace \subseteq  \lbrace G_{l,t} \bigcap H_{a_{i,i},l,t} \rbrace $. This further implies
\begin{align*}
 \bigcup_{j=1}^M G_{a_{i,j},l,t} = \bigcup_{j=1}^M \left\lbrace G_{l,t} \bigcap F_{a_{i,j},l,t} \right\rbrace \subseteq \bigcup_{k=0}^{M-1} \left\lbrace G_{l,t} \bigcap F^k_{a_{i,1},l,t} \right \rbrace = \bigcup_{j=1}^M \left\lbrace G_{l,t} \bigcap \lbrace  z^*_i < j = z_{t,i}, T_{t-1}(a_{i,j}) \leq m_{l,t}\rbrace\right \rbrace
\end{align*}
Then, it holds that
\begin{align*}
 \mathds{1}\lbrace G_{l,t}, \Delta_{z_t}> 0 \rbrace \leq  \frac{1}{\beta_l K}\sum_{(i,j): z^*_i < j } \mathds{1}\lbrace   z_{t,i}=j, T_{t-1}(a_{i,j}) \leq m_{l,t}, \Delta_{z_t}> 0 \rbrace.
\end{align*}
Therefore, we can bound $\hat R_T$ as
\begin{align*}
    \hat R_T \leq \sum_{(i,j): z^*_i < j } \sum_{l=1}^\infty \sum_{t=MN+1}^T\mathds{1}\lbrace   z_{t,i}=j, T_{t-1}(a_{i,j}) \leq m_{l,t}, \Delta_{z_t}> 0 \rbrace \frac{\Delta_{z_t}}{\beta_l K},
\end{align*}
Now let each item $a_{i,j}$, which are not included in the optimal allocation $z^*$, be contained in $N_{i,j}$ suboptimal allocations $z$ and $\Delta_{i,j,1}\geq \Delta_{i,j,2} \geq \ldots \geq  \Delta_{i,j,N_{i,j}}$ be the gaps of these solutions. Then, we have
\begin{align*}
    \hat R_T &\leq \sum_{(i,j): z^*_i < j } \sum_{l=1}^\infty \sum_{t=MN+1}^T
    \sum_{k=1}^{N_{i,j}}\mathds{1}\lbrace   z_{t,i}=j, T_{t-1}(a_{i,j}) \leq \frac{\alpha_lK^2\log T}{\Delta^2_{i,j,k}}, \Delta_{z_t}=\Delta_{i,j,k} \rbrace \frac{\Delta_{i,j,k}}{\beta_l K}\\
    & \leq \sum_{(i,j): z^*_i < j } \sum_{l=1}^\infty \frac{\alpha_l K \log T}{\beta_l}\left( \frac{1}{\Delta_{i,j,1}} + \sum_{k=2}^{N_{i,j}}\Delta_{i,j,k}\left(\frac{1}{\Delta_{i,j,k}^2}-\frac{1}{\Delta^2_{i,j,k-1}} \right) \right),
\end{align*}
where the last term is the solution of the optimization problem:
\begin{align*}
\max_{(z_1,\ldots,z_t)}\sum_{t=1}^T\sum_{k=1}^{N_{i,j}} \mathds{1}\lbrace   z_{t,i}=j > z_i^*, T_{t-1}(a_{i,j}) \leq \frac{\alpha_lK^2\log T}{\Delta^2_{i,j,k}}, \Delta_{z_t}=\Delta_{i,j,k} \rbrace \frac{\Delta_{i,j,k}}{\beta_l K}.
\end{align*}
Then, similar to~\citet{kveton2015tight}, we obtain
\begin{align*}
  \hat R_T \leq   \sum_{(i,j): z^*_i < j } \sum_{l=1}^\infty \frac{2\alpha_l K \log T}{\beta_l \Delta_{i,j,N_{i,j}}} \leq \sum_{(i,j): z^*_i < j } \frac{534K\log T}{\Delta_{i,j}},
\end{align*}
where $\Delta_{i,j}$ denotes the minimum gap as defined in main paper, and is given by $\Delta_{i,j,N_{i,j}}$. Now, from \eqref{eq:high-prob}, we can complete the proof.




% --------------------------------------------------------------------------------------------------

% Here is a possible improvement in the definition of element level gaps leading to better overall regret.

% Consider Equations (14) and (15) in \cite{kveton2015tight}.
% For any element (keyword) $e$ and time $t$ define the two events

% \begin{align*}
%    & G_{e,1,t} = G_{1,t} \cap \{e\in \Tilde{A}_t, T_{t-1}(e) \leq \alpha K^2 \frac{6}{\Delta_{A_t}^2} \log n\} \\
%    & G_{e,2,t} = G_{2,t} \cap \{e\in \Tilde{A}_t, T_{t-1}(e) \leq \frac{\alpha d^2}{(\sqrt{\alpha}-1)^2}\frac{6}{\Delta_{A_t}^2}\log n\}
% \end{align*}

% These are then used to give the regret as follows:

% \[
% \widehat{R}(n) \leq \sum_{e \in \Tilde{E}}\sum_{t=t_0}^n\mathds{1}\{G_{e,1,t}, \Delta_{A_t}>0\}\frac{\Delta_{A_t}}{d} + \sum_{e\in \Tilde{E}}\sum_{t=t_0}^n \mathds{1}\{G_{e,2,t}, \Delta_{A_t}>0\}\Delta_{A_t}
% \]





% We claim that under the prefix structure the above calculation has a lot of double counting. To correct this define some more events $F_{e,1,t}^j$ for integers $j\geq 0$. Assume $e+j$ represents the element $j$ positions to the right of $e$ in the same nominator. Define the events

% \begin{align*}
%     &F_{e,1,t} = \{e\in \Tilde{A}_t, T_{t-1}(e) \leq \alpha K^2 \frac{6}{\Delta_{A_t}^2} \log n\},\\
%     &F_{e,1,t}^j = \{e+j\in \Tilde{A}_t, e+j+1\notin \Tilde{A}_t , T_{t-1}(e+j) \leq \alpha K^2 \frac{6}{\Delta_{A_t}^2} \log n\}
% \end{align*}

% We can show that 
% \[
% F_{e,1,t} \subseteq F_{e,1,t}^0 \cup F_{e,1,t}^1 \cup \ldots F_{e,1,t}^k = H_{e,1,t}
% \]

% This is true because of the prefix structure i.e. if $e$ has only been observed a certain number of times, $e+j$ would be observed even lesser number of times i.e., $T_{t-1}(e+j) \leq T_{t-1}(e) \Rightarrow $
% \[
% \{e+j\in \Tilde{A}_t, e+j+1\notin \Tilde{A}_t , T_{t-1}(e) \leq \alpha K^2 \frac{6}{\Delta_{A_t}^2} \log n\} \subseteq F_{e,1,t}^j
% \]
%  The union of all the LHS above for $j\geq 0$ is $F_{e,1,t}$ proving the containment. Let $E_i$ be all elements in nominator $i$. The above containment implies that $G_{1,t}\cap F_{e,1,t} \subseteq G_{1,t}\cap H_{e,1,t} \Rightarrow$
 
%  \[
%  \bigcup_{e \in E_i} G_{1,t}\cap F_{e,1,t} \subseteq  \bigcup_{e \in E_i} G_{1,t}\cap H_{e,1,t}
%  \]
%  Now notice that $H_{e,1,t}\supseteq H_{e+1,1,t} \supseteq \ldots$, implying that the RHS of the above is a union over decreasing events (sets), thereby giving
%  \[
%   \bigcup_{e \in E_i} G_{1,t}\cap F_{e,1,t} \subseteq   G_{1,t}\cap H_{e_0,1,t}
%  \]
%  where $e_0$ is the first element in nominator $i$. RHS can be further written as unions of intersection
%  \[
%  G_{1,t}\cap H_{e_0,1,t} = (G_{1,t}\cap F_{e_0, 1, t}^0) \cup (G_{1,t}\cap F_{e_0, 1, t}^1) \cup \ldots
%  \]
%  Our regret calculation will be simplified because
%  \[
%  \sum_{e\in E_i}\sum_{t=t_0}^n \mathds\{G_{e,1,t}, \Delta_{A_t} > 0\} \leq \sum_{j=1}^k\sum_{t=t_0}^n \mathds\{G_{1,t}\cap F_{e_0,1,t}^k, \Delta_{A_t}>0\}
%  \]

%  To bound the individual component corresponding to $G_{1,t}\cap F_{e_0,1,t}^k$, we will only need to look at sub-optimal prefixes that end at $e_0+k$, as compared to the calculation in \cite{kveton2015tight}. So now we will define the gap of an element as $\Delta_{a_i^j, min} = \min_{\bm{z}\neq \bm{z}^* : a_i^j\in \bm{z}, a_i^{j+1}\notin \bm{z}} \Delta_{\bm{z}}$.

 \subsection{Probability of Error of Algorithm 2}
 
 As a first step we define an event $\eta$ which helps us in the rest of the proof. Note that \cite{bubeck2013multiple} also defined a similar event $\xi$ in their proof of Theorem $1$, but our event is different from theirs. In $\eta$ we only consider the $|\Phi|$ ($\leq 2N$) sized subset of phases where some item from the boundary set $\Phi$ is accepted or rejected, whereas \cite{bubeck2013multiple} considered all the $MN-1$ phases. Let $k_1\leq k_2 \ldots \leq k_{|\Phi|}$ be the phases where items in $\Phi$ were accepted or rejected. Under this notation $H_\Phi = \max\limits_{1\leq i\leq |\Phi|} \frac{MN+1-k_i} {\Delta_{[MN+1-k_i]}^{2}}$. Consider the event $\eta$ defined by
\[
\{\forall i\in \{1,\ldots, MN\}, \forall k \in \{k_1, \ldots, k_{|\Phi|}\}, \abs{\frac{1}{T_k}\sum\limits_{s=1}^{T_k}X_{i,s}-\mu_i} \leq \frac{1}{4}\Delta_{[MN+1-k]}\}
\]
Note that by abuse of notation, we have renamed our items as $1, \ldots, MN$ above. Also $X_{i,s}$ denotes the bernoulli reward received for item $i$ in its $s^{th}$ pull so far. Recall that $T_{k}$ was defined as $\frac{T-MN}{\widebar{\log}(MN) (MN+1-k)}$ By Hoeffding's inequality and union bound, we bound the probability of the complement event $\bar{\eta}$ as

\[
\begin{split}
   \mathbb{P}(\bar{\eta}) &\leq \sum\limits_{i=1}^{MN}\sum\limits_{j=1}^{|\Phi|} \mathbb{P}\left(\abs{\frac{1}{T_{k_j}}\sum\limits_{s=1}^{T_{k_j}}X_{i,s}-\mu_i} > \frac{1}{4}\Delta_{[MN+1-k_j]}\right)\\
   &\leq \sum\limits_{i=1}^{MN}\sum\limits_{j=1}^{|\Phi|} 2 \exp(-2T_{k_j} (\Delta_{[MN+1-k_j]}/4)^2)\\
   &\leq 2MN|\Phi| \exp\left(-\frac{T-MN}{8\widebar{\log}(MN)H_\Phi}\right)
\end{split}
\]

Next, we show that assuming the event $\eta$, the algorithm does not make any error. The proof of this part is very similar to the proof of Theorem $1$ in \cite{bubeck2013multiple}. The main difference is that in our case since we always accept items (items) that are in the top of some list and reject items that are in the bottom of some list, we do not make any error until we reach an item in the boundary i.e. $\Phi$. We will claim that the event $\eta$ prevents these errors from happening. This is done by induction on the phases where some boundary item is accepted or rejected i.e. phases $k_1, \ldots, k_{|\Phi|}$. Since we only need to argue about correctness for these phases, we defined $\eta$ only for $k \in \{k_1, \ldots, k_{|\Phi|}\}$.
Note that this is the critical difference between our event $\eta$ and the corresponding event $\xi$ defined in proof of Theorem $1$ in \cite{bubeck2013multiple}). Define $k_0=0$ and consider $j\geq 1$. Using an induction approach we assume that no errors have happened till phase $k_{j-1}$. As explained above the next error can only occur at an item in boundary and therefore has to occur at phase $k_{j}$. We show that under event $\eta$, this cannot happen. We will need the following observation.

\begin{observation}
\label{observation}
Let $e$ be the first item to be erroneously accepted or rejected. As mentioned above $e\in \Phi$. From Algorithm 2, we know that there is some active item $e^\prime$ in the same list as $e$ with the highest empirical gap $\widehat{\Delta}_{e^\prime}$ among all the active items. By the design of our algorithm, if $e$ was accepted it is the top item of its list which also contains the active element $e^\prime \Rightarrow \mu_{e^\prime} \leq \mu_e$. Since $e$ was erroneously accepted, it does not belong to the top $K$ items $\Rightarrow$ $e^\prime$ also does not belong to the top $K$ items. Similarly if $e$ was erroneously rejected (i.e. it belongs to top $K$ items), it is the bottom of its list which contains the active element $e^\prime$ $\Rightarrow$ $\mu_e \leq \mu_{e^\prime}$. Thus $e^\prime$ also belongs to top $K$ items.
\end{observation}

Event $\eta$ implies that at the end of stage $k_j$, empirical means of rewards of all items are within $\frac{1}{4}\Delta_{[MN+1-k_j]}$ of their true reward means. Let $A_{k_j} = \{a_1, \ldots, a_{MN+1-k_j}\}$ be the active set of items during phase $k_j$ with decreasing true reward means i.e. $\mu_{a_1} \geq \ldots \geq \mu_{MN+1-k_j}$. We assume that $K^\prime$ items in the top $K$ are left to be found at the starting of phase $k_j$. Using the induction assumption this implies $\{a_1, \ldots, a_{K^\prime}\} \in \{1,\ldots, K\}$ \footnote{By abuse of notation we are using $i$ to denote the $i^{th}$ item from the top.} and $\{a_{K^\prime + 1}, \ldots, a_{MN+1-k_j}\} \in \{K+1, \ldots, MN\}$. Now there can be two types of errors.
\begin{itemize}
    \item \textbf{Type 1 error -} An item $a_l$ is accepted for some $l\geq K^\prime+1$.
    \item \textbf{Type 2 error -} An item $a_l$ is rejected for some $l\leq K^\prime$.
\end{itemize}
As done in \cite{bubeck2013multiple}, we only show that \textbf{Type 1 error} does not occur and the other can be shown symmetrically. We know that a boundary item is accepted or rejected in phase $k_j$, thus $a_l$ is a boundary item. Since $a_l$ is not in the top $K$ items, using Observation \ref{observation}, we get that there is some active item $a_p$ (in the same list as $a_l$) also not in top $K$ i.e. $p\geq K^\prime +1$ such that it has the highest empirical gap among all active items.


From here on wards our proof resembles the proof in \cite{bubeck2013multiple}. We can basically replace $a_j$ in their proof with our $a_p$, $K$ with $MN$ and $k$ with $k_j$, and repeat the steps that follow. However, to make it work we will have to use that $a_p$ is not in the top $K$ items as explained in Observation \ref{observation}. We show that $\Delta_{[MN+1-k_j]} > \max\{\mu_{a_1}-\mu_K, \mu_K - \mu_{a_{MN+1-k_j}}\}$. This cannot hold since at stage $k_j$ since only $k_j-1$ items have been accepted or rejected implying that $\Delta_{[MN+1-k_j]} \leq \max\{\mu_{a_1}-\mu_K, \mu_K - \mu_{a_{MN+1-k_j}}\}$. This will give us a contradiction similar to \cite{bubeck2013multiple}. However, to show this we need to use the implication of our observation that $a_p$ is also not in the top $K$ items and that it has the highest empirical mean reward i.e. $\widehat{\mu}_{a_p} \geq \widehat{\mu}_{a}$ for all $a\in A_{k_j}$. This is true since it has the highest empirical gap and it led to acceptance of $a_l$ (see Algorithm 2).

\begin{itemize}
    \item Proof of $\Delta_{[MN+1-k_j]} \geq \mu_{a_1} - \mu_{K}$:
    \[
\begin{split}
&\widehat{\mu}_{a_p, T_{k_j}}\geq \widehat{\mu}_{a_1, T_{k_j}}\\
    &\Rightarrow \mu_{a_p} + \frac{1}{4}\Delta_{[MN+1-k_j]} \geq \mu_{a_1} - \frac{1}{4}\Delta_{[MN+1-k_j]}\\
    & \Rightarrow \Delta_{[MN+1-k_j]} \geq \mu_{a_1} - \mu_{a_p}
\end{split}
\]
Now, since $a_p$ is not in the top $K$ items we know that $\mu_{a_p} \leq \mu_K \Rightarrow \Delta_{[MN+1-k_j]} \geq \mu_{a_1} - \mu_{K}$.


\item Proof of $\Delta_{[MN+1-k_j]} > \mu_K - \mu_{a_{MN+1-k_j}}$: 

Let $\sigma:\{1,\ldots,MN+1-k_j\}\rightarrow A_{k_j}$ be a permutation with $\sigma(1)=p$, such that $\widehat{\mu}_{\sigma(1), T_{k_j}}\geq \ldots \geq \widehat{\mu}_{\sigma(MN+1-k_j), T_{k_j}}$.
Since $a_p$ has the highest empirical gap in this phase, we know that,
\begin{equation}
\label{equation:best-item}
    \widehat{\mu}_{a_p, T_{k_j}} - \widehat{\mu}_{\sigma(K^\prime+1), T_{k_j}} \geq \widehat{\mu}_{\sigma(K^\prime), T_{k_j}} - \widehat{\mu}_{\sigma(MN+1-k_j), T_{k_j}}
\end{equation}
We claim that there are at least $K^\prime +1$ items ($a_1, \ldots, a_{K^\prime}, a_p$) in $A_{k_j}$ such that their empirical mean rewards are $\geq \mu_K - \frac{1}{4}\Delta_{[MN+1-k_j]}$. This is trivially true for $a_1, \ldots, a_{K^\prime}$ since for $i\leq K^\prime$, event $\eta$ implies $\widehat{\mu}_{a_i, T_{k_j}} \geq \mu_{a_i} - \frac{1}{4}\Delta_{[MN+1-k_j]} \geq \mu_K - \frac{1}{4}\Delta_{[MN+1-k_j]}$. Since $a_p$ is not in the top $K$ items and has empirical mean reward $\widehat{\mu}_{a_p, T_{k_j}}\geq \widehat{\mu}_{a_1, T_{k_j}}$, this also holds for $a_p$. This basically implies that both $\widehat{\mu}_{\sigma(K^\prime)}, \widehat{\mu}_{\sigma(K^\prime+1)}$ are $\geq \mu_K - \frac{1}{4}\Delta_{[MN+1-k_j]}$. Note that, since $\widehat{\mu}_{\sigma(MN+1-k_j)}$ is smallest it is $\leq \widehat{\mu}_{MN+1-k_j}$ which under $\eta$, is $\leq \mu_{MN+1-k_j} + \frac{1}{4}\Delta_{[MN+1-k_j]}$. Also under $\eta$, $\widehat{\mu}_{a_p, T_{k_j}} \leq \mu_{a_p} + \frac{1}{4}\Delta_{[MN+1-k_j]}$. Putting all of these together we get that

\[
\begin{split}
    (\mu_{a_p} + \frac{1}{4}\Delta_{[MN+1-k_j]}) &- (\mu_K-\frac{1}{4}\Delta_{[MN+1-k_j]}) \geq (\mu_K - \frac{1}{4}\Delta_{[MN+1-k_j]}) - (\mu_{MN+1-k_j} + \frac{1}{4}\Delta_{[MN+1-k_j]})\\
    & \Rightarrow \Delta_{[MN+1-k_j]} \geq 2\mu_K - \mu_{a_p} - \mu_{MN+1-k_j} > \mu_K - \mu_{MN+1-k_j} 
\end{split}
\]
where the last inequality again holds because $a_p$ is not in in the top $K$ items i.e. $\mu_K > \mu_{a_p}$.
\end{itemize}
This completes our proof.

\section{Detailed Experiments}



\paragraph{Regret Minimization:} Here, we present simulation results for other choices of parameters $M, N, K$ and for 100 independent trials. In Figure~\ref{fig:Bernoulli-supp}, we plot the results for synthetic bandit instance with Bernoulli rewards and with $N=10, M=20, K=10$. Next, in Figure~\ref{fig:Gausss-supp}, we plot the results for synthetic bandit instance with Gaussian rewards and with $N=5, M=20, K=10$ .
Finally, in Figure~\ref{fig:Real-supp}, we plot the results for
semi-synthetic bandit instance with 100 clusters (i.e., 100 total number of arms) and with $N=5, M=20, K=10$.
Similar to those reported in the main paper, in these experiments too we observe that our algorithm \emph{Ordered-CombUCB} fair much better than the baseline \emph{CombUCB}.




\begin{figure}[!htb]
  \centering
  \includegraphics[width=0.7\linewidth]{Bernoulli-easy-regret-comparison-supp.pdf}
  \caption{\footnotesize{Comparison of cumulative regret for CombUCB and Ordered CombUCB on synthetic Bernoulli bandit instance.}}\label{fig:Bernoulli-supp}
\end{figure}

\begin{figure}[!htb]
  \centering
  \includegraphics[width=0.7\linewidth]{Gaussian-easy-regret-comparison-supp.pdf}
  \caption{\footnotesize{Comparison of cumulative regret for CombUCB and Ordered CombUCB on synthetic Gaussian bandit instance.}}\label{fig:Gausss-supp}
\end{figure}

\begin{figure}[!htb]
  \centering
  \includegraphics[width=0.7\linewidth]{Gaussian-real-regret-comparison-supp.pdf}
  \caption{\footnotesize{Comparison of cumulative regret for CombUCB and Ordered CombUCB on semi-synthetic bandit instance. }}\label{fig:Real-supp}
\end{figure}



\paragraph{$K$-Best Arm Identification:}

Similar to the main paper, we generate a \emph{hard banidt instance} by sampling arm means uniformly in $[0.45, 0.55]$ and then sampling the rewards from Gaussian distributions with aforementioned means and projected to $[0,1]$. 
We run our algorithm \emph{ordered SAR} and the baseline algorithm \emph{SAR} for rounds $T\in[1000,\ldots,10000]$.
To mitigate the effect of randomness (as seen in the plots reported in the main paper), we increase number of independent trials to 1000 and plot the probability of error for both algorithms in Figure~\ref{fig:Gauss_error-supp}.
Similar to those reported in the main paper, here too ee find that the failure probability of Ordered SAR is consistently lower than that of SAR.



\begin{figure}[!htb]
  \centering
  \includegraphics[width=0.7\linewidth]{Gaussian-hard-error-comparison-supp.pdf}
  \caption{\footnotesize{Comparison of probability of error for SAR and Ordered SAR on Gaussian bandit instance.}}\label{fig:Gauss_error-supp}
\end{figure}




\bibliography{chowdhury_387}

\end{document}
