\documentclass[accepted]{uai2023} 

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{enumitem}
\usepackage{thmtools, thm-restate}
\usepackage{comment}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage{cancel}
\usepackage{eqnarray}

%\input{notation.tex}
%\DeclareMathOperator{\ent}{S}
\DeclareMathOperator{\Ex}{\mathbb{E}}
\DeclareMathOperator{\Var}{\text{Var}}
\DeclareMathOperator{\Cov}{\text{Cov}}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
%\newcommand{\Pr}{\text{Pr}}
\newcommand{\sign}{\text{sign}}
\newcommand{\err}{\text{err}}

\newcommand{\calA}{\mathcal{A}}
\newcommand{\calB}{\mathcal{B}}
\newcommand{\calS}{\mathcal{S}}
\newcommand{\calC}{\mathcal{C}}
\newcommand{\calD}{\mathcal{D}}
\newcommand{\calX}{\mathcal{X}}
\newcommand{\calK}{\mathcal{K}}
\newcommand{\calG}{\mathcal{G}}
\newcommand{\calR}{\mathcal{R}}
\newcommand{\calP}{\mathcal{P}}
\newcommand{\calN}{\mathcal{N}}
\newcommand{\calM}{\mathcal{M}}
\newcommand{\calH}{\mathcal{H}}
\newcommand{\calF}{\mathcal{F}}
\newcommand{\calO}{\mathcal{O}}

\newcommand{\IPS}{\text{IPS}}
\newcommand{\BAL}{\text{BAL}}

\newcommand{\x}{{x}}                 % Context
\newcommand{\xRV}{{X}}               % Context Random Variable
\newcommand{\action}{{a}}                 % Action
\newcommand{\actionRV}{{A}}               % Action Random Variable
\newcommand{\reward}{{r}}                 % Reward
\newcommand{\rewardRV}{{R}}               % Reward Random Variable
\newcommand{\Util}{{U}}                 % Reward
\newcommand{\UtilIPS}{{\hat{U}^{\text{IPS}}}}                 % Reward

\newcommand{\old}{{log}}
\newcommand{\eval}{{aug}}
\newcommand{\target}{{tar}}

\newcommand{\policy}{\pi}
\newcommand{\policyspace}{\Pi}
\newcommand{\oldpolicy}{\pi_{\text{\old}}}
\newcommand{\targetpolicy}{\pi_{\text{\target}}}
\newcommand{\targetpolicyspace}{\policyspace_{\text{\target}}}
\newcommand{\evalpolicy}{\pi_{\text{\eval}}}
\newcommand{\blendedpolicy}{\pi_{\text{balanced}}}
\newcommand{\minvarpolicy}{\pi_{\text{minvar}}^{\text{IPS}}}
\newcommand{\prodpolicy}{\pi_{\text{prod}}}
\newcommand{\maxpolicy}{\pi_{\text{max}}}

\newcommand{\paren}[1]{\left(#1\right)}
\newcommand{\sqaren}[1]{\left[#1\right]}
\newcommand{\set}[1]{\left\{#1\right\}}

\newcommand{\dataset}{\calD}
\newcommand{\evaln}{{n_\text{\eval}}}
\newcommand{\oldn}{{n_\text{\old}}}
\newcommand{\evalD}{{\dataset_\text{\eval}}}
\newcommand{\oldD}{{\dataset_\text{\old}}}

\newcommand{\Ball}{{\text{Ball}}}
\newcommand{\Regret}{{\text{Regret}}}
\newcommand{\Reward}{{\text{Reward}}}
\newcommand{\KL}{{\text{KL}}}

\newcommand{\rewardmean}{{\bar \reward(\x,\action)}}
\newcommand{\rewardmeansq}{{\bar \reward^2(\x,\action)}}
\newcommand{\rewardvar}{{\sigma^2(\x,\action)}}
\newcommand{\expectedsquarer}{\paren{\rewardmeansq + \rewardvar}}

\newtheorem{lemma}{Lemma}
\newtheorem{claim}{Claim}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{optimizationproblem}{Optimization Problem}
\newtheorem*{remark}{Remark}
\newtheorem{definition}{Definition}[section]
\newtheorem{assumption}{Assumption}[section]


\newcommand{\cmark}{\ding{51}}%
\newlist{todolist}{itemize}{2}
\setlist[todolist]{label=$\square$}
\newcommand{\done}{\rlap{$\square$}{\raisebox{2pt}{\large\hspace{1pt}\cmark}}%
\hspace{-2.5pt}}

%\newcommand{\calA}{\mathcal{A}}
\newcommand{\Varm}{V^m_\text{arm}}
\newcommand{\Vcontext}{V_\text{state}}
\newcommand{\Barm}{B_\text{arm}}
\newcommand{\Bcontext}{B_\text{state}}

\newcommand{\width}{\text{width}}

\newcommand{\abbreviation}{BwCRO }
\newcommand{\abbreviationNS}{BwCRO}

%%%%%%%%%%%%% Comments
\newcommand{\tj}[1]{\footnote{\textcolor{magenta}{TJ comments: #1}}}
\newcommand{\at}[1]{\footnote{\textcolor{blue}{AT comments: #1}}}


%\input{sections/fixedn/notation}
\newcommand{\constant}{K}

\title{Bandits with Costly Reward Observations}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author[ ]{\href{mailto:<aarondtucker@cs.cornell.edu>?Subject=Your UAI 2023 paper}{Aaron D. Tucker}}
\author[*]{Caleb Biddulph}
\author[*]{Claire Wang}
\author[ ]{Thorsten Joachims}

\affil[ ]{Department of Computer Science, Cornell University, Ithaca NY USA}
\affil[*]{Equal contribution, authors listed alphabetically}


\pagenumbering{arabic}
\begin{document}

\maketitle
\begin{abstract}
Many machine learning applications rely on large datasets that are conveniently collected from existing sources or that are labeled automatically as a by-product of user actions. However, in settings such as content moderation, 
%with rapidly changing distributions without automatic ground-truth feedback, 
accurately and reliably labeled data comes at substantial cost. If a learning algorithm has to pay for reward information, for example by asking a human for feedback, how does this change the exploration/exploitation tradeoff? We study this question in the context of bandit learning. Specifically, we investigate Bandits with Costly Reward Observations, where a cost needs to be paid in order to observe the reward of the bandit's action. We show that the observation cost implies an $\Omega(c^{1/3}T^{2/3})$ lower bound on the regret. Furthermore, we develop a general non-adaptive bandit algorithm which matches this lower bound, and we present several competitive adaptive learning algorithms for both k-armed and contextual bandits.
\end{abstract}

\section{Introduction}
Machine learning has proven extremely successful on tasks where accurately labeled data is readily available and abundant, such as in speech recognition or online advertising.
%Machine learning works extremely well in areas where there is a large amount of available data (such as image and speech recognition), or where feedback is automatically available (such as content recommendation, search engines, or online advertising).
However, there are many crucial settings such as content moderation of ephemeral messages, where latency constraints force decisions to be made without human assessment, and yet obtaining accurate reward information necessarily involves some costly interaction which would not automatically happen otherwise.
For example, while a search or ad engine can rely on users' clicks as a sufficiently accurate feedback signal, accurately labeling policy violations in content moderation is still based on human feedback.
In these situations, there is a tradeoff between collecting more labels to achieve better performance and collecting fewer labels to avoid the labeling cost.

We first study this exploration/exploitation tradeoff in the k-armed bandit setting, then extend our results to a setting where the algorithm can decide that it needs additional oversight in some contexts but not in others.
This problem is highly relevant to scalable oversight \citep{concreteproblems}, and more generally to modeling human preferences by learning from explicit human feedback \citep{unsolvedproblems}.

We refer to the specific setting studied in this paper as Bandits with Costly Reward Observations (BwCRO) (spoken bwick-roh). In this setting, the bandit problem is modified by adding a decision at each time step $t$ to pay or not to pay a known cost $c$ to observe the otherwise unknown reward $r_t$ \citep{activerl}. As in standard bandit problems, $a_t$ is the arm chosen at time $t$, and $r_t$ depends on $a_t$.
There are many different types of bandits that can be extended to the BwCRO setting, and their definitions are deferred to the relevant sections.

This setting can be used to analyze tradeoffs in a variety of domains. For example, an internet-of-things (IOT) device needs to account for its limited power supply during learning. Specifically, sensing the reward for an action can require substantial power, and the device needs to decide when to pay this updating cost. In chatbot optimization the chatbot needs to respond to questions in real-time, with an action space over possible utterances. However, observing the quality of an utterance can only be done through human assessment. Finally, in holistic recommendation we seek to maximize the value to a user according to a more holistic criterion than engagement metrics such as clicks. In this case, we would like to choose when it is worth it to get feedback from human assessors.

In this paper, we provide the following contributions.
First, we prove an information-theoretic $\Omega(c^{1/3}T^{2/3})$ lower bound on the regret in the BwCRO setting.
Second, we derive a novel algorithm for simple multi-armed \abbreviation which provably matches these lower bounds up to a logarithmic factor.
Third, we develop a general method for turning any suitable $O(T^{1/2})$-regret bandit algorithm into an $O(c^{1/3}T^{2/3})$-regret BwCRO algorithm. 
And, fourth, we propose a novel heuristic algorithm for linear contextual \abbreviation which can adaptively choose when to query for a label depending on the context.
Beyond the derivation of the new learning methods and their theoretical characterization, we also present experiments which validate and compare the empirical performance of the different algorithms.


%\input{sections/setting}
\section{Setting Description}

We first recap basic bandit settings and relevant related work, leading to a formal definition of the Bandits with Costly Reward Observations setting.

\subsection{Setting Description}
\paragraph{Standard Bandit Settings.}
The multi-armed bandit setting creates a tradeoff between exploring new actions in order to understand their performance, and exploiting actions which have worked well in the past.
There is a (sometimes null) set of contexts $\calX$, set of actions $\calA$ and an unknown mapping from actions to a distribution over rewards $\rho: \calX \times \calA \to \Delta(\mathbb{R})$. At each timestep $t$, the policy $\pi: \calX \to \Delta(\calA)$ chooses an action $a_t \in A_t \subset \calA$ based on the context $x_t \in \calX$, and receives a reward $r_t \sim \rho(x_t, a_t)$. However, the agent does not typically know $\rho$, and must learn how to choose high-reward actions over time. If we denote the optimal action at time $t$ as $a^*_t = \argmax_{a \in A_t}\Ex\sqaren{r|X_t, a}$, then the agent seeks to minimize
$$\Regret = \sum_{t=1}^T\paren{\Ex\sqaren{r|X_t, a_t^*} - \Ex\sqaren{r|X_t, a_t}}.$$
It is also sometimes useful to refer to the regret over an interval $[j, k]$, in which case we denote $\Regret_{j:k} = \sum_{t=j}^k\Ex\sqaren{r|X_t, a^*_t} - \sum_{t=j}^k\Ex\sqaren{r|X_t, a_t}.$

\paragraph{Bandits with Costly Observations.} The previous setting does not consider that observing labels may incur direct costs, rather than only having an opportunity cost. Bandits with costly observations (also studied as a special case of active reinforcement learning in \cite{activerl}) adds an additional dimension that the algorithm must request a reward label and incur a known cost $c$ in order to observe the reward. In order to add reward observation costs to a normal bandit setting, the contexts are unchanged and the new action space is $\calA' = \calA \times \{\text{label}, \text{no label}\}$. Define $l_t = 1$ if a label is requested and $0$ otherwise, so that $a_t' = (a_t, l_t)$, and $r_t$ is sampled as before, but the reward is modified to be $r_t' = r_t - c$ if a label is requested or $r_t' = r_t$ otherwise.

Since the algorithm always gets a higher immediate reward by not requesting a label, each requested label increases the regret by $c$. For the sake of clarity, if there are $n$ labels we define the regret ignoring label costs as $\Regret^\circ$ 
$$\Regret^\circ = \sum_{t=1}^T\paren{\max_{a \in A_t}\Ex\sqaren{r|X_t, a} - \Ex\sqaren{r|X_t, a_t}},$$
and the regret including label cost as cRegret
$$\text{cRegret} =  \Regret = \Regret^\circ + cn.$$

\subsection{Background and Related Work}
While the Bandits with Costly Reward Observations setting has been studied before as a special case of different frameworks, our paper more thoroughly investigates this specific setting.

\paragraph{Active Learning.} 
\abbreviation is similar to active learning, but differs in that it pays a reward cost for additional labels rather than having a fixed labeling budget, and also makes labeling decisions one at a time in response to contextual information. There are several more closely related topics.

\paragraph{Partial Monitoring.} BwCRO can be seen as a special case of partial monitoring, which studies sequential decision-making with imperfect feedback that may or may not include the reward. Prior work in partial monitoring states that since you need to take suboptimal actions in order for an algorithm to know if it is taking the optimal action, we will incur $O(T^{2/3})$ regret rate rather than an $O(T^{1/2})$ regret rate \citep{partialmonitoring}. However, our setting has a more specific structure where the cost between the observed and unobserved actions are exactly $c$, and we prove a novel $O(c^{1/3})$ component of the regret.
%However, this framework does not have the additional structure that each observation costs $c$, and so the dependence on $c$ is not known until this paper.

\paragraph{Best Arm Identification.} BwCRO is related to best arm identification, which seeks to choose the arm with the highest expected reward in a multi-armed bandit setting with a fixed number of arms. All multi-armed bandit algorithms are related to best arm identification in that regret is incurred every time a suboptimal arm is chosen. Since the only way to get zero regret on a timestep is to choose the optimal arm, any algorithm with sublinear regret must eventually play the optimal arm the most often. Our regret lower bound proof uses the difficulty of identifying the optimal arm as a key component of the lower bound on regret, and our proposed Worth-it-Width algorithm can be seen as an $\epsilon$-best arm identification algorithm. However, our other algorithms can achieve high performance in settings where the arms can change every timestep, which is a quite different setting.

\paragraph{Active Reinforcement Learning.} BwCRO is closely related to active reinforcement learning, which adds a cost to observing the rewards in an RL setting. This work primarily focuses on MDPs instead of bandits \citep{activerl, owain}. In contrast to this previous work, we develop new algorithms that not only have empirical advantages, but also proven regret rates. Furthermore, we prove the first lower bound for this setting.
%This paper differs in that while it states the $O(T^{2/3})$ regret rate we provide a proof and include a term for $c$. We also provide novel algorithms, along with some proofs for their regret rates.


\section{Algorithms}
The core challenge of adding costly observations to bandit settings is that the algorithm must now decide when to request a label. We first present an algorithm which analyzes how to adapt the UCB algorithm to the \abbreviation setting, then show more general algorithms which work in the \abbreviation setting without the need for additional specialized analysis.
%But is there an easier and more general strategy for deriving algorithms for this setting? We answer in the affirmative with a simple baseline algorithm that adapts popular algorithms such as Thompson Sampling and UCB, as well as a heuristic algorithm which extends UCB methods into the \abbreviation setting more generally.

%\input{sections/simplemab/main}
\subsection{Algorithm for Multi-armed \abbreviation}
For non-contextual bandits, the question of when to request labels can be simplified to the question of when to stop requesting labels. 
%any algorithm that decides to request or not request a label as a deterministic function of its observations. 
Consider any algorithm that decides whether or not to request a label based on a deterministic function $f(\mathcal{O}_t) \to \{\text{no label}, \text{label}\}$ of its observations $\mathcal{O}_t$ up to time $t$. Without contextual information, if the algorithm does not request a label at time $t$, then $\mathcal{O}_{t+1} = \mathcal{O}_t$. Therefore, $f(\mathcal{O}_t) = \text{no label}$ implies that $f(\mathcal{O}_{t+1}) = \text{no label}$, and once an algorithm stops requesting labels it will never request labels again. This means that many algorithms can be designed by focusing only on when to stop requesting labels, which forms the basis of Algorithms \ref{alg:wiw} and \ref{alg:fixedn}.


The key idea is to stop requesting labels by tracking whether or not it is still plausible that the bandit instance that it is observing is one where it will be worth it to collect enough data to disambiguate between the arms. If the difference between two arms' average rewards $\Delta$ is small enough, then it is cheaper to simply mistakenly commit to an arm than to pay for enough labels to figure out which arm is better. In the one-armed bandit case, it is possible to compute the range of $\Delta$s where it is better to request labels than to commit, and then check if the upper bound on $\Delta$ is such that it is still plausible that it is worth it to request labels. This idea can then be extended to the multi-armed case.

\subsubsection{One-armed Bandit Setting and Algorithm}


We first consider the simplified one-armed bandit setting where there is only one stochastic arm with unknown average reward. This allows us to analyze only the question of when to stop requesting labels compared to committing to a known alternative. In this setting,  there are two options. The first is to choose an arm with a reward $r_t \in [0, 1]$ drawn stochastically from an unknown distribution with an unknown mean $\mu^*$. The second is to choose a holdout arm with a known average reward $\nu$. The per-step regret of choosing the wrong arm is $\Delta = |\mu^* - \nu|$. 
 
Our goal is to decisively claim that one arm is better than another. If we know (with probability at least $1-\delta$) that the stochastic or fixed arm is better, then there is no need for further labels. More formally, we define disambiguation. 

\begin{definition}[Disambiguate]
\label{definition:disambiguate}
	Two arms $a$ and $a'$ with means $\mu^*_a$ and $\mu^*_{a'}$ are disambiguated if with probability at least $1-\delta$ it can be said that either $\mu^*_a > \mu^*_{a'}$  or $\mu^*_a < \mu^*_{a'}$.
\end{definition}

How many labels does it take to disambiguate between the stochastic and the fixed arm?  To help the analysis, define $\mu_n$ to be the empirical mean of the rewards of the stochastic arm based on $n$ samples, and define $\hat \Delta_n = |\hat \mu_n - \nu|$.


\begin{remark}
\label{remark:disambiguate}
The stochastic and fixed arms are disambiguated after $n$ stochastic arm labels if $n > 2\log(2T/\delta)/\hat \Delta_n^2.$ For a fixed $\mu^*$, this occurs by at most
$n \leq 8\log(2T/\delta)/\Delta^2.$
\end{remark}

\begin{proof}
The Azuma-Hoeffding inequality \citep{azuma, hoeffding} bounds the true average reward $\mu^*$ based on the observed rewards $\hat \mu_n$ with probability at least $1-\delta$ for all timesteps \citep{ajks}:

$$|\mu^* - \hat \mu_n|\leq \sqrt{2\log(2T/\delta)/n}$$

If $\mu^* \leq \nu$, then $\nu \leq u_n = \hat \mu_n + \sqrt{2\log(2T/\delta)/n}$ can only hold while $n \leq 2\log(2T/\delta)/\hat \Delta_n^2$. This further implies that $\mu^* \leq \nu$, since $\mu^* \leq u_n$ with probability $1-\delta$. Similarly, if $\nu \leq \mu^*$, then $\hat \mu_n - \sqrt{2\log(2T/\delta)/n} = \ell_n \leq \nu$ can only hold while  $n \leq 2\log(2T/\delta)/\hat \Delta_n^2$. Therefore, with high probability the two arms will be disambiguated once $n > 2\log(2T/\delta)/\hat \Delta_n^2$.


Bounding $\hat \mu_n$ with the Azuma-Hoeffding inequality $|\mu^* - \hat\mu_n| \leq \sqrt{2\log(2T/\delta)}$ shows that this will happen within at most $n < 8\log(2T/\delta)/\Delta^2$ steps. If $\nu < \mu^*$, then applying Azuma-Hoeffding we have $\mu^* - 2\sqrt{2\log(2T/\delta)/n} \leq \ell_n = \hat\mu_n - \sqrt{2\log(2T/\delta)/n}$, and so $\ell_n \leq \nu$ can only hold until $n > 8\log(2T/\delta)/\Delta^2$. Similarly, $\nu < \mu^*$, then applying Azuma-Hoeffding we have $\hat\mu_n + \sqrt{2\log(2T/\delta)/n} = u_n \leq \mu^* + 2\sqrt{2\log(2T/\delta)/n} $, and so $\ell_n \leq \nu$ can only hold until $n > 8\log(2T/\delta)/\Delta$. Therefore, with high probability the arms will be disambiguated by $n < 8\log(2T/\delta)/\Delta^2$.
\end{proof}


Note that the smaller the gap $\Delta$ between the fixed and stochastic arms, the less regret is accumulated by choosing the wrong arm. If $\Delta$ is small enough, then it is better to pay the regret of choosing the wrong arm than to pay the labeling cost needed to disambiguate between the two arms.

\begin{remark}
\label{remark:worthit}
Disambiguating between the stochastic and fixed arms is not worth it if the regret of choosing the wrong arm is lower than the labeling cost, which happens when $$\hat \Delta_n + \sqrt{2\log(2T/\delta)/n}  < \sqrt[3]{8c\log(T/\delta)/T}.$$
\end{remark}
\begin{proof}

Simply choosing an arm at the beginning yields an expected regret of $T\Delta$. The cost for requesting $n$ labels is $cn$. For a given $\Delta$, it takes $n < 8\log(2T/\delta)/\Delta^2$ labels until $\nu < \ell_n$ or $u_n > \nu$. This means that for a given $\Delta$, the labeling cost to disambiguate arms is at most $8c\log(2T/\delta)/\Delta^2$.

If a $\Delta$ is always worth it to request enough labels that $\nu < \ell_n$ or $u_n > \nu$, then the maximum labeling cost must be less than the regret of simply choosing the wrong arm. Namely, it must satisfy $8c\log(T/\delta)/\Delta^2 \leq T\Delta.$ Therefore it will not always be worth it to collect labels until either $\nu < \ell_n$ or $u_n > \nu$ if $\Delta < \sqrt[3]{8c\log(2T/\delta)/T}$. 

However, the algorithm does not have access to $\Delta = |\mu^* - \nu|$. We can apply the triangle inequality and Azuma-Hoeffding inequality to bound $\Delta$ with probability $1-\delta$.
$$\Delta = |\nu - \mu|  \leq |\nu - \hat\mu_n| + |\hat\mu_n - \mu| = \hat \Delta_n + \sqrt{2\log(2T/\delta)/n}.$$
Combining this bound on $\Delta$ and the definition of when it is always worth it to collect labels yields an expression usable by the algorithm:
$\hat \Delta_n + \sqrt{2\log(2T/\delta)/n}  < \sqrt[3]{8c\log(2T/\delta)/T}.$
\end{proof}

This tells us when to stop requesting labels -- when the arms are disambiguated so we can confidently commit, or when an upper bound on $\Delta$ is small enough that   committing to the wrong arm is cheaper than labeling until disambiguation.


\subsubsection{Multi-armed Bandit Algorithm}
In the multi-armed bandit setting there are multiple stochastic arms, and no arm has a known reward $\nu$. However, the key idea from the single-armed case holds. If two arms $a$ and $a'$ have close enough expected values $\mu^{a}$ and $\mu^{a'}$, then it is better to pick one than to pay the cost of learning which is better. For this setting, the algorithm uses a time-dependent holdout reward $\nu_t$ which represents an expected reward that the algorithm can expect to get by committing now.


First, we set up notation to define $\nu_t$.
Define arm $a$'s expected reward $\mu^{a} = \Ex[r_t|a_t=a]$, the empirical average at time $t$ as $\hat \mu^{a}_t$, and $n_t^{a}$ as the number of times arm $a$ was observed at time $t$. Define $u_t^{a}$ and $\ell_t^{a}$ as the Azuma-Hoeffding upper/lower bounds from $|\mu^{a} - \hat \mu_n^{a}| \leq \sqrt{2\log(2kT/\delta)/n_t^{a}}$, using the union bound to distribute the failure probability $\delta$ over all timesteps $t \in [T]$ and all $k$ arms. Then if 
$\nu_t = \max_{a\in\calA}\ell_t^{a} = \max_{a\in A} \paren{\hat \mu_t^{a} - \sqrt{2\log(2kT/\delta)/n_t^{a}}}$ and 
$a^{\nu}_t = \argmax_{a \in \calA}\ell_t^{a}$, the algorithm can commit to the arm $a^{\nu}_t$, and get at least reward $\nu_t$. For convenience, denote the expected reward of arm $a^{\nu}_t$ as $\mu_t^\nu = \mu^{a^{\nu}_t}$.

Now, we extend the one-armed case by defining the stop conditions analogous to those of the one-armed case. Define the gaps $g_t^{a}$ as $g_t^{a} = u^{a}_t - \nu_t$, which are upper bounds on the per-step regret for choosing the holdout arm $a^\nu_t$ instead of $a$, since with high probability both $\mu^a \leq u^{a}_t$ and $\nu_t \leq \mu^{a^{\nu}_t} = \mu_t^\nu$. Define the maximum gap $\bar g_t = \max_{a \in \calA} g^{a}_t$ and the arm with the maximum gap $a^{\bar g}_t = \argmax_{a \in \calA} g_t^{a}$. Finally, define the worth-it-width $$w = \sqrt[3]{8c\log(2kT/\delta)/T}.$$
Since $\bar g_t$ is an upper bound on the per-step regret of choosing the holdout arm $a^\nu_t$ once $\bar g_t \leq w$ committing to the holdout arm $a^\nu_t$ is better than gathering enough labels to conclude with high probability that some other arm $a'$ has a higher average reward.

\begin{algorithm}
\caption{Worth-it-Width (WiW) Algorithm}\label{alg:wiw}
\noindent At each time step $t$, compute 
the upper/lower bounds $u_t^{a}$ \& $\ell_t^{a}$, holdout value $\nu_t$, and max gap $\overline{g}_t$. \newline
\textbf{If} $\overline{g}_t \leq w = \sqrt[3]{8c\log(2kT/\delta)/T}$, commit to  arm $a^\nu_t$.\newline
\textbf{Else if} $g_t^{(a')} \leq w $ for all $a' \neq a_t^{\bar g}$ and the maximum gap $\overline{g}_t^{a}$ is such that $a_t^{\bar g} = a_t^\nu$, then commit to arm $a_t^\nu$.\newline
\textbf{Otherwise}, label arm $a = \argmin_{a' \in \{a_t^{\bar g}, a^{\nu}_t\}}n^{(a')}_t$.
\end{algorithm}

The next theorem establishes the regret of this algorithm. 

\begin{restatable}[Regret Rate for WiW Algorithm]{theorem}{simple}
\label{theorem:simple}
Algorithm \ref{alg:wiw} has a regret rate of $\tilde O(kc^{1/3}T^{2/3})$ with high probability.
\end{restatable}



\begin{proof}
%The proof has two main claims -- that we will hit a termination condition within $\tilde O(k(T/c)^{2/3})$ labels, and that upon doing so the regret will be bounded by $\tidle O(kT^{2/3})$.
Since it takes at most $8\log(2kT/\delta)/\Delta^2$ labels to disambiguate between two arms with a given $\Delta$, we can play an arm at most $n = \sqrt[3]{8\log(2kT/\delta)}(T/c)^{2/3}$ times before concluding that $\mu^a$ or $\mu^\nu_t$ is greater or that $|\mu^a - \mu^\nu_t| = \Delta \leq w$. Since we always play an arm associated with the largest gap we can only gather $k\sqrt[3]{8\log(2kT/\delta)}(T/c)^{2/3}$ labels before terminating, which incurs a regret of $k\sqrt[3]{8c\log(2kT/\delta)T^2}$.

Further, with high probability, $g_t^{a}$ bounds the regret of committing to the holdout arm $a^\nu_t$ instead of arm $a$ since $\mu^{a} - \mu^{a_t^{\nu}} \leq u_t^{a} - \ell_t^{a_t^{\nu}} = g_t^{a}$. At termination $g_t^{a} < \sqrt[3]{8c\log(2kT/\delta)/T}$, so our regret thereafter is bounded by $\sqrt[3]{8c\log(2kT/\delta)T^2}$ with high probability.
Adding these two terms, the regret is $\tilde O(c^{1/3}T^{2/3})$.
\end{proof}

Algorithm \ref{alg:wiw} (WiW) directly exploits the insight that as arms have more and more similar expected rewards it gets harder to disambiguate between them while becoming cheaper to mistakenly commit. So, it commits if it disambiguates the arms or if the reward difference upper bound is less than $w$. 

%\input{sections/fixedn/main}
\subsection{General Algorithm for \abbreviation}
The previous algorithm showed how to adapt to the UCB algorithm to the \abbreviation setting, but required a detailed analysis of the algorithm to prove its regret rate. Is there a more general approach that does not require detailed analysis for each additional setting? 

The affirmative answer is given by the following Fixed-N Algorithm, which is in fact very general and can also work in contextual bandit settings. Its key idea is to use a universally valid stopping criterion for requesting labels that is primarily a function of the horizon $T$ and the label cost $c$.

%The core idea of the Fixed-N Algorithm is that we need to trade off between requesting more labels to get better performance, and requesting fewer labels to incur a smaller labeling cost. This algorithm works for a range of bandit settings without needing special cased reasoning or proofs, by answering the question of how many labels the base algorithm should request as a function of the horizon $T$, and label cost $c$.

\begin{algorithm}
\caption{Fixed-N Algorithm for Multi-armed Bandits}\label{alg:fixedn}
%\begin{algorithmic}[1]
\textbf{Given:} Algorithm $\calA$  that satisfies  Assumption \ref{assumption:uniformregret} with $\Ex\sqaren{\Regret_{1:n}^\circ} \leq \constant \sqrt{n},$ \newline
\textbf{Phase 1:} Play according to $\calA$ while observing the first $n = \paren{\frac{T\constant}{2c}}^{2/3}$ labels.\newline
\textbf{Phase 2:} Play according to $\calA$ without more labels.
%\end{algorithmic}
\end{algorithm}


In order to analyze the performance of the algorithm, we first make an assumption that relates the regret of the algorithm after no longer requesting labels to its earlier performance.

\begin{restatable}[Uniform Regret Rate]{assumption}{uniformregret}
\label{assumption:uniformregret}
An algorithm $\calA$ meets the uniform regret assumption if, for all $n \leq T$ and with randomness taken over the algorithm's choices and environment, a) playing according to $\calA$ while observing labels for the first $n$ timesteps results in
$\Ex\sqaren{\Regret_{1:n}^\circ} \in O(n^{1/2})$ and b)
with randomness taken over the algorithm's choices and environment, and if requesting no further labels after the first $n$ timesteps results in
$$\frac{1}{T-n}\Ex\sqaren{\Regret_{n+1:T}^\circ} \leq \frac{1}{n}\Ex\sqaren{\Regret_{1:n}^\circ}.$$ 
\end{restatable}

Part b of this assumption essentially states that average regret does not get worse with more labels. In particular, we can stop the algorithm after $n$ labels at any time and expect an $O(n^{-1/2})$ per-timestep regret rate in retrospect and going forward. 
%Part b is a very mild condition, stating that on average the per-step regret of the algorithm is decreasing. 
Part a states that the algorithm does not have any distinct phases that have qualitatively different regret evolutions. This excludes most explore-then-commit algorithms, but includes popular algorithms such as UCB and Thompson sampling.  This assumption allows us to prove Theorem \ref{thm:fixednregret}, which shows that Algorithm 
\ref{alg:fixedn} achieves a regret rate of $O(c^{1/3}T^{2/3})$. If instead the base algorithm instead has an $\tilde O(T^{1/2})$ regret rate, then Algorithm \ref{alg:fixedn} has 
the corresponding $\tilde O(c^{1/3}T^{2/3})$ regret rate.



\begin{theorem}[Regret Rate for Fixed N Algorithm]\label{thm:fixednregret}
Assuming that $\calA$ satisfies the uniform regret assumption, the Fixed N algorithm based on $\calA$ has $\text{cRegret} \in O(c^{1/3}T^{2/3})$.%, matching the lower bound.
\end{theorem}

\noindent {\sc Proof Sketch.} 
Assume that $\calA$ satisfies the Uniform Regret assumption, so that $\Ex\sqaren{\Regret_{1:n}^\circ} \leq \constant \sqrt{n}$ for all $n > n_0$ for some $n_0$. In the \abbreviation setting, receiving $n$ labels incurs a regret of $cn$, so the total regret of using $\calA$ while labeling the first $n > n_0$ can be bounded as follows:
\begin{align*}
    \text{c}\Regret_{1:T} &= cn + \Ex\sqaren{\Regret_{1:n}^\circ} + \Ex\sqaren{\Regret_{n+1:T}^\circ}\\
    &\leq  cn + \dfrac{n}{n}\Ex\sqaren{\Regret_{1:n}^\circ} + \frac{T-n}{n}\Ex\sqaren{\Regret_{1:n}^\circ}\\
    &=cn + T\Ex\sqaren{\Regret_{1:n}^\circ}/n\\
    &\leq cn + TKn^{-1/2}
\end{align*}
The first inequality follows from the Uniform Regret assumption, and the second from the definition of $O(\sqrt{n})$.
As shown in Appendix A.3, %\ref*{proof:fixednregret}
$cn + T\constant n^{-1/2}$ is minimized by $n = \paren{T\constant/2c}^{2/3}$. Plugging this value of $n$ into the original bound $cn + T\constant n^{-1/2}$ yields the regret
$O(c^{1/3}\constant^{2/3}T^{2/3}) \subset O(c^{1/3}T^{2/3})$. \qedsymbol{}

Algorithm \ref{alg:fixedn} generalizes any bandit algorithm meeting Assumption \ref{assumption:uniformregret} into a corresponding \abbreviation algorithms. This provides a generic mechanism for constructing \abbreviation\ algorithms, establishing a natural baseline for any special-purpose designed \abbreviation\ algorithms.

%\input{sections/linear/main}
\subsection{Linear Contextual BwCRO}
While the Fixed-N algorithm is very general and can be used to handle costly reward observations in many bandit settings, it is entirely non-adaptive and does not use fewer labels in easier instances, or request labels in more interesting states. We conjecture that this is a substantial miss in many applications (such as healthcare, self-driving cars), where it is useful to request labels or oversight in the right states. The Worth-it-Width Algorithm (Algorithm \ref{alg:wiw}) is adaptive, but has no notion of context or state.
Is there a method for designing \abbreviation algorithms that is both adaptive and general?

We combine the idea of keeping track of upper bounds on the per-step regret, and the idea of requesting labels only when it is worth it, to propose the following $\Delta$ Max Regret Heuristic for general \abbreviation learning.

\subsubsection{$\Delta$ Max Regret Heuristic}
We can reinterpret the proof of the Fixed-N Algorithm regret rate (Theorem \ref{thm:fixednregret}) to arrive at a more general algorithm. The proof places an upper bound on $\Ex\sqaren{\text{cRegret}_{1:T}}$ as $cn + TKn^{-1/2}$, then minimizes that lower bound by selecting $n=(TK/2c)^{2/3}$ labels. However, we can interpret these mechanics instead as upper bounding the future $\Ex\sqaren{\Regret^\circ_{t:T}}$ as $TKn^{-1/2}$ (by Assumption \ref{assumption:uniformregret}), then requesting labels as long as the marginal labeling cost $c$ exceeds the marginal decrease on our $\Regret^\circ_{t:T}$ upper bound.%of $TKn^{-1/2}$.

This suggests a new heuristic: request a label if the decrease in an upper bound on $\Ex\sqaren{\Regret^\circ_{t:T}}$ is greater than the labeling cost $c$. Denote the observation at time $t$ as $o_t$, such that $\calO_{t+1} = \calO_t \cup \{o_t\}$ if a label is requested and $\calO_{t+1} = \calO_t$ otherwise, and let $\Phi(\calO_t)$ be an upper bound on the per-step $\Regret^\circ$ given the observations $\calO_t$. Then, request a label if 
\begin{equation}\label{eqn:deltamaxregret}
c \leq (T-t)\Phi(\calO_t) - (T-t)\Phi(\calO_t \cup \{o_t\}). 
\end{equation}
The Fixed-N algorithm can be exactly fit into this heuristic, while the Worth-it-Width algorithm can be seen as a refinement which stops slightly sooner. To recover the Fixed-N algorithm, then note that if $\calA$ satisfies the Uniform Regret assumption, then we know that there exists a $K$ such that $$\dfrac{\Ex\sqaren{\Regret^\circ_{n+1:T}}}{T-n} \leq \dfrac{\Ex\sqaren{\Regret^\circ_{1:n}}}{n} \leq \dfrac{K\sqrt{n}}{n} = \dfrac{K}{\sqrt{n}},$$
and therefore there exists a per-step $\Regret^\circ$ upper bound $\Phi(\calO_t) = K/\sqrt{n}$. We loosen this bound to $TK/(\sqrt{n}(T-t))$ then bound $(T-t)\Phi(\calO_t) - (T-t)\Phi(\calO_t \cup \{o_t\})$ by $TK/(2\sqrt{n^3})$, then choose $n$ such that $c = TK/\sqrt{n^3}$, which recovers the previous stopping condition of $n = (TK/2c)^{2/3}$.


This heuristic is more adaptive in two ways. First, by using an adaptive upper bound $\Phi$ (such as $\bar g_t$ in the Worth-it-Width Algorithm) instead of the non-adaptive upper bound $Kn^{-1/2}$ of the Uniform Regret Assumption, we can use instance-specific information to choose whether or not to label something. Second, this formulation allows us to take advantage of state-specific and not just instance-specific information in making our labeling decisions. We will concretely demonstrate this property by applying the heuristic to linear contextual bandits.

\subsubsection{Linear Contextual \abbreviation Algorithm}
We adapt the Delta Max Regret heuristic to the linear contextual Bandits setting by building on top of LinUCB, a well-studied implementation of the ``optimism in the face of uncertainty" principle \citep{li, dani, yadkori}. In the linear contextual bandit setting, at each time step $t$ the algorithm chooses an action from among $k$ contexts $X_t = \{x_t^{j}\in\mathbb{R}^d\}_{j=1}^k$ which are drawn (at each time step) from some distribution $\calD$ such that $\|x_t^{j}\| \leq B$. The algorithm receives reward $x_t \cdot \mu^* + \eta_t$ for the chosen $x_t \in X_t$, where $\mu^*$ is unknown, $\|\mu^*\| \leq W$, and $\eta_t$ is $\sigma^2$ sub-Gaussian noise. Following \cite{ajks}, define $\Sigma_t =  (\sigma^2/W^2)I +\sum_{\tau=1}^{t-1}x_\tau x_\tau^T$, mean $\hat \mu = \Sigma_t^{-1}\sum_{\tau=1}^{t-1}r_\tau x_\tau$, $\beta_t = \sigma^2\paren{2+4d\log(1+tB^2W^2/d) + 8\log(4/\delta)}$, and an uncertainty region which contains the true $\mu^*$ at all time steps with probability $1-\delta$
$$\Ball_t = \{\mu | \paren{\hat \mu_t - \mu}^T \Sigma_t^{-1} \paren{\hat \mu_t - \mu} \leq \beta_t\}.$$ 
LinUCB bounds the difference between our upper bound on the value of some $x$ and its true value with the width of the uncertainty region along $x$. For any $\mu \in \Ball_t$,
$$|\mu\cdot x - \hat\mu_t\cdot x| \leq \width(\beta_t, \Sigma_t, x) = \sqrt{\beta_t x^T\Sigma_t^{-1}x}.$$
Since $\mu^* \in \Ball_t$ for all $t\leq T$ with probability $1-\delta$, it follows that we can upper bound the value of $\mu^* \cdot x$ as
$\mu^* \cdot x \leq \hat\mu_t \cdot x + \width( \beta_t, \Sigma_t, x).$

We now need a suitable $\Phi$ for the Delta Max Regret heuristic. \cite{ajks} provides a short proof that the per-step regret of choosing $x_t$ is bounded as $2\width(\beta_t, \Sigma_t, x_t)$. Define $x^*_t = \max_{x\in X_t}\mu^*x$, and $\tilde\mu = \arg\max_{\mu\in\Ball_t}\max_{x \in X_t} \mu \cdot x = \argmax_{\mu\in\Ball_t}\mu \cdot x_t$. 
\begin{align*}
\Regret_t &= \mu^*\cdot x^*_t - \mu^*\cdot x_t \\
&\leq \tilde\mu\cdot x_t - \mu^*\cdot x_t \\
%&= (\tilde\mu - \mu^*)\cdot x_t \\
&= (\tilde\mu - \hat\mu_t)\cdot x_t + (\hat\mu_t - \mu^*)\cdot x_t \\
&\leq 2\width(\beta_t, \Sigma_t, x_t)
\end{align*}

However, the algorithm does not know what the future $X_t$ will be, so this bound cannot be applied directly. We instead consider the \textit{maximum} width possible for a given covariance matrix, since this bounds $x_t$ for all possible future contexts $X_t$.
Conveniently,
%the change in the maximum width from observing a reward depends on the current $x_t$ but not the observed reward $r_t$,
since the width depends on $\beta_t$ and $\Sigma_t$, and
since $\Sigma_{t+1} = \Sigma_t + x_t x_t^T$ if a label is requested, we can compute $\Phi(\calO_t \cup \{o_t\})$ using information which is known at decision time. The max width is computed as $$\Phi(\calO_t) = \text{mw}(\beta, \Sigma) = \max_{\text{eigenvectors}} B \text{width}(\beta, \Sigma, e_i).$$
\begin{algorithm}\caption{Delta Max Regret for LinUCB Algorithm}\label{alg:linear}
%\begin{algorithmic}[1]
At each time step $t$, compute the center $\hat \mu_t$, covariance $\Sigma_t$, and uncertainty region $\Ball_t$.\newline
\textbf{Play arm} $x_t = \arg\max_{x\in D}\max_{\mu \in \Ball_t} \mu \cdot x$.\newline
\textbf{Request label} if $x_t$ is such that
$$\!\!(T\!-\!t)\!\sqaren{\text{mw}\!\left(\!\beta_t, \Sigma_t^{-1}\!\right) \!-\! \text{mw}\!\left(\!\beta_{t+1}, \!(\Sigma_t \!+\! x_tx_t^T\!)^{\!-1}\!\right)} > c$$
\textbf{Otherwise} don't request label
%\end{algorithmic}
\end{algorithm}

This algorithm is able to determine which states are useful to label, and our empirical evaluation will demonstrate that it can do this effectively on both synthetic and real data.

% contains the mab_graphs
%\input{sections/lowerbound/main}
\section{Lower bound}
We have described several algorithms, but are they close to optimal? The following proves that the \abbreviation setting has a regret lower bound of $\Omega(c^{1/3}T^{2/3})$, and that therefore the Fixed N (Algorithm \ref{alg:fixedn}) and WiW (Algorithm \ref{alg:wiw}) algorithms match the lower bound on the regret, and no algorithms can achieve a better asymptotic rate.  This proves a novel rate for the labeling cost $c$, as well as agreeing with the rate for $T$ from \cite{activerl} and \cite{partialmonitoring}. %This shows that knowing the additional structure that the labeled and unlabeled actions differ in reward by exactly $c$ does not allow one to avoid the $T^{2/3}$.
Our information-theoretic proof is based on the regret lower bound proof by \cite{mabintro}.

\begin{restatable}[]{theorem}{lowerboundthm}
\label{thm:lowerbound}
The Bandits with Costly Observations setting has a regret lower bound of $\Omega(c^{1/3}T^{2/3}).$
\end{restatable}

%Our information-theoretic proof is based on the regret lower bound proof for multi-armed bandits presented in \cite{mabintro}. Below we provide a shortened version of \ref{proof:lowerbound}'s full proof.
The basic idea of the proof is that we randomize over $K$ instances with different best arms $k^*$, then show that (on average) $k^*$ would not be played that often in a base instance, and therefore cannot be played that often in instance $k^*$. 


\begin{proof}[Proof of Theorem \ref{thm:lowerbound}]
Consider a setting which chooses uniformly at random from $K$ different multi-armed bandit instances, each with $K$ actions where a coin is flipped with reward $1$ for heads and $0$ for tails. Denote the index of the randomly selected instance as $k^*$. In each bandit instance $k$, coin $k$ is biased with expected reward $(1+\epsilon)/2$, and all other $K-1$ coins are fair. Denote the probability of an event $A$ in instance $k$ as $\Pr_k(a)$. 

\begin{figure*}[t]
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/mabcosts/c1T1000}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/mabcosts/c1T10000}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/mabcosts/c1T100000}
    \end{subfigure}
    \caption{Final average per step regret for varying values of gaps $\Delta$, $c=1$, standard error from 20 trials. Dashed red line is at the predicted worst-case $\Delta = \sqrt[3]{c/T}$. Left graph has horizon $T=1000$, middle has $T=10000$, and right has $T=100000$.}
     \label{fig:delta}
\end{figure*}

Consider an additional hypothetical base instance $0$ where all the coins are fair. Our setting never chooses this instance.
%, but we use it in our analysis.
%Denote the probability of an event $A$ in this instance as $\Pr_0(A)$.
Let $Q^{T}_{k}$ denote the number of times coin $k$ is played in $T$ timesteps, and note that by linearity of expectation,
%and definition of $Q^{T}_{k}$,
\begin{equation}\label{eqn:QTK}
\sum_{k=1}^K\Ex_0\sqaren{Q^{T}_{k}} = \Ex_0\sqaren{\sum_{k=1}^KQ^{T}_{k}} = \Ex_0[T] = T.
\end{equation}
\paragraph{How many times do we play $k^*$ in instance $0$?}
Let $J_T = \{k: \Ex_0\sqaren{Q^{T}_{k}} \leq 3T/K\}$ be the set of coins that the algorithm is not expected to play more than $3T/K$ times during the $T$ timesteps in instance $0$. For each coin $k \in J_T$, $\Ex_0[Q^{T}_{k}] \leq 3T/K$ so by the Markov inequality
%$$\Pr_0(Q^{T}_{k} > a = 6T/K) < \dfrac{3T/K}{6T/K} = \dfrac{1}{2},$$
%\noindent and therefore
\begin{equation}\label{eqn:Prleq6TK}
\text{If } k\in J_T \text{, then } \Pr_0\paren{Q^{T}_{k} \leq 6T/K} \geq 1/2.
\end{equation}
Further, $J_T$ must have at least $2K/3$ elements, since its complement $\overline{J_T}$ must have at most $K/3$ elements, because otherwise the sum of the expectations of $Q^{T}_{k}$ would be greater than $T$, which contradicts Equation \ref{eqn:QTK}.
$$\sum_{k \in \overline{J_T}}\Ex_0\sqaren{Q^{T}_{k}} > \sum_{k \in \overline{J_T}}^K\dfrac{3T}{K} = |\overline{J_T}|\dfrac{3T}{K}$$
Since the coin $k^*$ is chosen uniformly at random, and since $J_T$ has at least $2K/3$ coins in it, with the randomness over the setting's choice of $k^*$,
\begin{equation}\label{eqn:PrJT}
\Pr(k^* \in J_T) > 2/3.
\end{equation}
Combining Equations \ref{eqn:Prleq6TK} and \ref{eqn:PrJT}, we can bound the probability that $Q^{T}_{k^*} \leq 6T/K$ in instance $0$. Denoting the event $Q^{T}_{k^*} \leq 6T/K$ as $\mathcal{E}$,
$$\Pr_0\paren{\mathcal{E}} = \Pr_0\paren{\mathcal{E} \Big| k^* \in J_T}\Pr\paren{k^* \in J_T} \geq \dfrac{1}{2}\dfrac{2}{3} = \dfrac{1}{3}.$$
\paragraph{What is the regret in instance $k^*$?}
The KL Bound Lemma (proof in Lemma 1%\ref*{lemma:event}
, Appendix A.2), states that for any event $A$ based $n$ observed coin flips
$|\Pr_0(A) - \Pr_{k^*}(A)| \leq \epsilon\sqrt{n}.$

\noindent We can bound $\Pr_{k^*}(\mathcal{E})$ as
\begin{equation}\label{eqn:PrkE}
\Pr_{k^*}\paren{\mathcal{E}} \geq \Pr_0\paren{\mathcal{E}}- \epsilon\sqrt{n} \geq \dfrac{1}{3} - \epsilon\sqrt{n}.
\end{equation}

In instance $k^*$, arm $k^*$ is the optimal choice with expected reward $(1+\epsilon)/2$ while all other arms have expected reward $1/2$. Therefore, every timestep that $k^*$ is not chosen incurs an expected regret of $\epsilon/2$. Since $Q^{T}_{k^*}$ is the number of times that $k^*$ is chosen, and since there are $T$ timesteps,
$$\Ex_{k^*}\sqaren{\Regret^\circ|Q^{T}_{k^*}} = \epsilon(T-Q^{T}_{k^*})/2.$$
Therefore, if $\mathcal{E}$ then the regret in instance $k^*$
\begin{equation}\label{eqn:Regret}
\Ex_{k^*}\sqaren{\Regret^\circ|\mathcal{E}} \geq \dfrac{\epsilon(T-6T/K)}{2} = \dfrac{(K-6)T\epsilon}{2K}.
\end{equation}

\paragraph{Conclusion.}

If we collect $n$ labels with a labeling cost $c$, then the regret in instance $k^*$ is
\begin{align*}
\Ex_{k^*}\sqaren{\text{c}\Regret} &= \Ex_{k^*}\sqaren{\Regret^\circ} + cn\\
&\geq \Ex_{k^*}\sqaren{\Regret^\circ|\mathcal{E}}\Pr_{k^*}(\mathcal{E}) + cn\\
&\geq \dfrac{(K-6)T\epsilon}{2K}\paren{\dfrac{1}{3} - \epsilon\sqrt{n}} + cn,
\end{align*}
with the first inequality coming from $\Ex_{k^*}\sqaren{\Regret^\circ} = \Ex_{k^*}\sqaren{\Regret^\circ|\mathcal{E}}\Pr_{k^*}(\mathcal{E}) + \Ex_{k^*}\sqaren{\Regret^\circ|\mathcal{\overline{E}}}\Pr_{k^*}(\mathcal{\overline{E}})$, and the second coming from using Equations \ref{eqn:Regret} and \ref{eqn:PrkE}. Choosing $\epsilon = \sqrt[3]{c/T}$ for the setting, we have 
$$\Ex_{k^*}\!\sqaren{\text{c}\Regret} \geq \dfrac{(K-6)T\sqrt[3]{c/T}}{2K}\paren{\dfrac{1}{3} - \sqrt[3]{c/T}\sqrt{n}} + cn$$
If the algorithm minimizes this expression with respect to $n$ using $\sqrt{n} = \frac{(k-6)}{4k}\sqrt[3]{T/c}$, we get a regret of 
$$\Ex_{j^*}\sqaren{\text{c}\Regret_T} \geq \frac{(k-6)}{6k}\sqrt[3]{cT^2}  - \frac{(k-6)^2}{16k^2}\sqrt[3]{cT^2},$$
\noindent for an $\Omega(c^{1/3}T^{2/3})$ regret lower bound, as desired.
\end{proof}


\section{Experiments}
%\input{sections/experiments/main}
We first compare the Worth-it-Width (WiW) algorithm (Algorithm \ref{alg:wiw}), the Fixed-N algorithm (Algorithm \ref{alg:fixedn}), and the Delta Max Regret (DMR) algorithm (Equation \ref{eqn:deltamaxregret}) to a prior baseline \footnote{Note that \citet{owain} present another more recent algorithm, however its much higher computational costs limit its evaluation to bandits with up to 40 timesteps in the original paper. As such, we defer its discussion to Appendix
A.1.1.%\ref*{appendix:owain}.
} from \cite{activerl}, and the Naive UCB algorithm on a variety of synthetic non-contextual \abbreviation problems. Then, we demonstrate the performance of the DMR algorithm (Algorithm \ref{alg:linear}) compared to Fixed-N and Naive  UCB  on both real and synthetic contextual problems.

\subsection{Multi-Armed Bandit Evaluation}
We first consider a variety of two armed Bernoulli bandit settings with average reward $0.5 \pm \Delta/2$ and labeling cost $c=1$. A more detailed study of cost is in Appendix A.1.4.%\ref{appendix:costexperiments}.
\paragraph{Baselines.}
We compare the performance of the WiW, Fixed N, and DMR Algorithms against two baselines. The first baseline is a naive implementation of UCB that always requests a label. The second baseline is the MCCH (Mind-changing Cost Heuristic) algorithm presented in \cite{activerl}, using UCB as the underlying algorithm, so that all algorithms are directly comparable. For Fixed N, the constant in the UCB algorithm is $K=8\sqrt{k\log(Tk/\delta)}.$

\begin{figure}[t]
\centerline{
\includegraphics[width=0.32\textwidth]{images/mabcosts/c1T10000_Pr.png}}
    \caption{Probability of committing to the higher value arm for varying values of gaps $\Delta$, $c=1$, and $T=10000$. Standard error from 20 trials. Dashed red line is at the predicted worst-case $\Delta = \sqrt[3]{c/T}$.}
     \label{fig:pr}
\end{figure}

 

\paragraph{Results.}
Figure \ref{fig:delta} shows that the Fixed N algorithm consistently has low variance in its performance. Furthermore, its performance is invariant to different values of $\Delta$, reflecting the fact that it does not adapt to problem instances at all. Figure \ref{fig:delta} also shows that DMR consistently does at least as well as WiW, and shows substantial improvement for small $\Delta$. Neither the MCCH nor the DMR algorithm dominates the other, with DMR having the advantage for larger $\Delta$ and longer episode lengths. As shown in Figure \ref{fig:pr}, this is a result of the fact that MCCH commits earlier, but is much more likely to commit to the wrong arm. For small arm differences $\Delta$ and short horizons $T$, this tradeoff is less costly. As $\Delta$ or $T$ get bigger, the cost of wrongly committing is higher and the DMR and WiW algorithms do better. 


\paragraph{Worth-it-Width Ablations.}
\begin{figure}
\centering

\includegraphics[width=0.32\textwidth]{images/baselines/c100T10000_legend.png}
\caption{$c=100, T=10000$. The simpler baseline can achieve similar performance to the Worth-it-Width algorithm, but does considerably worse when the difference between arms $\Delta$ is small.}
\label{fig:baseline}
\end{figure}

We also perform two ablations on the WiW algorithm to demonstrate the necessity of each step. ``Baseline" simply plays UCB while requesting labels until one arm has a higher LCB than any other arm's UCB. This baseline performs comparably to always requesting a label, since it never labels the suboptimal arm often enough to push its UCB below the optimal arm's LCB. ``Baseline'" rectifies this problem by playing and labeling the least played of the arms associated with the highest UCB and highest LCB, however it does not stop early if the associated gap is smaller than the "worth-it-width". This algorithm performs comparably to WiW for large differences $\Delta$ between the two arms but substantially worse for small differences, demonstrating the necessity of the early stopping condition. These results are corroborated over more parameter settings in Appendix A.1.5.%\ref{appendix:wiwablations}.


\subsection{Linear Contextual Evaluation}
We also evaluate the ability of the DMR algorithm to make state-dependent labeling decisions in contextual bandits.

\paragraph{Baselines.}
We compare the DMR algorithm to Fixed N and a Naive LinUCB baseline. The Naive LinUCB algorithm runs LinUCB and always requests a label. For the Fixed N Algorithm we choose $K=8\beta_Td\log(1+\frac{TB^2W^2}{d\sigma^2})$ as per \cite{ajks}. 

\begin{figure}[t]
\centerline{\includegraphics[width=0.24\textwidth]{images/linucb/cost1d5}\hfill
            \includegraphics[width=0.24\textwidth]{images/linucb/cost10d5}}
    \caption{Final average per step regret for varying noises $\sigma^2$, standard error from 20 trials. Note the logarithmic $y$ scale. The contexts have dimension $d=5$, drawn from $\calN(0, 1)^d$ and rescaled to size $1$. Left has $c=1$ and right has $c=10$.}
     \label{fig:noise}
\end{figure}


\paragraph{Results.}
We set $W$ and $B$ (the sizes of $\mu^*$ and $x$) to $1$, set $T=10000$, and set the number of arms $k=5$. As shown in Figure \ref{fig:noise}, increasing noise $\sigma^2$ forces all non-naive algorithms to request more labels, though DMR is able to avoid always requesting labels even with $\eta \sim \sqrt{10} * \calN(0, 1)$ and each $r_t$ constrained within $[-1, 1]$.


\subsection{Evaluation on Real-World Data}
%\input{sections/experiments/yahoo_graphs}
\begin{figure}
    \centerline{\includegraphics[width=0.24\textwidth]{images/yahoo/double_cost0.01.png}
                \includegraphics[width=0.24\textwidth]{images/yahoo/double_cost0.1.png}}
    \caption{Average per-step reward at each timestep for 1000 timesteps rejection sampled using the Yahoo! Frontpage Dataset. Standard error from 20 trials. Left graph has cost $c=0.01$ and right has $c=0.1$. Non-red dashed lines correspond to using the doubling trick.}
    \label{fig:yahoo}
\end{figure}

We also conducted experiments on the Yahoo! Front Page dataset \citep{yahoo} in order to validate the performance of the Delta Max Regret Algorithm on real data. This dataset was collected in an experiment where Yahoo! placed articles on their front page uniformly at random. Each context in $X_t$ has a $D\times5$ dimensional matrix with features for each of the $D$ article available at the time, as well as a $5$-dimensional vector representing information about the user. For convenience, we only use contexts which have exactly $20$ articles. We create $35$-dimensional vectors for each individual article by concatenating its $5$-d vector to the user's $5$-d vector, along with a $25$-d vector containing the cross-terms of the user and article vectors.
The fact that the articles were selected uniformly at random allows us to run an unbiased simulation of arbitrary policies using rejection sampling \citep{krause_yahoo}. 

\paragraph{Baselines.} In addition to the previously mentioned baselines, we also add a ``Regression" skyline in order to understand the upper limits of performance for linear models in this setting. This regression model has the unfair advantage of being trained on all datapoints which could be sampled, and never paying a labeling cost. For any context, it then selects the action that has the highest predicted reward.

\paragraph{The Doubling Trick.} In pratice, the horizon $T$ may not be known beforehand. A standard method for handling this problem is to use the  ``Doubling Trick", where an initial $T_0$ is used as the horizon, and then the algorithm is rerun with $2T_0$ if the horizon is exceeded, then $4T_0$ if that horizon is exceeded, etc. We use a simpler variant where the horizon $T$ is initially set to $T_0$, then modified in place by doubling it whenever it is exceeded. This reuses the old data rather than restarting, and simply updates the max-regret calculations of the remaining time and size of the confidence region.


\paragraph{Results.} As seen in Figure \ref{fig:yahoo}, the DMR algorithm is able to achieve strong performance, doing as well as the regression model despite needing to request labels and pay the associated cost. In comparison, while the Fixed N algorithm is able to improve its performance over time, it performs poorly because it requests more labels than necessary. Further the doubling trick does not negatively impact the performance for any algorithm in the experiments, and in fact DMR seems to benefit in early timesteps. This shows that the doubling trick preserves not just asymptotic rate but also finite-sample performance in this setting, allowing the algorithms to easily adapt to unknown horizon lengths. The DMR algorithm is able to substantially save on labeling costs by successfully choosing informative contexts to label, while still attaining high performance and demonstrating the value of scalable oversight in linear contextual \abbreviationNS. 

\section{Conclusions} 

We develop algorithms for Bandits with Costly Reward Observations, and provide theoretical guarantees on their regret. In particular, we develop the Fixed N algorithm for turning a large class of conventional bandit algorithms into algorithm for \abbreviationNS, the WiW algorithm, and the DMR heuristic which can exploit instance-specific information in simple and contextual bandits. Finally, we prove $\Omega(c^{1/3}T^{2/3})$ lower bounds for BwCRO, matching the Fixed N regret rate.


\begin{acknowledgements}
This research was supported in part by NSF Awards IIS-1901168, IIS-2008139, and scholarship funding from Open Philanthropy. All content represents the opinion of the authors, which is not necessarily shared or endorsed by their respective employers and/or sponsors.
\end{acknowledgements}

% References
\bibliography{bibliography}

\end{document}
