\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.


%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{enumitem}
\usepackage{thmtools, thm-restate}
\usepackage{comment}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage{cancel}
\usepackage{eqnarray}

\usepackage{xr}
\externaldocument{tucker_462}

%\input{notation.tex}
%\DeclareMathOperator{\ent}{S}
\DeclareMathOperator{\Ex}{\mathbb{E}}
\DeclareMathOperator{\Var}{\text{Var}}
\DeclareMathOperator{\Cov}{\text{Cov}}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
%\newcommand{\Pr}{\text{Pr}}
\newcommand{\sign}{\text{sign}}
\newcommand{\err}{\text{err}}

\newcommand{\calA}{\mathcal{A}}
\newcommand{\calB}{\mathcal{B}}
\newcommand{\calS}{\mathcal{S}}
\newcommand{\calC}{\mathcal{C}}
\newcommand{\calD}{\mathcal{D}}
\newcommand{\calX}{\mathcal{X}}
\newcommand{\calK}{\mathcal{K}}
\newcommand{\calG}{\mathcal{G}}
\newcommand{\calR}{\mathcal{R}}
\newcommand{\calP}{\mathcal{P}}
\newcommand{\calN}{\mathcal{N}}
\newcommand{\calM}{\mathcal{M}}
\newcommand{\calH}{\mathcal{H}}
\newcommand{\calF}{\mathcal{F}}
\newcommand{\calO}{\mathcal{O}}

\newcommand{\IPS}{\text{IPS}}
\newcommand{\BAL}{\text{BAL}}

\newcommand{\x}{{x}}                 % Context
\newcommand{\xRV}{{X}}               % Context Random Variable
\newcommand{\action}{{a}}                 % Action
\newcommand{\actionRV}{{A}}               % Action Random Variable
\newcommand{\reward}{{r}}                 % Reward
\newcommand{\rewardRV}{{R}}               % Reward Random Variable
\newcommand{\Util}{{U}}                 % Reward
\newcommand{\UtilIPS}{{\hat{U}^{\text{IPS}}}}                 % Reward

\newcommand{\old}{{log}}
\newcommand{\eval}{{aug}}
\newcommand{\target}{{tar}}

\newcommand{\policy}{\pi}
\newcommand{\policyspace}{\Pi}
\newcommand{\oldpolicy}{\pi_{\text{\old}}}
\newcommand{\targetpolicy}{\pi_{\text{\target}}}
\newcommand{\targetpolicyspace}{\policyspace_{\text{\target}}}
\newcommand{\evalpolicy}{\pi_{\text{\eval}}}
\newcommand{\blendedpolicy}{\pi_{\text{balanced}}}
\newcommand{\minvarpolicy}{\pi_{\text{minvar}}^{\text{IPS}}}
\newcommand{\prodpolicy}{\pi_{\text{prod}}}
\newcommand{\maxpolicy}{\pi_{\text{max}}}

\newcommand{\paren}[1]{\left(#1\right)}
\newcommand{\sqaren}[1]{\left[#1\right]}
\newcommand{\set}[1]{\left\{#1\right\}}

\newcommand{\dataset}{\calD}
\newcommand{\evaln}{{n_\text{\eval}}}
\newcommand{\oldn}{{n_\text{\old}}}
\newcommand{\evalD}{{\dataset_\text{\eval}}}
\newcommand{\oldD}{{\dataset_\text{\old}}}

\newcommand{\Ball}{{\text{Ball}}}
\newcommand{\Regret}{{\text{Regret}}}
\newcommand{\Reward}{{\text{Reward}}}
\newcommand{\KL}{{\text{KL}}}

\newcommand{\rewardmean}{{\bar \reward(\x,\action)}}
\newcommand{\rewardmeansq}{{\bar \reward^2(\x,\action)}}
\newcommand{\rewardvar}{{\sigma^2(\x,\action)}}
\newcommand{\expectedsquarer}{\paren{\rewardmeansq + \rewardvar}}

\newtheorem{lemma}{Lemma}
\newtheorem{claim}{Claim}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{optimizationproblem}{Optimization Problem}
\newtheorem*{remark}{Remark}
\newtheorem{definition}{Definition}[section]
\newtheorem{assumption}{Assumption}[section]


\newcommand{\cmark}{\ding{51}}%
\newlist{todolist}{itemize}{2}
\setlist[todolist]{label=$\square$}
\newcommand{\done}{\rlap{$\square$}{\raisebox{2pt}{\large\hspace{1pt}\cmark}}%
\hspace{-2.5pt}}

%\newcommand{\calA}{\mathcal{A}}
\newcommand{\Varm}{V^m_\text{arm}}
\newcommand{\Vcontext}{V_\text{state}}
\newcommand{\Barm}{B_\text{arm}}
\newcommand{\Bcontext}{B_\text{state}}

\newcommand{\width}{\text{width}}

\newcommand{\abbreviation}{BwCRO }
\newcommand{\abbreviationNS}{BwCRO}

%%%%%%%%%%%%% Comments
\newcommand{\tj}[1]{\footnote{\textcolor{magenta}{TJ comments: #1}}}
\newcommand{\at}[1]{\footnote{\textcolor{blue}{AT comments: #1}}}


%\input{sections/fixedn/notation}
\newcommand{\constant}{K}

\title{Bandits with Costly Reward Observations Supplementary Material}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author[ ]{\href{mailto:<aarondtucker@cs.cornell.edu>?Subject=Your UAI 2023 paper}{Aaron D. Tucker}}
\author[*]{Caleb Biddulph}
\author[*]{Claire Wang}
\author[ ]{Thorsten Joachims}

\affil[ ]{Department of Computer Science, Cornell University, Ithaca NY USA}
\affil[*]{Equal contribution, authors listed alphabetically}


\begin{document}


\onecolumn
\maketitle

\appendix
\setcounter{figure}{5}
%\input{experiments/appendix}


\section{Appendix}
\subsection{Experimental Appendix}


\begin{comment}
\begin{figure*}
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/yahoo/double_cost0.01.png}
        \caption{Label cost $c=0.01$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/yahoo/double_cost0.1_nolegend.png}
        \caption{Label cost $c=0.1$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/yahoo/double_cost1_legend.png}
        \caption{Label cost $c=1$}
    \end{subfigure}
    \caption{Average per-step reward at each timestep for 1000 timesteps rejection sampled using the Yahoo! Frontpage Dataset. Standard error from 20 trials. Non-red dashed lines correspond to using the doubling trick. This Figure is for the same experiment as Figure \ref{fig:yahoo}, but with bigger graphs.}
\end{figure*}
\end{comment}




\subsubsection{Comparison to BAMCP++}\label{appendix:owain}
\begin{figure*}\label{fig:bamcppp}
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/bamcppp/delta.5.png}
        \caption{$\delta=0.5$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/bamcppp/delta50.png}
        \caption{$\delta=50$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/bamcppp/delta250.png}
        \caption{$\delta=250$}
    \end{subfigure}
    \caption{Replication of Figure 3 of \cite{owain} with varying settings for the hyperparameter $\delta$. As in \cite{owain}, mean and standard error are presented over 100 trials.}
    \label{fig:bamcppp}
\end{figure*}
\citet{owain} presents the BAMCP++ algorithm for the Active RL setting, which is built on top of Bayesian Monte-Carlo Tree Search, and is applicable to MDP settings as well as bandits. However, it is much more computationally expensive than the algorithms discussed throughout this paper, and the original publication only evaluated its performance on bandits up to 40 timesteps. In Figure \ref{fig:bamcppp} we show experiments which are directly comparable to the experiment presented in Figure 3 of \cite{owain}. We find that the MCCH heuristic \cite{activerl} is able to achieve higher performance than BAMCP++, since it also stays close to the line corresponding to requesting 3 labels then performing optimally, however it also does so in earlier horizons rather than performing at chance until roughly $T=15$. All other algorithms presented perform below chance with their typical hyperparameter settings. While the $\delta$ hyperparameter for the UCB algorithms represents a bound on the probability of the Azuma-Hoeffding bounds failing \citep{ajks}, treating it as a freely-chosen hyperparameter and setting $\delta$ to higher values causes the DMR and Fixed-N algorithms to perform comparably to or better than MCCH and BAMCP++.


\subsubsection{Impact of Dimension on Linear Contextual Bandit Results}
\begin{figure*}
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/d/d5.png}
        \caption{$d=5$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/d/d50.png}
        \caption{$d=50$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/d/d100.png}
        \caption{$d=100$}
    \end{subfigure}
    \caption{Final average per step regret for varying noises $\sigma^2$, standard error from 20 trials. Note the logarithmic $y$ scale. $T=10000$, $k=5$}
    \label{fig:d}
\end{figure*}
Figure \ref{fig:d} shows that DMR maintains its advantage over the Fixed N algorithm over a variety of context dimensions $d$ and noise scales. Note that both have lower regret with higher dimensions, likely because the randomly drawn vectors become more orthogonal with increasing dimension, resulting in smaller differences between the rewards and lower regret.

\subsubsection{Hyperparameters}
The only hyperparameter for the Fixed N and Worth it Width algorithms is the parameter $\delta$, which is set to $0.5$. The MCCH heuristic from \cite{activerl} also has a single parameter $\alpha$ which is set to $0.1$ which appeared to be the best setting in the paper's experiments, though they note that the algorithm appears robust to parameter choice. 

\subsubsection{Impact of Labeling Cost}\label{appendix:costexperiments}
\begin{figure}
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c0.01T1000}
        \caption{$c=0.01, T=1000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c0.01T10000}
        \caption{$c=0.01, T=10000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c0.01T100000}
        \caption{$c=0.01, T=100000$}
    \end{subfigure}
    
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c0.1T1000}
        \caption{$c=0.1, T=1000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c0.1T10000}
        \caption{$c=0.1, T=10000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c0.1T100000}
        \caption{$c=0.1, T=100000$}
    \end{subfigure}

    
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c1T1000}
        \caption{$c=1, T=1000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c1T10000.png}
        \caption{$c=1, T=10000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c1T100000.png}
        \caption{$c=1, T=100000$}
    \end{subfigure}
    
    
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c10T1000}
        \caption{$c=10, T=1000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c10T10000}
        \caption{$c=10, T=10000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c10T100000}
        \caption{$c=10, T=100000$}
    \end{subfigure}
    
    
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c100T1000}
        \caption{$c=100, T=1000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c100T10000}
        \caption{$c=100, T=10000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/doubling/c100T100000}
        \caption{$c=100, T=100000$}
    \end{subfigure}
    \caption{Final average per step regret for varying values of gaps $\Delta$, across many different horizons $T$ and costs $c$. Standard error from 20 trials. Dashed vertical red line is at the predicted worst-case $\Delta = \sqrt[3]{c/T}$. Other dashed lines correspond to using the doubling trick.}
    \label{fig:cost}
\end{figure}
Figure \ref{fig:cost} shows that Fixed-N and WiW have an advantage over MCCH in low cost ($c\leq 1$) settings, but that MCCH does better in higher cost settings. Increasing the episode length generally improves the performance of all algorithms, with more dramatic impacts for the WiW algorithm in the regime near the predicted worst-case $\Delta=\sqrt[3]{c/T}$.


\subsubsection{Worth-it-Width Ablations}\label{appendix:wiwablations}
\begin{figure}
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c0.01T1000}
        \caption{$c=0.01, T=1000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c0.01T10000}
        \caption{$c=0.01, T=10000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c0.01T100000}
        \caption{$c=0.01, T=100000$}
    \end{subfigure}
    
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c0.1T1000}
        \caption{$c=0.1, T=1000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c0.1T10000}
        \caption{$c=0.1, T=10000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c0.1T100000}
        \caption{$c=0.1, T=100000$}
    \end{subfigure}

    
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c1T1000}
        \caption{$c=1, T=1000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c1T10000.png}
        \caption{$c=1, T=10000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c1T100000.png}
        \caption{$c=1, T=100000$}
    \end{subfigure}
    
    
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c10T1000}
        \caption{$c=10, T=1000$. Note that Baseline' tracks performance of Baseline (rather than WiW) for $\Delta \leq 0.3$.}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c10T10000}
        \caption{$c=10, T=10000$. Note that Baseline' tracks performance of Baseline (rather than WiW) for $\Delta \leq 0.1$.}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c10T100000}
        \caption{$c=10, T=100000$}
    \end{subfigure}
    
    
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c100T1000}
        \caption{$c=100, T=1000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c100T10000}
        \caption{$c=100, T=10000$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.32\textwidth}
        \includegraphics[width=\textwidth]{images/baselines/c100T100000}
        \caption{$c=100, T=100000$}
    \end{subfigure}
    \caption{Simpler baseline comparisons. Final average per step regret for varying values of gaps $\Delta$, across many different horizons $T$ and costs $c$. Standard error from 20 trials. Dashed red line is at the predicted worst-case $\Delta = \sqrt[3]{c/T}$.  Note the similar performance of the Baseline algorithm to Worth-it-Width for $c<10$, and worse performance in the small arm difference $\Delta$ regime whenever $c\geq 10$.}
    \label{fig:bigbaseline}
\end{figure}

Figure \ref{fig:bigbaseline} repeats the Worth-it-Width ablation experiments across a variety of parameter settings, demonstrating that all steps of the Worth-it-Width algorithm are necessary for best performance.


%\input{sections/lowerbound/appendix}
\subsection{Proof of Theorem 3}
Define $k$ bandit problem instances, with each arm being associated with a flip from one of $k$ coins. If the selected coin lands heads then the agent receives reward $1$, and otherwise it receives reward $0$. Our bandit problem is then drawn with uniform probability from these $k$ settings. We additionally analyze a base instance $0$ in which all coins are unbiased and have reward $1/2$, and in instance $j$ coin $j$ has expected reward $(1+\epsilon)/2$. Denote the probability of an event $A$ in instance $j$ as $\Pr_j(A)$, and the expectation of a random variable $X$ in instance $j$ as $\Ex_j(X)$.

We will analyze how often an algorithm plays a given arm $j^*$ in the base instance $0$, then use the fact that the coins have similar probability distributions to bound the performance in the instance $j^*$ where the coin is preferred. In order to establish the bound, we first need to prove a KL divergence lemma. This proof and lemma are again based on \cite{mabintro}, and adapted to the BwCRO setting.

\begin{lemma}[KL Bound]\label{lemma:event}
For any event $A$ based on $n$ observations of the coin flips, for any $j \in [1..k]$,

$$|\Pr_0(A) - \Pr_j(A)| \leq \epsilon\sqrt{n}.$$
\end{lemma}

\begin{proof}
First, define $p$ and $q$ to be the probability distributions over $n$ independent $(\epsilon/2)$-biased and fai  coin flips respectively, and let $p_i$ be the $i$th flip from the biased coin and $q_i$ be the $i$th flip from the fair coin. The $\KL$ divergence between a coin flip $p_i$ with bias $\epsilon/2$ and a fair coin flip $q_i$ is as follows:

\begin{align*}
\KL(p_i; q_i) &= \frac{1+\epsilon}{2}\log\paren{1+\epsilon} + \frac{1-\epsilon}{2}\log\paren{1-\epsilon} \\
&= \frac{1}{2}\log\paren{1-\epsilon^2} + \frac{\epsilon}{2}\log\paren{\dfrac{1+\epsilon}{1-\epsilon}} &\pm \frac{1}{2}\log\paren{1+\epsilon}\\
&\leq  \frac{\epsilon}{2}\log\paren{\dfrac{1+\epsilon}{1-\epsilon}} &\text{since }\frac{1}{2}\log\paren{1-\epsilon^2} < 0\\
&=  \frac{\epsilon}{2}\log\paren{1 + \dfrac{2\epsilon}{1-\epsilon}}\\
&\leq  \dfrac{\epsilon}{2}\dfrac{2\epsilon}{1-\epsilon} &\text{since } \log(1+x) \leq x \text{ for } x > 0\\
&\leq 2\epsilon^2 &\text{since } 0 \leq \epsilon \leq 1/2
\end{align*}

\begin{align*}
|\Pr_0(A) - \Pr_j(A)| &\leq \sqrt{\frac{1}{2}\KL(p;q)} &\text{by Pinsker's inequality}\\
&\leq \sqrt{\frac{1}{2}\sum_{i=1}\KL(p_i; q_i)} &\text{by KL divergence chain rule for independent draws}\\
&\leq \sqrt{\frac{1}{2}(2n\epsilon^2)} &\text{since } \KL(p_i; q_i) \leq 2\epsilon^2\\
&\leq \epsilon\sqrt{n}
\end{align*}
\end{proof}

\paragraph{Theorem 3}
\emph{The Bandits with Costly Observations setting has a regret lower bound of $\Omega(c^{1/3}T^{2/3}).$}

\begin{proof}\label{proof:lowerbound}
The basic idea of the proof is that for every instance $j^*\neq 0$, we can upper bound how many times we play the optimal arm $j^*$ by looking at how many times we play $j^*$ in instance $0$, then using a KL divergence lemma to upper bound the probability of playing coin $j^*$ in instance $j^*$ in terms of the number of observations $n$. This will establish that we cannot frequently play the coin $j^*$ in the appropriate instance $j^*$ without also playing it in the incorrect instances $j'\neq j^*$, leading to regret.

\paragraph{How many times do we play $j^*$ in instance $0$?}

Let $Q_j^{(t)}$ be the number of times that the algorithm flips coin $j$ by time $t$. Note that by linearity of expectation $$\sum_{j=1}^k\Ex_0\sqaren{Q_j^{(t)}} = \Ex_0\sqaren{\sum_{j=1}^kQ_j^{(t)}} = \Ex_0\sqaren{t} = t.$$
\noindent Let $J_t = \{j: \Ex_0[Q_t^{(j)}] \leq 3t/k\}$ be the set of coins that the algorithm has not played more than $3/k$ of the time over the first $t$ timesteps in instance $0$. As previously shown $\sum_{j=1}^k\Ex_0\sqaren{Q_t^{(j)} }=t$, so $J_t$ must have at least $2k/3$ elements since
$$t = \sum_{j=1}^k \Ex_0\sqaren{Q_t^{(j)}} \geq \sum_{j\notin J_t}\Ex_0\sqaren{Q_t^{(j)}} \geq \sum_{j\notin J_t}\dfrac{3t}{k} \geq |\set{j: j \notin J_t}|\dfrac{3t}{k} \text{ implies } |\set{j: j \notin J_t}| \leq k/3.$$


By the Markov Inequality $\Ex_0[Q_t^{(j)]} \leq 3t/k$ implies that for any coin $j \in J_t$ and any $a$
$$\Pr_0\paren{Q_j^{(t)} \geq a} \leq \frac{\Ex_0[Q_t^{(j)}]}{a}\leq \frac{3t/k}{a} \text{, and therefore } \Pr_0\paren{Q_t^{(j)} < a} > 1 - \frac{3t}{ka}.$$

Now, we compute the probability that $j^*$ is played less than $a$ times in instance $0$. Let $\mathcal{E}_{j^*}$ be the event that a given $j^* \in J_T$ and that $Q_t^{(j^*)} < a$.
\begin{align*}
    \Pr_0(\mathcal{E}_j^*) &= \Pr_{\text{inst}}(j^* \in J_T)\Pr_0\paren{Q_t^{(j^*)} < a|j \in J_T} &\text{(Randomness in } \Pr_{\text{inst}} \text{ is over instances)}\\
    &= \frac{2}{3}\Pr_0\paren{Q_t^{(j^*)}< a|j \in J_T} &\text{since } |J_T| > 2k/3\\
    &> \frac{2}{3}\paren{1 - \frac{3T}{ka}} &\text{Markov inequality with } \Ex_0\sqaren{Q_T^{(j^*)}} \leq \dfrac{3T}{k}\\
    &= \frac{2}{3} - \frac{2T}{ka}
\end{align*}

As a sanity check, note that increasing the number of arms raises the lower bound and makes $\mathcal{E}_j$ more likely, as does increasing the threshold $a$. Increasing $T$ on the other hand makes it less likely.

\paragraph{Expected regret in instance $j^*$?} Assume that the algorithm observed $n$ rewards for arm $j^*$ over the entire history.
%Note that $n$ must be less than $Q_T^{(j^*)}$, since the algorithm can only observe a reward for $j^*$ if it plays $j^*$.
We know from Lemma \ref{lemma:event} that for any event $A$ based on $n$ labels $|\Pr_0(A) - \Pr_j(A)| \leq \epsilon \sqrt{n}$, which lower bounds the probability $\Pr_{j^*}(\mathcal{E}_{j^*})$ of playing $j^*$ less than $a$ times as

$$\Pr_{j^*}(\mathcal{E}_{j^*}) > \frac{2}{3} - \frac{2T}{ka} - \epsilon\sqrt{n}.$$

If $j^*$ is the best arm with bias $(1+\epsilon) / 2$ and all other coins are fair, then the regret in instance $j^*$ if event $\mathcal{E}_{j^*}$ holds is simply the difference of the two rewards, plus the cost of acquiring $n$ labels.

\setcounter{equation}{0}
\begin{align}
    \Ex_{j^*}\sqaren{\Regret_T} &= 
    \Pr_{j^*}(\overline{\mathcal{E}_{j^*}})\Ex_{j^*}\sqaren{\Regret_T | \overline{\mathcal{E}_{j^*}}} + \Pr_{j^*}(\mathcal{E}_{j^*})\Ex_{j^*}\sqaren{\Regret_T | \mathcal{E}_{j^*}} + cn\\
    &\geq \Pr_{j^*}(\mathcal{E}_{j^*})\Ex_{j^*}\sqaren{\Regret_T | \mathcal{E}_{j^*}} + cn\\
    &= \Pr_{j^*}(\mathcal{E}_{j^*})\paren{
    T\frac{1+\epsilon}{2} - T\frac{1+Q^{(T)}_{j^*}\epsilon}{2}} + cn\\
    &\geq \Pr_{j^*}(\mathcal{E}_{j^*})\paren{
    T\frac{1+\epsilon}{2} - T\frac{1+a\epsilon}{2}} + cn\\
    &= \Pr_{j^*}(\mathcal{E}_{j^*})\frac{(T-a)\epsilon}{2} + cn\\
    &> \paren{\frac{2}{3} - \frac{2T}{ka} - \epsilon\sqrt{n}}\frac{(T-a)\epsilon}{2} + cn
\end{align}

Line $2$ holds because $\Pr_{j^*}(\overline{\mathcal{E}_{j^*}})\Ex_{j^*}\sqaren{\Regret_T | \overline{\mathcal{E}_{j^*}}}$ is positive, line $3$ holds by the definition of regret, line $4$ holds since $\mathcal{E}_{j^*}$ is true and so $Q^{(T)}_{j^*} < a$ and $-a < - Q^{(T)}_{j^*}$, and line $6$ holds from the KL divergence lemmas.

\paragraph{Conclusion.} Now we can conclude the proof. Recall that $a$ is from the Markov inequality, and so we are free to choose $a = 6T/k$, yielding the bound

\begin{align*}
    \Ex_{j^*}\sqaren{\Regret_T} &\geq \paren{\frac{2}{3} - \frac{2Tk}{k6T} - \epsilon\sqrt{n}}\frac{(T-6T/k)\epsilon}{2} + cn\\
    &= \paren{\frac{1}{3} - \epsilon\sqrt{n}}\frac{(k-6)T\epsilon}{2k} + cn\\
    &= \frac{(k-6)T\epsilon}{6k}  - \frac{(k-6)T\epsilon^2}{2k}\sqrt{n} + cn.
\end{align*}

Now, choose $\epsilon = \sqrt[3]{c/T}$ for the coin expected rewards, for a regret bound of 

$$\Ex_{j^*}\sqaren{\Regret_T} \geq \frac{(k-6)}{6k}\sqrt[3]{cT^2}  - \frac{(k-6)}{2k}\sqrt[3]{c^2T}\sqrt{n} + cn.$$

Now, imagine that the algorithm did as well as possible, and minimized this value with respect to $n$. This yields $\sqrt{n} = \frac{(k-6)}{4k}\sqrt[3]{T/c}$, and a regret of 

$$\Ex_{j^*}\sqaren{\Regret_T} \geq \frac{(k-6)}{6k}\sqrt[3]{cT^2}  - \frac{(k-6)^2}{16k^2}\sqrt[3]{cT^2},$$

\noindent for an $\Omega(c^{1/3}T^{2/3})$ regret lower bound, as desired.

\end{proof}

%\input{sections/fixedn/appendix}
\subsection{Proof of Theorem 2}
%\ref{thm:fixednregret}
With the uniform regret assumption, the $O(c^{1/3}T^{2/3})$ regret rate for the Fixed N algorithm is the result of fairly straightforward algebraic manipulations.

\paragraph{Assumption 3.1} (Uniform Regret Rate). \emph{
An algorithm $\calA$ meets the uniform regret assumption if, for all $n \leq T$ and with randomness taken over the algorithm's choices and environment, a) playing according to $\calA$ while observing labels for the first $n$ timesteps results in
$\Ex\sqaren{\Regret_{1:n}^\circ} \in O(n^{1/2})$ and b)
with randomness taken over the algorithm's choices and environment, and if requesting no further labels after the first $n$ timesteps results in
$$\frac{1}{T-n}\Ex\sqaren{\Regret_{n+1:T}^\circ} \leq \frac{1}{n}\Ex\sqaren{\Regret_{1:n}^\circ}.$$ 
}


\begin{proof} \label{proof:fixednregret}
Assume that $\calA$ meets the uniform regret assumption, so that
$$\frac{1}{T-n}\Ex\sqaren{\Regret_{n+1:T}}  \leq \frac{1}{n}\Ex\sqaren{\Regret_{1:n}}.$$


Then, by the definition of $O(n^{1/2})$ regret there is a constant $ \constant$ and $n_0$ such that for all $n > n_0$

$$\Ex\sqaren{\Regret_{1:n}} \leq  \constant \sqrt{n} \text{ and therefore }  \frac{1}{T-n}\Ex\sqaren{\Regret_{n+1:T}}  \leq \frac{1}{n}\Ex\sqaren{\Regret_{1:n}} \leq \frac{ \constant}{\sqrt{n}}.$$

In the BwCO setting, receiving $n$ labels necessarily incurs a regret of $cn$, so the total regret of using $\calA$ while labeling the first $n$ observations is simply

\begin{align*}
    \Regret_{1:T} &= cn + \Ex\sqaren{\Regret_{1:n}} + (T-n)\frac{1}{T-n}\Ex\sqaren{\Regret_{n+1:T}}\\
    &\leq cn + \Ex\sqaren{\Regret_{1:n}} + (T-n)\frac{1}{n}\Ex\sqaren{\Regret_{1:n}}\\
     &\leq cn +  \constant \sqrt{n} + (T-n)\frac{ \constant}{\sqrt{n}}\\
     &= cn + n\frac{ \constant }{\sqrt{n}} + (T-n)\frac{ \constant}{\sqrt{n}}\\
     &= cn + T  \constant n^{-1/2}\\
\end{align*}

We can now simply minimize this expression with respect to the number of labels $n$...
$$\dfrac{d}{dn}\paren{cn + T  \constant n^{-1/2} } = c - \frac{T \constant}{2}n^{-3/2}$$

Solving for $c - T  \constant n^{-3/2}/2 = 0$, we have
$$n = \paren{\frac{T \constant}{2c}}^{2/3}$$

Since the second derivative $3T  \constant n^{-5/2}/4$ is always positive, this is a global minima.

Since the regret $\Regret_{1:T}$ is bounded by $cn + T  \constant n^{-1/2}$, and since $cn + T  \constant n^{-1/2}$ is minimized by setting $n = \paren{\frac{T \constant}{2c}}^{2/3}$, we can minimize the upper bound on regret by requesting $n = \paren{\frac{T \constant}{2c}}^{2/3}$ labels.

Plugging it back into the original expression, we have the desired regret rate
\begin{align*}
    \Regret_{1:T} &\leq cn + T  \constant n^{-1/2}\\
    &= c \paren{\frac{T \constant}{2c}}^{2/3} + T \constant \paren{\paren{\frac{T \constant}{2c}}^{2/3}}^{-1/2}\\
    &= c \paren{\frac{T \constant}{2c}}^{2/3} + T \constant \paren{\frac{T \constant}{2c}}^{-1/3}\\
    &= c^{1/3} \paren{\frac{T \constant}{2}}^{2/3} + \paren{T \constant}^{2/3} \paren{2c}^{1/3}\\
    &\in O\paren{c^{1/3} \constant^{2/3}T^{2/3}}.
\end{align*}

Note that as $c \to 0$, $n \to \infty$ which makes sense since if the labels are free and always improve performance then the algorithm should always get the label. In this case, note that $n$ must be less than or equal to $T$, and therefore we recover the original regret expression.

$$\Regret_{1:T}  \leq cn + T  \constant n^{-1/2} = 0n + T  \constant n^{-1/2} = T \constant T^{-1/2} =  \constant\sqrt{T}$$
\end{proof}

%\input{sections/simplemab/appendix}
\subsection{Proof of Theorem 1}
%\ref{theorem:simple}

\paragraph{Theorem 1} (Regret Rate for WiW Algorithm). 
\emph{Algorithm 1 has a regret rate of $\tilde O(kc^{1/3}T^{2/3})$ with high probability.} %\ref{alg:wiw}

\begin{proof}
The proof has two main claims -- that we will hit a termination condition within $\tilde O(k(T/c)^{2/3})$ labels, and that upon doing so the regret will be bounded by $\tilde O(kT^{2/3})$.


\paragraph{Termination.} We show that the algorithm terminates after $\tilde O(T^{2/3})$ labels by showing that the number of labels necessary for the algorithm to terminate can be bounded by the number of labels necessary for $u_t^{(a)} - \ell_t^{(a)} < w$ to hold for all arms. 



First, note that since $g_t^{(a)} = u_t^{(a)} - \nu_t $ and $\nu_t = \max \ell_t^{(a)}$, an arm's gap $g_t^{(a)}$ is bounded above by $ u_t^{(a)} - \ell_t^{(a)}$.
$$g_t^{(a)} = u_t^{(a)} - \nu_t = u_t^{(a)} - \max_{a \in \calA} \ell_t^{(a)} \leq  u_t^{(a)} - \ell_t^{(a)}$$

\noindent Therefore, $u_t^{(a)} - \ell_t^{(a)} \leq w$ implies that $g_t^{(a)} \leq w$. Similarly, if $u_t^{(a)} - \ell_t^{(a)} \leq w$ for all arms $a \in \calA$ then $g_t^{(a)}\leq w$ for all arms $a \in \calA$ and the first termination condition holds.


Now, we solve for how many reward observations for an arm $a$ are necessary for $g_t^{(a)} \leq u_t^{(a)} - \ell_t^{(a)} \leq w$.

$$
 u_t^{(a)} - \ell_t^{(a)} = \mu_t^{(a)} + \sqrt{\dfrac{\log(kT/\delta)}{n_t^{(a)}}}  - \paren{\mu_t^{(a)} - \sqrt{\dfrac{\log(kT/\delta)}{n_t^{(a)}}}} = 2\sqrt{\dfrac{\log(kT/\delta)}{n_t^{(a)}}} = \sqrt{\dfrac{4\log(kT/\delta)}{n_t^{(a)}}}
$$


\begin{align*}
u_t^{(a)} - \ell_t^{(a)} = \sqrt{\dfrac{4\log(kT/\delta)}{n_t^{(a)}}} &\leq \sqrt[3]{\frac{4c\log(kT/\delta)}{T}} = w\\
%\dfrac{4\log(kT/\delta)}{n_t^{(a)}} &\leq \sqrt[3]{\frac{4c\log(kT/\delta)}{T}}^2\\
\sqrt[3]{4\log(kT/\delta)}(T/c)^{2/3} &\leq n_t^{(a)}
\end{align*}

\noindent Therefore, an arm $a$ needs to be played at most $\sqrt[3]{4\log(kT/\delta)}(T/c)^{2/3}$ times in order for $g_t^{(a)} \leq w$ to hold.

Second, note that since the arm always plays the least played arm associated with the maximum gap, it takes at most $2\sqrt[3]{4\log(kT/\delta)}(T/c)^{2/3}$ labels for a gap for both of the associated arms to have $u_t^{(a)} - \ell_t^{(a)} \leq w$ hold, and therefore for $g_t^{(a)} \leq w$ to hold. Further, since the algorithm always plays an arm associated with the maximum gap, it will be decreasing all of the $k$ gaps until it terminates. Therefore, the algorithm will reach the first termination condition after at most $2k\sqrt[3]{4\log(kT/\delta)}(T/c)^{2/3}$ labels. Note that the second termination condition may be reached sooner than this if all but the holdout arm have $g_t^{(a)} \leq w$.

Therefore in conclusion, the algorithm will commit to an arm after at most $2k\sqrt[3]{4\log(kT/\delta)}(T/c)^{2/3}$ labels. We can upper bound the regret incurred during this phase by $(1+c)$ times the length of the labeling phase to represent paying regret for the largest possible reward difference between the arms as well as the labeling cost $c$, totaling in a regret of at most

$$2(1+c)k\sqrt[3]{4\log(kT/\delta)}(T/c)^{2/3}.$$

\paragraph{Regret.} There are two regret cases to cover, one for if the first termination is reached, and another for if the second termination condition is reached.

In the first case, we commit to playing the arm $a^\nu_t$ associated with $\nu_t$ after $g^{(a)}_t \leq w$ for all arms. Since
$g^{(a)}_t  = u_t^{(a)} - \nu_t = u_t^{(a)} - \nu_t = u_t^{(a)} - \nu_t$  and since with high probability for all arms $a \in \calA$, $\ell_t^{(a)} \leq {\mu^*}^{(a)} \leq u_t^{(a)},$ it follows that $g^{(a)}_t$ is an upper bound on the per-turn regret of choosing $a^\nu_t$ instead of $a$.

\begin{align*}
    g^{(a)}_t &\geq \mu^{a} - \nu_t &\text{Hoeffding bound}\\
    &= \mu^{a} - \ell_t^{(a^\nu_t)} &\text{Definition of } \nu_t\\
    &\geq \mu^{a} - \mu^{(a^\nu_t)} &\text{Hoeffding bound}.
\end{align*}

Since $g^{(a)}_t \leq w$ for all arms, it then follows that the per-turn regret of committing to $a^\nu$ is at most $w = \sqrt[3]{\dfrac{c\log(kT/\delta)}{T}}$. The regret after committing can be bounded by $T$ times the maximum possible per-turn regret, yielding a regret of at most

$$T\sqrt[3]{\dfrac{c\log(kT/\delta)}{T}} = \sqrt[3]{c\log(kT/\delta)}T^{2/3}. $$

In the second case, the arm $a$ with the maximum gap $g^{(a)}_t$ is the holdout arm, while every other $a'$ is such that $g^{(a')}_t \leq w$. In this case, $w$ still bounds the per-turn regret of choosing $a$ instead of some other $a'$, and has the same regret bound.


\paragraph{Conclusion.} Adding together the two regret terms, we have $2(1+c)k\sqrt[3]{4\log(kT/\delta)}(T/c)^{2/3} +  \sqrt[3]{c\log(kT/\delta)}T^{2/3}$, for a total $\tilde O(c^{1/3}T^{2/3})$ regret of

$$k\sqrt[3]{c\log(kT/\delta)}(T/c)^{2/3} + \paren{1+2k}\sqrt[3]{4c\log(kT/\delta)}T^{2/3} \in \tilde O(c^{1/3}T^{2/3}).$$
\end{proof}

\bibliography{bibliography2}
\end{document}