\documentclass[accepted]{uai2023}
\usepackage{natbib}
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools}
\usepackage{booktabs}
\usepackage{tikz}

\usepackage{xr}
\externaldocument{lalitha_288}

\usepackage{algorithm}
\usepackage{algorithmicx}
\usepackage[noend]{algpseudocode}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{bbm}
\usepackage{bm}
\usepackage{color}
\usepackage{dirtytalk}
\usepackage{dsfont}
\usepackage{enumerate}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{mathtools}
\usepackage{subfigure}
\usepackage{times}
\usepackage{url}
\usepackage{xspace}

% \usepackage[usenames,dvipsnames]{xcolor}
\usepackage[bookmarks=false]{hyperref}
\hypersetup{
%   pdftex,
  pdffitwindow=true,
  pdfstartview={FitH},
  pdfnewwindow=true,
  colorlinks,
  linktocpage=true,
  linkcolor=blue,
  urlcolor=blue,
  citecolor=blue
}
\usepackage[capitalize,noabbrev]{cleveref}

\usepackage[textsize=tiny]{todonotes}
\newcommand{\todob}[2][]{\todo[color=red!20,size=\tiny,inline,#1]{B: #2}} % Brano's comments
\newcommand{\todokk}[2][]{\todo[color=blue!20,size=\tiny,inline,#1]{KK: #2}} % Kousha's comments
\newcommand{\todoym}[2][]{\todo[color=green!20,size=\tiny,inline,#1]{YM: #2}} % Yifei's comments

\usepackage{thmtools}
\declaretheorem[name=Theorem,refname={Theorem,Theorems},Refname={Theorem,Theorems}]{theorem}
\declaretheorem[name=Lemma,refname={Lemma,Lemmas},Refname={Lemma,Lemmas},sibling=theorem]{lemma}
\declaretheorem[name=Corollary,refname={Corollary,Corollaries},Refname={Corollary,Corollaries},sibling=theorem]{corollary}
\declaretheorem[name=Assumption,refname={Assumption,Assumptions},Refname={Assumption,Assumptions}]{assumption}
\declaretheorem[name=Proposition,refname={Proposition,Propositions},Refname={Proposition,Propositions},sibling=theorem]{proposition}
\declaretheorem[name=Definition,refname={Definition,Definitions},Refname={Definition,Definitions},sibling=theorem]{definition}
\declaretheorem[name=Example,refname={Example,Examples},Refname={Example,Examples}]{example}
\declaretheorem[name=Remark,refname={Remark,Remarks},Refname={Remark,Remarks}]{remark}

\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cE}{\mathcal{E}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cJ}{\mathcal{J}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cW}{\mathcal{W}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cZ}{\mathcal{Z}}
\newcommand{\cBR}{\mathcal{BR}}

\newcommand{\integerset}{\mathbb{Z}}
\newcommand{\naturalset}{\mathbb{N}}
\newcommand{\realset}{\mathbb{R}}

\newcommand{\children}{\mathsf{ch}}
\newcommand{\diag}[1]{\mathrm{diag}\left(#1\right)}
\newcommand{\domain}[1]{\mathrm{dom}\left(#1\right)}
\newcommand{\parents}{\mathsf{pa}}
\newcommand{\range}[1]{\mathrm{rng}\left[#1\right]}

\newcommand{\E}[1]{\mathbb{E} \left[#1\right]}
\newcommand{\condE}[2]{\mathbb{E} \left[#1 \,\middle|\, #2\right]}
\newcommand{\Erv}[2]{\mathbb{E}_{#1} \left[#2\right]}
\newcommand{\Et}[1]{\mathbb{E}_t \left[#1\right]}
\newcommand{\prob}[1]{\mathbb{P} \left(#1\right)}
\newcommand{\condprob}[2]{\mathbb{P} \left(#1 \,\middle|\, #2\right)}
\newcommand{\probt}[1]{\mathbb{P}_t \left(#1\right)}
\newcommand{\var}[1]{\mathrm{var} \left[#1\right]}
\newcommand{\condvar}[2]{\mathrm{var} \left[#1 \,\middle|\, #2\right]}
\newcommand{\std}[1]{\mathrm{std} \left[#1\right]}
\newcommand{\condstd}[2]{\mathrm{std} \left[#1 \,\middle|\, #2\right]}
\newcommand{\cov}[1]{\mathrm{cov} \left[#1\right]}
\newcommand{\condcov}[2]{\mathrm{cov} \left[#1 \,\middle|\, #2\right]}

\newcommand{\abs}[1]{\left|#1\right|}
\newcommand{\ceils}[1]{\left\lceil#1\right\rceil}
\newcommand*\dif{\mathop{}\!\mathrm{d}}
\newcommand{\floors}[1]{\left\lfloor#1\right\rfloor}
\newcommand{\I}[1]{\mathds{1} \! \left\{#1\right\}}
\newcommand{\maxnorm}[1]{\|#1\|_\infty}
\newcommand{\negpart}[1]{\left[#1\right]^-}
\newcommand{\norm}[1]{\|#1\|}
\newcommand{\normw}[2]{\|#1\|_{#2}}
\newcommand{\pospart}[1]{\left[#1\right]^+}
\newcommand{\set}[1]{\left\{#1\right\}}
\newcommand{\subreal}[0]{\preceq}
\newcommand{\supreal}[0]{\succeq}
\newcommand{\T}{^\top}

\DeclareMathOperator*{\argmax}{arg\,max\,}
\DeclareMathOperator*{\argmin}{arg\,min\,}
\let\det\relax
\DeclareMathOperator{\det}{det}
\DeclareMathOperator{\poly}{poly}
\DeclareMathOperator{\rank}{rank}
\DeclareMathOperator{\sgn}{sgn}
\let\trace\relax
\DeclareMathOperator{\trace}{tr}
\mathchardef\mhyphen="2D

\newcommand{\agape}{\ensuremath{\tt A\mhyphen GapE}\xspace}
\newcommand{\agapev}{\ensuremath{\tt A\mhyphen GapE\mhyphen V}\xspace}
\newcommand{\alg}{\ensuremath{\tt Alg}\xspace}
\newcommand{\gape}{\ensuremath{\tt GapE}\xspace}
\newcommand{\gapev}{\ensuremath{\tt GapE\mhyphen V}\xspace}
\newcommand{\sh}{\ensuremath{\tt SH}\xspace}
\newcommand{\shadavar}{\ensuremath{\tt SHAdaVar}\xspace}
\newcommand{\shvar}{\ensuremath{\tt SHVar}\xspace}
\newcommand{\unif}{\ensuremath{\tt Unif}\xspace}
\newcommand{\vbr}{\ensuremath{\tt VBR}\xspace}

\title{Fixed-Budget Best-Arm Identification with Heterogeneous Reward Variances (Supplementary Material)}

\author[ ]{{Anusha Lalitha}}
\author[ ]{Kousha Kalantari}
\author[ ]{Yifei Ma}
\author[ ]{Anoop Deoras}
\author[ ]{Branislav Kveton}
\affil[ ]{AWS AI Labs}
\affil[ ]{\texttt{\{anlalith,kkalant,yifeim,adeoras,bkveton\}@amazon.com}}

\begin{document}

\onecolumn
\appendix
\maketitle

\section{Proof of Theorem \ref{thm:shvar}}
\label{sec:shvar proof}

First, we decompose the probability of choosing a suboptimal arm. For any $s \in [m]$, let $E_s = \set{1 \in \cA_{s + 1}}$ be the event that the best arm is not eliminated in stage $s$ and $\bar{E}_s$ be its complement. Then by the law of total probability,
\begin{align*}
  \prob{\hat{I} \neq 1}
  = \prob{\bar{E}_m}
  = \sum_{s = 1}^m \prob{\bar{E}_s, E_{s - 1} \dots, E_1}
  \leq \sum_{s = 1}^m \condprob{\bar{E}_s}{E_{s - 1} \dots, E_1}\,.
\end{align*}
We bound $\condprob{\bar{E}_s}{E_{s - 1} \dots, E_1}$ based on the observation that the best arm can be eliminated only if the estimated mean rewards of at least a half of the arms in $\cA_s$ are at least as high as that of the best arm. Specifically, let $\cA_s' = \cA_s \setminus \set{1}$ be the set of all arms in stage $s$ but the best arm and
\begin{align*}
  N_s'
  = \sum_{i \in \cA_s'} \I{\hat{\mu}_{s, i} \geq \hat{\mu}_{s, 1}}\,.
\end{align*}
Then by the Markov's inequality,
\begin{align*}
  \condprob{\bar{E}_s}{E_{s - 1} \dots, E_1}
  \leq \condprob{N_s' \geq \frac{n_s}{2}}{E_{s - 1} \dots, E_1}
  \leq \frac{2 \, \condE{N_s'}{E_{s - 1} \dots, E_1}}{n_s}\,.
\end{align*}
The key step in bounding the above expectation is understanding the probability that any arm has a higher estimated mean reward than the best one. We bound this probability next.

\begin{lemma}
\label{lem:arm error} For any stage $s \in [m]$ with the best arm, $1 \in \cA_s$, and any suboptimal arm $i \in \cA_s$, we have
\begin{align*}
  \prob{\hat{\mu}_{s, i} \geq \hat{\mu}_{s, 1}}
  \leq \exp\left[- \frac{n_s \Delta_i^2}{4 \sum_{j \in \cA_s} \sigma_j^2}\right]\,.
\end{align*}
\end{lemma}
\begin{proof}
The proof is based on concentration inequalities for sub-Gaussian random variables \citep{boucheron13concentration}. In particular, since $\hat{\mu}_{s, i} - \mu_i$ and $\hat{\mu}_{s, 1} - \mu_1$ are sub-Gaussian with variance proxies $\sigma_i^2 / N_{s, i}$ and $\sigma_1^2 / N_{s, 1}$, respectively; their difference is sub-Gaussian with a variance proxy $\sigma_i^2 / N_{s, i} + \sigma_1^2 / N_{s, 1}$. It follows that
\begin{align*}
  \prob{\hat{\mu}_{s, i} \geq \hat{\mu}_{s, 1}}
  & = \prob{\hat{\mu}_{s, i} - \hat{\mu}_{s, 1} \geq 0}
  = \prob{(\hat{\mu}_{s, i} - \mu_i) - (\hat{\mu}_{s, 1} - \mu_1) > \Delta_i} \\
  & \leq \exp\left[- \frac{\Delta_i^2}
  {2 \left(\frac{\sigma_i^2}{N_{s, i}} + \frac{\sigma_1^2}{N_{s, 1}}\right)}\right]
  = \exp\left[- \frac{n_s \Delta_i^2}{4 \sum_{j \in \cA_s} \sigma_j^2}\right]\,,
\end{align*}
where the last step follows from the definitions of $N_{s, i}$ and $N_{s, 1}$ in \cref{lem:shvar allocation}.
\end{proof}

The last major step is bounding $\condE{N_s'}{E_{s - 1} \dots, E_1}$ with the help of \cref{lem:arm error}. Starting with the union bound, we get
\begin{align*}
  \condE{N_s'}{E_{s - 1} \dots, E_1}
  & \leq \sum_{i \in \cA_s'} \prob{\hat{\mu}_{s, i} \geq \hat{\mu}_{s, 1}}
  \leq \sum_{i \in \cA_s'}
  \exp\left[- \frac{n_s \Delta_i^2}{4 \sum_{j \in \cA_s} \sigma_j^2}\right] \\
  & \leq n_s \max_{i \in \cA_s'}
  \exp\left[- \frac{n_s \Delta_i^2}{4 \sum_{j \in \cA_s} \sigma_j^2}\right]
  = n_s \exp\left[- \frac{n_s \min_{i \in \cA_s'} \Delta_i^2}
  {4 \sum_{j \in \cA_s} \sigma_j^2}\right]\,.
\end{align*}
Now we chain all inequalities and get
\begin{align*}
  \prob{\hat{I} \neq 1}
  \leq 2 \sum_{s = 1}^m \exp\left[- \frac{n_s \min_{i \in \cA_s'} \Delta_i^2}
  {4 \sum_{j \in \cA_s} \sigma_j^2}\right]\,.
\end{align*}
To get the final claim, we use that
\begin{align*}
  m
  = \log_2 K\,, \quad
  n_s
  = \frac{n}{\log_2 K}\,, \quad
  \min_{i \in \cA_s'} \Delta_i^2
  \geq \Delta_{\min}^2\,, \quad
  \sum_{j \in \cA_s} \sigma_j^2
  \leq \sum_{j \in \cA} \sigma_j^2\,.
\end{align*}
This concludes the proof.


\section{Proof of Theorem \ref{thm:shvar2}}
\label{sec:shvar2 proof}

This proof has the same steps as that in \cref{sec:shvar proof}. The only difference is that $N_{s, i}$ and $N_{s, 1}$ in \cref{lem:arm error} are replaced with their lower bounds, based on the following lemma.

\begin{lemma}
\label{lem:shvar pulls} Fix stage $s$ and arm $i \in \cA_s$ in \shvar. Then
\begin{align*}
  N_{s, i}
  \geq \frac{\sigma_i^2}{\sigma_{\max}^2} \left(\frac{n_s}{\abs{\cA_s}} - 1\right)\,,
\end{align*}
where $\sigma_{\max} = \max_{i \in \cA} \sigma_i$ is the maximum reward noise and $n_s$ is the budget in stage $s$.
\end{lemma}
\begin{proof}
Let $J$ be the most pulled arm in stage $s$ and $\ell \in [n_s]$ be the round where arm $J$ is pulled the last time. By the design of \shvar, since arm $J$ is pulled in round $\ell$,
\begin{align*}
  \frac{\sigma_J^2}{N_{s, \ell, J}}
  \geq \frac{\sigma_i^2}{N_{s, \ell, i}}
\end{align*}
holds for any arm $i \in \cA_s$. This can be further rearranged as
\begin{align*}
  N_{s, \ell, i}
  \geq \frac{\sigma_i^2}{\sigma_J^2} N_{s, \ell, J}\,.
\end{align*}
Since arm $J$ is the most pulled arm in stage $s$ and $\ell$ is the round of its last pull,
\begin{align*}
  N_{s, \ell, J}
  = N_{s, J} - 1
  \geq \frac{n_s}{\abs{\cA_s}} - 1\,.
\end{align*}
Moreover, $N_{s, i} \geq N_{s, \ell, i}$. Now we combine all inequalities and get
\begin{align}
  N_{s, i}
  \geq \frac{\sigma_i^2}{\sigma_J^2} \left(\frac{n_s}{\abs{\cA_s}} - 1\right)\,.
  \label{eq:shvar pull lower bound}
\end{align}
To eliminate dependence on random $J$, we use $\sigma_J \leq \sigma_{\max}$. This concludes the proof.
\end{proof}

When plugged into \cref{lem:arm error}, we get
\begin{align*}
  \prob{\hat{\mu}_{s, i} \geq \hat{\mu}_{s, 1}}
  \leq \exp\left[- \frac{\Delta_i^2}
  {2 \left(\frac{\sigma_i^2}{N_{s, i}} + \frac{\sigma_1^2}{N_{s, 1}}\right)}\right]
  \leq \exp\left[- \frac{\left(\frac{n_s}{\abs{\cA_s}} - 1\right) \Delta_i^2}
  {4 \sigma_{\max}^2}\right]\,.
\end{align*}
This completes the proof.


\section{Proof of Theorem \ref{thm:shadavar}}
\label{sec:shadavar proof}

This proof has the same steps as that in \cref{sec:shvar proof}. The main difference is that $N_{s, i}$ and $N_{s, 1}$ in \cref{lem:arm error} are replaced with their lower bounds, based on the following lemma.

\begin{lemma}
\label{lem:shadavar pulls} Fix stage $s$ and arm $i \in \cA_s$ in \shadavar. Then
\begin{align*}
  N_{s, i}
  \geq \frac{\sigma_i^2}{\sigma_{\max}^2} \alpha(\abs{\cA_s}, n_s, \delta)
  \left(\frac{n_s}{\abs{\cA_s}} - 1\right)\,,
\end{align*}
where $\sigma_{\max} = \max_{i \in \cA} \sigma_i$ is the maximum reward noise, $n_s$ is the budget in stage $s$, and
\begin{align*}
  \alpha(k, n, \delta)
  = \frac{1 - 2 \sqrt{\frac{\log(1 / \delta)}{n / k - 2}}}
  {1 + 2 \sqrt{\frac{\log(1 / \delta)}{n / k - 2}} +
  \frac{2 \log(1 / \delta)}{n / k - 2}}
\end{align*}
is an arm-independent constant.
\end{lemma}
\begin{proof}
Let $J$ be the most pulled arm in stage $s$ and $\ell \in [n_s]$ be the round where arm $J$ is pulled the last time. By the design of \shadavar, since arm $J$ is pulled in round $\ell$,
\begin{align*}
  \frac{U_{s, \ell, J}}{N_{s, \ell, J}}
  \geq \frac{U_{s, \ell, i}}{N_{s, \ell, i}}
\end{align*}
holds for any arm $i \in \cA_s$. Analogously to \eqref{eq:shvar pull lower bound}, this inequality can be rearranged and loosened as
\begin{align}
  N_{s, i}
  \geq \frac{U_{s, \ell, i}}{U_{s, \ell, J}} \left(\frac{n_s}{\abs{\cA_s}} - 1\right)\,.
  \label{eq:shadavar pull lower bound}
\end{align}
We bound $U_{s, \ell, i}$ from below using the fact that $U_{s, \ell, i} \geq \sigma_i^2$ holds with probability at least $1 - \delta$, based on the first claim in \cref{lem:concentration}. To bound $U_{s, \ell, J}$, we apply the second claim in \cref{lem:concentration} to bound $\hat{\sigma}_{s, \ell, J}^2$ in $U_{s, \ell, J}$, and get that
\begin{align*}
  U_{s, \ell, J}
  \leq \sigma_J^2 \frac{1 + 2 \sqrt{\frac{\log(1 / \delta)}{N_{s, \ell, J} - 1}} +
  \frac{2 \log(1 / \delta)}{N_{s, \ell, J} - 1}}
  {1 - 2 \sqrt{\frac{\log(1 / \delta)}{N_{s, \ell, J} - 1}}}
\end{align*}
holds with probability at least $1 - \delta$. Finally, we plug both bounds into \eqref{eq:shadavar pull lower bound} and get
\begin{align*}
  N_{s, i}
  \geq \frac{\sigma_i^2}{\sigma_J^2}
  \frac{1 - 2 \sqrt{\frac{\log(1 / \delta)}{N_{s, \ell, J} - 1}}}
  {1 + 2 \sqrt{\frac{\log(1 / \delta)}{N_{s, \ell, J} - 1}} +
  \frac{2 \log(1 / \delta)}{N_{s, \ell, J} - 1}} \left(\frac{n_s}{\abs{\cA_s}} - 1\right)\,.
\end{align*}
To eliminate dependence on random $J$, we use that $\sigma_J \leq \sigma_{\max}$ and $N_{s, \ell, J} \geq n_s / \abs{\cA_s} - 1$. This yields our claim and concludes the proof of \cref{lem:shadavar pulls}.
\end{proof}

Similarly to \cref{lem:shvar pulls}, this bound is asymptotically tight when all reward variances are identical. Also $\alpha(\abs{\cA_s}, n_s, \delta) \to 1$ as $n_s \to \infty$. Therefore, the bound has the same shape as that in \cref{lem:shvar pulls}.

The application of \cref{lem:shadavar pulls} requires more care. Specifically, it relies on high-probability confidence intervals derived in \cref{lem:concentration}, which need $N_{s, t, i} > 4 \log(1 / \delta) + 1$. This is guaranteed whenever $n \geq K \log_2 K (4 \log(1 / \delta) + 1)$. Moreover, since the confidence intervals need to hold in any stage $s$ and round $t$, and for any arm $i$, we need a union bound over $K n$ events. This leads to the following claim.

Suppose that $n \geq K \log_2 K (4 \log(1 / \delta) + 1)$. Then, when \cref{lem:shadavar pulls} is plugged into \cref{lem:arm error}, we get that
\begin{align*}
  \prob{\hat{\mu}_{s, i} \geq \hat{\mu}_{s, 1}}
  \leq \exp\left[- \frac{\Delta_i^2}
  {2 \left(\frac{\sigma_i^2}{N_{s, i}} + \frac{\sigma_1^2}{N_{s, 1}}\right)}\right]
  \leq \exp\left[- \frac{\alpha(\abs{\cA_s}, n_s, K n \delta)
  \left(\frac{n_s}{\abs{\cA_s}} - 1\right) \Delta_i^2}
  {4 \sigma_{\max}^2}\right]\,.
\end{align*}
This completes the proof.

\bibliography{References}

\end{document}
