\onecolumn
% \section{Notation table}

\begin{table*}
 
\centering
\begin{tabular}{@{}ll@{}}\toprule
\textbf{Symbol} & \textbf{Meaning} \\
\midrule
$d$ & Dimension of environment \\
$T$ & Time horizon \\
$L$ & Number of phases \\
$\theta^*$ & True Reward Function Parameter \\
$\theta$ & Demonstrator's Reward Function Parameter \\
$\hat{\theta}$ & Inverse Estimator's Estimated Reward Parameter\\
$\gamma$ & Closeness parameter of action set \\
$a_t$ & Action taken by demonstrator at time $t$ \\
$x_t$ & Reward seen by demonstrator at time $t$ \\
$\eta_t$ & Noise in reward function seen at time $t$ \\
$\mu^*$ & Reward of optimal arm \\
$a^*$ & Optimal action with the highest reward \\
$\chi_1 = \min_{a \in \mathcal{A}} \|a\|_2$ & Smallest-norm action in action set \\
$\chi_2 = \max_{a \in \mathcal{A}} \|a\|_2$ & Largest-norm action in action set \\
$\mathcal{A}_{\ell}$ & Set of remaining arms at phase $\ell$ \\
$\mathcal{A}_{\ell} \setminus \mathcal{A}_{\ell - 1}$ & Set of eliminated arms before phase $\ell$\\
$\epsilon_{\ell}$ & $2^{-\ell}$ used as criteria for elimination \\
$\nu_{\ell}$ & Error parameter for G-Optimal Design\\
$\delta$ &  Probability Parameter for G-Optimal Design\\

\bottomrule
\end{tabular}
\caption{Table of notation used in main paper and proofs}
\label{tab:notation}
\end{table*}

% \section{Technical Lemmas}
% \subsection{Proof of Lemma B.1}
% \begin{restatable}{lemma}{gammacloserewards}
% \label{lem:gam_close_rewards}
% Given two arms $a, b$ that are $\gamma$-close, i.e. $\|a - b\|_2 \leq \gamma$, the difference in their rewards is bounded by 
% $$\langle a, \theta^* \rangle  - \langle b, \theta^* \rangle \leq \gamma \norm{\theta^*}_2 \text{.}$$
% \end{restatable}
% \begin{proof}
% Simply,
% \begin{align}
%     \langle a, \theta^* \rangle  - \langle b, \theta^* \rangle &= \langle a-b, \theta^* \rangle \nonumber\\
%     &\leq \norm{a-b}_2 \norm{\theta^*}_2\nonumber\\
%     &\leq \gamma \norm{\theta^*}_2\nonumber
% \end{align}
% \end{proof}

We now collect the proofs of our main results, for which Table~\ref{tab:notation} summarizes relevant mathematical notation.


\section{Phased Elimination Proofs}
\label{sec:appendphasedelim}
% We first prove that the estimate of the reward parameter for the forward algorithm is an accurate estimate of $\theta^*$. The central intuition behind this is that the G-Optimal design is chosen to ensure the forward algorithm explores each dimension in $\mathbb{R}^d$. This exploration helps ensure that the demonstrator's estimate of $\theta$ accurately predicts the sample mean rewards for any arm in the active set, not just ones that point in specific favorable directions. Formally, it ensures that the demonstrator's estimate of the reward of any arm in the remaining active set of any phase $\ell$ is bounded by a $\nu_{\ell}$. This lemma is similar to that of Lemma 6.1 in \citet{batchedbandits}.

First, we collect properties of the forward algorithm (the phased elimination algorithm) that will be useful for analyzing our inverse estimator.
The following lemma, essentially Lemma 6.1 in~\citet{batchedbandits}, shows that the error in the forward algorithm's estimate of the mean reward of any (active) arm decreases as more epochs are executed.
%
\begin{restatable}[\textbf{Demonstrator's Estimation Error}~\citep{batchedbandits}]{lemma}{errorgoodterm}
\label{lem:error_good_term}
Suppose that~\Cref{alg:phased_elim} is run, and denote by $\theta_{\ell}$ the forward algorithm's estimate of the reward parameter $\theta^*$. 
Denote the ``good event"
\begin{align}
    \mathcal{E}_{\mathsf{good}} := \{\lvert \langle a, \theta_{\ell} - \theta^* \rangle \rvert \leq \nu_{\ell} \text{ for all } a \in \mathcal{A}_{\ell}, \ell \in [L]\}.
\end{align}
Then, the good event $\mathcal{E}_{\mathsf{good}}$ occurs with probability at least $1 - |\mathcal{A}|L\delta$.
% Here, $\theta_{\ell}$ estimates the forward algorithm reward parameter after the $l$th phase. 
\end{restatable}

\begin{proof}
    Fix an epoch $\ell \in [L]$, an active arm $a \in \mathcal{A}_{\ell}$, and recall that the failure probability parameter $\delta$ was given as input to~\Cref{alg:phased_elim}.
    Then, Lemma 6.1 of~\citet{batchedbandits} tells us that $$|\langle a, \theta_{\ell} - \theta^*\rangle| \leq \nu_{\ell}$$ with probability at least $1 - \delta$.
    Taking a union bound over all active arms $a \in \mathcal{A}_{\ell}$ and all $\ell \in [L]$, and noting that $|\mathcal{A}_{\ell}| \leq |\mathcal{A}|$ completes the proof of the lemma.
    % From Lemma 6.1 of \cite{batchedbandits}, for any $\delta, \nu_{\ell} \geq 0$, we know that we can find a multiset where after playing the multiset in batched bandits fashion, the least-squares estimate error for any arm $a$ is $|\langle a, \theta_{\ell} - \theta^*\rangle| \leq \nu_{\ell}$ with probability $1 - \delta$. 
    % Therefore, we know that we can form a multiset such that for every arm $a \in \mathcal{A}$ and all phases $l$, $$|\langle a,  \theta_{\ell}  - \theta^*\rangle| \leq \nu_{\ell}\text{.}$$ 
    % To get a lower bound on the probability that this event occurs, we need to find the probability of the union of all these events not happening. We can upper bound this by taking the union bound of all events. For all $|\mathcal{A}|$ arms and $L$ phases, we get that the probability of any of these events not happening is upper bounded by $|\mathcal{A}|L\delta$. 
\end{proof}

Henceforth, we work on the good event $\mathcal{E}_{\mathsf{good}}$ and state and prove a series of simple lemmas.
The first such lemma shows that the optimal arm $a^*$ is not eliminated in any of the phases $\ell \in [L]$.
% This accuracy of the forward algorithm's $\theta_{\ell}$ helps maintain its low regret properties. Given the accuracy of its reward parameter, it is intuitive that with high probability, the forward algorithm knows which arms are suboptimal and which are not. This intuition should include that of the optimal arm $A^*$, which is not suboptimal by definition. Therefore, with high probability, the forward algorithm does not eliminate the optimal arm. 
\begin{restatable}{corollary}{bestarmactive}
\label{corr:best_arm_active}
The optimal arm $a^*$ remains active throughout, i.e. $a^* \in \mathcal{A}_{\ell}$ for all $\ell \in [L]$, under the good event $\mathcal{E}_{\mathsf{good}}$.
\end{restatable}
\begin{proof}
    From Lemma \ref{lem:error_good_term}, for any suboptimal arm $a$, we have
    $$\langle a,  \theta_{\ell}  \rangle -  \langle a^*, \theta_{\ell}  \rangle \leq (\langle a, \theta^* \rangle + \nu_{\ell}) -  (\langle a^*, \theta^* \rangle - \nu_{\ell}) \leq 2\iota\epsilon_{\ell} \leq 2 \epsilon_{\ell}\text{.}$$
    on the good event $\mathcal{E}_{\mathsf{good}}$.
    Thus, the elimination criterion is \emph{not} satisfied by arm $a^*$ in any epoch $\ell \in [L]$.
    This completes the proof of the lemma.
    % The event from Lemma \ref{lem:error_good_term} occurs with probability $1 - \delta$, so this result also happens with probability $1 - \delta$.
\end{proof}
% Given the event that the optimal arm remains in the active set, we can state with a high probability that suboptimal arms will be eliminated. This is clear from the elimination criteria; if an arm's reward is much worse than the best-estimated reward for any arm in the active set, it will be eliminated. Given that the optimal arm is still in the active set and the reward estimate is accurate, arms with a true reward much worse than the optimal arm will most likely also have an estimated reward worse than the optimal arm. This will lead to the elimination of that arm. We formalize this in \Cref{lem:armwillbeeliminated}.
The next lemma shows a related property, i.e. that sufficiently suboptimal arms will be eliminated (and that more suboptimal arms will be eliminated in earlier epochs).
\armwillbeeliminated*
\begin{proof}
    Let $b_{\ell - 1}$ be the arm that maximizes the reward in epoch $\ell - 1$, i.e. $b_{\ell - 1} = \underset{b \in \mathcal{A}_{\ell - 1}}{\argmax} \langle b, \theta_{\ell -1}\rangle$. 
    From \Cref{lem:error_good_term}, we have any arm $a$ satisfying \Cref{armwillbeeliminatedequation} satisfies
    \begin{align}
        \langle b_{\ell-1} - a, \theta_{\ell-1}\rangle  &\leq \langle b_{\ell-1} - a, \theta^*\rangle  + 2\nu_{\ell - 1}\label{eq:129}\\
        &\leq \langle a^* - a, \theta^*\rangle   + 2\nu_{\ell - 1}\nonumber \\
        &\leq 4(1 - \iota)\epsilon_{\ell}   + 2\iota\epsilon_{\ell - 1}\nonumber \\
         &\leq 2(1 - \iota)\epsilon_{\ell - 1}   + 2\iota\epsilon_{\ell - 1} \label{eq:vid1} \\
         &= 2\epsilon_{\ell - 1} \nonumber
    \end{align}
    where~\Cref{eq:129} follows from the good event in Lemma~\ref{lem:error_good_term} and~\Cref{eq:vid1} follows because $2\epsilon_l = \epsilon_{l-1}$. This implies that arm $a$ will not be eliminated in phase $\ell - 1$.
    On the other hand, for epoch $\ell$, we have
    \begin{align}
         \langle b_{\ell} - a, \theta_{\ell}\rangle &=  \langle b_{\ell}, \theta_{\ell}\rangle - \langle a, \theta_{\ell}\rangle\nonumber \\
         &\geq \langle a^*, \theta_{\ell}\rangle - \langle a, \theta_{\ell} \rangle \nonumber \\
         &\geq  \langle a^* - a, \theta^*\rangle - 2\nu_{\ell} \label{eq:140} \\
         &=  \langle a^* - a, \theta^*\rangle - 2\iota\epsilon_{\ell} \nonumber \\
         &\geq  2(1-\iota)\epsilon_{\ell} - 2\iota\epsilon_{\ell}  \nonumber \\
         &= 2\epsilon_{\ell} \nonumber
    \end{align}
     where~\Cref{eq:140} also follows from the good event in Lemma \ref{lem:error_good_term}.  Therefore, arm $a$ will be eliminated in phase $\ell$.
    This proves the first statement of the lemma.
    The second statement of the lemma, i.e. $\mu^* - 4 (1 - \iota) \epsilon_{\ell} \leq \langle a, \theta^* \rangle \leq \mu^*$, follows by rearranging the original inequalities and noting that $\mu^* := \langle a^*, \theta^* \rangle$.

%     By the definition of $\mu^*$, $$\langle a, \theta^*\rangle \leq \mu^*\text{.}$$
%     Given arm $a_i$ is in $\mathcal{A}_{\ell} \setminus \mathcal{A}_{\ell - 1}$, it was not eliminated in the previous phase. For notational ease, let $b = \argmax_{b \in \mathcal{A}_{\ell-1}} \langle b, \theta_{\ell-1}\rangle$. Therefore,
%     \begin{align}
%         2\epsilon_{l-1} &\geq \langle b - a, \theta_{\ell-1}\rangle \nonumber \\
%         &= \langle b , \theta_{\ell-1}\rangle  - \langle a, \theta_{\ell-1}\rangle \nonumber \\
%         &= \langle b , \theta_{\ell-1}\rangle  - \langle a, \theta_{\ell-1} - \theta^*\rangle - \langle a, \theta^*\rangle \nonumber\\
%         &\geq \langle b , \theta_{\ell-1}\rangle  - \nu_{\ell-1} - \langle a, \theta^*\rangle \label{eq:135}\\
%         &\geq \langle a^* , \theta_{\ell-1}\rangle  - \nu_{\ell-1} - \langle a, \theta^*\rangle \label{eq:136} \\
%         &= \langle a^* , \theta_{\ell-1} - \theta^*\rangle + \langle a^* , \theta^* \rangle   - \nu_{\ell-1} - \langle a, \theta^*\rangle  \nonumber\\
%         &\geq \langle a^* , \theta^* \rangle   - 2\nu_{\ell-1} - \langle a, \theta^*\rangle \label{eq:138}
%     \end{align}


% Here, \Cref{eq:135} comes from Lemma \ref{lem:error_good_term}, which happens with probability at least $1 - |\mathcal{A}|L\delta$. \Cref{eq:136} comes from the fact that $b$ achieves the maximum reward in $\mathcal{A}_{l-1}$ and $a^* \in \mathcal{A}_{l-1}$ with the same probability according to Corollary \ref{corr:best_arm_active}. Also, \Cref{eq:138} comes from applying Lemma \ref{lem:error_good_term} again. Therefore, we have
% \begin{align}
% \langle a, \theta^*\rangle &\geq \mu^* - 2\epsilon_{\ell-1} - 2\nu_{\ell-1}\nonumber \\
% &= \mu^* - 4\epsilon_{\ell} - 4\iota\epsilon_{\ell}\nonumber\\
% &= \mu^* - 4(1 + \iota)\epsilon_{\ell}\nonumber
% \end{align}
\end{proof}
%
The following is a useful corollary to Lemma~\ref{lem:armwillbeeliminated} for arms that are ``close" in Euclidean distance to arms satisfying the condition in Lemma~\ref{lem:armwillbeeliminated}.
\begin{restatable}{corollary}{closeisdeleted}
\label{lem:closeisdeleted}
    Consider an arm $a$ that is $\gamma$-close to some arm $b$ in Euclidean distance, i.e. $\|b - a\|_2 \leq \gamma$, such that arm $b$ satisfies
    \begin{equation}
        \mu^* - 4(1-\iota)\epsilon_{\ell} + \gamma\|\theta^*\|_2^2 \leq \langle a^* - b, \theta^* \rangle \leq \mu^*-2(1-\iota)\epsilon_{\ell} -\gamma\|\theta^*\|_2^2 \text{.} \label{eq:closeisdeleted}
    \end{equation}
    
    Then, under the good event $\mathcal{E}_{\mathsf{good}}$, arm $a$ will be eliminated before phase $L$, i.e. $a \in \mathcal{A}_L \setminus \mathcal{A}_{L-1}$.
\end{restatable}
\begin{proof}
    We have that $|\langle b - a, \theta^*\rangle| \leq \gamma\|\theta^*\|_2$ since $\|a - b\|_2 \leq \gamma$. 
    Therefore, 
    \begin{align}
        \langle a^* - a, \theta^* \rangle &\leq \langle a^* - b, \theta^* \rangle + \gamma\|\theta^*\|_2\nonumber\\
        &\leq \mu^*-4(1-\iota)\epsilon_{\ell}\nonumber
    \end{align}
    Moreover, we have
    \begin{align}
        \langle a^* - a, \theta^* \rangle &\geq \langle a^* - b, \theta^* \rangle - \gamma\|\theta^*\|_2\nonumber\\
        &\geq \mu^*-2(1-\iota)\epsilon_{\ell}\nonumber
    \end{align}
   Thus, directly applying Lemma \ref{lem:armwillbeeliminated} shows that arm $a$ will be eliminated.
\end{proof}
%
This completes our set of lemmas that work on the good event $\mathcal{E}_{\mathsf{good}}$.
Finally, we provide a lemma that characterizes the total number of phases $L$, which is technically a random variable, in terms of a deterministic upper bound that is logarithmic in $T$.
% Moreover, for simplicity, throughout this paper, we will do most of our calculations based on phase numbers, including $L$, the last phase number. However, given that the last phase is technically a random variable based on the G-optimal design, we provide a lower bound on the phase $L$ in terms of $T$. Here, we see that $L$ is lower bounded by the logarithm of $T$ up to constants. 
\begin{restatable}{lemma}{conntl}
    \label{lem:conntl}
    The total number of rounds of Algorithm~\ref{alg:phased_elim} and the total number of phases $L$ exhibit the relationship
    $$\log(T) \leq  \log(2\iota^{-2}dJ) + 2\log(2^L) + \log \left(2\right) \text{.}$$ Here, $J$ is notational shorthand, defined as $J \coloneqq  \left(\frac{|\mathcal{A}|L(L+1)}{\delta}\right)$.
\end{restatable}
\begin{proof}
    Let $N_{\ell}$ be the number of arms played in phase $\ell$. From \cite{lattimore_szepesvári_2020}, we have
    \begin{align}
        N_{\ell} - \frac{d(d+1)}{2} &\leq \frac{2d}{\nu_{\ell}^2}\log\left(\frac{|\mathcal{A}|l(l+1)}{\delta}\right)\\
       &\leq 2\iota^{-2}d\cdot2^{2l} \left(\frac{|\mathcal{A}|l(l+1)}{\delta}\right) \nonumber
    \end{align}
    Recall the notational shorthand $J \coloneqq  \left(\frac{|\mathcal{A}|L(L+1)}{\delta}\right)$.
    We have
    \begin{align}
        \log\left(\sum_{\ell}^{L-1} N_{\ell}\right) &\leq  \log\left(\sum_{\ell}^{L-1} 2\iota^{-2}d \cdot 2^{2l} \cdot \left(J\right) + \frac{d(d+1)}{2}\right)\nonumber\\
        &= \log\left(2\iota^{-2}d\left(J\right)\sum_{\ell}^{L-1} 2^{2l}   + \sum_{\ell}^{L-1}\frac{d(d+1)}{2}\right)\nonumber\\
        &= \log\left(2\iota^{-2}d\left(J\right)\sum_{\ell}^{L-1} 2^{2l}\right) + \log \left(\frac{\sum_{\ell}^{L-1}\frac{d(d+1)}{2}}{2\iota^{-2}d\left(J\right)\sum_{\ell}^{L-1} 2^{2l}}\right)\nonumber\\
        % &= \log\left(2\iota^{-2}d\left(J\right)\sum_{\ell}^{L-1} 2^{2l}\right) + \log \left(1 + \frac{\sum_{\ell}^{L-1}\frac{d+1}{4}}{2\iota^{-2}d\left(J\right)\sum_{\ell}^{L-1} 2^{2l})}\right)\nonumber\\
        &\leq \log\left(2\iota^{-2}d\left(J\right)\sum_{\ell}^{L-1} 2^{2l}\right) + \log \left(2\right)\nonumber\\
        % &= \log\left(2\iota^{-2}d\left(J\right)(4^L - 4)\right) + \log \left(2\right)\nonumber\\
        % &\leq \log(2\iota^{-2}dJ) + \log(4^L) + \log \left(2\right)\nonumber\\
        &\leq \log(2\iota^{-2}dJ) + 2\log(2^L) + \log \left(2\right)\nonumber
    \end{align}
    This completes the proof of the lemma.
\end{proof}

\section{Inverse Estimator Properties}
\label{sec:appendinversestimate}

The proof of Theorem~\ref{thm:accuracy_theta_est} relies on several intermediate lemmas. 
We first state and prove these lemmas, and then provid the proof of Theorem~\ref{thm:accuracy_theta_est}.

\subsection{Lemmas for inverse estimator analysis}

First, we state and prove a simple lemma that upper bounds our normalized inverse estimation error as a function of the condition number of the matrix whose rows constitute the set of selected arms $\mathcal{A}^e$ and the normalized estimation error of the rewards of the arms in $\mathcal{A}^e$.

\begin{restatable}{lemma}{slacknessinbounds}
    \label{lem:slackness_in_bounds}
    Suppose $r$ and $\hat{r}$ are vectors of the true rewards and estimated rewards for $\mathcal{A}^e$. If the arms in $\mathcal{A}^e$ are linearly independent, the solution to $\hat{\theta} = \argmin \sum_{a^i \in \mathcal{A}^e} (\hat{r}_i -\langle\theta, a^i \rangle)^2)$ where $\hat{r}_i$ is the estimate reward of $a^i$ satisfies the bound the error in estimation of $\theta$ via  $$\frac{\left\|\hat{\theta} - \theta^*\right\|_2}{\left\|\theta\right\|_2} \leq \operatorname{cond}(\mathcal{A}^e) \frac{ \|\hat{r} - r\|_2}{\|r\|_2}\text{.}$$
\end{restatable}
\begin{proof}
    We consider the design matrix $\mathbf{A}$ whose $d$ rows are given by the arms in $\mathcal{A}^e$. More formally, we define
$$\mathbf{A} = \left[\begin{array}{c} 
a^1\\
a^2 \\
\vdots \\
a^d \\

\end{array}\right]$$ where $a^1, \dots, a^d \in \mathcal{A}^e$.
Therefore, the solution to the least squares problem is given by $\hat{\theta} = \argmin \sum_{a^i \in \mathcal{A}^e} (\hat{r}_i -\langle\theta, a^i \rangle)^2)$ is solved by 
    \begin{align}
        \hat{\theta} &= (\mathbf{A}^{T}\mathbf{A})^{-1}\mathbf{A}^{T}\hat{r} = \mathbf{A}^{-1}\hat{r},
    \end{align}
    where the last equality follows because $\mathbf{A}$ is a square matrix and the arms are linearly independent (Lemma~\ref{lem:conda}).
    % \Cref{eq:from invertible} comes from the fact that the arms in $\mathcal{A}^e$ are linearly independent. 
    Therefore, we have
    \begin{align}
        \|\hat{\theta} - \theta^*\|_2 &= \|\mathbf{A}^{-1}(r - \hat{r})\|_2 \nonumber\\
        &\leq \|\mathbf{A}^{-1}\|_2 \|r - \hat{r}\|_2  \nonumber
    \end{align}
    Moreover, we have
    $$\|r\|_2 = \|\mathbf{A}\theta^*\|_2 \leq \|\mathbf{A}\|_2 \|\theta^*\|_2\text{.}$$
    Combining the inequalities above completes the proof of the lemma.
\end{proof}

Next, we restate and prove our main technical lemma, which characterizes the condition number of the design matrix $\mathbf{A}$ whose rows consist of the arms in $\mathcal{A}^e$.
\conda*

\begin{proof}


% We can now prove the original claim. For the help of this proof, we will denote $\mathbf{A}$ as the matrix version of $\mathcal{A}^e$, i.e.
% $\mathbf{A} = \left[\begin{array}{c} 
% a^1\\
% a^2 \\
% \vdots \\
% a^d \\

% \end{array}\right]$ where $a^1, \dots, a^d \in \mathcal{A}^e$.
We will break down the proof of the bound of the condition number into two parts. First, we decompose $\mathbf{A}$ into the following convenient form: 
$$\mathbf{A} = \mathbf{D}\tilde{\mathbf{A}} + \mathbf{N}\text{.}$$ 
Above, $\mathbf{D}$ is a diagonal matrix such that $D_{i, i} = \|a^i\|_2$, and $\tilde{\mathbf{A}}$ is a matrix such that $i$th row of $\mathbf{A}$, which we denote as shorthand by $v_i$, is $v_i = \frac{\operatorname{proj}(a^i, i)}{\|\operatorname{proj}(a^i, i)\|_2}$.
(Recall that $\operatorname{proj}(a^i, i)$ was defined in Section~\ref{sec:construct} and is the projection of the arm $a^i$ onto the plane spanned by the optimal arm $a^*$ and the $i$-th element of the regular simplex $s_i$.)
Finally, $\mathbf{N}$ constitutes an ``error" matrix term whose $i$-th row is equal to $a^i - \operatorname{proj}(a^i, i)$. 
We expect $\mathbf{N}$ to be ``small" in the sense of operator-norm under Assumption~\ref{rem:shape}; we will show this formally shortly.

Since $\operatorname{cond}(\mathcal{A}^e) = \frac{\sigma_{\max}(\mathbf{A})}{\sigma_{\min}(\mathbf{A})}$, it suffices to lower bound $\sigma_{\min}(\mathbf{A})$ and upper bound $\sigma_{\max}(\mathbf{A})$ in order to upper bound the condition number.
First, we provide a lower bound on the minimum singular value$\sigma_{\min}(\mathbf{A}_{\ell})$. 
By Weyl's theorem, we have
\begin{align}
    \sigma_{\min}(\mathbf{A}) &= \sigma_{\min}(\mathbf{D}\tilde{\mathbf{A}} + \mathbf{N})\nonumber\\
    &\geq \sigma_{\min}(\mathbf{D}\tilde{\mathbf{A}}) - \sigma_{\max}( \mathbf{N})\label{eq:335}
\end{align}
% Here, \Cref{eq:335} comes from \citet{loyka2015singular}. 
% We upper bound the $\sigma_{\max}( \mathbf{N})$ term via the following
Then, we can upper bound $\sigma_{\max}(\mathbf{N})$ as below:
\begin{align}
    \sigma_{\max}( \mathbf{N}) &= \sqrt{\|\mathbf{N}^\top\mathbf{N}\|_2}\label{eq:340}\\
    &= \sqrt{\underset{x \text{ s.t. } \|x\|_2 = 1}{\max}x^\top\mathbf{N}^\top\mathbf{N}x }\nonumber\\
    &\leq \sqrt{d\gamma^2}\nonumber\\
    &= \gamma \sqrt{d}\nonumber
\end{align}
Above, \Cref{eq:340} comes from noticing that the rows of $\mathbf{N}$ have $\ell_2$ norm at most $\gamma$ --- this is because $\|a^i - \operatorname{proj}(a^i,i)\|_2 =: \operatorname{dist}(a^i,i) \leq \gamma$, where the last inequality uses part 2 of Assumption~\ref{rem:shape}.
Thus, we have $\sigma_{\min}(\mathbf{A}) \geq \sigma_{\min}(\mathbf{D} \tilde{\mathbf{A}}) - \gamma \sqrt{d}$.
A symmetric argument for the maximum singular value gives us $\sigma_{\max}(\mathbf{A}) \leq \sigma_{\max}(\mathbf{D} \tilde{\mathbf{A}}) + \gamma \sqrt{d}$.

Next, we characterize the minimum and maximum singular values of the product matrix $\mathbf{D} \tilde{\mathbf{A}}$.
Starting with the minimum singular value, note that $ \sigma_{\min}(\mathbf{D}\tilde{\mathbf{A}}) \geq  \sigma_{\min}(\mathbf{D}) \sigma_{\min}(\tilde{\mathbf{A}}) $. 
Since $\mathbf{D}$ is a diagonal matrix, we have $\sigma_{\min}(\mathbf{D}) = \min_{i \in [d]} D_{i,i} \geq \min_{a \in \mathcal{A}} \|a\|_2 =: \chi_1$.
Therefore, we have
$$\sigma_{\min}(\mathbf{A}) \geq \chi_1 \sigma_{\min}(\tilde{\mathbf{A}}) - \gamma \sqrt{d}\text{.}$$
Similarly, for the maximum singular value we have $\sigma_{\max}(\mathbf{D}) = \max_{i \in [d]} D_{i,i} \leq \max_{a \in \mathcal{A}} \|a\|_2 =: \chi_2$.
This gives us 
$$\sigma_{\max}(\mathbf{A}) \leq \chi_2 \sigma_{\max}(\tilde{\mathbf{A}}) + \gamma \sqrt{d}$$.

We now only need to analyze the minimum and maximum singular values of $\tilde{\mathbf{A}}$; this forms the technical crux of our proof.
Recall that the rows of $\tilde{\mathbf{A}}$ are equal to $v_i := \frac{\operatorname{proj}(a^i, i)}{\|\operatorname{proj}(a^i, i)\|_2}$. 
Further, define the normalized matrix $\mathbf{B} = \frac{1}{\sqrt{d}} \tilde{\mathbf{A}}$ for convenience.
We will characterize the eigenvalues of the matrix $\mathbf{B}^\top \mathbf{B}$, noting that $\sigma_j(\tilde{\mathbf{A}}) = \sqrt{d \cdot \lambda_j (\mathbf{B}^\top \mathbf{B})}$.
Note that $(\mathbf{B}^\top \mathbf{B})_{i,j} = \frac{1}{d} \langle v_i, v_j \rangle$, and so $$(\mathbf{B}^\top \mathbf{B})_{i,i} = \frac{1}{d}$$ for all $i \in [d]$.
We now characterize the off-diagonal terms.
Note that $\langle v_i, v_j \rangle = 1 - \frac{\|v_i - v_j\|_2^2}{2}$, so it suffices to characterize the terms $\|v_i - v_j\|_2^2$.

% First, we list three properties of our $\tilde{\mathbf{A}}$ matrix. 
% We know that each row of $\tilde{\mathbf{A}}$ forms an angle of $\tau(a^i, i) \geq \beta$ with the optimal arm $a^*$ from \Cref{rem:shape}. We wish to find the condition number for the matrix $\tilde{\mathbf{A}}$. The smallest possible condition number is achieved when $\tau(a^i, i)$ is the smallest for each row $v_i$, i.e. $\tau(a^i, i) = \beta$. This is when the rows are the most colinear, leading to poor conditioning. To analyze the condition number of $\tilde{\mathbf{A}}$, we will first analyze the condition number of $\mathbf{B}$.
%     We define the matrix $\mathbf{B} = \frac{1}{\sqrt{d}}\tilde{\mathbf{A}}$. 
%     Observe that $\sigma_{\max}(\mathbf{B}) = \frac{1}{\sqrt{d}} \sigma_{\max}(\tilde{\mathbf{A}})$ and $\sigma_{\min}(\mathbf{B}) = \frac{1}{\sqrt{d}} \sigma_{\min}(\tilde{\mathbf{A}})$, so we only need to characterize the singular values of the latter matrix $\mathbf{B}$.
%     We will do this by characterizing the eiganvalues of the matrix $\mathbf{B}^*\mathbf{B}$ --- observe that for all $j \in [d]$, $$\lambda_j(\mathbf{B}^*\mathbf{B}) = \sigma_j(\mathbf{B})\text{.}$$ We note that $[\mathbf{B}^*\mathbf{B}]_{ij} = \frac{1}{d} \langle v_i, v_j \rangle$ where $v_i$ and $v_j$ are the $i$th and $j$th rows of $\tilde{\mathbf{A}}$. Note then that $[\mathbf{B}^*\mathbf{B}]_{ii} = \frac{1}{d}$. For $i \neq j$, then $\langle v_i, v_j \rangle$ is the following. We will assume the worst case, where the angle $\tau(a^i, i)$ is as small as possible,  i.e. $\tau(a^i, i)  = \beta$. 
We wish to first find the angle between our $\alpha$ vectors. We remind the reader that our $\alpha$ vectors form a $d-1$-dimensional simplex centered at the unit vector $u = \frac{a^*}{\|a^*\|_2}$. We will first find the radius of this simplex, i.e., $\|u - v_i\|_2$. The vectors $u$, $v_i$, and the origin form an isosceles triangle where $u$ and $v_i$ are unit-norm by definition. Therefore, by the Law of Sines
    \begin{align}
        \|u - v_i\|_2 &= \frac{\sin(\tau(a^i, i))}{\sin\left(\frac{\pi - \tau(a^i, i)}{2}\right)}\nonumber\\
        &= 2\sin\left(\frac{\tau(a^i, i)}{2}\right) \nonumber
    \end{align}
   Therefore, we have that the radius of the simplex is $2\sin\left(\frac{\tau(a^i, i)}{2}\right)$, which we will call $\rho$ for now. From \citet{krasnodkebski1971dihedral}, the angles formed between $u - v_i$ and $u - v_j$ is $\arccos \left(-\frac{1}{d-1}\right)$. Therefore, we have the distance between $v_j$ and $v_i$ satisfies 
    \begin{align}
        \|v_j - v_i\|_2^2 &= \|u - v_i\|_2^2 + \|u - v_j\|_2^2 - 2\|u - v_i\|_2\|u - v_j\|_2 \cos\left(\arccos\left(-\frac{1}{d-1}\right)\right) \nonumber\\
        &= 2\rho^2 \left(1 - \cos\left( \arccos\left(-\frac{1}{d-1}\right)\right)\right)\nonumber\\
        &= 2\rho^2 \frac{d}{d-1}\nonumber
    \end{align}
% On the other hand, we can write 
% We also have that the angle we are looking for $\beta$, which is the angle between $v_i$ and $v_j$, satisfies 
% Therefore, we have
% $$\|v_j - v_i\|_2^2 = 2 - 2\cos(\beta) \text{.}$$
Therefore, we have 
\begin{align}
    \langle v_i, v_j \rangle &= 1 - \frac{\rho^2d}{d-1} =: \cos(\beta) \nonumber.
\end{align}
% Next, we consider the structure of matrix $\mathbf{B}^*\mathbf{B}$. 
% Its diagonal elements are $\frac{1}{d}$, and its nondiagonal elements are $\frac{1}{d} \cos(\beta)$, leading to an explicit unitary diagonalization. This matrix has singular values:
We have shown that we can decompose the matrix $\mathbf{B}^\top \mathbf{B}$ as $\mathbf{B}^\top \mathbf{B} = \frac{1 - \cos(\beta)}{d} \cdot \mathbf{I} + \frac{\cos(\beta)}{d} \cdot \mathbf{1} \mathbf{1}^\top$.
This matrix has maximum eigenvalue equal to $\frac{1 - \cos(\beta)}{d} + \cos(\beta)$ and minimum eigenvalue equal to $\frac{1 - \cos(\beta)}{d} $.
% $$\sigma_1, \ldots, \sigma_{d-1} = \frac{1}{d} - \frac{1}{d} \cos(\beta)$$
% $$\sigma_d =  \frac{d-1}{d} \cos(\beta) + \frac{1}{d}\text{.}$$ We will upper bound the maximum singular value. 
Thus, we can upper bound the maximum eigenvalue as
\begin{align*}
    \lambda_{\max}(\mathbf{B}^\top \mathbf{B}) &= \frac{d-1}{d} \cos(\beta) + \frac{1}{d}\\
     &\leq \frac{d-1}{d}   + \frac{1}{d} = 1.
\end{align*}
% where the first inequality comes from the fact that $\cos(\beta) \leq 1$. 
Next, we can write the minimum eigenvalue as
\begin{equation}
    \lambda_{\min}(\mathbf{B}^\top \mathbf{B}) =\frac{1}{d} - \frac{1}{d} \cos(\beta) \geq\frac{\rho^2}{d-1}. \label{eq:334}
\end{equation}
Further, we can lower bound $\rho^2$ on the interval $\tau(a^i, i) \in [-\frac{\pi}{2}, \frac{\pi}{2}]$ via its Taylor expansion as 
\begin{equation}
    \rho^2 \geq \frac{\tau(a^i, i)^2}{2}\text{.}\label{eq:338}
\end{equation}
Combining \Cref{eq:334} with \Cref{eq:338} gives us the following lower bound on the minimum eigenvalue:
\begin{align}\label{eq:468}
    \lambda_{\min}(\mathbf{B}^\top \mathbf{B}) &\geq \frac{\tau(a^i, i)^2}{2d} \geq \frac{\beta^2}{2d}.
\end{align}
% Here, \Cref{eq:468} comes from our assumption \Cref{rem:shape}. 
% Therefore, the maximum singular value for $\tilde{\mathbf{A}}$ is upper bounded by $1$ and the minimum singular value for $\tilde{\mathbf{A}}$ is lower bounded by $(2d)^{-\frac{1}{2}}\beta$.
% We have proved the condition number of $\tilde{\mathbf{A}}$. Now, we can find the total condition number for $\mathbf{A}$.
Thus, we have characterized the minimum and maximum eigenvalues of $\mathbf{B}^\top \mathbf{B}$.
Putting all of the steps together, we have
\begin{align}
    \operatorname{cond}(\mathbf{A}) = \frac{\sigma_{\max}(\mathbf{A})}{\sigma_{\min}(\mathbf{A})} &\leq \frac{\chi_1 \sigma_{\max}(\tilde{\mathbf{A}}) + \gamma \sqrt{d}}{\chi_2 \sigma_{\min}(\tilde{\mathbf{A}}) - \gamma \sqrt{d}} \nonumber\\
    &= \frac{\chi_1 \sigma_{\max}(\mathbf{B}) + \gamma d}{\chi_2 \sigma_{\min}(\mathbf{B}) - \gamma d} \nonumber \\
    &= \frac{\chi_1 \sqrt{\lambda_{\max}(\mathbf{B}^\top \mathbf{B}}) + \gamma d}{\chi_2 \sqrt{\lambda_{\min}(\mathbf{B}^\top \mathbf{B}}) - \gamma d} \nonumber \\
    &\leq \frac{\chi_1 + \gamma d}{\chi_2 \left[ (2d)^{-\frac{1}{2}}\beta\right] - \gamma d}\nonumber.
\end{align}
This completes the proof of the lemma. 
\end{proof}

Next, we restate and prove a lemma that bounds the normalized estimation error of the rewards of the selected arms in $\mathcal{A}^e$.
\boundb*

\begin{proof}
% $r$ is a vector of rewards of arms in $\mathcal{A}_L \setminus \mathcal{A}_{L-1}$. Therefore, for an element $r_a$ associated with an arm 
Consider an arm $a^i \in \mathcal{A}^e$ (where $i \in [d]$), and denote $r_i := R_{\theta^*}(a^i)$ as shorthand.
% Since, by definition $a \in \mathcal{A}_L \setminus \mathcal{A}_{L-1}$, we know that $a \notin \mathcal{A}_{L-1} \setminus \mathcal{A}_{L-2}$. 
Via Lemma \ref{lem:armwillbeeliminated}, we have
\begin{align}\label{eq:rilowerbound}
    \mu^* - 4(1+\iota)\epsilon_L \leq r_i \leq \mu^* \text{.}
\end{align}
Denote the corresponding estimator of the mean reward of this arm by $\hat{r}_i := \mu^* - 2(1 + \iota) \epsilon_L$.
Clearly, we have $|r_i - \hat{r}_i| \leq 2(1+\iota)\epsilon_L$. 
Thus, we have $$\norm{\hat{r} - r}_2 \leq 2(1+\iota)\epsilon_L \sqrt{d} \text{.}$$ 
Next, we lower bound the denominator $\norm{r}_2$.~\Cref{eq:rilowerbound} tells us that $|r_i| \geq \mu^* - 4(1+\iota)\epsilon_L$ for every $i \in [d]$.
This gives us the lower bound $\norm{r}_2 \geq \sqrt{d} (\mu^* -4(1+\iota)\epsilon_L)$. 
Putting the pieces together yields
$$\frac{\norm{r - \hat{r}}_2}{\norm{r}_2} \leq \frac{2(1+\iota)\epsilon_L}{\mu^* - 4(1+\iota)\epsilon_L}\text{.}$$ Since $\iota \leq 1$ from \Cref{rem:shape}, we have $$ \frac{2(1+\iota)\epsilon_L}{\mu^* - 4(1+\iota)\epsilon_L} \leq  \frac{4\epsilon_L}{\mu^* - 8\epsilon_L} = \mathcal{O}\left(2^{-L}\right)\text{.}$$
This completes the proof of the lemma.
\end{proof}

% \accuracythetaest*
\subsection{Proof of Theorem~\ref{thm:accuracy_theta_est}}
We are now ready to prove Theorem~\ref{thm:accuracy_theta_est}.
% \begin{proof}
First, Lemma \ref{lem:slackness_in_bounds} tells us that 
$$\frac{\left\|\hat{\theta} - \theta^*\right\|_2}{\left\|\theta^*\right\|_2} \leq \operatorname{cond}(\mathcal{A}^e) \frac{ \|\hat{r} - r\|_2}{\|r\|_2}\text{.}$$
From Lemma~\ref{lem:conda}, we get $$\operatorname{cond}(\mathcal{A}^e) \leq  \frac{\chi_1 + \gamma d}{\chi_2 \left[ (2d)^{-\frac{1}{2}}\beta\right] - \gamma d} \text{.}$$ 
% Further, plugging in~\Cref{rem:shape} $\beta = (3(1-\iota)\epsilon_{L})^{\frac{1}{\omega}}$. 
Moreover, from Lemma \ref{lem:boundb}, we have
$$\frac{\norm{r - \hat{r}}_2}{\norm{r}_2} \leq \frac{4\epsilon_L}{\mu^* - 8\epsilon_L} \text{.}$$
Plugging in Assumption~\ref{rem:shape} (which stipulates that $\beta = (3(1-\iota)\epsilon_{L})^{\frac{1}{\omega}}$) and the above bounds gives us
% Combining these in Lemma \ref{lem:slackness_in_bounds}, we have that 
\begin{align}
    \frac{\left\|\hat{\theta} - \theta^*\right\|_2}{\left\|\theta\right\|_2}  &\leq \frac{\chi_1 + \gamma d}{\chi_2 \left[ (2d)^{-\frac{1}{2}}\left[3(1-\iota)\epsilon_L\right]^{\frac{1}{\omega}}\right] - \gamma d} \cdot \frac{4\epsilon_L}{\mu^* - 8 \epsilon_L}\nonumber\\
    &\leq \frac{\chi_1 + \gamma d}{2^{\frac{L(\omega - 1)}{\omega}} \cdot \chi_2 \left[ (2d)^{-\frac{1}{2}}\left[3(1-\iota)\right]^{\frac{1}{\omega}}\right] - 2^{L}\gamma d} \cdot \frac{4 }{\mu^* - 8\epsilon_L}\nonumber.
\end{align} 
It remains to express the above upper bound in terms of the deterministic quantity $T$ of interest (rather than the total number of phases $L$, which is random).
% Now, for the last phase number, we wish to express this in terms of $T$ instead of our dependence on $L$. 
For this, Lemma~\ref{lem:conntl} tells us that 
$$\log(T) \leq  \log(2\iota^{-2}dJ) + 2\log(2^L) + \log \left(2\right).$$ Using this, we have 
\begin{align*}
    \left[\frac{T}{4\iota^{-2}dJ}\right]^{\frac{1}{2}} &\leq 2^L  \\
    \implies 2^{\frac{L(1-\omega)}{\omega}} &\leq \left[\frac{T}{4\iota^{-2}dJ}\right]^{\frac{1-\omega}{2\omega}}.
\end{align*}
% $$\left[\frac{T}{4\iota^{-2}dJ}\right]^{\frac{1}{2}} \leq 2^L \text{.}$$
% Since $\frac{1-\omega}{\omega}$ is negative, we have $$ 2^{\frac{L(1-\omega)}{\omega}} \leq \left[\frac{T}{4\iota^{-2}dJ}\right]^{\frac{1-\omega}{2\omega}} \text{.}$$ 
Plugging this into the upper bound then gives us
\begin{align}
    \frac{\left\|\hat{\theta} - \theta^*\right\|_2}{\left\|\theta\right\|_2}  &\leq \frac{\chi_1 + \gamma d}{\left[\frac{T}{4\iota^{-2}dJ}\right]^{\frac{\omega - 1}{2\omega}}\chi_2 \left[ (2d)^{-\frac{1}{2}}\left[3\right]^{\frac{1}{\omega}}\right] - 2^{L}\gamma d} \cdot \frac{4}{\mu^* - 8\epsilon_L}.\nonumber\\
\end{align}
Given $\gamma \leq  \frac{\epsilon_{\bar{L}}}{\|\theta^*\|_2 d} \leq \frac{2^{-L}}{\|\theta^*\|_2 d}$ from \Cref{rem:shape} and $\frac{4}{\mu^* - 8\epsilon_L} = \mathcal{O}(1)$ , and noting that the term $\left[\frac{T}{4\iota^{-2}dJ}\right]^{\frac{1-\omega}{2\omega}}$ is increasing in $T$, we get
\begin{align}
    \frac{\left\|\hat{\theta} - \theta^*\right\|_2}{\left\|\theta\right\|_2}  = \mathcal{O}\left(\frac{\chi_1d^{\frac{2\omega-1}{2\omega}}J^{\frac{\omega -1}{2\omega}}}{\chi_2T^\frac{\omega-1}{\omega}}\right).\nonumber
\end{align}
This completes the proof of the theorem.
\qed
% \end{proof}

\section{Proof of Lower Bound (Theorem~\ref{thm:lower_bound})}
\label{sec:appendlowerbound}

In this section, we provide the proof of Theorem~\ref{thm:lower_bound}, which leverages the classical Le-Cam binary testing approach~\citep{lecam1973convergence} between a null instance and a random alternative instance.
We will actually show the lower bound on estimation error assuming access to \emph{both} the sequence of actions and observed rewards by the forward algorithm, as additionally observing rewards only makes the estimation problem easier.

Formally, we establish two bandit instances:
\begin{enumerate}
    \item The first instance $\mathcal{M}$ is one in which the linear reward parameter is the true parameter of interest $\theta^*$.
    \item The second random instance $\mathcal{M}'(v)$ is one in which the linear reward parameter is given by $\theta'(v) = \theta^* - \epsilon v$, where $\epsilon > 0$ is a parameter that will be chosen appropriately at a later point, and $v \sim \text{Unif}(S^{d-1})$, i.e.~$v$ is chosen uniformly at random from the $d$-dimensional unit sphere.
    Eventually, we will take an expectation over the binary testing error.
\end{enumerate}
%
Before proceeding with the proof, we set up some more relevant notation.
Let $\mathcal{E}_T$ denote the observed sequence of action-reward pairs $(a_1,r_1),\ldots,(a_T,r_T)$ (which is random), and $\mathcal{F}_T$ denote the associated sigma-algebra of possible events.
Further, for any arm $a$ let $\mathcal{V}(a)$ and $\mathcal{V}'_v(a)$ denote the associated reward distributions under bandit instances $\mathcal{M}$ and $\mathcal{M}'(v)$ respectively.
For convenience, we will assume that the noise in the rewards is drawn from an isotropic Gaussian distribution, meaning that $\mathcal{V}(a) = \mathcal{N}(\langle \theta^*, a \rangle, \mathbf{I})$ and $\mathcal{V}'_v(a) = \mathcal{N}(\langle \theta'(v), a \rangle, \mathbf{I})$.
Finally, we denote $\mathbb{E}_0[\cdot], \mathbb{E}'_v[\cdot]$ as the expectations over all randomness in the observation $\mathcal{E}_T$ induced by the bandit instance $\mathcal{M}, \mathcal{M}'$ respectively, and $\mathbb{E}[\cdot]$ will denote any additional expectations, typically to be taken over the randomness in the parameter $v$ only.
Further, let $\mathbb{P}_0[\cdot],\mathbb{P'}_v[\cdot]$ denote the probability distributions over the observation $\mathcal{E}_T$ under bandit instances $\mathcal{M},\mathcal{M}'(v)$ respectively.
Finally, we use $D_{\operatorname{KL}}(\cdot, \cdot)$ to denote the Kullback-Liebler divergence between two probability distributions.

% First, under~\Cref{rem:shape}, \citet{Banerjee2022} shows that the maximum eigenvalue $\lambda_d$ of the gram matrix $\sum^T a_ta_t^{\top} = \mathcal{O}(T)$ and  for all other eigenvalues $\lambda_i$ for all $i \in [d-1]$ satisfies $\lambda_i = \mathcal{O}\left(\frac{T}{d}\right)$.
Note that $\hat{\theta}$ can only be a functional of the observation $\mathcal{E}_T$.
Therefore, for any fixed $v \in S^{d-1}$, the LeCam method gives us
\begin{align*}
    \max\left\{\mathbb{E}\left[\|\hat{\theta} - \theta^*\|_2\right], \mathbb{E}'\left[\|\hat{\theta} - \theta'(v)\|_2\right]\right\} &\geq \frac{1}{2} \|\epsilon v\|_2 \left(1 - \|\mathbb{P}_0 - \mathbb{P'}_v\|_{\operatorname{TV}}\right) \\
    &= \frac{\epsilon}{2} \left(1 - \|\mathbb{P}_0 - \mathbb{P'}_v\|_{\operatorname{TV}}\right),
\end{align*}
%
where the last equality follows because $v \in S^{d-1}$.
Taking a further expectation over $v \sim \text{Unif}(S^{d-1})$ and using linearity of expectation yields
\begin{align}\label{eq:lowerbound_lecam}
    \mathbb{E}\left[\max\left\{\mathbb{E}\left[\|\hat{\theta} - \theta^*\|_2\right], \mathbb{E}'\left[\|\hat{\theta} - \theta'(v)\|_2\right]\right\}\right] &\geq \mathbb{E}\left[\frac{\epsilon}{2} \left(1 - \|\mathbb{P}_0 - \mathbb{P'}_v\|_{\operatorname{TV}}\right)\right] \nonumber \\
    &= \frac{\epsilon}{2} \left(1 - \mathbb{E}\left[\|\mathbb{P}_0 - \mathbb{P'}_v\|_{\operatorname{TV}}\right]\right).
\end{align}
%
Therefore, it suffices to upper bound the term $\mathbb{E}\left[\|\mathbb{P}_0 - \mathbb{P'}_v\|_{\operatorname{TV}}\right]$.
First, we consider the total variation distance $\|\mathbb{P}_0 - \mathbb{P'}_v\|_{\operatorname{TV}}$ for a fixed $v$.
By the definition of total variation distance, we have $\|\mathbb{P}_0 - \mathbb{P'}_v\|_{\operatorname{TV}} := \sup_{\mathcal{E}_T \in \mathcal{F}_T} |\mathbb{P}_0(\mathcal{E}_T) - \mathbb{P}'_v(\mathcal{E}_T)|$.
Then, an adaptation of Lemma 19 of~\cite{Kauffman2014} gives us
\begin{align}\label{eq:kauffmanlemma}
\sup_{\mathcal{E}_T \in \mathcal{F}_T} |\mathbb{P}_0(\mathcal{E}_T) - \mathbb{P}'_v(\mathcal{E}_T)| \leq \sum_{t=1}^T \mathbb{E}_0\left[D_{\operatorname{KL}}(\mathcal{V}(a_t), \mathcal{V}'_v(a_t))\right].
\end{align}
%
Next, note that for any fixed $a$, we have that $\mathcal{V}(a) = \mathcal{N}(\langle \theta^*, a \rangle, \mathbf{I})$ and $\mathcal{V}'_v(a) = \mathcal{N}(\langle \theta'(v), a \rangle, \mathbf{I})$.
Therefore, we have $D_{\operatorname{KL}}(\mathcal{V}(a), \mathcal{V}'_v(a)) = \frac{\epsilon^2}{2} (\langle a, v \rangle)^2$.
Plugging this into~\Cref{eq:kauffmanlemma} gives us
\begin{align*}
\|\mathbb{P}_0 - \mathbb{P'}_v\|_{\operatorname{TV}} &\leq \sum_{t=1}^T \mathbb{E}_0\left[\frac{\epsilon^2}{2} (\langle a_t, v \rangle)^2\right] \\
&= \frac{\epsilon^2}{2} \cdot v^\top \mathbb{E}_0\left[\sum_{t=1}^T a_t a_t^\top\right] v.
\end{align*}
Henceforth, we denote $\overline{M}_T := \mathbb{E}_0\left[\sum_{t=1}^T a_t a_t^\top\right]$ as the expected Gram matrix composed of the actions $a_1,\ldots,a_T$.
Note that $\overline{M}_T$ is a deterministic quantity.
We leverage the following key fact that was proved by~\cite{Banerjee2022}, restated below: for some universal positive constant $C > 0$ that does not depend on $T$ or $d$, we have
\begin{align}\label{eq:banerjeelemma}
\lambda_{\max}(\overline{M}_T) &\leq C T \nonumber \\
\lambda_i(\overline{M}_T) &\leq \frac{CT}{d} \text{ for all } i > 1.
\end{align}
%
We will leverage this fact to complete the proof of our main result.
Let $\{u_i\}_{i=1}^d$ denote the unit-normalized eigenvectors of $\overline{M}_T$, and let $v = \sum_{i=1}^d \alpha_i u_i$ (note that while $\{\alpha_i\}_{i=1}^d$ are random variables, the eigenvectors $\{u_i\}_{i=1}^d$ are deterministic).
Then, taking an expectation on our point-wise bound on the total variation distance over $v \sim \text{Unif}(S^{d-1})$ yields
\begin{align*}
    \mathbb{E}\left[\|\mathbb{P}_0 - \mathbb{P'}_v\|_{\operatorname{TV}}\right] &\leq \frac{\epsilon^2}{2} \cdot \mathbb{E}\left[v^\top \overline{M}_T v \right] \\
    &= \frac{\epsilon^2}{2d} \cdot \text{trace}(\overline{M}_T),
\end{align*}
%
where the last equality follows because $v$ is uniformly distributed on the sphere, and therefore $\mathbb{E}[vv^\top] = \frac{1}{d} \mathbf{I}$.
We then plug in~\Cref{eq:banerjeelemma}, which gives us $\text{trace}(\overline{M}_T) := \sum_{i=1}^d \lambda_i(\overline{M}_T) \leq 2CT$.
Ultimately, we get
\begin{align}
    \mathbb{E}\left[\|\mathbb{P}_0 - \mathbb{P'}_v\|_{\operatorname{TV}}\right] &\leq \frac{\epsilon^2 CT}{2d}.
\end{align}
Substituting this in~\Cref{eq:lowerbound_lecam} ultimately gives us
\begin{align*}
    \mathbb{E}\left[\max\left\{\mathbb{E}\left[\|\hat{\theta} - \theta^*\|_2\right], \mathbb{E}'\left[\|\hat{\theta} - \theta'(v)\|_2\right]\right\}\right] &\geq \frac{\epsilon}{2} \left(1 - \frac{\epsilon^2 CT}{2d}\right).
\end{align*}
Finally, we select $\epsilon = \sqrt{\frac{d}{2C'T}}$ for some sufficiently large constant $C' > C$.
This yields the lower bound
\begin{align*}
    \mathbb{E}\left[\max\left\{\mathbb{E}\left[\|\hat{\theta} - \theta^*\|_2\right], \mathbb{E}'\left[\|\hat{\theta} - \theta'(v)\|_2\right]\right\}\right] &\geq \sqrt{\frac{d}{8C'T}} \left(1 - \frac{C}{C'}\right) = \Omega\left(\sqrt{\frac{d}{T}}\right).
\end{align*}
%
This is the desired statement and completes the proof of the theorem.
\qed

% \lowerbound*
% \begin{proof}
%     This proof will follow the proof of Theorem 1 from \cite{guo2021learning}.
%     We will establish two bandit instances. The first instance $\mathcal{M}$ is parameterized by the true $\theta_1^*$. The second instance is $\mathcal{M}'$ which is parameterized by $\theta_2^*$ where $\theta_2^* \coloneqq \theta_1^*- \epsilon v$ where $\epsilon \in \mathbb{R}$. We will choose $v \in \mathbb{R}^d$ as a random vector on the unit ball according to a uniform distribution. Suppose one of instances $\mathcal{M}$ and $\mathcal{M}'$ are chosen and we observe the sequence $\mathcal{E}_T \coloneqq \{ a_1, a_2, \dots, a_T\}$. We denote the reward distribution for an arm $a_t$ under bandit instances $\mathcal{M}$ and $\mathcal{M}'$ as $\mathcal{V}(a_t)$ and $\mathcal{V}'(a_t)$ respectively. Furthermore, we state that the rewards of these bandit instances are a sample from Normal Distributions with variance $\Sigma^2$. Formally, we state that $\mathcal{V}(a_t) \sim N(\langle \theta_1^*, a_T \rangle, \Sigma^2)$ and $\mathcal{V}'(a_t) \sim N(\langle \theta_2^*, a_T \rangle, \Sigma^2)$. We reduce the reward estimation error to that of binary testing between these two instances, as in the Le-Cam approach. 
    
%     Given some series of actions $\mathcal{E} \coloneqq \{a_1, a_2, \dots, a_T\}$ generated by our demonstrator where $\mathcal{E} \in \mathcal{F}$ and $\mathcal{F}$ is the sigma-algebra of possible events, i.e. $\mathcal{F}_T = \sigma(\{a_1, a_2, \dots, a_T\})$. Our bandit instances $\mathcal{M}$ and $\mathcal{M}'$ have the probability distributions over all possible series of actions $\mathbb{P}$ and $\mathbb{P}'$, acting over $\mathcal{F}_T$. Given \cite{lecam1973convergence}, any algorithm choosing between the two bandit instances with a decision $\hat{\theta}$, it must at least suffer an error
% \begin{align}
%   \underset{v}{\mathbb{E}}\left[\max \{ \mathbb{E}_1\left(\left\| \hat{\theta} - \theta_2^*\right\|_2\right), \mathbb{E}_2\left(\left\| \hat{\theta} - \theta_1^*\right\|_2\right)\}\right] & \geq \underset{v}{\mathbb{E}} \left[\frac{1}{2} \| \epsilon v\|(1 - \| \mathbb{P}' -  \mathbb{P}\|_{\text{TV}})\right] \nonumber\\
%   &\geq \underset{v}{\mathbb{E}} \left[\frac{1}{2}\| \epsilon v\|\left(1 - \underset{\mathcal{E}\in\mathcal{F}_T}{\sup} \lvert \mathbb{P}(\mathcal{E}) - \mathbb{P}'(\mathcal{E}) \rvert \right)\right] \label{eq:570}
% \end{align}
% where \Cref{eq:570} comes from the definition of the total variation. Here, we rely on the result of Lemma 19 from \cite{Kauffman2014} stating that 
% $$\underset{\mathcal{E}\in\mathcal{F}_T}{\sup} \lvert \mathbb{P}(\mathcal{E}) - \mathbb{P}'(\mathcal{E}) \rvert \leq  \sum_{t=1}^T \operatorname{KL}(\mathcal{V}(a_t), \mathcal{V}'(a_t))\text{.}$$
% However, remembering that the reward distributions are normally distributed with well-defined means and variances, we get
% $$\operatorname{KL}(\mathcal{V}(a_t), \mathcal{V}'(a_t)) = \frac{\epsilon^2(\langle a_t, v \rangle)^2}{2\Sigma^2}\text{.}$$
% Here, we introduce the term $\alpha_{t, d} = \langle a_t, v \rangle$. 
% \begin{align}
%     \underset{v}{\mathbb{E}}\left(\sum_{t=1}^T \left(\langle a_t, v \rangle\right)^2\right) &= \underset{v}{\mathbb{E}}\left(\sum_{t=1}^T v^\top a_ta_t^\top v\right)\nonumber\\
%     &= \underset{v}{\mathbb{E}}\left(v^\top\left(\sum_{t=1}^T  a_ta_t^\top\right) v\right)\nonumber\\
%     &= \underset{v}{\mathbb{E}} \left(\sum_{i}^{d} \alpha_i^2 e_i^\top \left(\sum_{t=1}^T  a_ta_t^\top\right) e_i^\top\right)\nonumber\\
%     &= \underset{v}{\mathbb{E}} \left(\sum_{i}^{d} \alpha_i^2 e_i^\top \left(\sum_{t=1}^T  a_ta_t^\top\right) e_i^\top\right)\nonumber\\
%     &= \sum_{i}^{d} \frac{1}{d} \|e_i\|_2^2 \lambda_i\\
%     &\leq  \|e_i\|_2^2 \frac{T}{d} + \sum_{i}^{d-1} \frac{1}{d} \|e_i\|_2^2 \lambda_i\nonumber\\
%     &\leq  \|e_i\|_2^2 \frac{T}{d} + \underset{i \in [d-1]}{\max}\left(\frac{1}{d}\lambda_i\|e_i\|_2^2\right)\nonumber\\
%     &\leq \frac{T}{d}\underset{i \in [d]}{\max}\left(\|e_i\|_2^2\right) \label{eq:finaleigen}
% \end{align}
% where \Cref{eq:finaleigen} comes from  \cite{Banerjee2022} saying $\lambda_i \leq \mathcal{O}\left(\frac{T}{d}\right)$ for $i \leq d -1$ and $\lambda_d \leq \mathcal{O}\left(T\right)$. We will call the quantity from \Cref{eq:finaleigen} as $\Lambda = \frac{T}{d}\underset{i \in [d-1]}{\max}\left(\|e_i\|_2^2\right)$.
% We finally have 
% \begin{equation}
%     \underset{\mathcal{E}\in\mathcal{F}_T}{\sup} \lvert \mathbb{P}(\mathcal{E}) - \mathbb{P}'(\mathcal{E}) \rvert \leq \frac{\epsilon^2 \Lambda}{\Sigma^2} \nonumber
% \end{equation}
% Therefore, we arrive at the final
% \begin{align}
%   \underset{v}{\mathbb{E}} \left(\max \{ \mathbb{E}_1\left(\left\| \hat{\theta} - \theta_2^*\right\|_2\right), \mathbb{E}_2\left(\left\| \hat{\theta} - \theta_1^*\right\|_2\right)\}\right) & \geq \frac{1}{2} \| \epsilon v\|\left(1 -  \frac{\epsilon^2 \Lambda}{\Sigma^2}\right)  \nonumber\\
%   &\geq  \frac{\epsilon \| v\|}{2} -  \frac{\epsilon^3 \| v\| \Lambda}{2\Sigma^2} \nonumber
% \end{align}
% To maximize the lower bound, we set $\epsilon = \frac{\Sigma}{\sqrt{2\Lambda}}$ to get ,
% $$\underset{v}{\mathbb{E}} \left(\max \{  \mathbb{E}_1\left(\left\| \hat{\theta} - \theta_2^*\right\|_2\right), \mathbb{E}_2\left(\left\| \hat{\theta} - \theta_1^*\right\|_2\right)\}\right) \leq \frac{\Sigma \|v\|}{3\sqrt{3\Lambda}} \text{.}$$
% Substituting in $\Lambda$, we get
% \begin{align}
% \underset{v}{\mathbb{E}} \left(\max \{ \ \mathbb{E}_1\left(\left\| \hat{\theta} - \theta_2^*\right\|_2\right), \mathbb{E}_2\left(\left\| \hat{\theta} - \theta_1^*\right\|_2\right)\} \right)&\leq \frac{\Sigma \|v\|}{3\sqrt{3\Lambda}}  \nonumber\\
% &\leq  \frac{\Sigma \|v\|\sqrt{d}}{3\sqrt{3T}\underset{i \in [d-1]}{\max}\left(\|e_i\|_2\right)} \nonumber\\
% &\leq \frac{\Sigma \sqrt{d}}{3\sqrt{3T}\underset{i \in [d-1]}{\max}\left(\|e_i\|_2\right)} \nonumber
% \end{align}
% Therefore, we get our final claim
% $$\underset{v}{\mathbb{E}} \left(\max \{  \mathbb{E}_1\left(\left\| \hat{\theta} - \theta_2^*\right\|_2\right), \mathbb{E}_2\left(\left\| \hat{\theta} - \theta_1^*\right\|_2\right)\}\right) \leq \mathcal{O}\left(\sqrt{\frac{d\Sigma^2}{T}}\right)\text{.}$$
% Therefore, in expectation of $v$, we have the desired quantity. 

% \end{proof} 



\section{Proof of Lemma \ref{lem:validass}}
\label{sec:appendvalidass}

Recall that \Cref{rem:shape} is parametrized by scalars $(\omega, \gamma)$ (and through $\omega$, the scalar $\beta$). We construct a family of action set/$\theta^*$ pairs in two dimensions and claim that each pair in the family satisfies \Cref{rem:shape}. We denote the coordinate system using $(x, y) \in \mathbb{R}^2$. Our instance is parametrized by an angle parameter $\kappa$.

\paragraph{Constructing the action set:}
\begin{itemize}
    \item Choose $\theta^*$ forming angle $\kappa$ with the vector $(1, 0)$. Set $G = \cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L$ for convenience.
    \item Construct action set $\mathcal{A}$ by including the convex hull\footnote{Any discretization of the boundary of this set also suffices.} of the following points: $(-1, 0)$, $(1, 0)$, $(0, 1)$, $(0, -1)$ as well as the points $\left(\frac{G\cos\left(\beta\right)}{\cos\left(\kappa+\beta\right)\|\theta^*\|_2},\frac{G\sin\left(\beta\right)}{\cos\left(\kappa+\beta\right)\|\theta^*\|_2}\right)$ and $\left(\frac{G\cos\left(-\beta\right)}{\cos\left(\kappa-\beta\right)\|\theta^*\|_2},\frac{G\sin\left(-\beta\right)}{\cos\left(\kappa-\beta\right)\|\theta^*\|_2}\right)$.
\end{itemize}

See \Cref{fig:vizvalidass} for an illustration of the set. Before we characterize this set, we will define a helper function

\begin{definition}
    The function $\operatorname{atan2}(y,x)$ is defined as 
    \begin{equation}
\operatorname{atan2}(y, x) = 
\begin{cases} 
\arctan\left(\frac{y}{x}\right) & \text{if } x > 0 \\
\arctan\left(\frac{y}{x}\right) + \pi & \text{if } y \geq 0 \text{ and } x < 0 \\
\arctan\left(\frac{y}{x}\right) - \pi & \text{if } y < 0 \text{ and } x < 0 \\
+\frac{\pi}{2} & \text{if } y > 0 \text{ and } x = 0 \\
-\frac{\pi}{2} & \text{if } y < 0 \text{ and } x = 0 \\
\text{undefined} & \text{if } y = 0 \text{ and } x = 0
\end{cases}\nonumber
\end{equation}
\end{definition}

The two crucial and readily verifiable properties about this set that will be used in the sequel are that
\begin{enumerate}
\item All arms $(x,y) \in \mathcal{A} \setminus \{(1, 0)\}$ satisfy $$\cos(\kappa + \operatorname{atan2}(y, x)) \|\theta^*\|_2\sqrt{x^2 + y^2} < \cos(\kappa)\|\theta^*\|_2\text{.}$$
\item $\mathcal{A}$ contains the points $P_1 = \left(\frac{G\cos\left(\beta\right)}{\cos\left(\kappa+\beta\right)\|\theta^*\|_2},\frac{G\sin\left(\beta\right)}{\cos\left(k+\beta\right)\|\theta^*\|_2}\right)$ and $P_3 = \left(\frac{G\cos\left(-\beta\right)}{\cos\left(\kappa-\beta\right)\|\theta^*\|_2},\frac{G\sin\left(-\beta\right)}{\cos\left(k-\beta\right)\|\theta^*\|_2}\right)$.
\end{enumerate}


\begin{figure}
    \centering
    \begin{tikzpicture}[scale=3]  % Adjust the scale factor for a larger drawing
    % Draw the grid
    \draw[step=0.2,gray,very thin] (-1.2,-1.2) grid (1.2,1.2);
    
    % Draw the axes
    \draw[thick,->] (-1.3,0) -- (1.3,0) node[right] {$x$};
    \draw[thick,->] (0,-1.3) -- (0,1.3) node[above] {$y$};
    
    % Draw the points with swapped coordinates
    \fill[green!50!black] (0.923,0.092) circle (1pt) node[above right] {Point 1};  % Smaller point size
    \fill[green!50!black] (1,0) circle (1pt) node[below right] {Point 2};  % Smaller point size
    \fill[green!50!black] (.886,-.0889) circle (1pt) node[below right] {Point 3};  % Smaller point size
    \fill[blue] (0,0.5) circle (1pt) node[above] {Example Convex Polytope};  % Blue point
    \fill[blue] (-0.5,0) circle (1pt);  % Blue point
    \fill[blue] (0,-0.5) circle (1pt);  % Blue point
    % Draw lines between the points
    \draw[green!50!black] (0.923,0.092) -- (1,0);
    \draw[green!50!black] (.886,-.0889) -- (1,0);

    \draw[orange,dotted,very thick,domain=.76:1.2] plot (\x,4.9333*\x-4.9333) node[left] {Arms with optimal reward};

    \draw[blue] (0.923,0.092) -- (0,0.5);
    \draw[blue] (0,0.5) -- (-0.5,0);
    \draw[blue] (-0.5,0) -- (0,-0.5);
    \draw[blue] (0,-0.5) -- (.886,-.0889);

    \draw[red,->] (0,0) -- (1,0) node[midway, above] {$a^*$};
    \draw[red,->] (0,0) -- (.98,-.19) node[midway, below] {$\theta^*$};
    
    % Add the angle label
    \draw[red] (.49, -.085) arc (-.2:0:20.45) node[midway,right] {$\kappa$};
\end{tikzpicture}
    \caption{Example Configuration of action set detailed by the proof for Lemma \ref{lem:validass}. The green points are the three points referenced by the proof, the orange line is the line of vectors with the same optimal reward as the optimal Point 2, and the blue lines are example continuations of drawing the convex hull of the action set that satisfy \Cref{ass:algass}. These are done when $\kappa = .2, L=5, \text{ and } \beta = .1$.}
    \label{fig:vizvalidass}
\end{figure}

\begin{lemma}
    Provided 
        \begin{align*}
            \kappa \in \bigg[&\max\bigg(-\cos^{-1}\left(\frac{3(1-\iota)\epsilon_L}{\|\theta^*\|_2}\right),\cos^{-1}\left(0\right)+\beta-\pi\bigg), \\
            &\min\left(\cos^{-1}\left(\frac{3(1-\iota)\epsilon_L}{\|\theta^*\|_2}\right),\cos^{-1}\left(0\right)-\beta\right)\bigg],
        \end{align*}
        the pair $(\theta^*, \mathcal{A})$ constructed above (or any rotation thereof) satisfies \Cref{rem:shape}. 
\end{lemma}
\begin{proof}
% \Etash{Visualize this and justify d=2} 

%For visualization purposes, we will demonstrate the existence of an action set $\mathcal{A} \subset \mathbb{R}^2$, which satisfies our assumptions for a prechosen value $\omega$. We provide an example visualization in \Cref{fig:vizvalidass}. 

We now verify several claims, which when taken together will prove the lemma.

\paragraph{Claim 1: The optimal arm is $a^* = P_2 = (1, 0)$:} 
Recall that $\beta = \left(3(1-\iota)\epsilon_L\right)^\frac{1}{\omega}$, and that every point $(x, y)$ in the action set satisfies $\cos(\kappa + \operatorname{atan2}(y, x)) \|\theta^*\|_2\sqrt{x^2 + y^2} < \cos(\kappa)\|\theta^*\|_2$. 
Any arm $(x, y) \in \mathcal{A}$ forms an angle of $\kappa + \operatorname{atan2}(y, x)$ with the reward vector $\theta^*$. They also have magnitude of $\sqrt{x^2 +y^2}$. Therefore, their reward is 
$$\cos(\kappa + \operatorname{atan2}(y, x)) \|\theta^*\|_2\sqrt{x^2 + y^2} \text{.}$$

The reward of the optimal arm by definition is also $$\cos(\kappa)\|\theta^*\|_2\text{.}$$
Therefore, by the first constraint, we have that any arm in the action set has reward less than the optimal arm. 

\paragraph{Claim 2: Point $P_3$ forms an angle of $\beta$ with $a^* = (1,0)$, thereby satisfying the first item of \Cref{rem:shape}:} 
We can explicitly character the angle $P_3$ forms with $a^*$ by the following

\begin{align}
    \arccos\left(\frac{\frac{G\cos\left(-\beta\right)}{\cos\left(k-\beta\right)n}}{\sqrt{\left(\frac{G\sin\left(-\beta\right)}{\cos\left(k-\beta\right)n}\right)^{2}+\left(\frac{G\cos\left(-\beta\right)}{\cos\left(k-\beta\right)n}\right)^{2}}}\right) &= \arccos\left(\frac{\frac{G\cos\left(-\beta\right)}{\cos\left(k-\beta\right)\|\theta^*\|_2}}{G\sqrt{\left(\frac{\sin\left(-\beta\right)}{\cos\left(k-\beta\right)\|\theta^*\|_2}\right)^{2}+\left(\frac{\cos\left(-\beta\right)}{\cos\left(k-\beta\right)\|\theta^*\|_2}\right)^{2}}}\right) \nonumber \\
    &= \arccos\left(\frac{\frac{G\cos\left(-\beta\right)}{\cos\left(k-\beta\right)\|\theta^*\|_2}}{\frac{G}{\cos\left(k-\beta\right)\|\theta^*\|_2}\sqrt{\left(\sin\left(-\beta\right)\right)^{2}+\left(\cos\left(-\beta\right)\right)^{2}}}\right) \nonumber \\
    &= \arccos\left(\frac{\cos\left(-\beta\right)}{\sqrt{\left(\sin\left(-\beta\right)\right)^{2}+\left(\cos\left(-\beta\right)\right)^{2}}}\right) \nonumber \\
    &= \beta \nonumber
\end{align}
Therefore, $P_3$ forms an angle of $\beta$ with $a^*$. Similar logic holds for proving Point 1 forms an angle of $\beta$ with $a^* = (1,0)$. 

\paragraph{Claim 3: The second item of \Cref{rem:shape} is satisfied:} 

We will now also prove that the second constraint from \Cref{rem:shape} holds in this setting. 
We will evaluate the reward of Point 1. Point 1 forms an angle of $\beta$ with the optimal arm $a^*$ and, thus, forms an angle of $\beta + \kappa$ with $\theta^*$. Moreover, the $\ell_2$ norm of Point 1 is $$\left|\frac{\left(\cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L\right)}{\cos\left(\kappa+\beta\right)\|\theta^*\|_2}\right|\text{.}$$ Given the restriction on $\kappa$, we have that $\frac{\left(\cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L\right)}{\cos\left(\kappa+\beta\right)\|\theta^*\|_2}$ is strictly positive. Since $$-\arccos\left(\frac{3(1-\iota)\epsilon_L}{\|\theta^*\|_2}\right) \leq \kappa \leq \arccos\left(\frac{3(1-\iota)\epsilon_L}{\|\theta^*\|_2}\right)\text{,}$$ the numerator is positive. Moreover, since $\arccos\left(0\right)-\beta-\pi \leq \arccos\left(0\right)-\beta$ the denominator is positive. Therefore, its reward is 
\begin{align}
    \frac{\left(\cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L\right)}{\cos\left(\kappa+\beta\right)\|\theta^*\|_2}\|\theta^*\|_2 \cos(\beta + \kappa) &= \cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L\nonumber\\
    &= \mu^* - 3(1-\iota)\epsilon_L\nonumber
\end{align}

We now do this similarly for Point 3. Point 3 forms an angle of $-\beta$ with the optimal arm $a^*$ and, thus, forms an angle of $\kappa-\beta$ with $\theta^*$. Moreover, the $\ell_2$ norm of Point 1 is $$\left|\frac{\left(\cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L\right)}{\cos\left(\kappa-\beta\right)\|\theta^*\|_2}\right|\text{.}$$ Given the restrictions on $\kappa$, the value $\frac{\left(\cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L\right)}{\cos\left(\kappa-\beta\right)}$ is strictly positive.  Since $$-\arccos\left(\frac{3(1-\iota)\epsilon_L}{\|\theta^*\|_2}\right) \leq \kappa \leq \arccos\left(\frac{3(1-\iota)\epsilon_L}{\|\theta^*\|_2}\right)\text{,}$$ the numerator is positive. Moreover, since $\arccos\left(0\right) + \beta-\pi \leq \arccos\left(0\right)+\beta$, the denominator is positive. Therefore, its reward is Therefore, its reward is 
\begin{align}
    \frac{\left(\cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L\right)}{\cos\left(\kappa-\beta\right)\|\theta^*\|_2}\|\theta^*\|_2 \cos(\kappa - \beta) &= \cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L\nonumber\\
    &= \mu^* - 3(1-\iota)\epsilon_L\nonumber
\end{align}

\paragraph{Claim 4:  The action set is sufficiently dense as in $\operatorname{dist}(a^i, i) \leq \gamma \leq  \frac{\epsilon_{\bar{L}}}{\|\theta^*\|_2 d}$.}

\begin{align}
    \operatorname{dist}(a^i, i) &= \|\operatorname{proj}(a^i, i) - a^i\|_2 \nonumber \\
    &= \|a^i - a^i\|_2 \nonumber \\
    &= 0 \nonumber \\
    &\leq \gamma 
\end{align}

% \apcomment{Old proof below.}
% Without loss of generality, set the optimal arm $a^*$ to be the vector $(1,0)$. Let \begin{align*}
%             \kappa \in \bigg[\max\bigg(-\arccos&\left(\frac{3(1-\iota)\epsilon_L}{\|\theta^*\|_2}\right),\arccos\left(0\right)+\beta-\pi\bigg), \\
%             &\min\left(\arccos\left(\frac{3(1-\iota)\epsilon_L}{\|\theta^*\|_2}\right),\arccos\left(0\right)-\beta\right)\bigg]
%         \end{align*} be the angle formed between $\theta^*$ and $a^*$ where $a^*$ is the reference point and $\theta^*\in\mathbb{R}^d$. In this setting, $\mu^* = \cos(\kappa) \|\theta^*\|_2$. We remind the reader that we set $\beta = \left(3(1-\iota)\epsilon_L\right)^\frac{1}{\omega}$.  

% The claim is that the following conditions are sufficient for an action set to satisfy \Cref{rem:shape} for a given $\omega$.
% \begin{enumerate}
%     \item $\forall (x, y) \in \mathcal{A} \text{ s.t. } (x, y) \neq a^*,  \cos(\kappa + \operatorname{atan2}(y, x)) \|\theta^*\|_2\sqrt{x^2 + y^2} < \cos(\kappa)\|\theta^*\|_2$ 
%     \item The points $\left(\frac{\left(\cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L\right)\cos\left(\beta\right)}{\cos\left(\kappa+\beta\right)\|\theta^*\|_2},\frac{\left(\cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L\right)\sin\left(\beta\right)}{\cos\left(k+\beta\right)\|\theta^*\|_2}\right)$ \\ and $\left(\frac{\left(\cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L\right)\cos\left(-\beta\right)}{\cos\left(\kappa-\beta\right)\|\theta^*\|_2},\frac{\left(\cos\left(\kappa\right)\|\theta^*\|_2-3(1-\iota)\epsilon_L\right)\sin\left(-\beta\right)\|\theta^*\|_2}{\cos\left(k-\beta\right)\|\theta^*\|_2}\right)$ are both in $\mathcal{A}$. 
% \end{enumerate}

% In the visualization (Figure~\ref{fig:vizvalidass}), the orange line denotes the first constraint so that all points to the left of the orange line satisfy the first constraint. Moreover, the points from the second constraint are Points 1 and 3 in \Cref{fig:vizvalidass}. 

 
% \begin{align}
%     \arccos\left(\frac{\frac{G\cos\left(-\beta\right)}{\cos\left(k-\beta\right)n}}{\sqrt{\left(\frac{G\sin\left(-\beta\right)}{\cos\left(k-\beta\right)n}\right)^{2}+\left(\frac{G\cos\left(-\beta\right)}{\cos\left(k-\beta\right)n}\right)^{2}}}\right) &= \arccos\left(\frac{\frac{G\cos\left(-\beta\right)}{\cos\left(k-\beta\right)\|\theta^*\|_2}}{G\sqrt{\left(\frac{\sin\left(-\beta\right)}{\cos\left(k-\beta\right)\|\theta^*\|_2}\right)^{2}+\left(\frac{\cos\left(-\beta\right)}{\cos\left(k-\beta\right)\|\theta^*\|_2}\right)^{2}}}\right) \nonumber \\
%     &= \arccos\left(\frac{\frac{G\cos\left(-\beta\right)}{\cos\left(k-\beta\right)\|\theta^*\|_2}}{\frac{G}{\cos\left(k-\beta\right)\|\theta^*\|_2}\sqrt{\left(\sin\left(-\beta\right)\right)^{2}+\left(\cos\left(-\beta\right)\right)^{2}}}\right) \nonumber \\
%     &= \arccos\left(\frac{\cos\left(-\beta\right)}{\sqrt{\left(\sin\left(-\beta\right)\right)^{2}+\left(\cos\left(-\beta\right)\right)^{2}}}\right) \nonumber \\
%     &= \beta \nonumber
% \end{align}
% Similar logic holds for proving Point 1 forms an angle of $\beta$ with $a^* = (1,0)$. 


% Moreover,  $a^*$, or Point 2 in \Cref{fig:vizvalidass} has a reward of $\mu^* = \cos(\kappa)\|\theta^*\|_2$. The $\ell_2$ norm of Point 2 is $1$. It forms an angle of $\kappa$ with $\theta^*$. Therefore, its reward is $\cos(\kappa)\|\theta^*\|_2$. Given that all points in the action set obey constraint 1 except for $a^*$, by definition, they have a reward less than $\cos(\kappa) \|\theta^*\|_2$, which is the reward of $a^*$. Therefore, all points in $\mathcal{A}$ will be rewarded less than $a^*$. Also, Points 1 and 3 satisfy the first constaint as well. Therefore, these conditions are sufficient to satisfy \Cref{rem:shape}.


\end{proof}


\section{Implementation details for Phased Elimination used in experiments}
\label{sec:practical_phased_elim}
\RestyleAlgo{ruled}
\LinesNumbered % uncomment to add line numbers
\begin{algorithm}[H]
\caption{Phased Elimination}\label{alg:practical_phased_elim}
  \SetKwInOut{Input}{Input}
  \Input{$\delta \text{ (probability parameters)}, L \text{ (number of phases)},\newline \{\nu_1, \dots, \nu_L\} \text{ (error parameters)}$}
  \KwResult{$a_1, \dots, a_T$}
  $\ell \leftarrow 0$\\
  $\mathcal{A}_1 \leftarrow \mathcal{A}$\\
  $t_\ell \leftarrow 0$\\ % need to show how this is updated...
  \While{$\ell < L$}{
        $\varepsilon_\ell \leftarrow 2^{-\ell}$ \\
        $\pi_\ell \leftarrow \text{G-Optimal design of } \mathcal{A}_\ell \text{ with } \delta \text{ and } \nu_{\ell}$ \label{alg:pracitcal_phased_goptimal}\\
        % \For{$a \in \mathcal{A}_l$}{
        %     \State{$n_l(a) \leftarrow \ceil{\pi_l(a) \cdot \frac{g(\pi_l) }{\varepsilon_l^2}\log{\frac{1}{\delta}}}$}\\
        %     \State{$N_{\ell} \leftarrow N_{\ell} + n_l(a)$} \\
        % }
        $N_\ell \leftarrow 0$\\
        \For{$a \in \mathcal{A}_\ell$}{
            $N_\ell(a) \leftarrow \left\lceil\frac{2d\pi_{\ell}(a)}{\nu_\ell^2} \log\left(\frac{k\ell(\ell+1)}{\delta} \right)\right\rceil$\\
            $\text{Play action } a \text{ for } N_\ell(a) \text{ rounds}$ \\
            $N_\ell \leftarrow N_\ell + N_\ell(a)$
        }
        $V_\ell \leftarrow \sum_{a \in \mathcal{A}_\ell} \pi_\ell(a) aa^\top$ \\
        $\theta_\ell \leftarrow V_l^{-1} \sum_{t=t_\ell}^{t_\ell + N_\ell} a_t x_t$ \\
        $\mathcal{A}_{\ell+1} \leftarrow \{ a \in \mathcal{A}_\ell \text{ s.t. } \underset{b \in \mathcal{A}_\ell}\max(\langle \theta_\ell, b - a \rangle) \leq 2\varepsilon_l\}$\\
        $t_\ell \leftarrow t_\ell + T_\ell$\\
        $\ell \leftarrow \ell + 1$\\
    }
\end{algorithm}

\Cref{alg:practical_phased_elim} formally describes the implementation of Phased Elimination used in our experiments. The behavior of this implementation only differs from \Cref{alg:phased_elim} in the choice of stopping criteria; here, we stop after a maximum number of phases, while \Cref{alg:phased_elim} fixes $T$ and allows $L$ to vary. \Cref{alg:pracitcal_phased_goptimal} is computed via a convex program with Gurobi solver \citep{gurobi}. 