\section{Proof of Theorem \ref{theorem: UB-CRM}}\label{secappendix: proof of CRM}
Throughout the proof we use $a^*$ to denote the optimal arm. First, we prove a few lemmas, and then use it to bound the expected cumulative regret of \CRM. Recall the definitions of $E_t, O_t$ and $C_t^x$ from Section \ref{secappendix: proof of CRM}. In this section we need to keep the context of which $X_i$ the $C_t^x$ corresponds to, therefore, we refer to it as $C_t^{i,x}$ instead. The following lemma shows that the expectation of $\widehat{\mu}_{i,x}$ as defined in Equation \ref{equation: emprical estimate for arm i,x modified} is equal to $\mu_{i,x}$ for every $i,x$.
%
\begin{lemma}\label{lemma: unbiased muix}
$\widehat{\mu}_{i,x}(t)$ is an unbiased estimator of $\mu_{i,x}$, that is $\mathbb{E}[\widehat{\mu}_{i,x}(t)] = \mu_{i,x}$. Moreover $\mathbb{P}(|\widehat{\mu}_{i,x}(t) - \mu_{i,x}| \geq \epsilon) \leq 2\exp(-2(N_t^{i,x} + C_t^{i,x})\epsilon^2)$~.
\end{lemma}
\begin{proof}
We begin by restating the the definition of $\widehat{\mu}_{i,x}$ from  Equation \ref{equation: emprical estimate for arm i,x modified}.
\begin{equation*}
    \widehat{\mu}_{i,x}(t) = \frac{\sum_{j \in S_t^{i,x}}\mathds{1}\{Y_j=1\} + \sum_{c \in [C_t^{i,x}]} Y_c^{i,x}}{N^{i,x}_t + C^{i,x}_t}
\end{equation*}
We note that in Equation \ref{equation: emprical estimate for arm i,x modified}, $Y_c^{i,x}$ is a random variable such that $\mathbb{E}[Y_c^{i,x}] = \mu_{i,x}$. Note that this holds because we partition the time steps where arm $a_0$ was pulled into odd and even instances $O_t$ and $E_t$.Taking expectation on both sides of the above equation we have
\begin{align*}
&\mathbb{E}[\widehat{\mu}_{i,x}(t)] \nonumber \\
&= \mathbb{E}\bigg[ \frac{\sum_{j \in S_t^{i,x}}\mathds{1}\{Y_j=1\} + \sum_{c \in [C_t^{i,x}]} Y_c^{i,x}}{N^{i,x}_t + C^{i,x}_t} \bigg] \nonumber \\
&= \sum_{a=1}^{\infty} \sum_{b=0}^{\infty} \mathbb{E}\bigg[ \frac{\sum_{j \in S_t^{i,x}}\mathds{1}\{Y_j=1\} + \sum_{c \in [C_t^{i,x}]} Y_c^{i,x}}{N^{i,x}_t + C^{i,x}_t} ~\Bigl\vert~ N_t^{i,x} = a, C_t^{i,x} = b \bigg] \mathbb{P}(N_t^{i,x} = a, C_t^{i,x} = b) \nonumber \\
% &= \sum_{a=1}^{\infty} \sum_{b=0}^{\infty} E\bigg[ \frac{\sum_{j \in S_t^{i,x}}\mathds{1}\{Y_j=1\} + \sum_{c \in [C_t^{i,x}]} Y_c^{i,x}}{a + b} \bigg] \mathbb{P}(N_t^{i,x} = a, C_t^{i,x} = b) \nonumber \\
&= \sum_{a=1}^{\infty} \sum_{b=0}^{\infty} \bigg(\frac{a\mu_{i,x} + b\mu_{i,x}}{a + b}\bigg) \mathbb{P}(N_t^{i,x} = a, C_t^{i,x} = b) \nonumber \\
&= \mu_{i,x} \sum_{a=1}^{\infty} \sum_{b=0}^{\infty} \mathbb{P}(N_t^{i,x} = a, C_t^{i,x} = b) \nonumber \\
&= \mu_{i,x}
\end{align*}

Next we prove the concentration inequality part of the lemma, which is similar to Chernoff-Hoeffding inequality (Lemma \ref{lemma: chernoff-hoeffding inequality}) for our estimator. 

\begin{align}
&\mathbb{P}\bigg(\frac{\sum_{j\in S_t^{i,x}} \mathds{1}\{Y_j = 1\} + \sum_{c\in [C^{i,x}_t]}Y^{i,x}_c}{N_t^{i,x} + C_t^{i,x}} \geq \mu_{i,x} + \epsilon \bigg) \nonumber \\
&= \mathbb{P}\bigg(\sum_{j\in S_t^{i,x}} \mathds{1}\{Y_j = 1\} + \sum_{c\in [C^{i,x}_t]}Y^{i,x}_c \geq (N_t^{i,x} + C_t^{i,x})\mu_{i,x} + (N_t^{i,x} + C_t^{i,x})\epsilon \bigg) \nonumber \\
&\stackrel{(i)}{\leq} \min_{\lambda > 0} E\bigg[\exp{\Big(\lambda\big(\sum_{j\in S_t^{i,x}} (\mathds{1}\{Y_j = 1\} - \mu_{i,x}) + \sum_{c \in [C_t^{i,x}]}(Y^{i,x}_c - \mu_{i,x}) \big) \Big)} \bigg]e^{-\lambda(N_t^{i,x} + C_t^{i,x})\epsilon} \nonumber \\
&= \min_{\lambda > 0} E\bigg[ \prod_{j \in S_t^{i,x}} \exp \big(\lambda (\mathds{1}\{Y_j = 1\} - \mu_{i,x}) \big) \prod_{c \in [C_t^{i,x}]} \exp \big(\lambda (Y_c^{i,x} - \mu_{i,x}) \big) \bigg]e^{-\lambda(N_t^{i,x} + C_t^{i,x})\epsilon} \nonumber \\
&\stackrel{(ii)}{=} \min_{\lambda > 0} \prod_{j \in S_t^{i,x}} E\bigg[\exp \big(\lambda (\mathds{1}\{Y_j = 1\} - \mu_{i,x}) \big) \bigg] \prod_{c \in [C_t^{i,x}]} E\bigg[ \exp \big(\lambda (Y_c^{i,x} - \mu_{i,x}) \big) \bigg]e^{-\lambda(N_t^{i,x} + C_t^{i,x})\epsilon} \nonumber \\
&\stackrel{(iii)}{\leq} \min_{\lambda > 0} \exp \bigg(\frac{N_t^{i,x}\lambda^2}{8} + \frac{C_t^{i,x}\lambda^2}{8} - \lambda(N_t^{i,x} + C^{i,x}_t) \epsilon\bigg) \nonumber \\
&\leq \exp{(-2(N_t^{i,x} + C^{i,x}_t)\epsilon^2)}
\end{align}
In the above equations, the inequality in $(i)$ follows from Lemma \ref{lemma: Chernoff bound}, the equality in $(ii)$ follows from the fact that each term in the product are independent, and $(iii)$ follows from Lemma \ref{lemma: Hoeffding's lemma}. 
%We use that each $|S^{i,x}_{\mathbf{z},t}| = C^{i,x}_{t}$ in $(i)$ and it is for this step we truncate $S^{i,x}_{\mathbf{z},t}$ to $C^{i,x}_{t}$ elements. 
Following the same steps as above we get the following two sided bound
\begin{align}
    \mathbb{P}(|\widehat{\mu}_{i,x}(t) - \mu_{i,x}| \geq \epsilon) \leq 2\exp{(-2(N_t^{i,x} + C_t^{i,x})\epsilon^2)} \ .
\end{align}
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%-------COMMENT----------------------%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{comment}
\gau{This is Old -
\begin{lemma}
$\widehat{\mu}_{i,x}$ is an unbiased estimator of $\mu_{i,x}$, that is $E[\widehat{\mu}_{i,x}] = \mu_{i,x}$.
\end{lemma}
\begin{proof}
By our definition
\begin{equation*}\label{equation: emprical estimate for arm 0}
    \widehat{\mu}_{i,x}(t) = \frac{(\sum_{s=1}^t\mathds{1}\{Y=1, a_s=a_{i,x}\}) + (\sum_\mathbf{z}\sum_{s \in S^{i,x}_{\mathbf{z},t}} \mathds{1}\{Y=1\}\widehat{p}^{\ i}_{\mathbf{z},t})}{N^{i,x}_t + \sum_\mathbf{z}\sum_{s \in S^{i,x}_{\mathbf{z},t}} \widehat{p}^{\ i}_{\mathbf{z},t}}
\end{equation*}
Taking expectation on both sides,
\begin{align*}
&E[\widehat{\mu}_{i,x}(t)] \nonumber \\
&= E\bigg[ \frac{(\sum_{s=1}^t\mathds{1}\{Y=1, a_s=a_{i,x}\}) + (\sum_\mathbf{z}\sum_{s \in S^{i,x}_{\mathbf{z},t}} \mathds{1}\{Y=1\}\widehat{p}^{\ i}_{\mathbf{z},t})}{N^{i,x}_t + \sum_\mathbf{z}\sum_{s \in S^{i,x}_{\mathbf{z},t}} \widehat{p}^{\ i}_{\mathbf{z},t}} \bigg] \nonumber \\
&= E\bigg[ \frac{\sum_{s=1}^t\mathds{1}\{Y=1, a_s=a_{i,x}\}}{N^{i,x}_t} \big| a_s \neq a_0 \bigg]\mathbb{P}(a_s \neq a_0) \nonumber \\
&+ E\bigg[ \frac{\sum_\mathbf{z}\sum_{s \in S^{i,x}_{\mathbf{z},t}} \mathds{1}\{Y=1\}\widehat{p}^{\ i}_{\mathbf{z},t}}{\sum_\mathbf{z}\sum_{s \in S^{i,x}_{\mathbf{z},t}} \widehat{p}^{\ i}_{\mathbf{z},t}} \big| a_s = a_0\bigg]\mathbb{P}(a_s = a_0) \nonumber \\
&= \mu_{i,x} \mathbb{P}(a \neq a_0) \nonumber \\
&+ E\bigg[ \frac{\sum_\mathbf{z}C_t^{i,x}\widehat{p}(Y=1 | X_i=x, \mathbf{Pa}(X_i) = \mathbf{z})\widehat{p}^{\ i}_{\mathbf{z},t}}{C_t^{i,x}} \bigg]\mathbb{P}(a = a_0) \nonumber \\
&= \mu_{i,x} \mathbb{P}(a_s \neq a_0) + \mu_{i,x} \mathbb{P}(a_s = a_0) \nonumber \\
&= \mu_{i,x}
\end{align*}
\end{proof}}

\begin{lemma}\label{lemma: unbiased muix new}
$\widehat{\mu}_{i,x}$ is an unbiased estimator of $\mu_{i,x}$, that is $E[\widehat{\mu}_{i,x}] = \mu_{i,x}$.
\end{lemma}
\begin{proof}
\gau{
\begin{equation*}
\begin{split}
    & S_t^{i,x} = \{s\in [t] \mid a_s=a_{i,x}\}\hspace{2em}
    S_t^0 = \{s\in [t] \mid a_s = a_0\}\\
    & S^{i,x}_{\mathbf{z}, t} = \{s\in [t] \mid a_s=a_0, X_i=x, pa(X_i)=z\} = \{s_1, \ldots,s_{C^{i,x}_{\mathbf{z}, t}}\}, \hspace{2em}
    C^{i,x}_{\mathbf{z}, t} = |S^{i,x}_{\mathbf{z}, t}|\\
    & S_t^0  = S_1\sqcup\ldots\sqcup S_{C^{i,x}_{\mathbf{z}, t}}\hspace{1em} s_j\notin S_j, j\in [C^{i,x}_{\mathbf{z}, t}]\\
    & \widehat{p}^{i,\mathbf{z}}_{t,j} = \frac{\sum\limits_{S_j} \mathds{1}\{pa(X_i)=\mathbf{z}\}}{|S_j|}, 
    Y^{i,x}_j = \sum\limits_{\mathbf{z}}\mathds{1}\{Y(s_j)=1\} \widehat{p}^{i,\mathbf{z}}_{t,j}, \hspace{1em} j\in [C^{i,x}_{\mathbf{z}, t}]\\
    & \widehat{\mu}_{i,x}(t) = \frac{\sum_{j \in S_t^{i,x}}\mathds{1}\{Y_j=1\} + \sum_{j \in [C_t^{i,x}]} Y_j^{i,x}}{N^{i,x}_t + C^{i,x}_t}
\end{split}
\end{equation*}
}




Let $S_t^{i,x} = \{s \mid a_s=a_{i,x}, s\leq t\}$, that is, $S_t^{i,x}$ is the set of time steps where the algorithm pulls the arm $a_{i,x}$, until time $t$. Similarly, let $S_t^0 = \{s \mid a_s = a_0, s \leq t\}$. Denote the $c$-th element of $S^{i,x}_{\mathbf{z}, t}$ by $s^{i,x}_{\mathbf{z}, c}$, and recall $|S^{i,x}_{\mathbf{z}, t}| = C^{i,x}_t$. Partition the set $S_t^0$ into $C_t^{i,x}$ components, where the $c$-th partition is denoted by $S_{t,c}^{0,i}$ and $s_{\mathbf{z}, c}^{i,x} \notin S_{t,c}^{0,i}$. Compute $\widehat{p}_{t, c}^{i, \mathbf{z}} = \sum_{s \in S_{t,c}^{0,i}} \mathds{1}\{\mathbf{Pa}(X_i) = \mathbf{z}\}/|S_{t, c}^{0, i}|$, and define the random variable $Y_c^{i,x}$ as follows: Take the $c$-th element of $S_{\mathbf{z},t}^{i,x}$ and multiply by $\widehat{p}_{t,c}^{i,\mathbf{z}}$ for all $\mathbf{z}$ and sum them, that is $Y_c^{i,x} = \sum_{\mathbf{z}} \mathds{1}\{Y_{s_{\mathbf{z},c}^{i,x}=1}\}\widehat{p}_{t,c}^{i,\mathbf{z}}$. Note that $E[Y_c^{i,x}] = \mu_{i,x}$.

\AUR{Note that each term in the summation of $Y_c^{i,x}$ can be represented as Gaurav had written in the chat where $s_1$ are the timesteps in $S_{\mathbf{z}, t}^{i,x}$ and $s_2$ are the timesteps in $S_{t,c}^{0, i}$}

By our definition
\begin{equation*}\label{equation: emprical estimate for arm 0}
    \widehat{\mu}_{i,x}(t) = \frac{\sum_{j \in S_t^{i,x}}\mathds{1}\{Y_j=1\} + \sum_{c \in [C_t^{i,x}]} Y_c^{i,x}}{N^{i,x}_t + C^{i,x}_t}
\end{equation*}
Taking expectation on both sides,
\begin{align*}
&E[\widehat{\mu}_{i,x}(t)] \nonumber \\
&= E\bigg[ \frac{\sum_{j \in S_t^{i,x}}\mathds{1}\{Y_j=1\} + \sum_{c \in [C_t^{i,x}]} Y_c^{i,x}}{N^{i,x}_t + C^{i,x}_t} \bigg] \nonumber \\
&= \sum_{a=1}^{\infty} \sum_{b=0}^{\infty} E\bigg[ \frac{\sum_{j \in S_t^{i,x}}\mathds{1}\{Y_j=1\} + \sum_{c \in [C_t^{i,x}]} Y_c^{i,x}}{N^{i,x}_t + C^{i,x}_t} \mid N_t^{i,x} = a, C_t^{i,x} = b \bigg] \mathbb{P}(N_t^{i,x} = a, C_t^{i,x} = b) \nonumber \\
&= \sum_{a=1}^{\infty} \sum_{b=0}^{\infty} E\bigg[ \frac{\sum_{j \in S_t^{i,x}}\mathds{1}\{Y_j=1\} + \sum_{c \in [C_t^{i,x}]} Y_c^{i,x}}{a + b} \bigg] \mathbb{P}(N_t^{i,x} = a, C_t^{i,x} = b) \nonumber \\
&= \sum_{a=1}^{\infty} \sum_{b=0}^{\infty} \frac{a\mu_{i,x} + b\mu_{i,x}}{a + b} \mathbb{P}(N_t^{i,x} = a, C_t^{i,x} = b) \nonumber \\
&= \mu_{i,x} \sum_{a=1}^{\infty} \sum_{b=0}^{\infty} \mathbb{P}(N_t^{i,x} = a, C_t^{i,x} = b) \nonumber \\
&= \mu_{i,x}
\end{align*}
\end{proof}

Next we prove concentration inequality similar to Chernoff-Hoeffding inequality for our estimator. 

\begin{align}
&\mathbb{P}\bigg(\frac{\sum_{j\in S_t^{i,x}} \mathds{1}\{Y_j = 1\} + \sum_{c\in [C^{i,x}_t]}Y^{i,x}_c}{N_t^{i,x} + C_t^{i,x}} \geq \mu_{i,x} + \epsilon \bigg) \nonumber \\
&= \mathbb{P}\bigg(\sum_{j\in S_t^{i,x}} \mathds{1}\{Y_c = 1\} + \sum_{c\in [C^{i,x}_t]}Y^{i,x}_c \geq (N_t^{i,x} + C_t^{i,x})\mu_{i,x} + (N_t^{i,x} + C_t^{i,x})\epsilon \bigg) \nonumber \\
&\leq \min_{\lambda \geq 0} E\bigg[\exp{\Big(\lambda\big(\sum_{j\in S_t^{i,x}} (\mathds{1}\{Y_c = 1\} - \mu_{i,x}) + \sum_{c \in [C_t^{i,x}]}(Y^{i,x}_c - \mu_{i,x}) \big) \Big)} \bigg]e^{-\lambda(N_t^{i,x} + C_t^{i,x})\epsilon} \nonumber \\
&\leq \min_{\lambda \geq 0} E\bigg[ \prod_{j \in S_t^{i,x}} \exp \big(\lambda (\mathds{1}\{Y_c = 1\} - \mu_{i,x}) \big) \prod_{c \in [C_t^{i,x}]} \exp \big(\lambda (Y_c^{i,x} - \mu_{i,x}) \big) \bigg]e^{-\lambda(N_t^{i,x} + C_t^{i,x})\epsilon} \nonumber \\
&\leq \min_{\lambda \geq 0} \prod_{j \in S_t^{i,x}} E\bigg[\exp \big(\lambda (\mathds{1}\{Y_c = 1\} - \mu_{i,x}) \big) \bigg] \prod_{c \in [C_t^{i,x}]} E\bigg[ \exp \big(\lambda (Y_c^{i,x} - \mu_{i,x}) \big) \bigg]e^{-\lambda(N_t^{i,x} + C_t^{i,x})\epsilon} \nonumber \\
&\leq \min_{\lambda \geq 0} \exp \bigg(\frac{N_T^{i,x}\lambda^2}{8} + \frac{C_t^{i,x}\lambda^2}{8} - \lambda(N_T^{i,x} + C^{i,x}_t) \epsilon\bigg) \nonumber \\
&\leq \exp{(-2(N_t^{i,x} + C^{i,x}_t)\epsilon^2)}
\end{align}
Following the same steps as above we get the following two sided bounds,
\begin{align}
    \mathbb{P}(|\widehat{\mu}_{i,x}(t) - \mu_{i,x}(t)| \geq \epsilon) \leq 2\exp{(-2(N_t^{i,x} + S_t)\epsilon^2)}
\end{align}
\end{comment}
%%%%%%%%%%%%%%%%%%%%%end of comment%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Next we show that the estimates of $\mu_a$ at the end of $T$ rounds is good with high probability.

\begin{lemma} \label{lemma: concentration bounds on mu in cumulative regret}
Let $p = \min_{i,x,\mathbf{z}} \mathbb{P}(X_i = x, \mathbf{Pa}(X_i) = \mathbf{z})$.  Then for sufficiently large $T \in \mathbb{N}$, at the end of $T$ rounds the following hold:  %for  \gau{Notation $P\rightarrow \mathbb{P}$}:
\begin{enumerate}
    \item $\mathbb{P} \Big(|\widehat{\mu}_0(T) - \mu_0| \geq \frac{\Delta_0}{4} \Big) \leq  2T^{-\frac{\Delta_0^2}{8}}$~.
    \item Let $\widehat{p}^{\ i,x}_{\mathbf{z}, T} = \frac{1}{|O_T|}\sum_{t \in O_T} \mathds{1}\{X_i(t)=x, \mathbf{Pa}(X_i)(t)=\mathbf{z}\}$, and $\widehat{p}^{\ i,x}_T = \min_\mathbf{z} \widehat{p}^{\ i,x}_{\mathbf{z},T}$. Then
    $\mathbb{P}\big(\widehat{p}^{i,x}_T \geq \frac{p}{2} \big) \geq  1-Z_iT^{-\frac{p^2}{4}}$, where $Z_i$ is the size of the domain from which $\mathbf{Pa}(X_i)$ takes values.
    \item $\mathbb{P}\Big(|\widehat{\mu}_{i,x}(T) - \mu_{i,x}| \geq \frac{\Delta_0}{4} \Big) \leq 2T^{-\frac{p\Delta_0^2}{32}} + Z_iT^{-\frac{p^2}{4}}$~.
\end{enumerate}
\end{lemma}
\begin{proof}
a) Since $\beta\geq 1$, at the end of $T$ rounds arm $a_0$ is pulled by Algorithm \ref{CR-algorithm} at least $(\ln{T})$ times. Hence, $N^0_T \geq (\ln{T})$, and by \ref{lemma: chernoff-hoeffding inequality},
\begin{equation*}\label{equation: empirical estimate of mu_0 is close}
\mathbb{P}\Big(|\widehat{\mu}_0(T) - \mu_0| \geq \frac{\Delta_0}{4} \Big) \leq 2e^{-\frac{\Delta_0^2}{8}\ln T} = 2T^{-\frac{\Delta_0^2}{8}}
\end{equation*}

b) \AUR{In this part we show, using union bound, that the estimation of $\widehat{p}^{\ i,x}_{T}$ being less that $p/2$ have low probability.}
Since, $|O_T| \geq N_T^0/2$, by Lemma \ref{lemma: chernoff-hoeffding inequality}, we have,
\begin{equation*}
\mathbb{P}\bigg(\widehat{p}^{i,x}_{\mathbf{z}, T} > p^{i,x}_{\mathbf{z}} - \frac{p}{2} \geq \frac{p}{2}\bigg) \geq 1 - e^{-2 \frac{p^2}{4} \frac{\ln T}{2}} = 1 - T^{-\frac{p^2}{4}} 
\end{equation*}
Now using this we get,
\begin{align}
\label{p-ix-greater-p-2}
\mathbb{P}\bigg(\widehat{p}^{\ i,x}_{T} \leq \frac{p}{2}\bigg) =  \mathbb{P}\bigg(\min_\mathbf{z} \widehat{p}^{\ i,x}_{\mathbf{z}, T} \leq \frac{p}{2}\bigg) \leq \sum_\mathbf{z} \mathbb{P}\bigg(\widehat{p}^{\ i,x}_{\mathbf{z}, T} \leq \frac{p}{2}\bigg) \leq Z_iT^{-\frac{p^2}{4}}
\end{align}

 %\gau{$P\rightarrow \mathbb{P}$} \vin{Let $S_{N_t^{i,x}}$ be the set of timesteps $s$ when $a_s = a_{i,x}$}. Let $S^{i,x}_{\mathbf{z}, t} = \{s^{i,x}_{\mathbf{z},1}, \dots, s^{i,x}_{\mathbf{z},C^{i,x}_t} \}$. Define the random variable $Y^{i,x}_c = \sum_\mathbf{z} \mathds{1}\{Y_{s^{i,x}_{\mathbf{z}, c}} = 1\}\widehat{p}^{i}_{\mathbf{z}, t}$. Hence, by Chernoff bound followed by Hoeffding's lemma,
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%comment%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{comment}
\vin{This part of the proof needs to be edited because most of this is proved in the previous lemma!}
\gau{This is proved earlier. Let $S_t^{i,x} = \{s \mid a_s=a_{i,x}, s\leq t\}$, that is, $S_t^{i,x}$ is the set of time steps where the algorithm pulls the arm $a_{i,x}$, until time $t$. Denote the $c$-th element of $S^{i,x}_{\mathbf{z}, t}$ by $s^{i,x}_{\mathbf{z}, c}$, and recall $|S^{i,x}_{\mathbf{z}, t}| = C^{i,x}_t$. Define a random variable $Y_c^{i,x}$ as follows: Take the $c$-th element of $S^{i,x}_{\mathbf{z}, t}$ and multiply it by $\widehat{p}^i_{\mathbf{z}, t}$ for all $\mathbf{z}$ and sum them, that is $Y^{i,x}_c = \sum_\mathbf{z} \mathds{1}\{Y_{s^{i,x}_{\mathbf{z}, c}} = 1\}\widehat{p}^{i}_{\mathbf{z}, t}$. Note that $E[Y_c^{i,x}] = \mu_{i,x}$ Hence by Chernoff Bound followed by Hoeffding Lemma we get,
\begin{align}
&\mathbb{P}\bigg(\frac{\sum_{j\in S_t^{i,x}} \mathds{1}\{Y_j = 1\} + \sum_{c\in [C^{i,x}_t]}Y^{i,x}_c}{N_t^{i,x} + C_t^{i,x}} \geq \mu_{i,x} + \epsilon \bigg) \nonumber \\
&= \mathbb{P}\bigg(\sum_{j\in S_t^{i,x}} \mathds{1}\{Y_c = 1\} + \sum_{c\in [C^{i,x}_t]}Y^{i,x}_c \geq (N_t^{i,x} + C_t^{i,x})\mu_{i,x} + (N_t^{i,x} + C_t^{i,x})\epsilon \bigg) \nonumber \\
&\leq \min_{\lambda \geq 0} E\big[\exp{\big(\lambda(\sum_{j\in S_t^{i,x}} (\mathds{1}\{Y_c = 1\} - \mu_{i,x}) + \sum_{c \in [C_t^{i,x}]}(Y^{i,x}_c - \mu_{i,x}) \big)} \big]e^{-\lambda(N_t^{i,x} + C_t^{i,x})\epsilon} \nonumber \\
&= \min_{\lambda \geq 0} E\big[ \Pi_{j\in S_t^{i,x}} \exp{(\mathds{1}\{Y_c = 1\} - \mu_{i,x})} \Pi_{c \in [C_t^{i,x}]} \exp{(Y^{i,x}_c - \mu_{i,x})}\big]e^{-\lambda(N_t^{i,x} + C_t^{i,x})\epsilon} \nonumber \\
&= \min_{\lambda \geq 0} \Pi_{j \in S_t^{i,x}} E[\exp{(\mathds{1}\{Y_c = 1\} - \mu_{i,x})}] \Pi_{c \in [C_t^{i,x}]} E[\exp{(Y^{i,x}_c - \mu_{i,x})}]e^{-\lambda(N_t^{i,x} + C_t^{i,x})\epsilon} \nonumber \\
&\leq \min_{\lambda \geq 0} \exp \bigg(\frac{N_T^{i,x}\lambda^2}{8} + \frac{C_t^{i,x}\lambda^2}{8} - \lambda(N_T^{i,x} + C^{i,x}_t) \epsilon\bigg) \nonumber \\
&\leq \exp{(-2(N_t^{i,x} + C^{i,x}_t)\epsilon^2)}
\end{align}
Following the same steps as above we get the following two sided bounds,
\begin{align}
    \mathbb{P}(|\widehat{\mu}_{i,x}(t) - \mu_{i,x}(t)| \geq \epsilon) \leq 2\exp{(-2(N_t^{i,x} + S_t)\epsilon^2)}
\end{align}}
\end{comment}
%%%%%%%%%%%%%%%%%comment%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%c) %
c) Let the conditional probability distribution $\mathbb{P}(. \mid \widehat{p}^{\ i,x}_T > \frac{p}{2})$ be denoted by $\mathbb{P}_p$. Since $\beta \geq1$, $N_T^0 \geq \ln{T}$. Further if $\widehat{p}^{\ i,x}_T > \frac{p}{2}$ then $C^{i,x}_T > \frac{p}{2} \frac{N^0_T}{2} \geq \frac{p}{4} \ln T$ (from the definition of $C^{i,x}_T$). Hence, from Lemma \ref{lemma: unbiased muix} we have
\begin{equation}
    \label{bounding-mu-ix-delta}
    \mathbb{P}_p\bigg(|\widehat{\mu}_{i,x}(T) - \mu_{i,x}| \geq \frac{\Delta_0}{4}\bigg) \leq 2\exp\bigg(-\frac{\Delta_0^2}{32}p\ln T\bigg) = 2T^{-\frac{p\Delta_0^2}{32}}
\end{equation}
Finally by the law of total probability and using Equations \ref{p-ix-greater-p-2} and  \ref{bounding-mu-ix-delta}
\begin{align*}
\mathbb{P}\bigg(|\widehat{\mu}_{i,x}(T) - \mu_{i,x}| \geq \frac{\Delta_0}{4}\bigg) &\leq \mathbb{P}_p\bigg(|\widehat{\mu}_{i,x}(T) - \mu_{i,x}| \geq \frac{\Delta_0}{4}\bigg) + \mathbb{P}\bigg(\widehat{p}^{\ i,x}_T \leq \frac{p}{2}\bigg) \nonumber \\
&\leq 2T^{-\frac{p\Delta_0^2}{32}} + Z_iT^{-\frac{p^2}{4}}
\end{align*}
\end{proof}

Next we show that $\beta$ as set in \CRM\ is bounded in expectation. Lemma \ref{bounding E[beta^2]} and its proof is similar to Lemma 8.6 in \cite{NairPS21}.
\begin{lemma} \label{bounding E[beta^2]}
Let $L = \arg \min_{t \in \mathds{N}} \bigg\{\frac{t^\frac{p^2\Delta_0^2}{32}}{\ln{t}} \geq 3N(Z+3)\bigg\}$, where $Z = \max_{i} Z_i$, and suppose \CRM\ pulls arms for T rounds, where $T \geq \max(L, e^{\frac{50}{\Delta_0^2}})$, and let $a^* \neq a_0$. Then at the end of T rounds, $\frac{8}{9\Delta_0^2} \leq \mathbb{E}[\beta^2] \leq \frac{50}{\Delta_0^2}$.
\end{lemma}
\begin{proof}
Before proceeding to the proof of the lemma we make the following two observations.
\begin{observation}
1. If $a^{*} \neq a_0$ then $\Delta_0 = \mu_{a^*} - \mu_0$ \\
2. Let $\widehat{\mu}^* = \max_{i,x}(\widehat{\mu}_{i,x}(T))$. If $|\widehat{\mu}_0(T) - \mu_0| \leq \frac{\Delta_0}{4}$ and $|\widehat{\mu}_{i,x}(T) - \mu_{i,x}| \leq  \frac{\Delta_0}{4}$ for all $(i,x)$ then $\frac{\Delta_0}{2} \leq \widehat{\mu}_{a^*} - \widehat{\mu}_0(T) \leq \frac{3\Delta_0}{2}$, and $\frac{32}{9\Delta_0^2} \leq \beta^2 \leq \frac{32}{\Delta_0^2}$. Notice that since $T\geq e^{\frac{50}{\Delta_0^2}}$, $\frac{32}{\Delta_0^2} \leq \ln T$.
\end{observation}

Let $U_0$ be the event that $|\widehat{\mu}_0(T) - \mu_0| \leq \frac{\Delta_0}{4}$, and for any $i,x$ let $U_{i,x}$ be the event $|\widehat{\mu}_{i,x}(T) - \mu_{i,x}| \leq \frac{\Delta_0}{4}$. Also let $U = (\cap_{i,x} U_{i,x}) \cap U_0$. If $\overline{U}_0$, $\overline{U}_{i,x}$, and $\overline{U}$ denote the compliment of the events $U_0, U_{i,x}$, and $U$ respectively, then

$$
\mathbb{P}(\overline{U}_0) \leq 2T^{-\frac{\Delta_0^2}{8}}~, \text{~and}$$
$$\text{for a fixed } (i,x)~~~\mathbb{P}(\overline{U}_{i,x}) \leq 2T^{-\frac{p\Delta_0^2}{32}} + Z_iT^{-\frac{p^2}{4}}~.
$$

Hence applying union bound, 
\begin{align*}
\mathbb{P}(\overline{U}) &\leq 2N\left(\frac{2}{T^{\frac{p\Delta_0^2}{32}}} +  \frac{Z}{T^{\frac{p^2}{4}}}\right) + \frac{2}{T^{\frac{\Delta_0^2}{8}}} \nonumber \\
&\leq 2N\left(\frac{2}{T^{\frac{p^2\Delta_0^2}{32}}} +  \frac{Z}{T^{\frac{p^2\Delta_0^2}{32}}}\right) + \frac{2N}{T^{\frac{p^2\Delta_0^2}{32}}} ~~~~~~~~~~~~\text{as }~p\leq 1, \Delta_0\leq 1 \nonumber \\
&\leq \frac{2N(Z+3)}{T^{\frac{p^2\Delta_0^2}{32}}} =\delta
\end{align*}

We will use the above arguments to first show that $\mathbb{E}[\beta^2] \geq \frac{8}{9\Delta_0^2}$. From part 2 of Observation we have that the event $U$ implies $\beta^{2} \geq \frac{32}{9\Delta_0^2}$. Since $\mathbb{P}\{U\} \geq 1-\delta$,
$$\mathbb{E}[\beta^{2}] \geq \frac{32}{9\Delta_0^2} (1-\delta) = \frac{32}{9\Delta_0^2} - \frac{32\delta}{9\Delta_0^2} $$
Since $T$ satisfies $\frac{T^{\frac{p^2\Delta_0^2}{32}}}{\ln T} \geq 3N(Z + 3)$, this implies $\frac{32\delta}{9\Delta_0^2} \leq \frac{24}{9\Delta_0^2}$, and hence $\mathbb{E}[\beta^2] \geq \frac{8}{9\Delta_0^2}$. Similarly, from part 2 of Observation we have that the event $U$ implies $\beta^{2} \leq \frac{32}{\Delta_0^2}$. If $U$ does not hold then $\beta^2 \leq \ln T$. Hence using the fact that $T$ satisfies $\frac{T^{\frac{p^2\Delta_0^2}{32}}}{\ln T} \geq 3N(Z+3)$, and hence $\delta \ln T \leq \frac{18}{\Delta_0^2}$, we get,
$$\mathbb{E}[\beta^{2}] \leq \frac{32}{\Delta_0^2} (1-\delta) + \delta\ln T \leq \frac{32}{\Delta_0^2} + \delta \ln T \leq \frac{50}{\Delta_0^2}~.$$

\end{proof}

\begin{lemma} \label{lemma: bound E[N_T^ix]}
Suppose $a^* \neq a_{i,x}$. Then at the end of $T$ rounds the following holds:
$$\mathbb{E}[N^{i,x}_{T}] \leq \max\left(0, \frac{8\ln T}{\Delta_{i,x}^2} + 1 - \frac{1}{4} \cdot p_{i,x} \cdot \eta_T^{i,x} \cdot \mathbb{E}[N_T^0]\right) + \frac{\pi^2}{3}~.$$
Further if $a^* \neq a_0$ then
$$\mathbb{E}[N^{0}_T] \leq \Big( \mathbb{E}[\beta^2]\ln T + ~\frac{8\ln T}{\Delta_{0}^2} + 1\Big) + \frac{\pi^2}{3} ~.$$
\end{lemma}
\begin{proof}
Let $F_T^{i,x} = N_T^{i,x} + C_T^{i,x}$. Then, 
\begin{equation}\label{equation: value of N^i,x_T}
    N^{i,x}_{T} = \sum_{t\in T} \mathds{1}\{a_t = a_{i,x}\} ~.
\end{equation}
\begin{align}
N_T^{i,x} \leq \max(0, \ell - C_T^{i,x}) + \sum_{t \in T} \mathds{1}\{a_t = a_{i,x}, F_t^{i,x}\geq \ell\} \label{N-T-i-x-l}
\end{align}

Here, we make an observation regarding the expected value of $C_T^{i,x}$.
\begin{observation}
\label{obs:eta}
$\mathbb{E}[C_T^{i,x}] = \mathbb{E}[\min_\mathbf{z} \widehat{p}^{i,x}_{\mathbf{z},T} \lceil N_T^0/2 \rceil] \geq \frac{1}{4} \cdot p_{i,x} \cdot \mathbb{E}[N_T^0] \cdot (1 - Z_i T^{-\frac{p_{i,x}^2}{2}}) = \frac{1}{4} \cdot p_{i,x} \cdot \eta_T^{i,x} \cdot \mathbb{E}[N_T^0]$
\end{observation}
\begin{proof}
Note that the expectation of $\min_{\mathbf{z}}\widehat{p}^{i,x}_{\mathbf{z},T}$ is over the distribution of the CBN and that of $N_T^0$ over the distribution in the observation across all $T$ rounds. Recall $p_{i,x} = \min_{\mathbf{z}} p_{\mathbf{z}}^{i,x}$. By Lemma \ref{lemma: chernoff-hoeffding inequality}, we have,
\begin{equation*}
\mathbb{P}\bigg(\widehat{p}^{\ i,x}_{\mathbf{z}, T} > p^{i,x}_{\mathbf{z}} - \frac{p_{i,x}}{2} \geq \frac{p_{i,x}}{2}\bigg) \geq 1 - e^{-2 \frac{p_{i,x}^2}{4} \frac{\ln T}{2}} = 1 - T^{-\frac{p_{i,x}^2}{4}} 
\end{equation*}
Now using this we get,
\begin{align*}
\mathbb{P}\bigg(\widehat{p}^{\ i,x}_{T} \leq \frac{p_{i,x}}{2}\bigg) \leq  \mathbb{P}\bigg(\min_\mathbf{z} \widehat{p}^{\ i,x}_{\mathbf{z}, T} \leq \frac{p_{i,x}}{2}\bigg) \leq \sum_\mathbf{z} \mathbb{P}\bigg(\widehat{p}^{\ i,x}_{\mathbf{z}, T} \leq \frac{p_{i,x}}{2}\bigg) \leq Z_iT^{-\frac{p_{i,x}^2}{4}}
\end{align*}
We can now bound the expectation of $C_T^{i,x}$ for sufficiently large $T$ as follows:
\begin{align}
\mathbb{E}[\min_\mathbf{z} \widehat{p}^{i,x}_{\mathbf{z},T} \lceil N_T^0/2 \rceil] 
&\geq \frac{1}{2} \mathbb{E}[\min_{\mathbf{z}} \widehat{p}^{i,x}_{\mathbf{z},T} N_T^0] \nonumber \\
&= \frac{1}{2} \sum_{a=1}^{\infty} a\cdot\mathbb{E}[\min_{\mathbf{z}} \widehat{p}^{i,x}_{\mathbf{z},T} \mid N_T^0 = a] \mathbb{P}(N_T^0 = a) \nonumber \\
&\geq \frac{1}{2} \sum_{a=1}^{\infty} a \cdot\frac{p_{i,x}}{2} \cdot \mathbb{P}\bigg(\min_\mathbf{z} \widehat{p}^{i,x}_{\mathbf{z},T} > \frac{p_{i,x}}{2} \mid N_T^0 = a\bigg) \mathbb{P}(N_T^0 = a) \nonumber \\
&\geq \frac{1}{2} \sum_{a=1}^{\infty} a \cdot\frac{p_{i,x}}{2} \cdot \mathbb{P}\bigg(\min_\mathbf{z} \widehat{p}^{i,x}_{\mathbf{z},T} > \frac{p_{i,x}}{2} \mid N_T^0 = a\bigg) \mathbb{P}(N_T^0 = a) \nonumber \\
&\geq \frac{p_{i,x}}{4} \mathbb{E}[N_T^0] \cdot \max\Big\{0, 1 - Z_i T^{-\frac{p_{i,x}^2}{4}}\Big\} \nonumber \\
&= \frac{1}{4} \cdot p_{i,x} \cdot \eta_T^{i,x} \cdot \mathbb{E}[N_T^0]
% &= \frac{1}{2} \mathbb{E}[N_T^0] \int_0^1 \mathbb{P}(\min_\mathbf{z} \widehat{p}^{i,x}_{\mathbf{z},T} > x) dx \nonumber \\
% &\geq \frac{1}{2} \mathbb{E}[N_T^0] \int_0^1 \Pi_\mathbf{z} \mathbb{P}(\widehat{p}^{i,x}_{\mathbf{z},T} > x) dx \nonumber \\
% &\geq \frac{1}{2} \mathbb{E}[N_T^0] \Pi_\mathbf{z} \int_0^1 \mathbb{P}(\widehat{p}^{i,x}_{\mathbf{z},T} > x) dx \nonumber \\
% &= \frac{1}{2} \Pi_\mathbf{z} p^{i,x}_{\mathbf{z}} \mathbb{E}[N_T^0]
\end{align}
\end{proof}

Taking expectation of Equation \ref{N-T-i-x-l}, we get
\begin{align} \label{equation: bounding the expected number of pulls of sub-optimal arm}
\mathbb{E}[N_T^{i,x}] \leq \max\big\{0, \ell - \frac{p_{i,x}}{4} \cdot \eta_T^{i,x} \cdot \mathbb{E}[N_T^0] \big\} + \sum_{t \in [\ell+1, T]} \mathbb{P}(a_t = a_{i,x}, F_t^{i,x}\geq \ell)
\end{align}

Now we bound $\sum_{t\in [l+1,T]} \mathbb{P}(a(t) = a_{i,x}, F^{i,x}_{t} \geq \ell)$, and assuming $a^*\neq a_0$. The proof for $a^*=a_0$ is similar. We use $F^{a^*}_T$ to denote the effective number of pulls of $a^*$ at the end of $T$ rounds. Also, for better clarity, we use $\widehat{\mu}_{i,x}(F^{i,x}_T,T)$ (instead of $\widehat{\mu}_{i,x}(T)$) and $\widehat{\mu}_{0}(N^{0}_T,T)$ (instead of $\widehat{\mu}_{0}(T)$) to denote the empirical estimates of $\mu_{i,x}$ and $\mu_0$  computed by Algorithm \ref{CR-algorithm} at the end of $T$ rounds.

\begin{align*}
&\sum_{t\in [\ell+1,T]} \mathbb{P}\Bigg(a_t = a_{i,x}, F^{i,x}_{t} \geq \ell\Bigg) \\
&= \sum_{t\in [\ell,T-1]} \mathbb{P}\Bigg(\widehat{\mu}_{a^*}(F^{a^*}_t,t) + \sqrt{\frac{2\ln t}{F^{a^*}_{t}}} \leq \widehat{\mu}_{i,x}(F^{i,x}_t, t) + \sqrt{\frac{2\ln (t)}{F^{i,x}_{t}}},~~ F^{i,x}_{t} \geq \ell \Bigg) \\
&\leq  \sum_{t\in [0,T-1]} \mathbb{P}\Bigg( \text{min}_{s\in [0,t]}\widehat{\mu}_{a^*}(s,t) + \sqrt{\frac{2\ln t}{s}} \leq \text{max}_{s_j\in [\ell-1,t]}\widehat{\mu}_{i,x}(s_j,t) + \sqrt{\frac{2\ln t}{s_j}} \Bigg) \\
&\leq \sum_{t\in [T]} \sum_{s\in [0,t-1]} \sum_{s_j \in [\ell-1,t]} \mathbb{P}\Bigg(\widehat{\mu}_{a^*}(s,t) + \sqrt{\frac{2\ln t}{s}} \leq \widehat{\mu}_{i,x}(s_j, t) + \sqrt{\frac{2\ln t}{s_j}}\Bigg)
\end{align*}

If $\widehat{\mu}_{a^*}(s,t) + \sqrt{\frac{2\ln t}{s}} \leq \widehat{\mu}_{i,x}(s_j,t) + \sqrt{\frac{2\ln t}{s_j}}$ is true then at least one of the following events is true
\begin{subequations}
\begin{align}
   \widehat{\mu}_{a^*}(s,t) &\leq \mu_{a^*} - \sqrt{\frac{2\ln t}{s}} ~, \label{equation: ucb event a}\\
   \widehat{\mu}_{i,x}(s_j,t) &\geq \mu_{i,x} + \sqrt{\frac{2\ln t}{s_j}} ~, \label{equation: ucb event b}\\
   \mu_{a^*} &\leq \mu_{i,x} + 2\sqrt{\frac{2\ln t}{s_j}}~. \label{equation: ucb event c}
\end{align}
\end{subequations}
The probability of the events in Equations \ref{equation: ucb event a} and \ref{equation: ucb event b} can be bounded using Chernoff-Hoeffding inequality
$$\mathbb{P}\Bigg(\widehat{\mu}_{a^*}(s,t) \leq \mu_{a^*} - \sqrt{\frac{2\ln t}{s}}\Bigg) \leq t^{-4} ~,$$ 
$$\mathbb{P}\Bigg(\widehat{\mu}_{i,x}(s_j,t) \geq \mu_{i,x} + \sqrt{\frac{2\ln t}{s_j}}\Bigg) \leq t^{-4} ~.$$

Also if $\ell \geq \lceil \frac{8\ln T}{\Delta_{i,x}^2} \rceil$  then the event in Equation \ref{equation: ucb event c} is false, i.e. $\mu_{a^*} >  \mu_{i,x} + 2\sqrt{\frac{2\ln t}{s_j}}$. Thus for $\ell = \frac{8\ln T}{\Delta_{i,x}^2} + 1 \geq \lceil\frac{8\ln T}{\Delta_{i,x}^2}\rceil$, which implies
\begin{equation}\label{equation: probability of a suboptimal pull}
\sum_{t\in [\ell+1,T]} \mathbb{P}\{a(t) = a_{i,x}, F^{i,x}_{t} \geq \ell\} \leq \sum_{t\in [T]}\sum_{s\in [0,t-1]}\sum_{s_j\in [\ell-1, t]} 2t^{-4} \leq \frac{\pi^2}{3}
\end{equation}

If $a^* = a_0$ then using the exact arguments as above we can show that Equation \ref{equation: probability of a suboptimal pull} still holds. Hence, using Equations \ref{equation: bounding the expected number of pulls of sub-optimal arm} and \ref{equation: probability of a suboptimal pull} we have if $a^* \neq a_{i,x}$ then
$$\mathbb{E}[N^{i,x}_{T}] \leq \text{max}\bigg\{0, \frac{8\ln T}{\Delta_{i,x}^2} + 1 - \frac{p_{i,x}}{4} \cdot \eta_T^{i,x} \cdot  \mathbb{E}[N_T^0]\bigg\} + \frac{\pi^2}{3} ~.$$
The arguments used to bound $\mathbb{E}[N^0_T]$, when $a^*\neq a_0$ is similar. In this case the equation corresponding to Equation \ref{equation: bounding the expected number of pulls of sub-optimal arm} is 
\begin{equation}\label{equation: bounding the expected number of pulls of sub-optimal arm a_0}
    \mathbb{E}[N^{0}_{T}] \leq \mathbb{E}[\beta^2]\ln T + \ell + \sum_{t\in [\ell+1, T]} \mathbb{P}\{a(t) = a_0, N^{0}_{t} \geq \ell\} ~.
\end{equation}
Also the same arguments as above can be used to show that for $\ell =  \frac{8\ln T}{\Delta_{0}^2} + 1$,
\begin{equation}\label{equation: probability of a suboptimal pull a_0}
\sum_{t\in T} \mathbb{P}\{a(t) = a_{0}, N^{0}_{t} \geq \ell\}  \leq \frac{\pi^2}{3}~.
\end{equation}
Finally using Equations \ref{equation: bounding the expected number of pulls of sub-optimal arm a_0} and \ref{equation: probability of a suboptimal pull a_0}, we have
$$\mathbb{E}[N^{0}_{T}] \leq \left(\mathbb{E}[\beta^{2}]\ln T + \frac{8\ln T}{\Delta_{0}^2} + 1 \right)  + \frac{\pi^2}{3}~.$$

\begin{lemma} \label{lemma: bound E[N_T^0]}
If $a^* = a_0$ then at the end of $T$ rounds the following is true: 
$$\mathbb{E}[N^0_T] \geq T - \left( 2N(1+\frac{\pi^2}{3}) + \sum_{i,x} \frac{8\ln T}{\Delta_{i,x}^2} \right)~.$$
\end{lemma}
\begin{proof}
At the end of $T$ rounds we have
$$N^{0}_T + \sum_{i,x} N^{i,x}_T = T~. $$
Taking expectation on both sides of the above equation and rearranging the terms we have,
$$\mathbb{E}[N^{0}_T] = T -  \sum_{i,x} \mathbb{E}[N^{i,x}_T]~. $$
Now we use Lemma \ref{lemma: bound E[N_T^ix]} to conclude that
$$\mathbb{E}[N^{0}_T] \geq T - \left( 2N(1+\frac{\pi^2}{3}) + \sum_{i,x} \frac{8\ln T}{\Delta_{i,x}^2} \right)~. $$
\end{proof}

Now that we have bounds on $\mathbb{E}[N_T^0]$ and $\mathbb{E}[N_T^{i,x}]$, we can bound the regret as follows.

\textbf{Case a} ($a^* = a_0$): In this case we bound the expected cumulative regret of Algorithm \ref{CR-algorithm}. From Lemma \ref{lemma: bound E[N_T^ix]} and \ref{lemma: bound E[N_T^0]} for any $T$ satisfying both $T^{-\frac{p_{i,x}^2}{4}} > Z_i$ and 
\begin{align} \label{constraint on T}
    T \geq \frac{4}{p_{i,x} \cdot \eta_T^{i,x}} \bigg(1 + \frac{8\ln T}{\Delta^2_{i,x}} \bigg) + \bigg( 2N(1+\frac{\pi^2}{3}) + \sum_{i,x} \frac{8\ln T}{\Delta^2_{i,x}} \bigg)
\end{align}
we have $\mathbb{E}[N_T^{i,x}] \leq \frac{\pi^2}{3}$. Notice that Equation \ref{constraint on T} holds for sufficiently large $T$. Hence the cumulative regret caused by pulling sub-optimal arms $a_{i,x}$ is
\begin{align}
    \mathbb{E}[R(T)] \leq \sum_{\Delta_a > 0} \Delta_a \frac{\pi^2}{3}
\end{align}

\textbf{Case b} ($a^* \neq a_0$): In this case we bound the regret of pulling sub-optimal arms when $T \geq \max (L, e^{\frac{50}{\Delta_0^2}})$, where $L$ is as defined in Lemma \ref{bounding E[beta^2]}. Note that this is satisfied for sufficiently large $T$. Hence from Lemma \ref{bounding E[beta^2]} and Lemma \ref{lemma: bound E[N_T^ix]}, we have for $a^* \neq a_{i,x}$ and for $a_0$
\VIN{
\begin{align}
    \mathbb{E}[N_T^{i,x}] \leq \max \bigg\{0, 1 + 8\ln T\bigg(\frac{1}{\Delta_{i,x}^2} - \frac{p_{i,x} \cdot \eta_T^{i,x}}{36\Delta_0^2} \bigg)\bigg\} + \frac{\pi^2}{3}
\end{align}
\begin{align}
    \mathbb{E}[N_T^0] \leq \frac{58 \ln T}{\Delta_0^2} + 1 + \frac{\pi^2}{3}
\end{align}}
% \begin{comment}
% \begin{align}
%     E_T[E[N_T^{i,x}]] \leq \max \bigg(0, 1 + 8\ln T\bigg(\frac{1}{\Delta
    
%     _{i,x}^2} - \frac{p_{i,x} \cdot \eta_T^{i,x}}{36\Delta_0^2} \bigg)\bigg) + \frac{\pi^2}{3}
% \end{align}
% \begin{align}
%     E_T[E[N_T^0]] \leq \frac{50 \ln T}{\Delta_0^2} + 1 + \frac{\pi^2}{3}
% \end{align}
% \end{comment}

Hence the cumulative regret can be written as
\begin{align}
\mathbb{E}[R(T)] \leq \Delta_0 \bigg(\frac{58 \ln T}{\Delta_0^2} + 1 + \frac{\pi^2}{3}\bigg) + \sum_{\Delta_{i,x} > 0} \Delta_{i,x} \bigg(\max \bigg\{0, 1 + 8\ln T\bigg(\frac{1}{\Delta_{i,x}^2} - \frac{p_{i,x} \cdot \eta_T^{i,x}}{36\Delta_0^2} \bigg\}\bigg) + \frac{\pi^2}{3} \bigg)
\end{align}

\end{proof}
