\begin{proof}
Let \( a^* = \arg\max\limits_{a} \mu_a \) be the optimal arm. Using Hoeffding's inequality, we can derive the following bounds for any time step \( 1 \leq t \leq T \):

- If \( a = a_t = \arg\max\limits_{a} \left( \text{UCB}_a \right) \), we have:
\[
    |\hat{\mu}_{a} - \mu_{a}| \leq \sqrt{\frac{\alpha \log(t)}{2T_{a, o}}},
\]
with probability \( 1 - 2t^{-\alpha} \). Name this ``good event" \( A_{t} \).

Now, define \( \epsilon_a = \sqrt{\frac{\alpha \log(t)}{2T_{a, o}}} \). For \( a = a_t = \arg\max\limits_{a} \left( \text{UCB}_a \right) \), we get the following inequality:
\begin{equation}
    \mu_a + 2\epsilon_a \geq \hat{\mu}_a + \epsilon_a = \text{UCB}_a \geq \text{UCB}_{a^*} = \hat{\mu}_{a^*} + \epsilon_{a^*} \geq \mu_{a^*} \quad \Rightarrow \quad \epsilon_a \geq \frac{\Delta_a}{2},
    \label{eq:mcar_proof_1}
\end{equation}
where \( \Delta_a = \mu_{a^*} - \mu_a \).

Now, if \( E_t \) represents the ``good events" at time step \( t \), then under \( E = \bigcap\limits_{t} E_t \), using \eqref{eq:mcar_proof_1} we obtain:
\[
T_{a, o} \leq 4\alpha \log(T) \Delta_a^{-2}.
\]

Thus, we have:
\begin{align}
    \mathbb{E}[T_{a, o}] &= \sum\limits_{t=1}^T \mathbb{E}[\mathbb{I}(I_t = a, O^Y_t = 1)] \nonumber \\
    &\leq 4\alpha \log(T) \Delta_a^{-2} + \sum\limits_{t=1}^T \mathbb{E}[\mathbb{I}(E_t^c)]
    \nonumber \\
    &= 4\alpha \log(T) \Delta_a^{-2} + \sum\limits_{t=1}^T \mathbb{E}[\mathbb{I}(\left( A_t^c \right)   )] \nonumber \\
    &\leq 4\alpha \log(T) \Delta_a^{-2} + \sum\limits_{t=1}^T 2t^{-\alpha} \nonumber \\
    &\leq 4\alpha \log(T) \Delta_a^{-2} + \frac{2\alpha}{\alpha - 1}.
\end{align}

Since we observe the reward with probability \( \gamma \), and \( O^Y \indep (A, Y) \), we have \( \mathbb{E}[T_{a, o}] = \gamma \mathbb{E}[T_a] \). Therefore:
\[
\mathbb{E}[T_a] \leq \frac{4\alpha \log(T) \Delta_a^{-2} + \frac{2\alpha}{\alpha - 1}}{\gamma}.
\]

Let \( x = \sqrt{\frac{4\alpha n \log(T)}{T\gamma}} \). Then, we have:
\begin{align}
    \mathbb{E}[R_T] &= \sum\limits_{a} \Delta_a \mathbb{E}[T_a] \nonumber \\
                  &= \sum\limits_{\Delta_a < x} \Delta_a \mathbb{E}[T_a] + \sum\limits_{\Delta_a \geq x} \Delta_a \mathbb{E}[T_a] \nonumber \\
                  &\leq Tx + \sum\limits_{\Delta_a \geq x} \Delta_a \frac{4\alpha \log(T) \Delta_a^{-2} + \frac{2\alpha}{\alpha - 1}}{\gamma} \nonumber \\
                  &= Tx + \frac{4n\alpha \log(T)}{x\gamma} + \frac{2n\alpha}{(\alpha - 1)\gamma} \nonumber \\
                  &= 2\sqrt{\frac{4n\alpha T \log(T)}{\gamma}} + \frac{2n\alpha}{\gamma(\alpha - 1)} \\
                  &= O\left(\sqrt{\frac{\alpha n T \text{log}(T)}{\gamma}}\right)
\end{align}

\end{proof}
