\begin{proof}
As before, let \( a^* = \arg\max\limits_{a} \mu_a \) denote the optimal arm, and define \( T_1 = \sum\limits_{a} T_{1,a} \) as the total number of times the agent samples each arm during the initial rounds. After the first \( T_1 \) rounds, we can derive the following bounds at any time step \( 1 \leq t \leq T_2 = T - T_1 \).

For each arm \( a \), let the reward samples observed when \( M = m \) be denoted by \( Y^o_{m, a}(1), \dots, Y^o_{m, a}(T_{m, a, o}) \). Applying Hoeffding's inequality, we obtain:
\[
\mathbb{P}\left( \left| \sum_{m \in [K]} p_{m,a} \frac{\sum_{j = 1}^{T_{m, a, o}} Y^o_{m, a}(j)}{T_{m, a, o}} - \mu_a \right| \geq \epsilon_a \right) \leq 2\exp\left( -\frac{2\epsilon_a^2}{\sum_{m \in [K]} \frac{p_{m,a}^2}{T_{m, a, o}}} \right)
\]

This result holds because the sub-Gaussian norm of the random variable \( p_{m,a} \frac{Y_{m, a}(j)}{T_{m, a, o}} \) is \( \frac{p_{m,a}}{T_{m, a, o}} \).

By setting \( \epsilon_a = \sqrt{\frac{\alpha \log(t)}{2} \sum_{m \in [K]} \frac{p_{m,a}^2}{T_{m, a, o}}} \), we obtain the following inequality, which holds with probability at least \( 1 - 2t^{-\alpha} \):
\[
|\hat{\mu}_a - \mu_a| \leq \sqrt{\frac{\alpha \log(t)}{2} \sum_{m \in [K]} \frac{p_{m,a}^2}{T_{m, a, o}}}
\]
Name the above ``good event" \( A_{t, a} \).

Also, like before for \( a = a_t = \arg\max\limits_{a} \left( \text{UCB}_a \right) \), we get the following inequality:
\begin{equation}
    \mu_a + 2\epsilon_a \geq \hat{\mu}_a + \epsilon_a = \text{UCB}_a \geq \text{UCB}_{a^*} = \hat{\mu}_{a^*} + \epsilon_{a^*} \geq \mu_{a^*} \quad \Rightarrow \quad \epsilon_a \geq \frac{\Delta_a}{2},
    \label{eq:mar_proof_1}
\end{equation}
where \( \Delta_a = \mu_{a^*} - \mu_a \).

Next, let \( T_{m, a} \) represent the number of times arm \( a \) is pulled and \( M = m \) is observed, and let \( T_a \) represent the total number of times arm \( a \) is pulled. Using Hoeffding's inequality, we can bound the deviation between \( p_{m,a} \) (the probability of observing \( M = m \)) and the empirical ratio \( \frac{T_{m, a}}{T_a} \) as follows:
\[
    p_{m,a} - \frac{T_{m, a}}{T_a} \leq \sqrt{\frac{\alpha \log(t)}{2T_a}} \leq \sqrt{\frac{\alpha \log(T)}{2T_a}},
\]
with probability at least \( 1 - t^{-\alpha} \). Name this ``good event" \( B_{t, m, a} \).

Similarly, we bound the deviation between \( \gamma_{m, a} \) and \( \frac{T_{m, a, o}}{T_{m, a}} \), where \( T_{m, a, o} \) is the number of times reward is observed for arm \( a \) and \( M = m \):
\[
    \gamma_{m, a} - \frac{T_{m, a, o}}{T_{m, a}} \leq \sqrt{\frac{\alpha \log(t)}{2T_{m, a}}} \leq \sqrt{\frac{\alpha \log(T)}{2T_{m, a}}},
\]
again with probability \( 1 - t^{-\alpha} \).  Name this ``good event" \( C_{t, m, a} \).

For sufficiently large \( T \), we have:
\[
\log(T)^2 \geq 2\alpha \log(T) \frac{1}{p^2},
\]
which implies \( T_a \geq \log(T)^2 \geq 2\alpha \log(T) \frac{1}{p^2} \). This allows us to use inequality \( \sqrt{\frac{\alpha \log(T)}{2T_a}} \leq \frac{p_{m,a}}{2} \) to derive a lower bound for \( T_{m, a} \):
\[
    T_{m, a} \geq \frac{T_a p_{m,a}}{2}.
\]

Furthermore, since \( T_{m, a} \geq \frac{T_a p_{m,a}}{2} \) and For sufficiently large \( T \), we know that \( T_a \geq T_{1,a} = \log(T)^2 \geq 2\alpha \log(T) \frac{2}{\gamma_{m, a}^2 p} \). This gives us \( \sqrt{\frac{\alpha \log(T)}{2T_{m, a}}} \leq \frac{\gamma_{m, a}}{2} \), which allows us to establish a lower bound for \( T_{m, a, o} \):
\[
    T_{m, a, o} \geq \frac{\gamma_{m, a}}{2} T_{m, a}.
\]

Combining this with \( T_{m, a} \geq \frac{T_a p_{m,a}}{2} \), we derive:
\[
    T_{m, a, o} \geq \frac{T_a p_{m,a} \gamma_{m, a}}{4}.
\]


Let \( E_t \) represent the intersection of "good events" at time step \( t \).  Under \( E = \bigcap\limits_{t} E_t \), we obtain:

\begin{align*}
    \epsilon_a 
    &= \sqrt{\frac{\alpha \log(t)}{2} \sum_{m \in [K]} \frac{p_{m,a}^2}{T_{m, a, o}}} \\
    &\leq \sqrt{\frac{\alpha \log(T)}{2} \sum_{m \in [K]} \frac{4p_{m,a}^2}{T_a p_{m,a} \gamma_{m, a}}} \\
    &= \sqrt{\frac{2\alpha \log(T)}{T_a} \sum_{m \in [K]} \frac{p_{m,a}}{ \gamma_{m, a}}} \\
    &= \sqrt{\frac{2\alpha \log(T)}{T_a} P_a} \label{eq:epsilon_a}
\end{align*}

Using inequality \eqref{eq:mar_proof_1}, we have:
\[
    T_a \leq \frac{8 \alpha \log(T) P_a}{\Delta_a^2}
\]

Thus, we get:
\begin{align}
    \mathbb{E}[T_a] &= \sum\limits_{t=1}^T \mathbb{E}[\mathbb{I}(I_t = a)] \nonumber \\
    &\leq  \frac{8 \alpha \log(T) P_a}{\Delta_a^2} + \sum\limits_{t=1}^T \mathbb{E}[\mathbb{I}(E_t^c)] \nonumber \\
    & \leq  \frac{8 \alpha \log(T) P_a}{\Delta_a^2} + \sum\limits_{t=1}^T \mathbb{E}[\mathbb{I}(\bigcup\limits_{m} \left( B_{t, m, a}^c \cup C_{t, m, a}^c \right) \cup A_{t, a}  )] \nonumber \\
    &\leq  \frac{8 \alpha \log(T) P_a}{\Delta_a^2} + \sum\limits_{t=1}^T 4K t^{-\alpha} \nonumber \\
    &\leq  \frac{8 \alpha \log(T) P_a}{\Delta_a^2} + \frac{4K\alpha}{\alpha - 1}.
    \label{eq:mar_proof_2}
\end{align}

To conclude, note that the regret of second part of algorithm is \( \mathbb{E}[R_2] = \sum\limits_{a} \Delta_a \mathbb{E}[T_a] \). 
We now split the arms into two groups: \( \Delta_a \leq \sqrt{\frac{8\alpha \log(T)nS}{T}} \) and \( \Delta_a \geq \sqrt{\frac{8\alpha \log(T)nS}{T}} \). Let \( R_2 \) be the regret for the second part, and let \( x = \sqrt{\frac{8\alpha \log(T)nS}{T}} \).
To conclude, note that \( \mathbb{E}[R] = \sum\limits_{a} \Delta_a \mathbb{E}[T_a] \). 
Since \(S = \frac{\sum\limits_{a} P_a}{n} \) Then:
\[
\begin{split}
    \mathbb{E}[R_2] &= \sum\limits_{a} \Delta_a \mathbb{E}[T_a] = \sum\limits_{\Delta_a < x} \Delta_a \mathbb{E}[T_a] + \sum\limits_{\Delta_a \geq x} \Delta_a \mathbb{E}[T_a] \\&\leq Tx + \frac{8\alpha \log(T)}{x} S + \frac{4K\alpha n}{\alpha - 1} = 2\sqrt{8\alpha T \log(T) n S} + \frac{4K\alpha n}{\alpha - 1}.
\end{split}
\]

Finally, for the total regret \( R = R_1 + R_2 \), we have:
\begin{align}
    \mathbb{E}[R] &\leq 2\sqrt{8\alpha T \log(T) n S} + \frac{4K\alpha n}{\alpha - 1} + \sum\limits_{a} T_{1,a} \nonumber \\
    &\leq 2\sqrt{8\alpha T \log(T) n S} 
    + \frac{4K\alpha n}{\alpha - 1} 
    + n \log(T)^2 \nonumber \\
    &= O\left( \sqrt{\alpha T \log(T) n S} \right).
\end{align}

\end{proof}
