
\begin{proof}
We follow the same approach as the previous proof. From the previous result, we know the following inequality holds with probability \( 1 - 2t^{-\alpha} \):
\[
\left| \sum\limits_{m \in \mathbb{M}} p_{m,a} \hat{\mu}_{m, a} - \mu_a \right| \leq \sqrt{\frac{\alpha \log (t)}{2} \sum\limits_{m \in \mathbb{M}} \frac{p_{m,a}^2}{T_{m, a, o}}}.
\]
Let this ``good event" be denoted as \( A_{t, a} \).

Now, we consider:
\begin{align}
    \label{eq:mar_proof2_1}
    \left| \sum\limits_{m \in \mathbb{M}} \hat{p}_{m, a} \hat{\mu}_{m, a} - \mu_a \right| 
    &= \left| \sum\limits_{m \in \mathbb{M}} (\hat{p}_{m, a} - p_{m,a}) \hat{\mu}_{m, a} + \sum\limits_{m \in \mathbb{M}} p_{m,a} \hat{\mu}_{m, a} - \mu_a \right| \\
    &\leq \sum\limits_{m \in \mathbb{M}} \left| \hat{p}_{m, a} - p_{m,a} \right| \hat{\mu}_{m, a} + \left| \sum\limits_{m \in \mathbb{M}} p_{m,a} \hat{\mu}_{m, a} - \mu_a \right|.
\end{align}


From the previous proof, since \( \hat{p}_{m, a} = \frac{T_{m, a}}{T_a} \), we have:
\[
\left| p_{m,a} - \hat{p}_{m, a} \right| \leq \sqrt{\frac{\alpha \log(t)}{2T_a}},
\]
with probability at least \( 1 - 2t^{-\alpha} \). Denote this "good event" as \( B_{t, m, a} \). Under this event for \( T_a \geq \log(T)^2\) and sufficient big \( T \) we will have \(  \frac{p_{m, a}}{2} \leq \hat{p}_{m, a} \leq 2 p_{m, a}\)

Additionally, we have:
\[
\left| \gamma_{m, a} - \frac{T_{m, a, o}}{T_{m, a}} \right| \leq \sqrt{\frac{\alpha \log(t)}{2T_{m, a}}} \leq \sqrt{\frac{\alpha \log(T)}{2T_{m, a}}},
\]
again with probability \( 1 - 2t^{-\alpha} \), denoted as "good event" \( C_{t, m, a} \).

If these ``good events" hold, we know:
\[
T_{m, a, o} \geq \frac{\gamma_{m, a} p_{m,a} T_a}{4}.
\]

We also know:
\[
\left| \mu_{m, a} - \hat{\mu}_{m, a} \right| \leq \sqrt{\frac{\alpha \log(t)}{2T_{m, a, o}}},
\]
which leads to:
\[
\hat{\mu}_{m, a} \leq \mu_{m, a} + \sqrt{\frac{\alpha \log(t)}{2T_{m, a, o}}} \leq 1 + \sqrt{\frac{\alpha \log(t)}{2T_{m, a, o}}},
\]
with probability at least \( 1 - 2t^{-\alpha} \), denoted as "good event" \( D_{t, m, a} \).

Under all these ``good events," we have:
\begin{align*}
    \sum\limits_{m \in \mathbb{M}} \left| \hat{p}_{m, a} - p_{m,a} \right| \hat{\mu}_{m, a} 
    &\leq \sum\limits_{m \in \mathbb{M}} \left| \hat{p}_{m, a} - p_{m,a} \right| + \sum\limits_{m \in \mathbb{M}} \left| \hat{p}_{m, a} - p_{m,a} \right| \times \sqrt{\frac{\alpha \log(t)}{2T_{m, a, o}}} \\
    &\leq \sum\limits_{m \in \mathbb{M}} \left| \hat{p}_{m, a} - p_{m,a} \right| + \sum\limits_{m \in \mathbb{M}} \sqrt{\frac{\alpha \log(T)}{2T_a}} \times \sqrt{\frac{2\alpha \log(T)}{p_{m,a} \gamma_{m, a} T_a}}.
\end{align*}

Using Lemma 7 from \cite{kamath2015learning}, we get:
\[
\sum\limits_{m \in \mathbb{M}} \left| \hat{p}_{m, a} - p_{m,a} \right| \leq \sqrt{\frac{2 (k - 1)}{\pi T_a}} + \frac{4k^{\frac{1}{2}} (k - 1)^{\frac{1}{4}}}{T_a^{\frac{3}{4}}}.
\]

Now, combining everything with the initial inequality \eqref{eq:mar_proof2_1}, we have:
\[
\begin{split}
    \left| \sum\limits_{m \in \mathbb{M}} \hat{p}_{m, a} \hat{\mu}_{m, a} - \mu_a \right| &\leq 
    \sqrt{\frac{2 (k - 1)}{\pi T_a}} + \frac{4k^{\frac{1}{2}} (k - 1)^{\frac{1}{4}}}{T_a^{\frac{3}{4}}} \\&+ \sum\limits_{m \in \mathbb{M}} \sqrt{\frac{\alpha \log(T)}{2T_a}} \times \sqrt{\frac{2\alpha \log(T)}{p_{m,a} \gamma_{m, a} T_a}} + \sqrt{\frac{\alpha \log(T)}{2} \sum\limits_{m \in \mathbb{M}} \frac{p_{m,a}^2}{T_{m, a, o}}}.
\end{split}
\]

For sufficiently large \( T \), since \( T_a > \log(T)^2 \), we have:
\[
\sqrt{\frac{\alpha \log(T)}{2} \sum\limits_{m \in \mathbb{M}} \frac{p_{m,a}^2}{T_{m, a, o}}} \geq \sum\limits_{m \in \mathbb{M}} \sqrt{\frac{\alpha \log(T)}{2T_a}} \times \sqrt{\frac{2\alpha \log(T)}{p_{m,a} \gamma_{m, a} T_a}}.
\]

and
\[
\sqrt{\frac{\alpha \log(T)}{2} \sum\limits_{m \in \mathbb{M}} \frac{p_{m,a}^2}{T_{m, a, o}}} \geq \sqrt{\frac{2 (k - 1)}{\pi T_a}} \text{ and } \frac{4k^{\frac{1}{2}} (k - 1)^{\frac{1}{4}}}{T_a^{\frac{3}{4}}}
\]

Thus, we conclude:
\[
\left| \sum\limits_{m \in \mathbb{M}} \hat{p}_{m, a} \hat{\mu}_{m, a} - \mu_a \right| \leq 4\sqrt{\frac{\alpha \log(T)}{2} \sum\limits_{m \in \mathbb{M}} \frac{p_{m,a}^2}{T_{m, a, o}}}.
\]

Finally, since \( \frac{p_{m,a}}{2} \leq \hat{p}_{m, a} \), we have:
\[
\left| \sum\limits_{m \in \mathbb{M}} \hat{p}_{m, a} \hat{\mu}_{m, a} - \mu_a \right| \leq 8\sqrt{\frac{\alpha \log(T)}{2} \sum\limits_{m \in \mathbb{M}} \frac{\hat{p}_{m, a}^2}{T_{m, a, o}}}.
\]

Following the exact reasoning in the proof of Theorem~\ref{theo:mar_upper_1}, and using the fact that \( \hat{p}_{m, a} \leq 2p_{m, a} \) we conclude:
\[
\mathbb{E}[R] = O\left( \alpha \sqrt{T \log(T) n S} \right).
\]   

\end{proof}

