\begin{proof}
First assume that the missingness probabilities are consistently estimated.
That is, \( \hat{\gamma}_{m, a} \) are consistent estimators for \(\gamma_{m,a}\). 
We have \( \forall \epsilon \):
\[
\lim\limits_{T \rightarrow \infty} \mathbb{P}(|\hat{\gamma}_{m, a} - \gamma_{m, a}| \geq \epsilon) = 0
\]

We have with probability \( 1 - \mathbb{P}(|\hat{\gamma}_{m, a} - \gamma_{m, a}| \geq \epsilon)\) that:
\[
|\hat{\gamma}_{m, a} - \gamma_{m, a}| \leq \epsilon
\]
so
\begin{align*} 
    |\frac{1}{\hat{\gamma}_{m, a}} - \frac{1}{\gamma_{m, a}}| \leq \frac{\epsilon}{\gamma_{m, a} \hat{\gamma}_{m, a}} 
    \leq \frac{\epsilon}{\gamma_{m, a} (\gamma_{m, a} - \epsilon)}
\end{align*}

Now we have:
\begin{equation}
    \label{eq:db_1}
    \hat{\mu}_a = \sum\limits_{t = 1}^T\big[
        \sum_{m\in\mathcal{M}}\frac{\mathbbm{1}\{M_t=m\}}{\hat{\gamma}_{m,a}}\big(
        Y^o_t\mathbbm{1}\{\oo_t=1\}-(\mathbbm{1}\{\oo_t=1\}-\hat{\gamma}_{m,a})\hat{\mu}_{m, a} = O(\epsilon)
    \big]
\end{equation}


Now if \( T_{m, o} \) be the number of times \(M = m \) and reward observed and \( T_{m, o'}\) be the number of times \(M = m \) observed and reward missed then: 
\begin{align*}
    \hat{\mu}_a 
    &= \sum\limits_{m \in \mathbb{M}} \big[
    T_{m, o'} \hat{\mu}_{m, a} 
    -\frac{1 - \hat{\gamma}_{m, a}}{\hat{\gamma}_{m, a}} T_{m, o} \hat{\mu}_{m, a}
    + \frac{1}{\hat{\gamma}_{m, a}} \sum\limits_{t_1 = 1}^{T_{m, o}} Y^o_t
    \big] \\
    &= \sum\limits_{m \in \mathbb{M}} \big[
    \hat{\mu}_{m, a} (T_m - \frac{1}{\hat{\gamma}_{m, a}}T_{m, o})
    + \frac{1}{\hat{\gamma}_{m, a}} \sum\limits_{t_1 = 1}^{T_{m, o}} Y^o_t
    \big] 
 \end{align*}

Now using \ref{eq:db_1} with probability \( 1 - \sum\limits_{m \in \mathbb{M}} \mathbb{P}(|\hat{\gamma}_{m, a} - \gamma_{m, a}| \geq \epsilon)\) we will have:
\[
\hat{\mu}_a = \sum\limits_{m \in \mathbb{M}} \big[
    \hat{\mu}_{m, a} (T_m - \frac{1}{\gamma_{m, a}}T_{m, o})
    + \frac{1}{\gamma_{m, a}} \sum\limits_{t_1 = 1}^{T_{m, o}} Y^o_t
    \big] + O(\epsilon)
\]

Now using Chebyshev's inequality we have:
\[
\mathbb{P}(|\frac{T_{m, o}}{T_m} - \gamma_{m, a}| \geq \frac{\epsilon \gamma_{m, a}}{T_m}) \leq \frac{\gamma_{m, a} (1 - \gamma_{m, a})}{T_m \epsilon \gamma_{m, a}}
\]
which yields:
\[
\mathbb{P}(|\frac{T_{m, o}}{\gamma_{m, a}} - {T_m}| \geq \epsilon) \leq \frac{(1 - \gamma_{m, a})}{T_m \epsilon}
\]

\end{proof}