\begin{algorithm}[H]
\caption{MAR Algorithm with unknown \( p_{m,a} \)}\label{alg:mar_algorithm2}
\begin{algorithmic}[1]
\State \textbf{Input:} Number of arms $n$, time horizon $T$, exploration parameter $\alpha$
\State \textbf{Initialize:} 
\For{each arm $a \in [n]$ and $m \in \mathbb{M}$} 
\State $\hat{\mu}_{m, a} = 0$ \Comment{estimated mean reward for arm $a$ when $M = m$} 
\State $T_{m, a, o} = 0$ \Comment{number of times arm $a$ is pulled with $M = m$ and reward observed} 
\State $T_{m, a} = 0$ \Comment{number of times $M = m$ was observed for arm $a$}

\EndFor


\For{each arm $a \in [n]$} \For{$\log(T)^2$ rounds}
    \State Pull arm $a$, observe $m$ and reward $r$
    \State Update $T_{m, a}$ for observed $M = m$
    \If{reward is observed} \State Update $T_{m, a, o}$ and $\hat{\mu}_{m, a}$ \EndIf
\EndFor \EndFor

\State $T_1 = n \log(T)^2$, $T_2 = T - T_1$

\For{each round $t = 1, \dots, T_2$}
    \For{each arm $a \in [n]$}
        \State Estimate \( \hat{p}_{m,a} = \frac{T_{m,a}}{T_a} \) for each $m \in \mathbb{M}$
        % \vspace{0.5em}
        \State Compute $\hat{\mu}_{a} = \frac{1}{T_a} \sum_t\sum_{m\in\mathcal{M}}\hat{\mu}_{m,a}  \mathbbm{1}\{M_t=m\} \mathbbm{1}\{A_t=a\}$
        \State Compute $\text{UCB}_a(t) = \hat{\mu}_a + 8\sqrt{\frac{\alpha \log(T)}{2} \sum\limits_{m \in \mathbb{M}} \frac{\hat{p}_{m, a}^2}{T_{m, a, o}}}$
    \EndFor
    \State Select arm $a_t = \arg \max_a \text{UCB}_a(t)$
    \State Pull arm $a_t$, observe $m$ and reward $r_t$
    \State Update $T_{m, a_t}$ and, if reward is observed, update $T_{m, a_t, o}$ and $\hat{\mu}_{m, a_t}$
\EndFor

\end{algorithmic}
\end{algorithm}
