\begin{algorithm}[H]
\caption{MAR Algorithm with known \( p_{m,a} \)}\label{alg:mar_algorithm}
\begin{algorithmic}[1]
\State \textbf{Input:} Number of arms $n$, time horizon $T$, exploration parameter $\alpha$
\State \textbf{Initialize:}  
\For{each arm $a \in [n]$ and $m \in \mathbb{M}$} 
\State $\hat{\mu}_{m, a} = 0$ \Comment{estimated mean reward for arm $a$ when $M = m$} 
\State $T_{m, a, o} = 0$ \Comment{number of times arm $a$ is pulled with $M = m$ and reward observed} 
\State $T_{m, a} = 0$ \Comment{number of times $M = m$ was observed for arm $a$}
\EndFor

\For{each arm $a \in [n]$} \For{$\log(T)^2$ rounds}
    \State Pull arm $a$, observe $m$ and reward $r$
    \State Update $T_{m, a}$ for observed $M = m$
    \If{reward is observed} \State Update $T_{m, a, o}$ and $\hat{\mu}_{m, a}$ \EndIf
\EndFor \EndFor

\State $T_1 = n \log(T)^2 $, $T_2 = T - T_1$

\For{each round $t = 1, \dots, T_2$}
    \For{each arm $a \in [n]$}
        \State Compute $\hat{\mu}_{a} = \sum\limits_{m \in [K]} p_{m, a} \hat{\mu}_{m, a}$ \Comment{estimated mean reward for arm $a$}
        \State Compute $\text{UCB}_a(t) = \hat{\mu}_a + \sqrt{\frac{\alpha \log(T)}{2} \sum\limits_{m \in \mathbb{M}} \frac{p_{m, a}^2}{T_{m, a, o}}}$ \Comment{Upper Confidence Bound for arm $a$}
    \EndFor
    \State Select arm $a_t = \arg \max_a \text{UCB}_a(t)$
    \State Pull arm $a_t$, observe $m$ and reward $r_t$
    \State Update $T_{m,a_t}$ and, if reward is observed, update $T_{m, a_t, o}$ and $\hat{\mu}_{m, a_t}$
\EndFor

\end{algorithmic}
\end{algorithm}
