begin{algorithm}[H]
\caption{MAR Algorithm \( p_{m,a} \)}\label{alg:mar_algorithm2}
\begin{algorithmic}[1]
\State \textbf{Input:} Number of arms $n$, time horizon $T$, $\alpha \geq 1$
\State \textbf{Initialize:} 
\\ Set $\hat{\mu}_{i, a} = 0$ for all arms $a = 1, 2, \dots, n$ and all $i = 1, 2, \dots, K$
\\ Set $n_{i, a} = 0$ for all arms $a$ and all $i \in [K]$ (the number of times arm $a$ is pulled and $M = i$ with the reward observed)
\\ Set $m_{i,a} = 0$ for all arms $a$ and all $i \in [K]$ (the number of times $M = i$ was observed)
\vspace{0.5em}

\For{each arm $a = 1, 2, \dots, n$}
    \For{$T_{1, a}$ rounds}
        \State Pull arm $a$ and observe $m$ and reward $r$
        \State Update $m_{i, a}$ (number of times $M = i$ is observed for arm $a$)
        \If{reward is observed}
            \State Update $n_{a, m}$ and $\hat{\mu}_{a, m}$
        \EndIf
    \EndFor
\EndFor
\vspace{0.5em}

\State $T_1 = \sum\limits_{a} T_{1, a}$
\State $T_2 = T - T_1$

\vspace{0.5em}
\For{each round $t = 1, 2, \dots, T_2$}
    \For{each arm $a = 1, 2, \dots, n$}
        \State Estimate \( p_{m,a} = \frac{m_{i,a}}{T_a} \) for each $i \in [K]$
        \vspace{0.5em}
        \State Compute $\hat{\mu}_{a} = \sum\limits_{i \in [K]} p_{m,a} \hat{\mu}_{i, a}$
        \State Compute $\text{UCB}_a(t) = \hat{\mu}_a + \sqrt{\frac{\alpha \log t}{2}} \sum\limits_{i \in [K]} p_{m,a} \sqrt{\frac{1}{n_{i,a}}}$
    \EndFor
    \State Select arm $a_t = \arg \max_a \text{UCB}_a(t)$
    \State Pull arm $a_t$ and observe $m$ and reward $r_t$
    \State Update $m_{i,a_t}$ (number of times $M = i$ is observed for arm $a_t$)
    \If{reward is observed}
        \State Update $n_{a_t, m}$ and $\hat{\mu}_{a_t, m}$
    \EndIf
\EndFor

\end{algorithmic}
\end{algorithm}
