\begin{algorithm}[H]
\caption{MNAR Algorithm}\label{alg:mnar_algorithm}
\begin{algorithmic}[1]
\State \textbf{Input:} Number of arms $n$, time horizon $T$, exploration parameter $\alpha$
\State \textbf{Initialize:} 
\For{each arm $a \in [n]$ and $m \in \mathbb{M}$} 
    \State Set $b_{m, 0 | a} = 0$ \Comment{Estimation of \( \mathbb{P}(M = m, O^Y = 0 \mid a) \)}
    \State Set $\hat{\theta}_{a} = [0]_{k \times L}$ \Comment{Estimation of matrix \( \theta_a[m, y] = \mathbb{P}(m, y, O^Y = 1 \mid a) \)}
    \State Set $q_{m, y \mid 1, a} = 0$ \Comment{Estimation of \( \mathbb{P}(M = m, Y = y \mid a, O^Y = 1) \)}
    \State Set $T_{a} = 0$ \Comment{Count of pulls of arm $a$}
    \State Set $T_{a, o} = 0$ \Comment{Count of pulls of arm $a$ with observed reward}
\EndFor

\For{each arm $a \in [n]$} 
    \For{$\log(T)^2$ rounds}
        \State Pull arm $a$, observe mediator $m$ and reward $y$
        \State Update $T_a$ 
        \If{reward is observed} 
            \State Update $T_{a, o}$, $\hat{\theta}_{a}[m, y]$, and $q_{m, y \mid 1, a}$ 
        \Else
            \State Update $b_{m, 0 | a}$
        \EndIf
    \EndFor
\EndFor

\State Set $T_1 = n \log(T)^2$ and $T_2 = T - T_1$

\For{each round $t = 1, \dots, T_2$}
    \For{each arm $a \in [n]$}
        \State Solve $x_a = \hat{\theta_a}^{\dagger} b_a$ 
        \State Update $x_a = x_a + [1]_{L \times 1}$
        \State Compute $\hat{p}(m, y) = x_a[y] \times q_{m, y \mid 1, a}$ 
        \State Compute $\hat{p}(y) = \sum\limits_{m \in \mathbb{M}} \hat{p}(m, y)$ 
        \State Compute $\hat{\mu}_a = \sum\limits_{y \in \mathbb{Y}} y \times \hat{p}(y)$ 
        \State Compute $\hat{\gamma}_a = \frac{1}{\max\limits_{y \in \mathbb{Y}}(x_a[y])}$ 
        \State Compute $\text{UCB}_a(t) = \hat{\mu}_a + 8\frac{L C_a}{\lVert \hat{\theta}_a \rVert_\infty \hat{\gamma}_a}\sqrt{\frac{\alpha \log(T)}{T_a}} + \frac{K}{\hat{\gamma}_a}\sqrt{\frac{\alpha \log(T)}{T_{a, o}}}$ 
    \EndFor
    \State Select arm $a_t = \arg \max_a \text{UCB}_a(t)$ 
    \State Pull arm $a_t$, observe $m$ and reward $y_t$
    \State Update $T_{a_t}$ 
    \If{reward is observed} 
        \State Update $T_{a_t, o}$, $\hat{\theta}_{{a_t}}[m, y]$, and $q_{m, y \mid 1, {a_t}}$ 
    \Else
        \State Update $b_{m, 0 | {a_t}}$
    \EndIf
\EndFor

\end{algorithmic}
\end{algorithm}
