\clearpage
\appendix
\onecolumn
\begin{center}
    \Large\bfseries
    \vspace{-1em}Appendix\vspace{-.1em}
\end{center}
This appendix is organized as follows.
Section \ref{apx:alg} includes the omitted algorithms referred to in the main text.
Section \ref{apx:proofs} includes the technical proofs of our results. 
Section \ref{apx:Missing M} provides our algorithm for the case where both the outcome and the mediator are missing not at random (MNAR) along with the regret analysis and proofs.
Finally, Section \ref{appendix:additional_simulations} includes additional empirical evaluation results.
\vspace{-.5em}
\section{Main Algorithms}\label{apx:alg}
\input{algorithms/mcar_algorithm}
\input{algorithms/mar_algorithm}
\input{algorithms/mar_algorithm2}
\clearpage
\input{algorithms/mnar_algorithm}


\clearpage
\section{Technical Proofs}\label{apx:proofs}
\paragraph{Double Robustness of the AIPW estimator.}
Following \nameref{discussion2} in Section \ref{subsec:mar}, let $\hat{\gamma}_{m,a}$ and $\hat{\mu}_{m,a}$ be models for $\gamma_{m,a}$ and $\ex{Y^o_t\mid m,a,\oo_t=1}$, respectively.
Define
\begin{equation}\label{eq:dre}
\begin{split}
    \hat{\mu}_a &= \mathbb{E}\big[
        \sum_{m\in\mathcal{M}}\frac{\mathbbm{1}\{M_t=m\}}{\hat{\gamma}_{m,a}}\big(
        Y^o_t\mathbbm{1}\{\oo_t=1\}
        -
        (\mathbbm{1}\{\oo_t=1\}-\hat{\gamma}_{m,a})\hat{\mu}_{m,a}
        \big)
        \mid A_t=a
    \big],
\end{split}
\end{equation}
as an estimator for $\mu_a$ of Eq.~\eqref{eq:dr}.
Herein, we prove that $\hat{\mu}_a$ is \emph{doubly robust}, in the sense that if either of the missingness probability models ($\hat{\gamma}_{m,a}$) or the outcome regression models ($\hat{\mu}_{m,a}$), but not necessarily both, are correctly specified, then $\hat{\mu}_a$ of Eq.~\eqref{eq:dre} is consistent for $\mu_a$ of Eq.~\eqref{eq:dr}.
We discuss the two cases separately:

Case (i): the missingness probabilities are correctly specified; i.e., $\hat{\gamma}_{m,a}=\gamma_{m,a}$.
In this case,
\[
\begin{split}
&\ex{\sum_{m\in\mathcal{M}}(\frac{\mathbbm{1}\{\oo_t=1\}}{\hat{\gamma}_{m,a}}-1)\hat{\mu}_{m,a}\mathbbm{1}\{M_t=m\}\mid A_t=a}
\\&\overset{(a)}{=}
\sum_{m\in\mathcal{M}}\ex{(\frac{\mathbbm{1}\{\oo_t=1\}}{{\gamma}_{m,a}}-1)\hat{\mu}_{m,a}\mathbbm{1}\{M_t=m\}\mid A_t=a}
\\&\overset{(b)}{=}
\sum_{m\in\mathcal{M}}\ex{(\frac{\mathbbm{1}\{\oo_t=1\}}{{\gamma}_{m,a}}-1)\mid A_t=a,M_t=m}
\hat{\mu}_{m,a}p_{m,a}
\\&\overset{(c)}{=}
\sum_{m\in\mathcal{M}}(\frac{\gamma_{m,a}}{{\gamma}_{m,a}}-1)
\hat{\mu}_{m,a}p_{m,a}\\&=0,
\end{split}
\]
where $(a)$ is due to $\hat{\gamma}_{m,a}$ being correctly specified,
$(b)$ is an application of the law of total expectation, and $(c)$ is by definition of $\gamma_{m,a}=\ex{\oo_t\mid A_t=a,M_t=m}$.
As a result, we get
\[
\hat{\mu}_a= \mathbb{E}\big[
        \sum_{m\in\mathcal{M}}\frac{\mathbbm{1}\{M_t=m\}}{{\gamma}_{m,a}}
        Y^o_t\mathbbm{1}\{\oo_t=1\}\mid A_t=a
    \big],
\]
which matches Eq.~\eqref{eq:ht}, and therefore $\hat{\mu}_a$ is consistent for $\mu_a$.

Case (ii): the outcome regression models are correctly specified; i.e., 
$\hat{\mu}_{m,a}=\ex{Y^o_t\mid m,a,\oo_t=1}$.
Then,
\[\begin{split}
    &\mathbb{E}\big[
        \sum_{m\in\mathcal{M}}\frac{\mathbbm{1}\{M_t=m\}}{\hat{\gamma}_{m,a}}(
        Y^o_t
        -
        \hat{\mu}_{m,a})\mathbbm{1}\{\oo_t=1\}
        \mid A_t=a
    \big]
    \\&\overset{(a)}{=}
    \sum_{m\in\mathcal{M}}\mathbb{E}\big[
        \frac{\mathbbm{1}\{\oo_t=1\}}{\hat{\gamma}_{m,a}}(
        Y^o_t
        -
        \hat{\mu}_{m,a})
        \mid A_t=a, M_t=m
    \big]p_{m,a}
    \\&\overset{(b)}{=}
    \sum_{m\in\mathcal{M}}\frac{\gamma_{m,a}}{\hat{\gamma}_{m,a}}\mathbb{E}\big[
        Y^o_t
        -
        \hat{\mu}_{m,a}
        \mid A_t=a, M_t=m, \oo_t=1
    \big]p_{m,a}
    \\&\overset{(c)}{=}
    \sum_{m\in\mathcal{M}}\frac{\gamma_{m,a}}{\hat{\gamma}_{m,a}}\big(\mathbb{E}\big[
        Y^o_t
        \mid A_t=a, M_t=m, \oo_t=1
    \big]-
        \hat{\mu}_{m,a}\big)p_{m,a}
        \\&\overset{(d)}{=}0,
    \end{split}
\]
where $(a)$ and $(b)$ are due to the law of total expectations, $(c)$ is by linearity of expectation, and $(d)$ follows from the correctness of $\hat{\mu}_{m,a}$.
From Eq.~\eqref{eq:dre},
\[
\begin{split}
    \hat{\mu}_{m,a}&=\ex{\sum_{m\in\mathcal{M}}\mathbbm{1}\{M_t=m\}\hat{\mu}_{m,a}\mid A_t=a}\\
    &= \ex{\sum_{m\in\mathcal{M}}\mathbbm{1}\{M_t=m\}\ex{Y^o_t\mid m, a, \oo_t=1}\mid A_t=a}
    \\ &= \ex{\ex{Y^o_t\mid M, a, \oo_t=1}\mid A_t=a},
\end{split}
\]
which matches Eq.~\eqref{eq:idmar}, and therefore $\hat{\mu}_{m,a}$ is consistent for $\mu_{m,a}$.\qed
\theomcarupper*
\input{proofs/mcar_proof}

\theomcarlower*
\input{proofs/mcar_lower_proof}


\theomarupperfirst*
\input{proofs/mar_proof1}

\theomaruppersecond*
\input{proofs/mar_proof2}

\theomarlower*
\input{proofs/mar_lower_proof}

\thmignoremed*
\input{proofs/ignore_m_proof}

\theomnarupper*
\input{proofs/mnar_proof}

% \textbf{Double Robustness} In this part we will prove the double robustness of the following estimator in MAR environment.
% \begin{equation}
%     \mu_a = \mathbb{E}\big[
%         \sum_{m\in\mathcal{M}}\frac{\mathbbm{1}\{M_t=m\}}{\gamma_{m,a}}\big(
%         Y^o_t\mathbbm{1}\{\oo_t=1\}
%         -
%         (\mathbbm{1}\{\oo_t=1\}-\gamma_{m,a})\ex{Y^o_t\mid m, a, \oo_t=1}
%         \big)
%         \mid A_t=a
%     \big]
% \end{equation}
% Suppose $\hat{\mu}_{m,a}$ is an estimator for $\mu_{m,a}=\ex{Y^o_t\mid m, a, \oo_t=1}$, and $\hat{\gamma}_{m,a}$ is an estimator for $\gamma_{m,a}$.
% We claim that if either $\hat{\mu}_{m,a}$s or $\hat{\gamma}_{m,a}$s (but not necessarily both) are consistent, then the following estimator is consistent for $\mu_a$:
% \[
%     \hat{\mu}_a = \frac{1}{\sum_{t=1}^T\mathbbm{1}\{A_t=a\}}\sum_{t=1}^T\mathbbm{1}\{A_t=a\}
%         \sum_{m\in\mathcal{M}}\frac{\mathbbm{1}\{M_t=m\}}{\hat{\gamma}_{m,a}}\big(
%         Y^o_t\mathbbm{1}\{\oo_t=1\}
%         -
%         (\mathbbm{1}\{\oo_t=1\}-\hat{\gamma}_{m,a})\hat{\mu}_{m,a}
%         \big)
% \]
% \begin{proof}
% First suppose the missingness probability estimators are consistent.
%     By symmetry, it suffices to show that 
%     \[\begin{split}
%         \frac{1}{\sum_{t=1}^T\mathbbm{1}\{A_t=a\}}
%         &\sum_{t=1}^T
%         \frac{\mathbbm{1}\{A_t=a,M_t=m\}}{\hat{\gamma}_{m,a}}\big(
%         Y^o_t\mathbbm{1}\{\oo_t=1\}
%         -
%         (\mathbbm{1}\{\oo_t=1\}-\hat{\gamma}_{m,a})\hat{\mu}_{m,a}
%         \big)\\
%         &=\frac{1}{\sum_{t=1}^T\mathbbm{1}\{A_t=a\}}\sum_{t=1}^T
%         \frac{\mathbbm{1}\{A_t=a,M_t=m,\oo_t=1\}}{\hat{\gamma}_{m,a}}
%         Y^o_t
%         \\&-
%         \frac{1}{\sum_{t=1}^T\mathbbm{1}\{A_t=a\}}\sum_{t=1}^T
%         \frac{\mathbbm{1}\{A_t=a,M_t=m\}}{\hat{\gamma}_{m,a}}
%         (\mathbbm{1}\{\oo_t=1\}-\hat{\gamma}_{m,a})
%         \hat{\mu}_{m,a}
%     \end{split}
%     \]
% is consistent for
%     \[
%     \begin{split}
%         \mathbb{E}\big[
%         &\frac{\mathbbm{1}\{M_t=m\}}{\gamma_{m,a}}\big(
%         Y^o_t\mathbbm{1}\{\oo_t=1\}
%         -
%         (\mathbbm{1}\{\oo_t=1\}-\gamma_{m,a})\mu_{m,a}
%         \big)
%         \mid A_t=a
%     \big]\\
%     &=\mathbb{E}\big[
%         \frac{\mathbbm{1}\{M_t=m\}}{\gamma_{m,a}}
%         Y^o_t\mathbbm{1}\{\oo_t=1\}
%         \mid A_t=a
%     \big].
%     \end{split}
%     \]
% \end{proof}
% \input{proofs/db}

\section{Theoretical results on Missing Outcome and Missing Mediator}\label{apx:Missing M}
\input{Missing_Mediator/main_text}


\clearpage
\section{Additional Empirical Evaluation}
\label{appendix:additional_simulations}

\begin{figure*}[h]
    \centering
    \begin{subfigure}[b]{0.38\textwidth}
        \includegraphics[width=\textwidth]{figures/UCB-MNAR_2.png}
        \caption{MNAR and UCB algorithms in the MNAR bandit environment.}
        \label{fig:MNAR-UCB}
    \end{subfigure}
    % \hspace{1cm}
    \begin{subfigure}[b]{0.38\textwidth}
        \includegraphics[width=\textwidth]{figures/PBC-data.png}
        \caption{MAR and UCB algorithms in a real-world MAR bandit environment.}
        \label{fig:RW-1}
    \end{subfigure}
    % \hfill
    % % \vspace{0.3cm} % Adds vertical space between rows
    % \begin{subfigure}[b]{0.31\textwidth}
    %     \includegraphics[width=\textwidth]{figures/MAR-1.png}
    %     \caption{MAR algorithm with different p initializations on MAR environment.}
    %     % \label{fig:MAR-2}
    % \end{subfigure}
    % % \hfill
    % \begin{subfigure}[b]{0.31\textwidth}
    %     \includegraphics[width=\textwidth]{figures/MAR-2.png}
    %     \caption{MAR and UCB algorithms in the MAR bandit environment.}
    %     % \label{fig:MAR-3}
    % \end{subfigure}
    % \hspace{1cm}
    % \begin{subfigure}[b]{0.31\textwidth}
    %     \includegraphics[width=\textwidth]{figures/MNAR-1.png}
    %     \caption{Performance of the MNAR algorithm in the described environment.}
    %     % \label{fig:MNAR-1}
    % \end{subfigure}
    \hfill
    \caption{Complementary evaluation results for our proposed algorithms.}
    % \label{fig:combined}
\end{figure*}

In Figure~\ref{fig:MNAR-UCB}, we compare the performance of the UCB and MNAR algorithms in the MNAR bandit environment. The results clearly demonstrate that the cumulative regret of the UCB algorithm is consistently higher than that of the MNAR algorithm. Additionally, the y-axis is displayed on a logarithmic scale, further highlighting the considerable difference in the performance of our algorithm compared to the UCB algorithm. The environment is generated as before, with a horizon of \( T = 100{,}000 \), and the experiment is repeated 10 times.

% \begin{refsection}
%     \nocite{kamath2015learning,higham1994survey,lattimore2020bandit}
%     \printbibliography[heading=bibintoc, title={References}]
% \end{refsection}

\subsection{Real-World Simulation}

The dataset used in this study is the Primary Biliary Cirrhosis (PBC) dataset from the Mayo Clinic \footnote{\url{https://www.openml.org/d/200}}, containing 418 observations and 19 variables. Collected over a 10-year span (1974–1984), it focuses on a randomized, placebo-controlled trial of D-penicillamine for treating PBC, and includes both trial participants and observational data from non-participants.

To simulate a real-world setting, we structured the data as follows: the \textbf{Z1} variable (1 for D-penicillamine, 2 for placebo) was treated as the \textit{arms of the bandit}, representing treatment groups. The \textbf{X} variable, denoting the time in days from registration to death, liver transplantation, or censoring, was used as the outcome. The \textbf{D} variable, indicating whether \textbf{X} measures time until death (1) or censoring (0), served as the \textit{mediator}.

The \textbf{D} mediator captures whether the time interval \textbf{X} is associated with death or censoring, offering key insights into the progression of the disease and the effect of treatment. This setup allows us to model the pathways from treatment to outcome, where \textbf{Z1} represents the action taken, \textbf{X} is the reward (days survived), and \textbf{D} explains the intermediate state between treatment and survival or death.

Applying the MAR algorithm to this MAR bandit environment yielded results consistent with those seen in synthetic data, as shown in Figure \ref{fig:RW-1}.