% \renewcommand{\labelenumi}{\alph{enumi})}

\section{Estimation of Reward from Observation}\label{secappendix: estimating rewards from observations}
In Algorithm \ref{alg: estimating rewards from observations} below we explain our strategy (derived from \cite{BhattacharyyaGKMV20}) for estimating the reward of the interventional arms $a_{i,x}$ using $T/2$ observational samples collected by playing the observational arm $a_0$. This is followed by details on each of the steps involved. Recall, $N$ is the number of intervenable nodes.

% For each $a_{i,x}$ the algorithm consists of three steps: \textbf{(a)} Reduce ADMG $G$ to an ADMG $H$ using the reduction process described in Section \ref{subsection:graph-reduction} such that the number of observable nodes under consideration is reduced, in-degree and size of c-components are still bounded and the required causal effect of $do(X_i=x)$ in $G$ remains identifiable in $H$, \textbf{(b)} Using $H_i$, construct another bayes net $D_{i,x}$ simulating the intervention $do(X_i=x)$ as described in Section \ref{subsection:intervention-reduced-graph}, and finally $(c)$ Create an estimator $\widehat{D}_{i,x}$ for $D_{i,x}$ using the $T/2$ observational samples in Algorithm \ref{SR-algorithm}, followed by estimate $\mu_{i,x}$ by using samples from the estimated distribution $\widehat{D}_{i,x}$, as explained in Section \ref{subsection:estimate-effect}.

\begin{algorithm}[H]
\caption{Estimating Rewards from Observational Samples} \label{alg: estimating rewards from observations}
\begin{algorithmic}
\State INPUT: $\mathsf{His}$ containing the $T/2$ observational samples collected by playing arm $a_0$, and $\mathcal{G}$
\end{algorithmic}

\begin{algorithmic}[1]
% \State \todo{This Algorithm should $T/2$ samples and output estimates for all arms}
\State For each $i\in [N]$, reduce the input ADMG $\mathcal{G}$ to ADMG $\mathcal{H}_i$ as outlined in Algorithm \ref{alg:graph-reduction}.
\State Next, for each $i\in [N]$ and $x\in \{0,1\}$, construct the Bayes net $D_{i,x}$ which simulates the causal effect of intervention $do(X_i=x)$ on the reduced graph $\mathcal{H}_i$.
\State Using Algorithm \ref{Dix-algorithm} on the input samples, estimate the distributions of all $D_{i,x}$. Then, using learned $D_{i,x}$, generate samples to estimate marginal of $Y$ and return them as estimated rewards.
\end{algorithmic}
\end{algorithm}

\textbf{Step $1$ :} This step executes Algorithm \ref{alg:graph-reduction} based on the reduction algorithm from \cite{BhattacharyyaGKMV20}.


\begin{algorithm}[H]
\caption{Reducing $\mathcal{G}$ to $\mathcal{H}_i$} 
\label{alg:graph-reduction}
\begin{algorithmic}
\State INPUT: ADMG $\mathcal{G}$ and index $i\in [N]$.
\end{algorithmic}
\begin{algorithmic}[1]
% \State \todo{This Algorithm should $T/2$ samples and output estimates for all arms}
\State Let $\mathbf{W} = Y \cup X_i \cup \mathbf{Pa}^c(X_i)$, and $\mathcal{G}'_i$ be the graph obtained by considering $\mathbf{V}\backslash \mathbf{W}$ as hidden variables. Let $\mathbf{V}_i$ denote the nodes in $\mathcal{G}'_i$
\State \textbf{Projection Algorithm:} It reduces $\mathcal{G}'_i$ to $\mathcal{H}_i$ as follows:
  \begin{enumerate}
        \item Add all observable variables in $\mathcal{G}'_i$ to $\mathcal{H}_i$.
        \item For every pair of observable variable $V_j^i, V_k^i \in \mathbf{V}_i$, add a directed edge from $V_j^i$ to $V_k^i$ in $\mathcal{H}_i$, if $(a)$ there exists a directed edge from $V_j^i$ to $V_k^i$ in $\mathcal{G}_i'$, or if $(b)$ there exists a directed path from $V_j^i$ to $V_k^i$ in $\mathcal{G}'_i$ which contains only unobservable variables.
        \item For every pair of observable variable $V_j^i, V_k^i \in \mathbf{V}_i$, add a bi-directed edge between $V_j^i$ and $V_k^i$ in $\mathcal{H}_i$, if $(a)$ there exists an unobserved variable $U$ with two directed paths in $\mathcal{G}'_i$ going from $U$ to $V_j^i$ and $U$ to $V_k^i$ and containing only unobservable variables.
    \end{enumerate}
\State Return $\mathcal{H}_i$.
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[H]
\caption{Estimating distributions of $D_{i,x}$} \label{Dix-algorithm}
\begin{algorithmic}
\State INPUT: ADMG $\mathcal{H}_i$ and $x\in \{0,1\}$.
\end{algorithmic}
\begin{algorithmic}[1]
\For {every $V_j \in S_1$}
    \For {every assignment $V_j = v$ and $\mathbf{Z_j} = \mathbf{z}$ where $\mathbf{Z_j}$ are effective parents of $V_j$ in $\mathcal{H}_i$}
        \State $N_j \leftarrow$ the number of samples with $\mathbf{Z_j} = \mathbf{z}$
        \State $N_{j,v} \leftarrow$ the number of samples with $\mathbf{Z_j}=\mathbf{z}$ and $V_j = v$
        \State $\widehat{D}_{i,x}(V_j = v | \mathbf{Z_i} = \mathbf{z}) \leftarrow \frac{N_{j,v}+1}{N_j+2}$
        % \State $\widehat{D}_{i,x}(V_j | \mathbf{Z_i} = \mathbf{z}) \leftarrow$ add-1 empirical distribution at $V_j$ in the subset of samples where $\mathbf{Z_i} = \mathbf{z}$
        % \vin{What is the meaning of add-1 empirical distribution?} \aur{Instead of a simple estimate like $n/d$, we do $(n+1)/(d+\sigma)$, where $\sigma$ is the number of values taken. Another way of thinking this is - each value has occurred at once and after that we are observing the dataset.}
    \EndFor
\EndFor
\For {every $V_j \in \mathbf{V_i} \backslash \mathbf{S_1}$}
    \For {every $V_j = v$ and $\mathbf{Z_j} \backslash {X_i} = \mathbf{z}$, where $\mathbf{Z_j}$ are effective parents of $V_j$ in $\mathcal{H}_i$}
        \If {$X \in \mathbf{Z_i}$}
            \State $N_{j} \leftarrow$ the number of samples with $\mathbf{Z_j} \backslash {X_i} = \mathbf{z}$ and $X_i = x$
            \State $N_{j,v} \leftarrow$ the number of samples with $V_j = v$, $\mathbf{Z_j} \backslash {X_i} = \mathbf{z}$ and $X_i = x$
            \If {$N_{j} \geq t$}
                \State $\widehat{D}_{i,x}(V_j = v | \mathbf{Z_i} = \mathbf{z}) \leftarrow \frac{N_{j,v}+1}{N_j+2}$
                % \State $\widehat{D}_{i,x}(V_j | \mathbf{Z_j} - \{X_i\} = \mathbf{z}, X_i = x) \leftarrow$ the add-1 empirical distribution at node $i$ in the subset of samples where $\mathbf{Z_i} - \{X_i\} = \mathbf{z}$ and $X_i = x$
            \Else
                \State $\widehat{D}_{i,x}(V_j = v| \mathbf{Z_j} - \{X_i\} = \mathbf{z}, X_i = x) \leftarrow \frac{1}{2}$
            \EndIf
        \Else
            \State $N_{j} \leftarrow$ the number of samples with $\mathbf{Z_j} = \mathbf{z}$
            \State $N_{j,v} \leftarrow$ the number of samples with $V_j = v$ and $\mathbf{Z_j} = \mathbf{z}$
            \If {$N_{j} \geq t$}
                \State $\widehat{D}_{i,x}(V_j = v | \mathbf{Z_i} = \mathbf{z}) \leftarrow \frac{N_{j,v}+1}{N_j+2}$
                % \State $\widehat{D}_{i,x}(V_j | \mathbf{Z_j} = \mathbf{z}) \leftarrow$ the add-1 empirical distribution at node $i$ in the subset of samples where $\mathbf{Z_i} = \mathbf{z}$
            \Else
                \State $\widehat{D}_{i,x}(V_j = v | \mathbf{Z_j} = \mathbf{z}) \leftarrow \frac{1}{2}$
            \EndIf
        \EndIf
    \EndFor
\EndFor
\State Return $\widehat{D}_{i,x}$.
\end{algorithmic}
\end{algorithm}

\textbf{Step $2$ :} Construction of $D_{i,x}$ is done using the method described in Section $4.1$ of \cite{BhattacharyyaGKMV20}. Without loss of generality let ${\bf S_1}$ be the c-component containing $X_i$. To construct $D_{i,x}$, we start with $\mathcal{H}_i$. Then, for each $V \notin {\bf S_1}$ such that $X_i$ is in the set ${\bf Z_i}$ of ``effective parents'' (Section $4$,  \cite{BhattacharyyaGKMV20}) of $V$, we create a clone of $X_i$ and fix its value to $x$ (i.e. the clone has no parents). Then we remove all the outgoing edges from the original $X_i$. Note that, for any assignment $\bm{v}$ of all variables except $X_i$ in $\mathcal{H}_i$, the causal effect $\mathbb{P}_{\mathcal{H}_i}(\bm{v} | do(X_i = x)) = \sum_x \mathbb{P}_{D_{i,x}} (\bm{v}, X_i = x)$.


% \subsection{Reduction of $G$ to $H_{i,x}$}
% \label{subsection:graph-reduction}
% If $V_1, \dots, V_N$ are the nodes of $G$ in topological order, then the effective parents of $V_i$ are denoted by $\mathbf{Pa}^+(V_i) \cap [i-1]~$. In order to estimate $E[Y | do(X_i = x)]$, we first reduce the graph $G$ to ADMG $H_i$ in the following manner:
% \begin{enumerate}
%     \item Let $\mathbf{W} = Y \cup X_i \cup \mathbf{Pa}^c(X_i)$, and $G'_i$ be the graph obtained by considering $\mathbf{V}\backslash \mathbf{W}$ as hidden variables. Let $\mathbf{V_i}$ denote the nodes in $G'_i$
%     \item \textbf{Projection Algorithm:} It reduces $G'_i$ to $H_i$ as follows:
%     \begin{enumerate}
%         \item For each observable variable $V$ in $G'_i$ add $V$ in $H_i$
%         \item For each pair of observable variable $V_j, V_k \in \mathbf{V_i}$ there exists a directed edge from $V_j$ to $V_k$ in $G'$ or if there exists a directed path from $V_j$ to $V_k$ that contains only unobservable variables in $G'_i$ then add a directed edge from $V_j$ to $V_k$ in $H_i$.
%         \item For each pair of unobservable variable $V_j, V_k \in \mathbf{V_i}$, if there exists an unobserved variable $U$ such that there exists two directed paths in $G'_i$ from $U$ to $V_j$ and from $U$ to $V_k$ such that both the paths contains only unobserved variables then add a bidirected edge between $V_j$ and $V_k$ in $H_i$.
%     \end{enumerate}
% \end{enumerate}


\textbf{Step $3$ :}
In this step, we estimate the distributions of all $D_{i,x}$ using the $T/2$ samples that were provided as input. Details are described in Algorithm \ref{Dix-algorithm}. Using this estimated distribution, we get $O(T)$ samples and compute an empirical estimate $\widehat{\mu}_{i,x}$ of the reward $\mu_{i,x} = \mathbb{P}_{\mathcal{G}}(Y = 1 | do(X_i = x))$. This follows from the construction of $D_{i,x}$ in Step $2$ which implies,
\begin{equation*}
\mu_{i,x} = \mathbb{P}_\mathcal{G}(Y=1 | do(X_i = x)) = \mathbb{P}_{\mathcal{H}_i}(Y=1 | do(X_i = x)) = \sum_{x, \bm{v}^\prime} \mathbb{P}_{D_{i,x}} (Y=1, \bm{v}^\prime, X_i = x)
\end{equation*}
where $\bm{v}^\prime$ is an assignment of nodes in $D_{i,x}$ other than $X_i$ and $Y$.
