\section{Confounding Robust Eligibility Traces}\label{sec:_3}
The extended causal Bellman equations described so far require one to have precise estimations for the full models of the nominal transition distribution $\Tilde{\1T}$, reward function $\Tilde{\1R}$, and the propensity score $P(x\mid s)$. However, in practice, the detailed parameterizations of these probability models are generally assumed to be unknown. The learner must recover them from finite samples drawn from the confounded observational distribution.

This section will introduce novel model-free algorithms, using eligibility traces \citep{sutton1988learning}, to bound value functions from finite observational samples.
We consider the episodic framework, where the agent interacts with the environment for repeated episodes $n = 1, 2, 3, \dots$; each episode contains a finite number of time steps $t = 1, 2, \dots, T_n$. At each episode, the environment starts at state $s_1$ following the initial distribution $P(S_1)$. At each time step $t$, taking the observed state $s_t$ of the environment as input, the behavior policy selects an action $x_t$. In response to intervention $\doo(x_t)$, the environment produces a subsequent reward $y_t$ and moves to the next observed state $s_{t+1}$. If the next state $s_{t+1}$ is \emph{terminal}, the episode terminates at time step $T_n = t + 1$; the learner receives observational data $\Braces{\bar{\*x}_{1:T_n-1},  \bar{\*s}_{1:T_n}, \bar{\*y}_{1:T_n-1}}$.

\begin{algorithm}[t]
	\caption{Causal Temporal Difference (\texttt{C-TD($\lambda$)})}
	\label{alg:_3_ctd}
	\setlength{\textfloatsep}{0pt}
	\begin{algorithmic}[1]
		\Require Observational data $\1D$ and a policy $\pi(x \mid s)$.
		\State Update the eligibility traces for all state $s \in \1S$,
		\begin{align*}
			e_t(s) = \gamma \lambda \pi(x_{t-1} \mid s_{t-1})e_{t-1}(s) + \I_{s = s_t}
		\end{align*}
		where $\lambda \in [0, 1]$ is an eligibility trace decay factor.
		\State Compute the temporal difference error
		\begin{align*}
			\delta_t &=
			\pi(x_t \mid s_t) \left(y_t  + \gamma V_t(s_{t+1})\right) \\
            &+ \pi(\neg x_t \mid s_t) \left(w + \gamma V_t(s^*) \right) - V_t(s_t)
		\end{align*}
		\State Update the value function $V_{t+1}(s) \gets V_t(s) + \alpha e_t(s)\delta_t$ for all state $s \in \1S$.
	\end{algorithmic}
\end{algorithm}
\subsection{Causal Temporal Difference}\label{sec:_3_1}

\begin{wrapfigure}[16]{r}{0.2\textwidth}
	\vspace{-0.1in}
	\centering
	\begin{tikzpicture}
		\def\outerr{3}
		\def\innerr{2.7}

		\node[vertex, label={[label distance=0.01in]180: \small S\textsubscript{1}}] (S1) at (0, 0) {};
		% \node[uvertex] (U_1) at (0.2, 1.05);
		\node[action, label={[label distance=0.04in]180: \small X\textsubscript{1}}] (X1) at (0, -1) {};
		\node[vertex, label={[label distance=0.01in]180: \small S\textsubscript{2}}] (S2) at (0, -2) {};
		\node[vertex, label={[label distance=0.01in]0: \small s*}] (S2p) at (0.952, -1.3) {};
		\node[action, label={[label distance=0.04in]180: \small X\textsubscript{2}}] (X2) at (0, -3) {};
		\node[vertex, label={[label distance=0.01in]180: \small S\textsubscript{3}}] (S3) at (0, -4) {};
		\node[vertex, label={[label distance=0.01in]0: \small s*}] (S3p) at (0.952, -3.3) {};

		\draw[dir] (S1) -- (X1);
		\draw[dir] (X1) -- (S2) node [anchor = east, midway] {\small $\pi$};
		\draw[dir] (X1) -- (S2p) node [above, midway, sloped] {\small $1 - \pi$};
		\draw[dir] (S2) -- (X2);
		\draw[dir] (X2) -- (S3) node [anchor = east, midway] {\small $\pi$};
		\draw[dir] (X2) -- (S3p) node [above, midway, sloped] {\small $1 - \pi$};
		\path (S3) -- node {} (0, -5) node [font=\large, midway, sloped] {$\dots$};
	\end{tikzpicture}
        \vspace{-0.1in}
	\caption{Backup diagram for \texttt{C-TD($\lambda$)}.}
	\label{fig:_3_ctd}
\end{wrapfigure}
We first introduce a novel augmentation procedure on the celebrated temporal difference (\texttt{TD}, \citep{sutton1988learning,precup2000eligibility}) that allows one to estimate the bounds over state value functions, which we call the \emph{causal temporal difference} (\texttt{C-TD}). \Cref{fig:_3_ctd} shows the backup diagram illustrating the idea of our proposed algorithm. Similar to the standard off-policy \texttt{TD}, our algorithm will update the estimation of state value functions $\underline{V_{\pi}}, \overline{V_{\pi}}$ using the sampled trajectories of transitions in the observational data. It could use a finite number of $n$-step trajectories or the entire trajectory. Different from the standard off-policy \texttt{TD}, our proposed algorithm does not weight each step of the transition using importance sampling (or equivalently, inverse propensity weighting) since the true behavior policy $f_X$ (propensity score) is not recoverable from the observational data. Instead, \texttt{C-TD} weights each transition using the target policy $\pi$ and adjusts for the misalignment between the target and behavior policies using an overestimation/underestimation of value function at state $s^*$. Such $s^*$ is set as the best-case state associated with the highest value in our current estimation when computing upper bounds and the worst-case state estimate for lower bounds.

To formally introduce the estimation algorithm, we first introduce some necessary notations. Let $\*N(s)$ denote the set of indices of episodes containing a state $s \in \1S$, and let $\*t_n(s)$ be the collection of time steps in the $n$-th episode such that for every $t \in \*t_n(s)$, $s_t = s$. For any time step $t$, let $\pi_t = \pi(x_t \mid s_t)$ and $\neg \pi_t = 1 - \pi(x_t \mid s_t)$. We iteratively define the estimator for bounds over the state value function $V_{\pi}(s)$ as follows, for any state $s \in \1S$,
\begin{equation}
    \begin{split}
        \widehat{V_{\pi}}(s) = \frac{1}{N} \sum_{n \in \*N(s)} \sum_{t \in \*t_n(s)} \sum_{k = 0}^{T_n - t} \gamma^k \Big (\pi_{t+k} y_{t + k}&\\
        + \neg \pi_{t+k} \big (w + \gamma V(s^*) \big ) \Big )\prod_{i = t}^{t + k-1}\pi_i&
    \end{split} \label{eq:_3_1_ctd}
\end{equation}
Among the above equations, $N$ represents the total number of occurrences for the even $s_t = s$ in the observational data. we set parameters $w = a$ and $V(s^*) = \min_{s} V(s)$ when estimating the lower bound $\underline{V_{\pi}}(s)$; parameters $w = b$ and $V(s^*) = \max_{s} V(s)$ for the upper bound $\overline{V_{\pi}}(s)$.

An eligibility-trace version of our proposed estimation strategy is described \Cref{alg:_3_ctd}. The algorithm keeps track of eligibility traces for every state in a similar manner to standard off-policy temporal difference algorithms. The main difference is that here the eligibility trace is multiplied by the target policy $\pi(x_{t-1} \mid s_{t-1})$ and a decay-rate $\lambda$, not including the nominal propensity score $P(x_{t-1} \mid s_{t-1})$. When computing the temporal difference error, the algorithm adjusts for the misalignment between the target and behavior policies by adding a regularized term $w + \gamma V_t(s^*)$, weighted by the probability $1 - \pi(x_t \mid s_t)$. We describe in \Cref{alg:_3_ctd} a version of \texttt{C-TD($\lambda$)} using \emph{online update}. This means that the bound estimates are updated at every time step. The \emph{offline} version of the algorithm will use the same temporal difference error and eligibility traces. However, the update only occurs at the end of each episode; the increments and decrements are accumulated on the side, and the value function estimates do not change during the episode.
\begin{restatable}{theorem}{thmctd}\label{thm:_3_ctd}
	For any behavior policy, for any choice of $\lambda \in [0, 1]$ that does not depend on the actions chosen at each state, let parameters $w$ and $s^*$ be defined as follows: (1) Lower Bound $\underline{V_{\pi}}$: $w = a$ and $s^* = \argmin_{s} V_t(s)$; (2) Upper Bound $\overline{V_{\pi}}$: $w = b$ and $s^* = \argmax_{s} V_t(s)$.
	Then, \Cref{alg:_3_ctd} with offline updating converges with probability $1$ to lower bound $\underline{V_{\pi}}$ and upper bound $\overline{V_{\pi}}$, respectively, under the usual step-size conditions on $\alpha$.
\end{restatable}
The proof of \Cref{thm:_3_ctd} first shows a contraction property for estimates $\widehat{V}_{\pi}$, and then follows the general convergence theorem in \citep{jaakkola1994reinforcement}.


\begin{algorithm}[t]
	\caption{Causal Tree-Backup (\texttt{C-TB($\lambda$)})}
	\label{alg:_4_ctb}
	\setlength{\textfloatsep}{0pt}
	\begin{algorithmic}[1]
		\Require Observational data $\1D$ and a policy $\pi(x|s)$.
		\State Update the eligibility traces for all state-action pairs $s, x \in \1S \times \1X$:
		\begin{align*}
			e_t(s, x) = \gamma \lambda \pi(x_t \mid s_t)\I_{x_{t-1} = x} e_{t-1}(s, x) + \I_{s \neq s_t} 
		\end{align*}
		where $\lambda \in [0, 1]$ is an eligibility trace decay factor.
		\State Compute the temporal difference error for every action $x \in \1X$. More specifically, if $x = x_t$,
		\begin{align*}
			\delta_t(x) =
				y_t  + \gamma \sum_{x'} \pi(x \mid s_{t+1}) Q_t(s_{t+1}, x') - Q_t(s_t, x)
            \end{align*}
            Otherwise,
            \begin{align*}
                \delta_t(x) =
				w + \gamma \sum_{x'} \pi(x' \mid s^*) Q_t(s^*, x') - Q_t(s_t, x)
		\end{align*}
		\State Update the action-value function $Q_{t+1}(s, x) \gets Q_t(s, x) + \alpha e_t(s, x)\delta_t(x)$ for all  $s, x \in \1S \times \1X$.
	\end{algorithmic}
\end{algorithm}

\subsection{Causal Tree Backup}\label{sec:_3_2}
The algorithm described so far focuses on the estimation of the state value functions. We next introduce a novel algorithm to bound the state-action value function $Q_{\pi}$ from finite samples of the observational distribution.

\begin{wrapfigure}[19]{r}{0.2\textwidth}
	\centering
	\begin{tikzpicture}
		\def\outerr{3}
		\def\innerr{2.7}

		\node[action, label={[label distance=0.04in]0: \small (s, x)}] (start) at (0, 0) {};
		\node[vertex, label={[label distance=0.01in]180: \small S\textsubscript{1}}] (S1) at (0, -1) {};
		\node[action, label={[label distance=0.01in]180: \small X\textsubscript{1}}] (X1) at (0, -2) {};
		\node[vertex, label={[label distance=0.01in]0: \small s*}] (S2p) at (0.85, -2.85) {};
		\node[action] (X1l) at (-0.85, -1.65) {};
		\node[action] (X1r) at (0.85, -1.65) {};
		\node[vertex, label={[label distance=0.01in]180: \small S\textsubscript{2}}] (S2) at (0, -3.2) {};
		\node[action, label={[label distance=0.01in]180: \small X\textsubscript{2}}] (X2) at (0, -4.2) {};
		\node[vertex, label={[label distance=0.01in]0: \small s*}] (S3p) at (0.85, -5.05) {};
		\node[action] (X2l) at (-0.85, -3.85) {};
		\node[action] (X2r) at (0.85, -3.85) {};
		\node[vertex, label={[label distance=0.01in]180: \small S\textsubscript{3}}] (S3) at (0, -5.4) {};

		\draw[dir] (start) -- (S1);
		\draw[dir] (S1) -- (X1);
		\draw[dir] (S1) -- (X1l);
		\draw[dir] (S1) -- (X1r);
		\draw[dir] (X1) -- (S2)node [below, midway, sloped] {\small $= x$};
		\draw[dir] (X1) -- (S2p) node [above, midway, sloped] {\small $\neq x$};
		\draw[dir] (S2) -- (X2);
		\draw[dir] (S2) -- (X2l);
		\draw[dir] (S2) -- (X2r);
		\draw[dir] (X2) -- (S3)node [below, midway, sloped] {\small $= x$};
		\draw[dir] (X2) -- (S3p) node [above, midway, sloped] {\small $\neq x$};
		\path (S3) -- node {} (0, -6.4) node [font=\large, midway, sloped] {$\dots$};
	\end{tikzpicture}
 \vspace{-0.1in}
	\caption{Backup diagram for \texttt{C-TB($\lambda$)}.}
	\label{fig:_3_ctb}
\end{wrapfigure}

Our algorithm is based on an augmentation on the standard tree backup (\texttt{TB} \citep{precup2000eligibility}), which we call the \emph{causal tree backup} (\texttt{C-TB($\lambda$)}). The main idea of this new algorithm is illustrated in the backup diagram of \Cref{fig:_3_ctb}. Similar to the standard tree backup, our algorithm updates the value estimates for the action selected by the behavior policy at each time step based on the subsequent reward and the current estimation for the value of the next state. The algorithm then forms a new estimate for the target value function, using the old value estimates for the actions not observed in the observational data and the new estimated value for $t$-he action taken by the behavior policy. On the other hand, the main differences include the following. (1) Eligibility traces will not only be weighted by the target policy $\pi(x_t \mid s_t)$ using the observed trajectories, but also an indicator function $\I_{x_{t-1} = x}$ returning $1$ if the previous action $x_{t-1}$ coincides with the target action $x$. (2) When the behavior policy takes the same action $x_t = x$ as the target action, the update follows standard \texttt{TB} and uses the next sampled state $s_t$; when the sampled action $x_t \neq x$ differs from the target, our algorithm updates, instead, using the value function associated with the next worst-case or best-case state $s^*$, corresponding to the estimation of the lower and upper bounds respectively. The $n$-step causal tree-backup estimator is defined as
\begin{equation}
    \begin{split}
    &\widehat{Q_{\pi}}(s, x) =\frac{1}{N} \sum_{n \in \*N(s)}\sum_{t \in \*t_n(s)} \gamma^{n} Q(s_{t + n}, x_{t+n})\\
    &\cdot \prod_{i = t}^{t + n - 1} \pi_{i+1} \I_{x_i= x} + \sum_{k = t}^{t + n} \gamma^{k - t + 1} \prod_{i = t}^{t + k-1}\pi_{i+1} \I_{x_i =x} \\
    &\cdot \Bigg (\I_{x_k \neq  x} \bigg (w + \sum_{x'} \pi(x' \mid s^*) Q(s^*, x') \bigg) + \I_{x_k = x} \\
    & \cdot \bigg(y_{k} + \sum_{x' \neq x} \pi(x' \mid s_{k+1}) Q(s_{k+1}, x') \bigg) \Bigg)
    \end{split}\label{eq:_3_2_ctb_est}
\end{equation}
The above tree backup estimator also has a simple incremental implementation using eligibility traces. An online version of this implementation is shown in \Cref{fig:_3_ctb}.

\begin{figure}[t]
\centering
\hfill
        \begin{subfigure}{0.49\linewidth}\centering%(d)
		\includegraphics[width=\linewidth]{figures/lava_maze_2.png}
		\caption{}
		\label{fig:_4_1_a}
	\end{subfigure}\hfill
	\begin{subfigure}{0.49\linewidth}\centering
		\includegraphics[width=\linewidth]{figures/lava_policy_2.png}
		\caption{}
		\label{fig:_4_1_b}
	\end{subfigure}\hfill\null
	\caption{An alternative windy gridworld environment where the lava is placed at both the top and bottom. The wind is the weakest at the center row of the map.}
    \label{fig:_4_1_windy}
\end{figure}

\begin{restatable}{theorem}{thmctb}\label{thm:_4_ctb}
	For any behavior policy, for any choice of $\lambda \in [0, 1]$ that does not depend on the actions chosen at each state, let parameters $w$ and $s^*$ be defined as follows: (1) Lower Bound $\underline{Q_{\pi}}$: $w = a$ and $s^* = \argmin_{s} \sum_{x'} \pi(x' \mid s) Q_t(s, x')$; (2) Upper Bound $\overline{Q_{\pi}}$: $w = b$ and $s^* = \argmax_{s} \sum_{x'} \pi(x' \mid s) Q_t(s, x')$.
	Then, \Cref{alg:_4_ctb} with offline updating converges with probability $1$ to lower bound $\underline{Q_{\pi}}$ and upper bound $\overline{Q_{\pi}}$, respectively, under the usual step-size conditions on $\alpha$.
\end{restatable}
The proof of the above theorem relies on a contraction property on the estimates $\widehat{Q}_{\pi}$ and follows from the general convergence theorem in \citep{jaakkola1994reinforcement}.