\begin{figure*}[t]
	\begin{subfigure}{0.24\linewidth}\centering%(d)
		\includegraphics[width=\linewidth]{figures/lava_2_is.png}
		\caption{Off-Policy TD}
		\label{fig:_4_2_a}
	\end{subfigure}\hfill
	\begin{subfigure}{0.24\linewidth}\centering
		\includegraphics[width=\linewidth]{figures/lava_2_tree.png}
		\caption{Tree Backup}
		\label{fig:_4_2_b}
	\end{subfigure}\hfill
	\begin{subfigure}{0.24\linewidth}\centering
		\includegraphics[width=\linewidth]{figures/lava_2_opt_v.png}
		\caption{$V^{*}(s)$}
		\label{fig:_4_2_c}
	\end{subfigure}\hfill
	\begin{subfigure}{0.24\linewidth}\centering
		\includegraphics[width=\linewidth]{figures/lava_2_opt_q.png}
		\caption{$Q^*(s, \texttt{right})$}
		\label{fig:_4_2_d}
	\end{subfigure}\hfill\null

	\begin{subfigure}{0.24\linewidth}\centering
		\includegraphics[width=\linewidth]{figures/lava_2_lower_v.png}
		\caption{$\underline{V^*}(s)$}
		\label{fig:_4_2_e}
	\end{subfigure}\hfill
	\begin{subfigure}{0.24\linewidth}\centering%(d)
		\includegraphics[width=\linewidth]{figures/lava_2_upper_v.png}
		\caption{$\overline{V^*}(s)$}
		\label{fig:_4_2_f}
	\end{subfigure}\hfill
	\begin{subfigure}{0.24\linewidth}\centering
		\setlength{\abovecaptionskip}{5pt}
		\includegraphics[width=\linewidth]{figures/lava_2_lower_q.png}
		\caption{$\underline{Q^*}(s, \texttt{right})$}
		\label{fig:_4_2_g}
	\end{subfigure}\hfill
	\begin{subfigure}{0.24\linewidth}\centering
		\includegraphics[width=\linewidth]{figures/lava_2_upper_q.png}
		\caption{$\overline{Q^*}(s, \texttt{right})$}
		\label{fig:_4_2_h}
	\end{subfigure}\hfill\null
	\caption{Estimations of value functions obtained by (\subref{fig:_4_2_a}, \subref{fig:_4_2_b}) standard off-policy methods, (\subref{fig:_4_2_c}, \subref{fig:_4_2_d}) value interaction in the ground-truth model, and (\subref{fig:_4_2_e}, \subref{fig:_4_2_h}) causally enhanced off-policy algorithms using eligibility traces (\texttt{C-TD($\lambda$)} and \texttt{C-TB($\lambda$)}). The offline data are generated by a confounded behavior policy determining the agent's actions based on the wind direction.}
	\label{fig:_4_2}
\end{figure*}

\section{Experiments} \label{sec:_4}
We demonstrate our algorithms in different variations of the Windy Gridworlds adapted from adapted from Deepmind's AI safety Gridworlds \citep{leike2017ai}. We found that simulation results support our findings, and the proposed causal eligibility trace algorithms consistently obtain informative bounds over target value functions. All experiments use $5 \times 10^4$ offline observational samples, meaning that error bars are not significant, hence, not explicitly shown; the decay factor $\lambda = 0.5$ and discount factor $\gamma = 0.9$. For details on the experimental setup, we refer readers to the complete technical report \citep{zhang2024eligibility}.

\paragraph{Experiment 1.} Consider again the learning setting in \Cref{exp:_2_1} where the demonstrator, following the behavior policy, decides whether to stay put and where to move based on the agent's state and the wind direction. Consequently, the offline data is contaminated with the unobserved confounding bias. We apply \texttt{C-TD($\lambda$)} to derive bounds over the optimal value function $V^*(s)$ and provide them in \Cref{fig:_2_2_e,fig:_2_2_f}. The analysis reveals the derived bounds are consistent, containing the target value function in \Cref{fig:_2_2_c}.

Additionally, we compute the optimal state-action value function $Q^*(s, x)$ for action $x = \texttt{right}$ and provide it in \Cref{fig:_2_2_d}. We then estimate its bounds using \texttt{C-TB($\lambda$)} from offline data; the bounding results are shown in \Cref{fig:_2_2_d,fig:_2_2_e}. By inspection, one can see our proposed algorithms are robust against the causal inconsistency in the offline data and consistently recover the informative bounds containing the actual value functions in the ground-truth model.

\paragraph{Experiment 2.} We now consider an alternative Windy Gridworld described in \Cref{fig:_4_1_a} where the lava is placed at both top and bottom. Without sensing the wind, a preferable policy for the agent is to move along the center of the map where the wind strength is weak (highlighted in \textcolor{red}{red} in \Cref{fig:_4_1_b}). At the same time, the demonstrator takes the shortest path (\textcolor{orange}{orange} in \Cref{fig:_4_1_b}) along the lava since it can sense the wind and take safe actions. Similar to the previous setting, the presence of wind direction becomes an unobserved confounder in the offline data, making the shorter route appear safer than it actually is.

We apply standard off-policy algorithms to evaluate the effect of the target policy $\pi^*$ and provide their evaluations in \Crefrange{fig:_4_2_a}{fig:_4_2_b}. We also compute bounds over the target value functions using our proposed algorithms, \texttt{C-TD($\lambda$)} and \texttt{C-TB($\lambda$)}, and provide their evaluations in \Crefrange{fig:_4_2_e}{fig:_4_2_f} and \Crefrange{fig:_4_2_g}{fig:_4_2_h} respectively. Comparing the bounds with the ground-truth value functions in \Cref{fig:_4_2_c,fig:_4_2_d}, we found that \texttt{C-TD($\lambda$)} and \texttt{C-TB($\lambda$)} can consistently obtain informative bounds. As expected, standard off-policy methods are not robust against causal inconsistency and deviate significantly from the target value functions. 