%
\documentclass[accepted]{uai2022} %
                                    %
                                    %
                                    %
%
%
                                         %
%
                                          %
%
%
%

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{booktabs} %
\usepackage{nicefrac}
\usepackage{amsfonts,amsmath,amssymb,mathtools,amsthm}
\usepackage{graphicx}
\usepackage{multirow}
\usepackage{multicol}
\usepackage[capitalise,noabbrev]{cleveref}
\usepackage{color,caption,subcaption}
\hypersetup{colorlinks=true,citecolor=black,linkcolor=black,urlcolor=black}

%
%

%
\usepackage{natbib} %
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\renewcommand{\cite}[1]{\citep{#1}}

    
%
%
%
%
%
\usepackage{derivative}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\real}{\mathbb{R}}
\newcommand{\states}{\mathcal{S}}
\newcommand{\actions}{\mathcal{A}}
\newcommand{\opt}{^\star}
\newcommand{\popt}{^{\pi\opt}}
\newcommand{\pI}{^{\pi}}
\newcommand{\tr}{^\top}
\newcommand{\data}{\Psi}
\newcommand{\cs}{\\[1ex] & }
\newcommand{\minimize}[1]{\operatorname*{minimize}_{#1} \quad &}
\newcommand{\maximize}[1]{\operatorname*{maximize}_{#1} \quad &}
\newcommand{\R}{\mathcal{R}}
\newcommand{\one}{\bm{1}}
\newcommand{\zero}{\bm{0}}
\newcommand{\p}{\mathbb{P}}
\newcommand{\PP}{\mathcal{P}}
\renewcommand{\ss}{\,\mid\,}
\newcommand{\rw}{\mathfrak{r}}
\newcommand{\srob}{\rho^S}
\newcommand{\drob}{\rho^D}
\newcommand{\rrob}{\rho^R}
\newcommand{\sarob}{\rho^{RA}}
\newcommand{\saV}{v_{SA}^{\pi}}
\newcommand{\sV}{v_{S}^{\pi}}
\newcommand{\sr}{R}				%
\usepackage[font=small,labelfont=bf]{caption}
%
%
%
\newcommand{\Real}{\mathbb{R}}
\newcommand{\RealPlus}{\Real_{+}}
\newcommand{\Int}{\mathbb{Z}}
\newcommand{\Natural}{\mathbb{N}}
\newcommand{\NaturalPlus}{\Natural_{+}}

\newcommand{\eps}{\varepsilon}
\newcommand{\To}{\longrightarrow}
\newcommand{\BX}{\bm{B}(X)}
%
\newcommand{\Pow}[1]{\mathcal{P}\left(#1\right)}  %
\newcommand{\E}[1]{\mathbb{E}\left[ #1 \right]}
\newcommand{\Ex}[2]{\mathbb{E}_{#1}\left[ #2 \right]}
\renewcommand{\P}[1]{\mathbb{P}\left[ #1 \right]}
\renewcommand{\Pr}[2]{\mathbb{P}_{#1}\left[ #2 \right]}
\newcommand{\Var}[1]{\bm{Var}\left[ #1 \right]}
\newcommand{\Proj}[1]{\bm{P}_{#1}}
\newcommand{\cond}{\textrm{if }}
\newcommand{\dotp}[2]{\langle #1, #2 \rangle}
\newcommand{\eye}{\bm{I}}
\newcommand{\setp}{\mathsf{P}^s}
\newcommand{\Pf}{f}
\newcommand{\st}{\operatorname{s.t.}}
\newcommand{\stc}{\operatorname{subject\,to} \quad &}
\newcommand{\Exp}[2]{\mathbb{E}_{#1} \left[ #2 \right] }
\renewcommand{\Pr}{\mathbb{P}}
\newcommand{\torg}{\theta_\text{org}}
\newcommand{\tpert}{\theta_\text{pert}}

\newcommand{\Bell}{\mathfrak{T}}
\newcommand{\T}{\Bell^{\pi}}
\newcommand{\Topt}{\mathfrak{T}}
\newcommand{\SRopt}{\mathfrak{L}^S}
\newcommand{\SARopt}{\mathfrak{L}^{SA}}
\newcommand{\SBopt}{\mathfrak{B}^S}
\newcommand{\SABopt}{\mathfrak{B}^{SA}}
\newcommand{\SigmaQ}{\widetilde{\Sigma}}
\newcommand{\muQ}{\tilde{\mu}}
\newcommand{\w}{w}
\newcommand{\msbr}{\operatorname{MSBR}}
\newcommand{\brm}{\text{BRM}}
\newcommand{\wis}{\text{WIS}}
\newcommand{\pdis}{\text{PDIS}}
\newcommand{\cpdis}{\text{CPDIS}}
\newcommand{\dr}{\text{DR}}
\newcommand{\wdr}{\text{WDR}}

\theoremstyle{plain}
\newtheorem{thm}{Theorem}[section]
\newtheorem{cor}[thm]{Corollary}
\newtheorem{lem}[thm]{Lemma}
\newtheorem{prop}[thm]{Proposition}
\newtheorem{exm}[thm]{Example}
\theoremstyle{definition}
\newtheorem{defn}[thm]{Definition}
\newtheorem{asm}[thm]{Assumption}
\theoremstyle{remark}
\newtheorem{rem}[thm]{Remark}
\newtheorem{fact}[thm]{Fact}
\usepackage[ruled]{algorithm2e}
%
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\cov}{Cov}
\DeclareMathOperator{\dst}{dst}
\usepackage{nameref}

\usepackage{xr-hyper}
\usepackage{hyperref}
\usepackage{xcite}


\makeatletter
\newcommand*{\addFileDependency}[1]{%
  \typeout{(#1)}%
  \@addtofilelist{#1}%
  \IfFileExists{#1}{}{\typeout{No file #1.}}%
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
%

%
\myexternaldocument{lobo_674-supp}




%
\renewcommand{\Pr}{\mathbb{P}}
\newcommand{\elita}[1]{\textcolor{orange}{#1}}
\newcommand{\hs}[1]{\textcolor{magenta}{#1}}

%
\usepackage{xifthen}
\usepackage[normalem]{ulem}
\newcommand{\mm}[2][]{\ifthenelse{\isempty{#1}}{}{\textcolor{red}{[\sout{#1}]}}\textcolor{green}{#2}}
\newcommand{\marek}[1]{\textcolor{red}{[#1]}}
\newcommand{\hima}[1]{\textcolor{red}{[#1]}}

\crefmultiformat{thm}{Theorems~#2#1#3}{ and~#2#1#3}{, #2#1#3}{ and~#2#1#3}


\newenvironment{mprog}{\begin{array}{>{\displaystyle}r>{\displaystyle}l>{\displaystyle}l}}{\end{array}}
\newcommand\commentfont[1]{\footnotesize\ttfamily\textcolor{magenta}{#1}}
\SetCommentSty{commentfont}

\title{Data Poisoning Attacks on Off-Policy Policy Evaluation Methods}

%
%
%
%
%
%
%
%
\author[1]{\href{mailto:<loboelita@gmail.com>}{Elita Lobo}{}}
\author[2]{Harvineet Singh}
\author[1]{Marek Petrik}
\author[3]{Cynthia Rudin}
\author[4]{Himabindu Lakkaraju}
%
\affil[1]{%
    University of New Hampshire\\
    Durham, NH, USA
}
\affil[2]{%
    New York University\\
    New York, NY, USA
}
\affil[3]{%
    Duke University\\
    Durham, NC, USA
  }
  
\affil[4]{%
    Harvard University\\
    Boston, MA, USA
  }
\begin{document}
\maketitle

\begin{abstract}%
Off-policy Evaluation (OPE) methods are a crucial tool for evaluating policies in high-stakes domains such as healthcare, where exploration is often infeasible, unethical, or expensive. However, the extent to which such methods can be trusted under adversarial threats to data quality is largely unexplored. In this work, we make the first attempt at investigating the sensitivity of OPE methods to marginal adversarial perturbations to the data.
We design a generic data poisoning attack framework leveraging influence functions from robust statistics to carefully construct perturbations that maximize error in the policy value estimates. We carry out extensive experimentation with multiple healthcare and control datasets. Our results demonstrate that many existing OPE methods are highly prone to generating value estimates with large errors when subject to data poisoning attacks, even for small adversarial perturbations. These findings question the reliability of policy values derived using OPE methods and motivate the need for developing OPE methods that are statistically robust to train-time data poisoning attacks.
%
\end{abstract}

\section{Introduction}
\label{sec:intro}

In reinforcement learning (RL), off-policy evaluation~(OPE) methods are popularly used to estimate the value of a policy from previously collected data~\cite{Thomas2015Highconfidence,voloshin2020empirical,levine2020offline}. These methods are instrumental in high-stakes decision problems such as in medicine and finance, where deploying a policy directly is often infeasible, unethical, or expensive~\cite{gottesman2020interpretable,hiv}. In such cases, one must estimate the value solely from a batch of data collected using a different and possibly unknown policy. Only if the OPE methods estimate the value of a policy to be sufficiently high will stakeholders deploy it. Otherwise, the policy will be rejected. It is therefore essential that OPE methods do not severely overestimate the values of bad policies or underestimate the values of good policies~\cite{gottesman2020interpretable}.

Despite the importance of OPE methods, their sensitivity to adversarial contamination of logged data is not well understood. The complexity of OPE methods offers ample opportunities for attackers to introduce significant errors in OPE estimates with only small perturbations to the input data. For example, some OPE methods compute the value of a policy in a given state as a function of its value in future states. Therefore, even small errors introduced in the value estimates of these future states can accumulate and result in significant errors in the value estimates at the initial states, where critical strategic decisions are often made.
This property could be exploited by attackers. Another possible avenue for an attack is the \textit{importance sampling weights}. Popular OPE methods, such as the Doubly Robust and the Importance Sampling methods~\cite{jiang16doublyrobust,voloshin2020empirical} use importance sampling weights to correct for dataset mismatch when evaluating the given policy with logged data from a different policy. The weights depend on the estimate of the policy used for the logged data. Attackers could perturb the data in a way that forces the agent to wrongly estimate the policy used to collect data and consequently introduce significant errors in the value estimates. Such vulnerabilities motivate the need for a thorough analysis of the effect of data poisoning attacks on OPE methods. 


While some prior works have studied adversarial attacks in the context of policy learning in online and batch RL settings~\cite{rakhsha2020policy,Ma2019PolicyPI,Chen2019adversarial}, they mainly focus on teaching an agent to learn an adversarial policy or driving the agent to an adversarial state~\cite{rakhsha2020policy,zhang2021corruptionrobust}, and do not specifically investigate the effect of these attacks on OPE methods. In this work, we address the aforementioned gaps and study the effect of data poisoning attacks on OPE methods. More specifically, our work answers the following question: \textit{Can we construct small perturbations to the training data that significantly change a given OPE method's estimate of the value of a given policy?} To this end, we propose a novel data poisoning framework to analyze the sensitivity of model-free OPE methods to adversarial data contamination at train time. We formulate the data poisoning problem as a bi-level optimization problem and show that it can be adapted to diverse model-free OPE methods, namely, Bellman Residual Minimization (\brm)~\cite{Farahmand2008BRM}, Weighted Importance Sampling~(\wis), Weighted Per-Decision Importance Sampling (\pdis)~\cite{Precup00temporalabstraction,powell1966wis,rubenstein1981MC}, Consistent Per-Decision Importance Sampling (\cpdis)~\cite{Thomas2015SafeRL}, and Weighted Doubly Robust methods (\wdr)~\cite{jiang16doublyrobust}. To solve the aforementioned bilevel optimization problem in a computationally tractable manner, we derive an approximate algorithm using influence functions from robust statistics~\cite{koh2018stronger,koh2017influence,diakonikolas2019recent,broderick2021automatic}. To the best of our knowledge, our work is the first to study the sensitivity of a wide range of OPE methods to train-time data poisoning attacks. 

We evaluate our framework using five different datasets spanning medical (Cancer and HIV) and control (Mountain Car, Cartpole, Continuous Gridworld) domains. Our experiments show that corrupting only $3\%$--$5\%$ of the observed states achieves more than $340\%$ and $100\%$ error in the estimate of the value function of the optimal policy in the HIV and MountainCar domains, respectively. Through our experimental results, we show that out of the five OPE methods, WDR, PDIS, and BRM are generally the least statistically robust, and CPDIS and WIS are relatively more statistically robust to such adversarial contamination. Finally, our findings question the reliability of policy values derived using OPE methods and motivate the need for developing OPE methods that are statistically robust to train-time data poisoning attacks.


\section{Preliminaries}\label{sec:background}

We model a sequential decision-making problem as a Markov Decision Process~(MDP). An MDP is a tuple of the form $\langle \states,\actions, R, P,p_0,\gamma \rangle$ representing the set of states, set of actions, reward function, transition probability model, initial state distribution, and discount factor respectively. 
When taking an action $a\in \actions$ in a state $s\in \states$ and transitioning to state $s'\in \states$, the scalar $R(s,a,s')$ denotes the reward received by the agent and $P(s,a,s')$ denotes the probability of transitioning to a state $s'$ on taking an action $a$ in a state $s$.

A randomized policy $\pi : \states \to \Delta^{|\actions|}$ prescribes the probability of taking each action in each state. The value function $v^{\pi}\colon \mathcal{S} \to \real$ of a policy $\pi$ at a state $s$ is the expected discounted returns of the policy starting from state $s$ and is given by
\[
  v^{\pi}(s) = \Exp{}{\sum_{t=0}^{\infty} \gamma^t R(S_t,A_t,S_{t+1}') \mid \pi,S_0 = s} ~.
\]
The value of a policy is computed as $p_0^Tv^{\pi}$. The state-action value function (also known as the Q-value function) $q^{\pi}\colon \states\times \actions \to \real$ of a policy $\pi$ at a state $s$ and an action $a$ is the expected discounted returns obtained by taking action $a$ in state $s$ and following policy $\pi$ thereafter. The state-action value function $q^{\pi}$ for a policy $\pi$ is the unique fixed point of the \emph{Bellman operator} $\mathcal{T}^{\pi}\colon \states \times \actions \to \real^{\states \times \actions}$ defined as
%
\begin{gather}\label{eq:bellman}
(\mathcal{T}^{\pi} q) (s,a) =  \\
\nonumber
\sum_{s'\in \mathcal{S}}\sum_{a'\in \mathcal{A}} (R(s,a,s') + \gamma P(s,a,s') \pi(s',a') q(s',a') )\,.
\end{gather}

We assume the standard batch RL setting~(e.g.~\citep{levine2020offline}) in which the agent is given a batch of $n{=}N\times T$ transition tuples  $D {=} ((s^i_j, a^i_j, r^i_j)_{j=1}^{T})_{i=1}^{N}$, generated by a behavior policy $\pi_b$ for $N$ episodes of length $T$. The \emph{goal} of OPE is to use $D$ to evaluate the value of the evaluation policy $\pi$. Let $D_0$ be a set of initial states sampled from the distribution $p_0$.

The value function is approximated using features $\xi\colon \mathcal{S} \to \real^d$. As is standard in linear value function approximation, we assume also that the state-action value function $q^{\pi}$ is approximated as a linear combination of state-action features $\phi\colon \mathcal{S} \times \mathcal{A} \to \real^{|\actions|\cdot d}$. The state-action features for a given state-action pair $(s,a)$ are constructed by using the state features $\xi(s)$ at the indices corresponding to $a$ and zero elsewhere, i.e. $\phi(s,a)[ad: (a{+}1)d]\leftarrow\xi(s)$. Because the value function is estimated from data, we need to define a sample feature matrix $\Phi \in \real^{n \times d}$ where the rows correspond to the state-action features $\phi(s,a)$ for the $n$ state-action pairs in $D$.  Similarly, $\Phi_p\in \real^{n \times d}$ denotes the sample feature matrix for the \emph{next states} such that each row corresponds to $\phi(s'_i,\pi(s'_i))$ for the next states $s'_i$ in $D$. The sample reward matrix $r \in \real^{n \times 1}$ is constructed such that the $i^{th}$ row corresponds to the reward $r_i$ in $D$. More details on the construction of the sample feature matrices $\Phi$, $\Phi_p$ and reward matrix $r$ can be found in Section 4 in~\cite{Lagoudakis2003LSPI}.

OPE methods are broadly classified into three categories: Direct, Importance Sampling, and Hybrid Methods~\cite{voloshin2020empirical}. 
\emph{Direct Methods} estimate the value of the evaluation policy by solving for the fixed point of the Bellman Equation~\eqref{eq:bellman} with an assumed model for the state-action value function $q$ or the transition model $P$. We illustrate our attack on one of the most popular Direct Methods, namely the \emph{Bellman Residual Minimization}~(BRM) method~\cite{voloshin2020empirical,Farahmand2008BRM}. BRM solves a sequence of supervised learning problems with state-action features $\phi(s, a)$ as the predictor and the 1-step Bellman update $\mathcal{T}^{\pi}q=r + \gamma Pq$ as the target response. 
%
The objective optimized in BRM is the Mean Squared Bellman residual (MSBR), defined as a weighted $L_2$ norm:
%
\begin{equation} \label{BRM_projectedbellman}
  \msbr(\eta) \;=\;  \|q_{\eta}-\mathcal{T}^{\pi} q_{\eta}\|^2_W~.
\end{equation}
%
Here, the linear Q-value function $q_\eta$ is parameterized by $\eta$ as $q=\Phi\eta$. The weight matrix is computed as $W=\diag[\mu^{\pi}]$ where $\mu^\pi\in[0,1]^S$ represents the stationary state distribution of policy $\pi$. The value of a policy is then computed as
%
\begin{equation}
\label{eq:brm}
  v_\text{BRM} \;=\;  \sum_{s\in D_0} \sum_{a\in\actions} p_0(s) \cdot \pi(s,a) \cdot q_{\eta}(s,a)~.
\end{equation}

\emph{Importance Sampling Methods}~(IS) 
~\cite{kahn1953Methods} are based on Monte-Carlo techniques and compute unbiased but high-variance value estimates. 
The key idea is to compute the value of policy $\pi$ as the weighted average of the returns of the trajectories in $D$, where each trajectory is re-weighted by its probability of being observed under evaluation policy $\pi$. We focus on attacking three popular variants of importance sampling methods, namely the \emph{Per-Decision, Consistent Weighted Per-Decision}, and \emph{Weighted} IS methods (PDIS, CPDIS, WIS)~\cite{Precup00temporalabstraction,Thomas2015SafeRL,rubenstein1981MC}.
Let $g^i_{T} = \sum_{t=0}^{T} \gamma^t r^i_t$ represent the returns observed for the $i^\text{th}$ trajectory in the dataset $D$ and assume that the behavior policy is parameterized by $\theta_b$ and estimated from data $D$ using maximum likelihood estimation (MLE)~\cite{vaart1998Asymptotic}. In this setting, the MLE method effectively minimizes the Cross Entropy Loss (CEL) on the predictions of the behavior policy. In order to define the OPE estimates of the value functions, we need the importance sampling weights $\rho^i_{0:t}$ for time step $t$ defined as
%
\[
  \rho^i_{0:t} = \prod_{t'=0}^t \frac{\pi(s^i_{t'}, a^i_{t'} )}{\pi_b^{\theta_b}(a^i_{t'}| s^i_{t'})}~.
\]
Here, the estimate of the behavior policy is defined as $\pi^{\theta_b}_{b}(a |s) = \exp( \phi(s,a) \theta_b ) (\sum_{a'\in\actions}\exp(\phi(s,a') \theta_b))^{-1}$ for each $s\in\states$ and $a\in\actions$.  Then
the WIS, PDIS, and CPDIS value function estimates are  defined as
\begin{align}
   v_\text{WIS}&= \left(\sum_{i=1}^N \rho^i_{0:T}\right)^{-1} \sum_{i=1}^N \rho^i_{0:T} g^i_T, \label{eq:wis}\\ 
  v_\text{PDIS} &= \frac{1}{N} \sum_{i=1}^N \sum_{t=1}^{T} \gamma^{t-1} \rho^i_{0:t} r^i_t, \label{eq:pdis}\\
  v_\text{CPDIS} &= \sum_{t=1}^{T} \gamma^{t-1} \frac{ \sum_{i=1}^N \rho^i_{0:t} r^i_t}{\sum_{i=1}^N \rho^i_{0:t}} \label{eq:cwpdis}.
 \end{align}

\emph{Hybrid Methods} combine both Direct and IS methods to generate value estimates with low bias and variance. An important hybrid method is the \emph{Doubly Robust}~($\dr$) estimator~\cite{jiang16doublyrobust}, which decreases the variance in the IS estimate by using the estimate from a method like BRM. The $\dr$ and Weighted DR ($\wdr$) estimators are given by
%
\begin{equation}
\label{eq:dr}
\begin{aligned}
v_\text{DR} &= \frac{1}{N} \sum_{i=1}^N \sum_{t=0}^{T-1} \rho^i_{0:t} w_t^i + \frac{1}{N} \sum_{i=1}^N v_{\eta}(s_0^i).\\
v_\text{WDR} &= \sum_{i=1}^N \sum_{t=0}^{T-1} \frac{\rho^i_{0:t}}{\sum_{i=1}^N \rho^i_{0:t} } w_t^i + \frac{1}{N} \sum_{i=1}^N v_{\eta}(s_0^i).
\end{aligned}
\end{equation}
%

where $w_t^i= ( r^i_t -  q_{\eta}(s^i_t,a^i_t) + v_{\eta}(s^i_t))$ and $v_{\eta}(s^i_t)=\sum_{a\in\actions}\pi(s^i_t,a)\cdot q_{\eta}(s^i_t,a)$.
Here the parameters of the value function $q$ are estimated using Direct Methods like $\brm$. Because empirical studies show that there are no clear winners among the three methods \citep{voloshin2020empirical}, we investigate attacks on representative methods from each type.
\section{DOPE Framework} \label{sec:Framework}
We first present our attack framework called DOPE for \emph{D}ata poisoning attacks on \emph{O}ff-\emph{P}olicy \emph{E}valuation. Then we demonstrate how to use the framework to attack the three types of OPE methods discussed in Section~\ref{sec:background}.
The objective and scope of the attacks considered in DOPE are as follows.

\textbf{Scope}: We assume the setting of a white-box attack, i.e. the attacker has access to the batch data $D$, evaluation policy $\pi$, the value of the discount factor $\gamma$, and the attacker knows how the agent estimates the behavior policy and the state-action value function from the data. This kind of a setting is commonplace in the healthcare domains~\cite{gottesman2020interpretable,Ernst2006Clinical,Yu2021Healthcare} where models are typically benchmarked and often made available to the general public so that they can be independently vetted and validated before deployment. 
Further, for the attack to be unnoticeable, we allow the attacker to only perturb at most $\alpha$ fraction of the transitions in $D$ while conforming to some perturbation budget $\eps \ge 0$ to be defined later. 

\textbf{Objective}: The goal of the attacker is to add small adversarial perturbations to a subset of transitions in $D$ such that it maximizes the error in the value estimate of a given policy in the desired direction.
This means that the attacker may choose to decrease or increase its estimated value for the policy being evaluated such that a good evaluation policy is rejected or a bad evaluation policy is approved. 

\textbf{Components:} The DOPE framework for a given OPE method has four major components:
\emph{Features ($\Psi$):} the part of the transition tuples targeted by the attack;
\emph{Value estimation function ($\rho$):} function used by the OPE method for computing the value;
\emph{Estimated parameter ($\theta$): } model parameters learned by the OPE method from the data;
\emph{Loss function ($L$): } loss optimized by the OPE method for model-fitting.
We define each component in detail in \cref{sec:attack_methods}.
We can now formulate our attack model as a problem of finding the perturbation matrix $\Delta=(\delta_i)_{i=1}^n, \delta_i \in\real^{Q}$ that maximizes the difference between values found using the perturbed and the original data under constraints dictating that the perturbations are small:
\begin{subequations} \label{eq:bilevel}
\begin{align}
   \maximize{\Delta \in \real^{n \times Q}}  
    \rho{(\tpert,\Psi+ \Delta)} - \rho(\torg,\Psi)\label{eq:bilevel_obj} \\
    %
    \stc  \tpert \;\in\;  \argmin_{\theta \in \real^P} \, L(\theta, \Psi+ \Delta)\label{eq:bilevel_one}  \\
    & \torg \;\in\;  \argmin_{\theta \in \real^P} \, L(\theta, \Psi)\label{eq:bilevel_one_dash}  \\
    %
     & \| \delta_i \|_{p} \;\leq\; \eps, \quad i = 1,\ldots, N   \label{eq:bilevel_two} \\
    %
    & \sum_{i=1}^n \mathbf{1}_{\| \delta_{i} \| \neq 0} \;\leq\; \alpha \cdot n.
    %
    \label{eq:bilevel_three}
\end{align}
\end{subequations}

\begin{table*}[htbp!]
    \centering
    \begin{tabular}{l|llll}
 \toprule
 %
 Method & Parameters $\theta$ & Features $\Psi$ & Function $\rho(\Psi)$ &  Loss $ L(\theta,\Psi)$ \\
 \hline
 $\brm$ (\citet{Farahmand2008BRM}), Eq. \eqref{eq:brm} & $\eta$ in $q_\eta$ & $\Phi$ or $r$ &  $v_\brm$ & MSBR \\
 WIS (\citet{rubenstein1981MC}), Eq. \eqref{eq:wis} & $\theta_b$ in $\pi_b^{\theta_b}$ & $\Phi$ or $r$ & $v_\text{WIS}$ & CEL \\
 PDIS (\citet{Precup00temporalabstraction}), Eq. \eqref{eq:pdis} & $\theta_b$ & $\Phi$ or $r$ & $v_\text{PDIS}$ & CEL \\
 CPDIS (\citet{Thomas2015SafeRL}), Eq. \eqref{eq:cwpdis} & $\theta_b$ & $\Phi$ or $r$ & $v_\text{CPDIS}$ & CEL \\
WDR/DR (\citet{jiang16doublyrobust}), Eq. \eqref{eq:dr} & $\theta_b$, $\eta$ & $\Phi$ or $r$ & $v_\text{WDR}\ \text{or}\ v_\text{DR}$ & CEL + MSBR or MSBR \\
     \bottomrule
\end{tabular}
    \caption{Settings for the four components of the DOPE attack for five different OPE methods.    \label{table:attack_templates}}
\end{table*}

The DOPE objective in~\eqref{eq:bilevel_obj} increases the estimated value from the original value, thereby increasing the error. Alternatively, if the attacker wants to decrease the estimated value of the given policy, they may do so by simply changing the sign of the objective. The constraint~\eqref{eq:bilevel_one} estimates the optimal parameter $\tpert$ from $D$ after perturbing $\Psi$ to $\Psi+\Delta$. The constraint~\eqref{eq:bilevel_two} ensures that the perturbation added to each sample $\delta_i$, i.e. $i^\text{th}$ row of $\Delta$, is limited to the user-defined budget $\eps$ in $\ell_p$ norm. This prevents the attack framework from generating adversarial transitions that can be easily detected as anomalous. Further, the constraint~\eqref{eq:bilevel_three} limits the number of transitions that the attacker can perturb. Finally, note that $\torg$ is only computed once with the original features $\Psi$ and $\rho(\torg,\Psi)$ is a constant that can be ignored while solving the optimization problem. 

\subsection{Attacking OPE methods using DOPE}\label{sec:attack_methods}

We are now ready to formally define the four components of the DOPE framework.
\cref{table:attack_templates} summarizes the choice of these components for each OPE method we attack.\\ 
%
(a) \emph{Features}:
Let $\psi(s,a,r) \in \real^{Q}$ be an arbitrary component of the transition tuple $\langle s,a,r \rangle$ in $D$ that is perturbed by the attacker. For example,  $\psi(s,a,r)$ could either be the state features $\Phi$ or the reward vector $r$.
We will use $\Psi \in \real^{n \times Q}$
to represent the sample matrix of  $\psi(s,a,r)$ constructed from the $n$ samples in $D$. \\
(b) \emph{Parameters}:
The parameters $\theta(\Psi) \in \real^P$ are the parameters of interest for a given OPE method, written as a function of $\Psi$ to clarify that these are estimated from samples in $D$. In $\brm$, $\theta$ represents the parameters of the Q-value function $q_\eta(s,a)$, whereas in IS methods,  $\theta$ represents the parameters of the estimated behavior policy $\pi_b^{\theta_b}(a|s)$. \\
(c) \emph{Loss function}:
The loss function $L(\theta,\Psi)$ with $L\colon \real^{P} \times \real^{n \times Q} \to \real$ is the empirical loss optimized by the OPE method to derive the optimal parameter $\theta(\Psi) \in  \argmin_{\theta' \in \real^P} \, L(\theta', \Psi)$ from the data. As an example, $L$ in $\brm$ and $\dr$ is the MSBR error, whereas in IS methods, $L$ is the CEL loss optimized to estimate the behavior policy. \\
(d) \emph{Value estimation function}:
Finally, the value estimation function $\rho(\theta(\Psi),\Psi)$ with $\rho: \real^{P} \times \real^{n \times Q} \to \real $ is the function used by the OPE method to compute the mean value of $\pi$ at the initial states. For example, in $\brm$, $\rho$ represents $v_\brm$. We will use the shorthand $\rho(\Psi):= \rho(\theta(\Psi),\Psi)$. 

The loss function $L(\theta,\Psi)$ must be twice continuously differentiable and linearly separable with respect to the transitions in $D$. We provide some examples of such loss functions such as MSBR and CEL and show that they are twice continuously differentiable in~\cref{app:examples}. Further, the value estimation function $\rho(\theta,\Psi)$ also needs to be continuously differentiable with respect to $\theta$ and $\psi$. These assumptions, as \cref{sec:optimization} shows, are important for the influence functions to be well-defined~\cite{koh2017influence}.

\section{Optimization}\label{sec:optimization}

In this section, we discuss the challenges of optimizing the DOPE problem in \eqref{eq:bilevel} and propose an approximate scheme for finding the optimal adversarial perturbations. 

There are two major challenges in solving the optimization problem in~\cref{eq:bilevel}. First, the constraint~\eqref{eq:bilevel_three} is non-differentiable and requires the attacker to select a set of at most $\alpha n$ transitions, such that perturbing these transitions results in maximum change in the value of the policy in the desired direction. It is important to realize that finding this set requires perturbing all possible subsets of data $\Psi$ whose size is at most $\alpha n$ and re-estimating the optimal parameter $\theta$ for each perturbation. The number of such subsets is larger than $\binom{n}{\alpha n}$. Thus, finding the optimal set is computationally infeasible. Second, observe that \eqref{eq:bilevel} is a bilevel optimization problem where the inner-level problem \eqref{eq:bilevel_one} is non-linear in the case of OPE methods which makes it generally NP-Hard to solve~\cite{Wiesemann2013Pessimistic}. 

We address these two challenges by deriving an approximation to the bilevel optimization problem (\ref{eq:bilevel}) using the Taylor expansion of \cref{eq:bilevel_obj}. We show that the resulting problem is simpler to optimize and has a closed-form solution. In \cref{sec:experiments}, we empirically demonstrate the effectiveness of our approximate solution on several domains. 

\textbf{Approximation} We define the influence score of the $i^\text{th}$ data point as $I_{\Psi_i}=\nabla_{\Psi_i} \rho(\Psi)$ as the rate of change in the value estimate $\rho(\Psi)$ with respect to the data point $\Psi_i\equiv \psi(s_i,a_i,r_i)$. Then, using the first-order Taylor expansion of $\rho(\Psi+ \Delta)$, we can approximate the net error in the value-function estimate  $\rho(\Psi+\Delta) - \rho(\Psi)$ as the weighted sum of the influence scores of individual data points, 
\begin{equation}\label{first_order_approx}
     \begin{aligned}
 \rho(\Psi+\Delta) - \rho(\Psi) &  \approx
    \sum_{i=1}^n (\nabla_{\Psi_i}{\rho(\Psi)})\tr \delta_i.
    \end{aligned}
\end{equation}
Using Eq.~\eqref{first_order_approx} reduces the optimization in~\eqref{eq:bilevel} to 
\begin{equation} \label{bilevel:approx}
\begin{aligned}
  &\max_{s\in \{0,1\}^n} \max_{\{\delta_i\}_{i=1}^N} %
    \left\{\sum_{i=1}^n 
    s_iI_{\Psi_i} \tr \delta_i \Bigm\vert \|\delta_i \|_p \leq \eps, \forall i\right\} ,\\
   &\text{ subject to } \sum_{i=1}^n s_i = \alpha \cdot n~.
\end{aligned}
\end{equation}
Here, $s\in\{0,1\}^N$ is a vector of binary indicators such that $s_i=1$ indicates that the $i^\text{th}$ transition is amongst the $\alpha n$ transitions selected to be perturbed. We can now compute an approximately optimal set of perturbations in polynomial time as shown in \cref{prop_greedy} for norms $p=1,2,\infty$.

\begin{thm}\label{prop_greedy}
  Let $(s^*,\Delta^*)$ be an optimal solution to the optimization problem in~\eqref{bilevel:approx} and define the \emph{approximate influential set} as $S^*_{\alpha}=\{i : s^*_i =1 , \forall i = 1,\dots,n\}$. Then,
 \begin{enumerate}[nosep]
    \item $S^*_{\alpha}$ can be constructed by choosing the set of $\alpha n$ transitions with the largest $q$-norm of their influence scores $I_{\Psi_i}$. Here, $q$-norm is the dual of $p$-norm used in \eqref{bilevel:approx}, i.e. $1/p+1/q=1$.
    \item For all $i \in [1,\dots n]$, the optimal $\delta^*_i$ for $p=1,2,\infty$ can be computed in closed-form as
    \begin{align*}
            \begin{aligned}
        & \text{ If } p=\infty, \text{ then } \, \delta^*_{i} = \eps \cdot  \operatorname{sign}(I_{\Psi_i}) \\
         & \text{ If } p=2, \text{ then } \, \delta^*_{i} =  \eps \cdot \frac{I_{\Psi_i}}{\|I_{\Psi_i}\|_2}. \\
         & \text{ If } p=1, \text{ then } \, \forall j \in [1,Q], \,  \\ 
        &  \delta^*_{i,j} =  \begin{cases}
         \eps \cdot \operatorname{sign}(I_{\Psi_i}(j))  &\,  \text{ if } \, j \in {\displaystyle\argmax_{m \in [1,Q]}} I_{\Psi_i}(m) \\
        0 &\, \text{ otherwise }
        \end{cases}
    \end{aligned}
        \end{align*}
\end{enumerate}
\end{thm}

\begin{rem}[Relation to optimal solution]
Solving the approximate problem~\eqref{bilevel:approx} gives us a lower bound to the optimal solution of the original problem \eqref{eq:bilevel}. Suppose $\Delta^*$ is the optimal solution for~\eqref{bilevel:approx}  that we get from \cref{prop_greedy} while $\Delta^{**}$ is the (intractable) optimal solution for \eqref{eq:bilevel}. Then, the maximum error in the value function is at least as much as what we get, 
\begin{align*}\label{eq:lower_bound}
    \rho(\Psi+\Delta^{**}) - \rho(\Psi) & = \max_{\Delta \in \real^{n \times Q}}  \rho(\Psi+\Delta) - \rho(\Psi)  \\ & \geq \rho(\Psi+\Delta^{*}) - \rho(\Psi)~.
\end{align*}
\end{rem}

\textbf{Influence scores} Finally, it remains to discuss how to compute the influence scores of each transition in $D$: $I_{\Psi_i}= \nabla_{\Psi_i} \rho(\Psi)$. Recall that $\rho(\Psi)$ is not only a function of $\Psi$ but also $\theta(\Psi)$ which is also a function of $\Psi_i$.
Hence, using the chain rule, we get for each $i \in [1 \dots n]$ that 
\begin{equation}\label{chain_rule}
    \begin{aligned}
I_{\Psi_i} \approx   \frac{\partial{\rho(\theta,\Psi)}}{\partial \Psi_i}\bigg|_{\torg(\Psi)}  +   \frac{\partial{\rho(\theta,\Psi)}}{\partial \theta}\bigg|_{{\torg}(\Psi)} \frac{\partial {\theta}(\Psi) }{\partial \Psi_i}.
\end{aligned}
\end{equation}

\begin{algorithm}[htbp!] 
\SetAlgoLined
 \SetKwFunction{merge}{merge}
  \KwIn{Features $\Psi$, attack budget $\eps$, $\%$ of corrupt transitions $\alpha$, norm-type $p$}
  Compute $\torg \gets \argmin_{\theta \in \real^P} L(\theta,\Psi)$ \;
  Compute $\|I_{\Psi_{{i}}}\|_q$ for all $i = 1, \dots, n$ using~\eqref{chain_rule} \;
  $S^*_{\alpha} \gets \alpha \cdot n$ indices $i$ with largest $\|I_{\Psi_i}\|_q$ \;
  \For{$k \in S^*_{\alpha}$}{
    Let $\delta^{*}_{{k}} \in {\displaystyle \argmax_{\delta \in \real^Q}} \{I_{\Psi_k}^\top \delta \mid \|\delta \|_p \leq \eps\}$ using Item 2 in \cref{prop_greedy}\;
  }
  Use line search to find the largest step-size $\beta \in [0,1]$ $\st$ the value estimate increases: $\rho(\theta,  \Psi + \beta \cdot \delta^{*}) - \rho(\theta,\Psi) > 0$\;
    %
    \Return{$\Psi =   \Psi + \beta\cdot\delta^{*}$} \;
    \caption{OPE Attack Algorithm \label{sec:algorithm}}
    \label{algorithm1}
\end{algorithm}

The computation of the partial derivative $\nicefrac{\partial \theta(\Psi)}{\partial \Psi_i}$ is not straightforward. However, we can approximately compute it as $\nicefrac{\partial  \theta(\Psi)}{\partial \Psi_i}=H_{\torg(\Psi)}^{-1} \nicefrac{\partial^2 L(\theta,\Psi_i)}{\partial \theta \partial \Psi_i}\big|_{\torg(\Psi)}$ where $H_{\torg(\Psi)}  = \nicefrac{\partial^2 L(\theta,\Psi)}{\partial \theta^2}\big|_{\torg(\Psi)}$~\citep[][Section 2.2]{koh2017influence}. See \cref{apps:preliminary} for more details.

To compute $I_{\Psi_i}$ in \eqref{chain_rule}, we require that $L(\theta,\Psi)$ is twice continuously differentiable and linearly separable with respect to the transitions in $D$, and $\rho(\theta,\Psi)$ is continuously differentiable with respect to $\theta$ and $\psi$.
Although these conditions may seem restrictive, they hold true for the OPE methods we have studied.

The derivatives in~\eqref{chain_rule} can be easily computed using automatic-differentiation software like PyTorch~\cite{torch2019}. Computing the influence score $I_{\Psi_i}$ can be very expensive due to the Hessian-inverse term $H^{-1}_{\theta_{org}(\Psi)}$ which requires $\mathcal{O}(P^3)$ operations to compute. Fortunately, as shown in~\cite{koh2017influence}, we can avoid the computation of the Hessian-inverse term while computing $I_{\Psi_i}$ by instead first approximately computing the Hessian-inverse vector product
%
\[
  c_\text{prod}=H_{\torg(\Psi)}^{-1} \frac{\partial{\rho(\theta,\Psi)}}{\partial \theta}\bigg|_{{\torg}(\Psi)}
\]
%
in $\mathcal{O}(nP)$ time using the Pearlmutter's method~\cite{pearmutter1994fast} and first-order Taylor approximation of the Hessian-inverse matrix, and then applying the Pearlmutter's method again to compute the Hessian-vector product $c_\text{prod} \cdot \nicefrac{\partial^2 L(\theta,\Psi_i)}{\partial \theta \partial \Psi_i}\big|_{\torg(\Psi)}$ in $\mathcal{O}(P)$ time.

\textbf{Algorithm outline} We outline how to approximately solve the DOPE optimization~\eqref{eq:bilevel} in Algorithm~\ref{sec:algorithm}, which consists of two main steps. In the first step, we compute an approximation of the optimal set of transitions to perturb $S^*_{\alpha}$ by choosing $\alpha n$ points in $\Psi$ with the largest $q$-norm of their influence scores $\|I_{\psi_{\cdot}}\|_q$. In the second step, we compute $\Delta$ for all points in $S^*_{\alpha}$ using \cref{prop_greedy} and use line search to find the optimal step size that guarantees an increase in the error of the value estimate. The second step may be repeated until no further perturbation to data points in $S^*_{\alpha}$ results in an increase in the error in the value estimate.

The main computational bottleneck is in computing the influence score for each data point. In some cases, this cost can be reduced. We derive closed-form expressions for the influence score in the case of the linear $\brm$ method under two settings a) when the adversary perturbs only the state features or b) only the reward features.
\begin{prop}\label{prop:BRM_rewards} If the attacker only perturbs the reward vector $r$ constructed from batch of transition tuples $D$. Then, the influence score of the $i^\text{th}$ data point $I_{r_i,\theta,\Psi}$ for the $\brm$ method can be computed as
\begin{equation}\label{BRM_reward}
I_{r,\theta,\Psi} = 4 p_0^T \Phi_{0} \left( ({\Phi} - \gamma  \cdot{\Phi}_p)^2 \right)^{-1} \left( \Phi - \gamma  \cdot \Phi_p\right)\,,
\end{equation}
where $\Phi_{0}$ is a sample matrix of initial state features constructed from $D_0$.
\end{prop}
\begin{prop}\label{prop:BRM_features}
If the attacker only perturbs the state feature matrix $\Phi$. Then, the influence score of the $i^\text{th}$ data point $I_{\phi(s_i,a_i),\theta,\Psi}$ for the $\brm$ method can be computed as
\begin{equation}\label{BRM_features}
\begin{aligned}
I_{\Psi,\theta,\Psi} &= 4 \cdot p_0^T \cdot \Phi_{0} \cdot \left( ({\Phi} - \gamma \cdot {\Phi}_p)^2 \right)^{-1}\cdot \\ & \left(2 \cdot w \bigotimes \Phi - 2\cdot \gamma \cdot w \bigotimes \Phi_p \right. \\ & \left. + 2 \cdot \mathbf{I} \bigotimes (\Phi \cdot w - \gamma \cdot \Phi_p \cdot w - r) \right),
\end{aligned}
\end{equation}
where $\bigotimes$ denotes the Kronecker product between matrices.
\end{prop}
\Cref{prop:BRM_rewards} and \ref{prop:BRM_features} follow from the chain rule and basic properties of the gradient operator for matrices.

\section{Experiments}\label{sec:experiments}
In this section, we investigate the strengths and weaknesses of the DOPE attack. First, we evaluate the effectiveness of the DOPE attack on OPE methods for different values of attack budget and identify which methods are most vulnerable to adversarial contamination. Second, we compare the performance of DOPE with three custom baselines: Random DOPE, FSGM-based Attack, and Random Attack. 

\subsection{Domains and Experimental Setup}

We first describe the five domains used in the experiments.
\emph{Cancer:} This domain~\cite{gottesman2020interpretable} models the growth of tumors in cancer patients. It consists of 4-dimensional states which represent the growth dynamics of the tumor in the patient, and two actions that indicate if a given patient is to be administered chemotherapy or not at a given time step. \newline
\emph{HIV:} The HIV domain \citep{hiv} has 6-dimensional states representing the state of the patient, and four actions that represent four different types of treatments.  \newline
\emph{Mountain Car:} In the Mountain Car~\cite{brockman2016openai} domain, the task is to drive a car positioned between two mountains to the top of the mountain on the right in the shortest time possible. The 2-dimensional state represents the car's current position and the current time-step, and the three actions represent: drive forward, drive backward, and do not move.  \newline
\emph{Cartpole:}
The Cartpole domain~\cite{brockman2016openai} models a simple control problem where the goal is to apply +1/-1 force to keep a pole attached to a moving cart from falling. The 2-dimensional state represents the cartpole dynamics, and the two actions represent the force applied to the pole.  \newline
\emph{Continuous Gridworld:}
The Continuous Gridworld is a custom domain that consists of a 2-dimensional state space that represents the coordinates of the agent, and two actions $a_0,a_1$ that determine the direction and step size of the agent. The agent begins at coordinate $(1,1)$ and moves towards coordinates $(50,50)$ to maximize its rewards.

\paragraph{Implementation details}
For each domain, we apply Deep Q-learning (DQN) to a randomly initialized neural network policy and obtain partially optimized deterministic policies.
We fix the deterministic policy obtained for each domain as the evaluation policy and use an $\epsilon$-greedy version of the evaluation policy as our behavior policy~\cite{gottesman2020interpretable}. We set $\epsilon = 0.1$ for the HIV domain and $\epsilon = 0.05$ for other domains. 
We use the behavior policy to generate five datasets, each containing $N$ trajectories of length $T$ (see  \cref{app:experiments} for the values of $N$ and $T$) and use it to evaluate the value of the evaluation policy. Our code is made available in the supplementary materials.


For any given OPE method that learns the Q-value function of the evaluation policy from data, we use linear function approximators to represent these Q-value functions and optimize the squared Bellman residual with $L_2$ regularization to learn it.
Note that we consider linear function approximations in line with the precedent set by other recent works in the off-policy evaluation literature~\cite{gottesman2020interpretable,Jin2020Provably}. Linear function approximations are commonly employed in the off-policy evaluation literature due to their simplicity, low computational complexity, and convergence guarantees~\cite{gottesman2020interpretable}. While our framework is general enough to accommodate any differentiable function approximations including deep learning models, computing the influence functions for non-linear function approximations is computationally expensive, and the time complexity grows as the square of the number of model parameters $\theta$. Hence, we resort to linear function approximators. Note, however, that we try to offset the limitations in the expressive power of linear function approximations by leveraging complex state representations obtained from the second last layer of a trained deep Q-network as input features to the linear function approximations in our experimentation (See~\cref{app:experiments}). 

%
For OPE methods that require learning behavior policy from the data, we train a multinomial logistic regression model to predict the behavior policy's action probabilities using maximum likelihood estimation. Following standard practice in RL, we clip the behavior probabilities to $0.01$ to avoid importance sampling weights from exploding. Note that although clipping the behavior probabilities prohibits the attacker from making individual behavior policy action probabilities too small, an attacker can still leverage the fact that the importance sampling weights are a function of the product of behavior policy action probabilities, and thus, the importance weights can be made very large by simply making the behavior policy action probabilities of as many points as possible to close to the clipping threshold. 

In all our experiments, we perturb only state features. Finally, the values of the hyperparameters used in our experiments are discussed in \cref{app:experiments}.

\begin{table*}[h!]
\centering
\begin{tabular}{|p{2cm}||rr|rr|rr|rr|rr|} 
 \toprule
%
%
%
  \multirow[t]{2}{*}{Domain}    & \multicolumn{2}{|c|}{BRM} & \multicolumn{2}{|c|}{WIS} & \multicolumn{2}{|c|}{PDIS} & \multicolumn{2}{|c|}{CPDIS}& \multicolumn{2}{|c|}{$\wdr$} \\
 \cline{2-11}
    & $lb$ & $ub$ & $lb$ & $ub$ & $lb$ & $ub$ & $lb$ & $ub$ & $lb$ & $ub$ \\
 \hline
 Cancer & 0.85 & 0.97 & 0.69& 0.69 & \textbf{8.95} & \textbf{10.69} & 0.48 & 0.58 & 3.36 & 3.72 \\
HIV & \textbf{343.35} & \textbf{440.92} & 0.0 & 0.1 & 1.4& 2.42 & 0.09 & 0.24 & \textbf{139.71} & \textbf{893.31} \\
Gridworld & \textbf{94.76} & \textbf{98.35} & 0.0 & 0.0 & \textbf{97.15} & \textbf{98.25} & 0.0 & 0.0 & \textbf{25.5} & \textbf{27.31} \\
Cartpole & 0.0 & 0.0 & 0.02 & 0.05  & \textbf{4.46e9} & \textbf{4.08e10} & 0.0 & 0.0 & 0.0 & 0.0 \\
MountainCar & 0.05& 0.07 & \textbf{100.0} & \textbf{100.0} & \textbf{98.37} & \textbf{99.62} & \textbf{47.38} & \textbf{98.68} &  0.02 & 0.03\\
 \bottomrule
\end{tabular}
\caption{\label{table:summary} Summary of the errors achieved by data poisoning across domains and OPE algorithms at $\eps=0.5\sigma$ and $\alpha=1.0$ and $p=1$. Here $lb$ and $ub$ denote the lower limit and upper limit of 95\% bootstrap confidence intervals of interquartile mean of percentage error in the value estimates, over 10 runs. We observe that the attack is successful on most of the methods across domains. CPDIS and WIS are usually the most resilient OPE methods.
%
} 
%
\end{table*}

 \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/cancersubset_experiment88_budget_0.png}
    %
     \caption{Cancer    \label{fig:R1}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/hivsubset_experiment88_budget_0.png}
    %
     \caption{HIV    \label{fig:R2}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/customsubset_experiment88_budget_0.png}
    %
     \caption{Gridworld \label{fig:R3} }
    \end{subfigure}
    \caption{\label{exp:effectiveness1}\cref{fig:R1,fig:R2,fig:R3}
  compares the effect of DOPE attack on BRM, WIS, PDIS, CPDIS and WDR methods in the  Cancer, HIV and Continuous Gridworld domains (left to right) 
  for different values of attacker's budget $\eps= \text{frac} \cdot \sigma$ and $p=1$ ($\ell_1$ norm). 
  Larger the value of frac, the larger are the perturbations added by the DOPE attack, and accordingly we observe larger errors in the value estimates. 
%
  } 
     \end{figure*}
      \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/cancersubset_experiment88_corrupt_0.png}
    %
     \caption{Cancer    \label{fig:R11}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/hivsubset_experiment88_corrupt_0.png}
    %
     \caption{HIV   \label{fig:R21}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/customsubset_experiment88_corrupt_0.png}
    %
     \caption{Gridworld \label{fig:R31} }
    \end{subfigure}
    \caption{ \label{exp:effectiveness2}\cref{fig:R11,fig:R21,fig:R31}
   compares the effect of DOPE attack on BRM, WIS, PDIS, CPDIS, and WDR methods in Cancer, HIV, and Continuous Gridworld domains (left to right) for different percentages of corruption $\alpha$ at $\eps=1.0\sigma$ and $p=1$ ($l_1$ norm). 
  The larger the value of $\alpha$, the larger the number of points perturbed by the DOPE attack, and accordingly, we observe larger errors in the value estimates. 
%
   }
     \end{figure*}
   
      \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.75\linewidth]{plots/experiments12/FQE-hiv-influenceexperiment12_0.png}
    %
     \caption{BRM    \label{fig:R13}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.75\linewidth]{plots/experiments12/IS_pdis-hiv-influenceexperiment12_0.png}
    %
     \caption{IS    \label{fig:R23}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.75\linewidth]{plots/experiments12/WDR_wdr-hiv-influenceexperiment12_0.png}
    %
     \caption{WDR \label{fig:R33} }
    \end{subfigure}
    %
    %
    %
    %
    %
    %
    \caption{ \label{exp:baselines1}\cref{fig:R13,fig:R23,fig:R33} compare the effects of Random attack, Random DOPE attack (an ablated version of DOPE), FSGM-based Attack and DOPE attack on the error in the value function estimates of BRM, IS, and DR methods (left to right) in HIV domain. The percentage error in the Random attack and FSGM-based attack is small relative to the percentage error due to DOPE and Random DOPE attack, and hence their curves lies close to the x-axis. 
    DOPE attack outperforms both the Random DOPE and Random attacks at nearly all values of the attacker's budget.
    }
     \end{figure*}
     
We evaluate the effectiveness of our attack model by computing the percentage error in the value function estimate relative to the initial value estimate. We report the $95\%$ bootstrap confidence intervals of the interquartile mean (IQM) of percentage error using our results from the 10 runs (10 datasets) since the IQM confidence intervals are found to be more reliable in practice~\cite{agarwal2021deep}. In this setting, a large percentage error indicates that the OPE method is less robust to adversarial contamination.


\subsection{Effectiveness of DOPE Attack}
Here we evaluate the effectiveness of the DOPE attack on five OPE methods for a range of attack budgets. In our first experiment, we fix the percentage of corrupt data points $\alpha = 0.05$ and vary the budget $\eps$ as $\text{frac} \cdot \sigma$, where $\text{frac}$ varies from $0.0$ to $0.51$ in step-sizes of $0.05$ and $\sigma^2=\frac{2}{N\cdot(N-1)}\sum_{i=1}^N\sum_{j=i+1}^N \|\xi(s_i)-\xi(s_j)\|_p^2$ is the standard deviation of all pairwise distances between the state-features in the dataset. \cref{exp:effectiveness1,exp:effectiveness3}  compare the percentage error in the value estimate of the OPE methods in all domains. Our results show that even when corrupting only $5\%$ of the data points, the attacker need not perturb the state features significantly to achieve large errors in the value estimate. In fact, with a perturbation budget as small as $\eps=0.5\sigma$, DOPE can result in a substantial error in the policy's value in HIV, Cancer, Mountain Car, and Continuous Gridworld domains. Further, a larger attacker's budget means the DOPE model has more leeway on the perturbations that it can add to the dataset, and hence, we observe larger errors for larger budget values. Note that the percentage errors of CPDIS and WIS in~\cref{fig:R2,fig:R3} are too small to be visible in the plots. 

In the second experiment, we vary the percentage of corrupt data points between 0.0 and  $0.10$ with a step size of 0.02 for all the domains (Figure~\ref{exp:effectiveness2}). We fix the perturbation budget $\eps$ to $1.0\sigma$. Our experimental results in \cref{exp:effectiveness1,exp:effectiveness2,exp:effectiveness3} demonstrate that corrupting only $0.05\%$ of the data points using DOPE is sufficient to observe a significant error in the value estimate of a given policy. It is important to realize that the attacker's budget $\eps$ is local to each data point and is not impacted by the number of points perturbed. Hence, we see that a larger percentage of corrupt data points yields a larger percentage error in the value estimates. Note that the percentage errors of CPDIS and WIS in~\cref{fig:R21,fig:R31} are too small relative to BRM and WDR and therefore are not clearly visible in the plots. 

Finally, we summarize the impact of DOPE attack ($\eps=0.5\sigma$ and $\alpha=1.0$, $p=1$) on all OPE methods and domains in \cref{table:summary}. 
It can be seen that the DOPE attack has a very high impact on BRM, PDIS, and WDR methods and an almost negligible impact on CPDIS and WIS methods. We hypothesize that CPDIS and WIS methods may be more robust because the weight normalization that they employ potentially minimizes the importance of any individual data point, especially when the rewards are uniformly distributed throughout the trajectory. On the other hand, the weights in PDIS are not normalized, and therefore, it appears that in Cartpole and HIV domains, the DOPE attack model is able to significantly impact the importance sampling weights and result in significant errors in the value estimates. In WDR, the attacker can introduce errors through both, the Q-value function learned from the data as well as the importance sampling weights, and therefore, we observe significant errors in the value estimates of $\wdr$ method in HIV and Gridworld domains. 

\subsection{COMPARISON WITH BASELINES} 
Here we compare the DOPE attack to three custom baselines: Random Attack and Random DOPE Attack (ablation of DOPE Attack) and FSGM-based Attack. In Random Attack, we choose $\alpha n$ random points to perturb and sample perturbations for these points from a uniform $l_1$ norm ball with a radius equal to the perturbation budget $\eps$. For more details on the sampling algorithm, see Algorithm 4.1 in~\cite{Calafiore1998samplinguniform}. In Random DOPE Attack, we select points randomly and update them using \cref{prop_greedy}. The purpose of using this ablation is to investigate the benefit of selecting data points to perturb based on their influence scores as suggested in  \cref{prop_greedy}.
The third baseline is an FGSM-based OPE attack which is a variant of the Fast Gradient Sign Method (FGSM)~\cite{GoodfellowSS2015Explaining}, a popular test-time attack designed to elicit misclassification errors from supervised learning models. Note that FGSM has never been used to attack OPE methods in prior literature, and we are the first to introduce and leverage a variant of it as a baseline in this context. Our FGSM-based OPE attack baseline modifies the transition tuples (features) $\psi(s,a,r)$ to maximize the (supervised learning) loss ($L(\theta)$) optimized by the OPE method, thus resulting in sub-optimal estimates of $\theta$. Note that the FGSM-based OPE attack baseline does not directly maximize the error in the value function estimates, unlike our proposed framework. Given these baselines, we fix the value of $\alpha$ to $0.05$ and vary the budget $\eps$ from $0.0$ to $0.25$ with step size $0.04$.

For each dataset and each value of the budget $\eps$, we average the percentage change in the value estimate for Random DOPE attack and Random attack over 50 trials. Results with the Gridworld domain are shown in Figure~\ref{exp:baselines1}. See \cref{exp:baselines:cancer,exp:baselines:gridworld,exp:baselines:hiv,exp:baselines:mountaincar} in~\cref{app:experiments} for results on other datasets. 

The experimental results in \cref{exp:baselines1,exp:baselines:cancer,exp:baselines:gridworld,exp:baselines:hiv,exp:baselines:mountaincar} demonstrate that in contrast to the DOPE attack, the Random attack and FSGM-based attack fail to introduce any significant error in the value-function estimate and, therefore, cannot be used as an alternative to the DOPE attack model.
Further, it can be seen that when the points to perturb are randomly selected (Random DOPE), it is likely to result in a smaller adversarial impact than when influential data points are chosen for perturbations (DOPE). These results are not surprising as we would expect the value function to be highly dependent on the influential data points. In some domains like Cancer and HIV, there is very little difference between the performance of DOPE and Random DOPE attacks. We hypothesize that this is due to all data points having similar influence scores.

\section{Conclusion}

We proposed a novel data poisoning framework to analyze the sensitivity of OPE methods to adversarial contamination at train time. We formulated the data poisoning problem as a bilevel optimization problem and proposed a computationally tractable solution that leverages the notion of influence functions from robust statistics literature. Using the proposed framework, we analyzed the sensitivity of five popular OPE methods on multiple datasets from medical and control domains. %
Our experimental results on various medical and control domains demonstrated that existing OPE methods are highly vulnerable to adversarial contamination %
thus highlighting the need for developing OPE methods that are statistically robust to train-time data poisoning attacks.
%

%
%
%
%
%

%
%
%
%
%
\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
The authors would like to thank the anonymous reviewers for their helpful feedback and all the funding agencies listed below for supporting this work. This work is supported in part by the NSF awards \#IIS-2008461 and \#IIS-2040989, and research awards from Amazon, Harvard Data Science Institute, Bayer, and Google. HL would like to thank Sujatha and Mohan Lakkaraju for their continued support and encouragement. The views expressed here are those of the authors and do not reflect the official policy or position of the funding agencies.
\end{acknowledgements}
%
\bibliography{lobo_674}
%

%

\clearpage
%
%
%
%
  
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%


%
%
%
%



%

%


%
%
%
%
%
%
%
%

%
%
%
%
%
%
%
%
%


%
%

%


%
%

%
%
%
%

%
%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%
%

%
%
%
%
%
    
%
    
%
%





%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%


%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%


%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%


%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%


%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%

%
%
%
%
%
%
%
%
%
%

%
%

%
%
%
%
%
%
%
%
%

%

%
%
%
%
%
%
%
%

%
%
%
%
%
%
%
%
%

%
%

%

%
%
%
%
%

%

%
%

%

    
%
%

%


%
%
%
%
%
%
%
%
%
%
%
%
%

%
%
%

%
%

%
%

%
%
%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%



%
%
%
%
%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
   
%
%
   
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
    
    
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
    
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
    
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
    
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
    
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
    
    
    
    
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%



\end{document}
