% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{booktabs} % for professional tables
\usepackage{nicefrac}
\usepackage{amsfonts,amsmath,amssymb,mathtools,amsthm}
\usepackage{graphicx}
\usepackage{multirow}
\usepackage{multicol}
\usepackage[capitalise,noabbrev]{cleveref}
\usepackage{color,caption,subcaption}
\hypersetup{colorlinks=true,citecolor=black,linkcolor=black,urlcolor=black}

% \DeclareMathOperator{\cvar}{CVaR}
% \DeclareMathOperator{\var}{VaR}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\renewcommand{\cite}[1]{\citep{#1}}

    
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)
\usepackage{derivative}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\real}{\mathbb{R}}
\newcommand{\states}{\mathcal{S}}
\newcommand{\actions}{\mathcal{A}}
\newcommand{\opt}{^\star}
\newcommand{\popt}{^{\pi\opt}}
\newcommand{\pI}{^{\pi}}
\newcommand{\tr}{^\top}
\newcommand{\data}{\Psi}
\newcommand{\cs}{\\[1ex] & }
\newcommand{\minimize}[1]{\operatorname*{minimize}_{#1} \quad &}
\newcommand{\maximize}[1]{\operatorname*{maximize}_{#1} \quad &}
\newcommand{\R}{\mathcal{R}}
\newcommand{\one}{\bm{1}}
\newcommand{\zero}{\bm{0}}
\newcommand{\p}{\mathbb{P}}
\newcommand{\PP}{\mathcal{P}}
\renewcommand{\ss}{\,\mid\,}
\newcommand{\rw}{\mathfrak{r}}
\newcommand{\srob}{\rho^S}
\newcommand{\drob}{\rho^D}
\newcommand{\rrob}{\rho^R}
\newcommand{\sarob}{\rho^{RA}}
\newcommand{\saV}{v_{SA}^{\pi}}
\newcommand{\sV}{v_{S}^{\pi}}
\newcommand{\sr}{R}				% soft-robust identifier
\usepackage[font=small,labelfont=bf]{caption}
% \usepackage{algpseudocode}% http://ctan.org/pkg/algorithmicx
% \usepackage{algorithmic}
% set notation
\newcommand{\Real}{\mathbb{R}}
\newcommand{\RealPlus}{\Real_{+}}
\newcommand{\Int}{\mathbb{Z}}
\newcommand{\Natural}{\mathbb{N}}
\newcommand{\NaturalPlus}{\Natural_{+}}

\newcommand{\eps}{\varepsilon}
\newcommand{\To}{\longrightarrow}
\newcommand{\BX}{\bm{B}(X)}
%additional functions
\newcommand{\Pow}[1]{\mathcal{P}\left(#1\right)}  % Power set
\newcommand{\E}[1]{\mathbb{E}\left[ #1 \right]}
\newcommand{\Ex}[2]{\mathbb{E}_{#1}\left[ #2 \right]}
\renewcommand{\P}[1]{\mathbb{P}\left[ #1 \right]}
\renewcommand{\Pr}[2]{\mathbb{P}_{#1}\left[ #2 \right]}
\newcommand{\Var}[1]{\bm{Var}\left[ #1 \right]}
\newcommand{\Proj}[1]{\bm{P}_{#1}}
\newcommand{\cond}{\textrm{if }}
\newcommand{\dotp}[2]{\langle #1, #2 \rangle}
\newcommand{\eye}{\bm{I}}
\newcommand{\setp}{\mathsf{P}^s}
\newcommand{\Pf}{f}
\newcommand{\st}{\operatorname{s.t.}}
\newcommand{\stc}{\operatorname{subject\,to} \quad &}
\newcommand{\Exp}[2]{\mathbb{E}_{#1} \left[ #2 \right] }
\renewcommand{\Pr}{\mathbb{P}}
\newcommand{\torg}{\theta_\text{org}}
\newcommand{\tpert}{\theta_\text{pert}}

\newcommand{\Bell}{\mathfrak{T}}
\newcommand{\T}{\Bell^{\pi}}
\newcommand{\Topt}{\mathfrak{T}}
\newcommand{\SRopt}{\mathfrak{L}^S}
\newcommand{\SARopt}{\mathfrak{L}^{SA}}
\newcommand{\SBopt}{\mathfrak{B}^S}
\newcommand{\SABopt}{\mathfrak{B}^{SA}}
\newcommand{\SigmaQ}{\widetilde{\Sigma}}
\newcommand{\muQ}{\tilde{\mu}}
\newcommand{\w}{w}
\newcommand{\msbr}{\operatorname{MSBR}}
\newcommand{\brm}{\text{BRM}}
\newcommand{\wis}{\text{WIS}}
\newcommand{\pdis}{\text{PDIS}}
\newcommand{\cpdis}{\text{CPDIS}}
\newcommand{\dr}{\text{DR}}
\newcommand{\wdr}{\text{WDR}}

\theoremstyle{plain}
\newtheorem{thm}{Theorem}[section]
\newtheorem{cor}[thm]{Corollary}
\newtheorem{lem}[thm]{Lemma}
\newtheorem{prop}[thm]{Proposition}
\newtheorem{exm}[thm]{Example}
\theoremstyle{definition}
\newtheorem{defn}[thm]{Definition}
\newtheorem{asm}[thm]{Assumption}
\theoremstyle{remark}
\newtheorem{rem}[thm]{Remark}
\newtheorem{fact}[thm]{Fact}
\usepackage[ruled]{algorithm2e}
% various match operators
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\cov}{Cov}
\DeclareMathOperator{\dst}{dst}
\usepackage{nameref}

\usepackage{xr-hyper}
\usepackage{hyperref}
\usepackage{xcite}


\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}% latexmk will find this if $recorder=0 (however, in that case, it will ignore #1 if it is a .aux or .pdf file etc and it exists! if it doesn't exist, it will appear in the list of dependents regardless)
  \@addtofilelist{#1}% if you want it to appear in \listfiles, not really necessary and latexmk doesn't use this
  \IfFileExists{#1}{}{\typeout{No file #1.}}% latexmk will find this message if #1 doesn't exist (yet)
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
%%% END HELPER CODE

% put all the external documents here!
\myexternaldocument{lobo_674-supp}




% editing
\renewcommand{\Pr}{\mathbb{P}}
\newcommand{\elita}[1]{\textcolor{orange}{#1}}
\newcommand{\hs}[1]{\textcolor{magenta}{#1}}

%mareks edit command
\usepackage{xifthen}
\usepackage[normalem]{ulem}
\newcommand{\mm}[2][]{\ifthenelse{\isempty{#1}}{}{\textcolor{red}{[\sout{#1}]}}\textcolor{green}{#2}}
\newcommand{\marek}[1]{\textcolor{red}{[#1]}}
\newcommand{\hima}[1]{\textcolor{red}{[#1]}}

\crefmultiformat{thm}{Theorems~#2#1#3}{ and~#2#1#3}{, #2#1#3}{ and~#2#1#3}


\newenvironment{mprog}{\begin{array}{>{\displaystyle}r>{\displaystyle}l>{\displaystyle}l}}{\end{array}}
\newcommand\commentfont[1]{\footnotesize\ttfamily\textcolor{magenta}{#1}}
\SetCommentSty{commentfont}

\title{Data Poisoning Attacks on Off-Policy Policy Evaluation Methods}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<loboelita@gmail.com>}{Elita Lobo}{}}
\author[2]{Harvineet Singh}
\author[1]{Marek Petrik}
\author[3]{Cynthia Rudin}
\author[4]{Himabindu Lakkaraju}
% Add affiliations after the authors
\affil[1]{%
    University of New Hampshire\\
    Durham, NH, USA
}
\affil[2]{%
    New York University\\
    New York, NY, USA
}
\affil[3]{%
    Duke University\\
    Durham, NC, USA
  }
  
\affil[4]{%
    Harvard University\\
    Boston, MA, USA
  }
\begin{document}
\maketitle

\begin{abstract}%
Off-policy Evaluation (OPE) methods are a crucial tool for evaluating policies in high-stakes domains such as healthcare, where exploration is often infeasible, unethical, or expensive. However, the extent to which such methods can be trusted under adversarial threats to data quality is largely unexplored. In this work, we make the first attempt at investigating the sensitivity of OPE methods to marginal adversarial perturbations to the data.
We design a generic data poisoning attack framework leveraging influence functions from robust statistics to carefully construct perturbations that maximize error in the policy value estimates. We carry out extensive experimentation with multiple healthcare and control datasets. Our results demonstrate that many existing OPE methods are highly prone to generating value estimates with large errors when subject to data poisoning attacks, even for small adversarial perturbations. These findings question the reliability of policy values derived using OPE methods and motivate the need for developing OPE methods that are statistically robust to train-time data poisoning attacks.
% To combat this problem, we suggest ways to identify and improve the robustness of OPE methods.
\end{abstract}

\section{Introduction}
\label{sec:intro}

In reinforcement learning (RL), off-policy evaluation~(OPE) methods are popularly used to estimate the value of a policy from previously collected data~\cite{Thomas2015Highconfidence,voloshin2020empirical,levine2020offline}. These methods are instrumental in high-stakes decision problems such as in medicine and finance, where deploying a policy directly is often infeasible, unethical, or expensive~\cite{gottesman2020interpretable,hiv}. In such cases, one must estimate the value solely from a batch of data collected using a different and possibly unknown policy. Only if the OPE methods estimate the value of a policy to be sufficiently high will stakeholders deploy it. Otherwise, the policy will be rejected. It is therefore essential that OPE methods do not severely overestimate the values of bad policies or underestimate the values of good policies~\cite{gottesman2020interpretable}.

Despite the importance of OPE methods, their sensitivity to adversarial contamination of logged data is not well understood. The complexity of OPE methods offers ample opportunities for attackers to introduce significant errors in OPE estimates with only small perturbations to the input data. For example, some OPE methods compute the value of a policy in a given state as a function of its value in future states. Therefore, even small errors introduced in the value estimates of these future states can accumulate and result in significant errors in the value estimates at the initial states, where critical strategic decisions are often made.
This property could be exploited by attackers. Another possible avenue for an attack is the \textit{importance sampling weights}. Popular OPE methods, such as the Doubly Robust and the Importance Sampling methods~\cite{jiang16doublyrobust,voloshin2020empirical} use importance sampling weights to correct for dataset mismatch when evaluating the given policy with logged data from a different policy. The weights depend on the estimate of the policy used for the logged data. Attackers could perturb the data in a way that forces the agent to wrongly estimate the policy used to collect data and consequently introduce significant errors in the value estimates. Such vulnerabilities motivate the need for a thorough analysis of the effect of data poisoning attacks on OPE methods. 


While some prior works have studied adversarial attacks in the context of policy learning in online and batch RL settings~\cite{rakhsha2020policy,Ma2019PolicyPI,Chen2019adversarial}, they mainly focus on teaching an agent to learn an adversarial policy or driving the agent to an adversarial state~\cite{rakhsha2020policy,zhang2021corruptionrobust}, and do not specifically investigate the effect of these attacks on OPE methods. In this work, we address the aforementioned gaps and study the effect of data poisoning attacks on OPE methods. More specifically, our work answers the following question: \textit{Can we construct small perturbations to the training data that significantly change a given OPE method's estimate of the value of a given policy?} To this end, we propose a novel data poisoning framework to analyze the sensitivity of model-free OPE methods to adversarial data contamination at train time. We formulate the data poisoning problem as a bi-level optimization problem and show that it can be adapted to diverse model-free OPE methods, namely, Bellman Residual Minimization (\brm)~\cite{Farahmand2008BRM}, Weighted Importance Sampling~(\wis), Weighted Per-Decision Importance Sampling (\pdis)~\cite{Precup00temporalabstraction,powell1966wis,rubenstein1981MC}, Consistent Per-Decision Importance Sampling (\cpdis)~\cite{Thomas2015SafeRL}, and Weighted Doubly Robust methods (\wdr)~\cite{jiang16doublyrobust}. To solve the aforementioned bilevel optimization problem in a computationally tractable manner, we derive an approximate algorithm using influence functions from robust statistics~\cite{koh2018stronger,koh2017influence,diakonikolas2019recent,broderick2021automatic}. To the best of our knowledge, our work is the first to study the sensitivity of a wide range of OPE methods to train-time data poisoning attacks. 

We evaluate our framework using five different datasets spanning medical (Cancer and HIV) and control (Mountain Car, Cartpole, Continuous Gridworld) domains. Our experiments show that corrupting only $3\%$--$5\%$ of the observed states achieves more than $340\%$ and $100\%$ error in the estimate of the value function of the optimal policy in the HIV and MountainCar domains, respectively. Through our experimental results, we show that out of the five OPE methods, WDR, PDIS, and BRM are generally the least statistically robust, and CPDIS and WIS are relatively more statistically robust to such adversarial contamination. Finally, our findings question the reliability of policy values derived using OPE methods and motivate the need for developing OPE methods that are statistically robust to train-time data poisoning attacks.


\section{Preliminaries}\label{sec:background}

We model a sequential decision-making problem as a Markov Decision Process~(MDP). An MDP is a tuple of the form $\langle \states,\actions, R, P,p_0,\gamma \rangle$ representing the set of states, set of actions, reward function, transition probability model, initial state distribution, and discount factor respectively. 
When taking an action $a\in \actions$ in a state $s\in \states$ and transitioning to state $s'\in \states$, the scalar $R(s,a,s')$ denotes the reward received by the agent and $P(s,a,s')$ denotes the probability of transitioning to a state $s'$ on taking an action $a$ in a state $s$.

A randomized policy $\pi : \states \to \Delta^{|\actions|}$ prescribes the probability of taking each action in each state. The value function $v^{\pi}\colon \mathcal{S} \to \real$ of a policy $\pi$ at a state $s$ is the expected discounted returns of the policy starting from state $s$ and is given by
\[
  v^{\pi}(s) = \Exp{}{\sum_{t=0}^{\infty} \gamma^t R(S_t,A_t,S_{t+1}') \mid \pi,S_0 = s} ~.
\]
The value of a policy is computed as $p_0^Tv^{\pi}$. The state-action value function (also known as the Q-value function) $q^{\pi}\colon \states\times \actions \to \real$ of a policy $\pi$ at a state $s$ and an action $a$ is the expected discounted returns obtained by taking action $a$ in state $s$ and following policy $\pi$ thereafter. The state-action value function $q^{\pi}$ for a policy $\pi$ is the unique fixed point of the \emph{Bellman operator} $\mathcal{T}^{\pi}\colon \states \times \actions \to \real^{\states \times \actions}$ defined as
%
\begin{gather}\label{eq:bellman}
(\mathcal{T}^{\pi} q) (s,a) =  \\
\nonumber
\sum_{s'\in \mathcal{S}}\sum_{a'\in \mathcal{A}} (R(s,a,s') + \gamma P(s,a,s') \pi(s',a') q(s',a') )\,.
\end{gather}

We assume the standard batch RL setting~(e.g.~\citep{levine2020offline}) in which the agent is given a batch of $n{=}N\times T$ transition tuples  $D {=} ((s^i_j, a^i_j, r^i_j)_{j=1}^{T})_{i=1}^{N}$, generated by a behavior policy $\pi_b$ for $N$ episodes of length $T$. The \emph{goal} of OPE is to use $D$ to evaluate the value of the evaluation policy $\pi$. Let $D_0$ be a set of initial states sampled from the distribution $p_0$.

The value function is approximated using features $\xi\colon \mathcal{S} \to \real^d$. As is standard in linear value function approximation, we assume also that the state-action value function $q^{\pi}$ is approximated as a linear combination of state-action features $\phi\colon \mathcal{S} \times \mathcal{A} \to \real^{|\actions|\cdot d}$. The state-action features for a given state-action pair $(s,a)$ are constructed by using the state features $\xi(s)$ at the indices corresponding to $a$ and zero elsewhere, i.e. $\phi(s,a)[ad: (a{+}1)d]\leftarrow\xi(s)$. Because the value function is estimated from data, we need to define a sample feature matrix $\Phi \in \real^{n \times d}$ where the rows correspond to the state-action features $\phi(s,a)$ for the $n$ state-action pairs in $D$.  Similarly, $\Phi_p\in \real^{n \times d}$ denotes the sample feature matrix for the \emph{next states} such that each row corresponds to $\phi(s'_i,\pi(s'_i))$ for the next states $s'_i$ in $D$. The sample reward matrix $r \in \real^{n \times 1}$ is constructed such that the $i^{th}$ row corresponds to the reward $r_i$ in $D$. More details on the construction of the sample feature matrices $\Phi$, $\Phi_p$ and reward matrix $r$ can be found in Section 4 in~\cite{Lagoudakis2003LSPI}.

OPE methods are broadly classified into three categories: Direct, Importance Sampling, and Hybrid Methods~\cite{voloshin2020empirical}. 
\emph{Direct Methods} estimate the value of the evaluation policy by solving for the fixed point of the Bellman Equation~\eqref{eq:bellman} with an assumed model for the state-action value function $q$ or the transition model $P$. We illustrate our attack on one of the most popular Direct Methods, namely the \emph{Bellman Residual Minimization}~(BRM) method~\cite{voloshin2020empirical,Farahmand2008BRM}. BRM solves a sequence of supervised learning problems with state-action features $\phi(s, a)$ as the predictor and the 1-step Bellman update $\mathcal{T}^{\pi}q=r + \gamma Pq$ as the target response. 
% $\mathcal{T}^{\pi} : \real^S \to \real^S$ is commonly referred to as the Bellman operator.
The objective optimized in BRM is the Mean Squared Bellman residual (MSBR), defined as a weighted $L_2$ norm:
%
\begin{equation} \label{BRM_projectedbellman}
  \msbr(\eta) \;=\;  \|q_{\eta}-\mathcal{T}^{\pi} q_{\eta}\|^2_W~.
\end{equation}
%
Here, the linear Q-value function $q_\eta$ is parameterized by $\eta$ as $q=\Phi\eta$. The weight matrix is computed as $W=\diag[\mu^{\pi}]$ where $\mu^\pi\in[0,1]^S$ represents the stationary state distribution of policy $\pi$. The value of a policy is then computed as
%
\begin{equation}
\label{eq:brm}
  v_\text{BRM} \;=\;  \sum_{s\in D_0} \sum_{a\in\actions} p_0(s) \cdot \pi(s,a) \cdot q_{\eta}(s,a)~.
\end{equation}

\emph{Importance Sampling Methods}~(IS) 
~\cite{kahn1953Methods} are based on Monte-Carlo techniques and compute unbiased but high-variance value estimates. 
The key idea is to compute the value of policy $\pi$ as the weighted average of the returns of the trajectories in $D$, where each trajectory is re-weighted by its probability of being observed under evaluation policy $\pi$. We focus on attacking three popular variants of importance sampling methods, namely the \emph{Per-Decision, Consistent Weighted Per-Decision}, and \emph{Weighted} IS methods (PDIS, CPDIS, WIS)~\cite{Precup00temporalabstraction,Thomas2015SafeRL,rubenstein1981MC}.
Let $g^i_{T} = \sum_{t=0}^{T} \gamma^t r^i_t$ represent the returns observed for the $i^\text{th}$ trajectory in the dataset $D$ and assume that the behavior policy is parameterized by $\theta_b$ and estimated from data $D$ using maximum likelihood estimation (MLE)~\cite{vaart1998Asymptotic}. In this setting, the MLE method effectively minimizes the Cross Entropy Loss (CEL) on the predictions of the behavior policy. In order to define the OPE estimates of the value functions, we need the importance sampling weights $\rho^i_{0:t}$ for time step $t$ defined as
%
\[
  \rho^i_{0:t} = \prod_{t'=0}^t \frac{\pi(s^i_{t'}, a^i_{t'} )}{\pi_b^{\theta_b}(a^i_{t'}| s^i_{t'})}~.
\]
Here, the estimate of the behavior policy is defined as $\pi^{\theta_b}_{b}(a |s) = \exp( \phi(s,a) \theta_b ) (\sum_{a'\in\actions}\exp(\phi(s,a') \theta_b))^{-1}$ for each $s\in\states$ and $a\in\actions$.  Then
the WIS, PDIS, and CPDIS value function estimates are  defined as
\begin{align}
   v_\text{WIS}&= \left(\sum_{i=1}^N \rho^i_{0:T}\right)^{-1} \sum_{i=1}^N \rho^i_{0:T} g^i_T, \label{eq:wis}\\ 
  v_\text{PDIS} &= \frac{1}{N} \sum_{i=1}^N \sum_{t=1}^{T} \gamma^{t-1} \rho^i_{0:t} r^i_t, \label{eq:pdis}\\
  v_\text{CPDIS} &= \sum_{t=1}^{T} \gamma^{t-1} \frac{ \sum_{i=1}^N \rho^i_{0:t} r^i_t}{\sum_{i=1}^N \rho^i_{0:t}} \label{eq:cwpdis}.
 \end{align}

\emph{Hybrid Methods} combine both Direct and IS methods to generate value estimates with low bias and variance. An important hybrid method is the \emph{Doubly Robust}~($\dr$) estimator~\cite{jiang16doublyrobust}, which decreases the variance in the IS estimate by using the estimate from a method like BRM. The $\dr$ and Weighted DR ($\wdr$) estimators are given by
% \begin{small}
\begin{equation}
\label{eq:dr}
\begin{aligned}
v_\text{DR} &= \frac{1}{N} \sum_{i=1}^N \sum_{t=0}^{T-1} \rho^i_{0:t} w_t^i + \frac{1}{N} \sum_{i=1}^N v_{\eta}(s_0^i).\\
v_\text{WDR} &= \sum_{i=1}^N \sum_{t=0}^{T-1} \frac{\rho^i_{0:t}}{\sum_{i=1}^N \rho^i_{0:t} } w_t^i + \frac{1}{N} \sum_{i=1}^N v_{\eta}(s_0^i).
\end{aligned}
\end{equation}
% \end{small}

where $w_t^i= ( r^i_t -  q_{\eta}(s^i_t,a^i_t) + v_{\eta}(s^i_t))$ and $v_{\eta}(s^i_t)=\sum_{a\in\actions}\pi(s^i_t,a)\cdot q_{\eta}(s^i_t,a)$.
Here the parameters of the value function $q$ are estimated using Direct Methods like $\brm$. Because empirical studies show that there are no clear winners among the three methods \citep{voloshin2020empirical}, we investigate attacks on representative methods from each type.
\section{DOPE Framework} \label{sec:Framework}
We first present our attack framework called DOPE for \emph{D}ata poisoning attacks on \emph{O}ff-\emph{P}olicy \emph{E}valuation. Then we demonstrate how to use the framework to attack the three types of OPE methods discussed in Section~\ref{sec:background}.
The objective and scope of the attacks considered in DOPE are as follows.

\textbf{Scope}: We assume the setting of a white-box attack, i.e. the attacker has access to the batch data $D$, evaluation policy $\pi$, the value of the discount factor $\gamma$, and the attacker knows how the agent estimates the behavior policy and the state-action value function from the data. This kind of a setting is commonplace in the healthcare domains~\cite{gottesman2020interpretable,Ernst2006Clinical,Yu2021Healthcare} where models are typically benchmarked and often made available to the general public so that they can be independently vetted and validated before deployment. 
Further, for the attack to be unnoticeable, we allow the attacker to only perturb at most $\alpha$ fraction of the transitions in $D$ while conforming to some perturbation budget $\eps \ge 0$ to be defined later. 

\textbf{Objective}: The goal of the attacker is to add small adversarial perturbations to a subset of transitions in $D$ such that it maximizes the error in the value estimate of a given policy in the desired direction.
This means that the attacker may choose to decrease or increase its estimated value for the policy being evaluated such that a good evaluation policy is rejected or a bad evaluation policy is approved. 

\textbf{Components:} The DOPE framework for a given OPE method has four major components:
\emph{Features ($\Psi$):} the part of the transition tuples targeted by the attack;
\emph{Value estimation function ($\rho$):} function used by the OPE method for computing the value;
\emph{Estimated parameter ($\theta$): } model parameters learned by the OPE method from the data;
\emph{Loss function ($L$): } loss optimized by the OPE method for model-fitting.
We define each component in detail in \cref{sec:attack_methods}.
We can now formulate our attack model as a problem of finding the perturbation matrix $\Delta=(\delta_i)_{i=1}^n, \delta_i \in\real^{Q}$ that maximizes the difference between values found using the perturbed and the original data under constraints dictating that the perturbations are small:
\begin{subequations} \label{eq:bilevel}
\begin{align}
   \maximize{\Delta \in \real^{n \times Q}}  
    \rho{(\tpert,\Psi+ \Delta)} - \rho(\torg,\Psi)\label{eq:bilevel_obj} \\
    % &&  \text{maximize error in value estimate }
    \stc  \tpert \;\in\;  \argmin_{\theta \in \real^P} \, L(\theta, \Psi+ \Delta)\label{eq:bilevel_one}  \\
    & \torg \;\in\;  \argmin_{\theta \in \real^P} \, L(\theta, \Psi)\label{eq:bilevel_one_dash}  \\
    % && \text{compute optimal $\theta$ on corrupt data } 
     & \| \delta_i \|_{p} \;\leq\; \eps, \quad i = 1,\ldots, N   \label{eq:bilevel_two} \\
    %  &&  \text{perturbation should conform to budget $\epsilon$}
    & \sum_{i=1}^n \mathbf{1}_{\| \delta_{i} \| \neq 0} \;\leq\; \alpha \cdot n.
    % &&  \text{ only perturb $\alpha$ n transitions}
    \label{eq:bilevel_three}
\end{align}
\end{subequations}

\begin{table*}[htbp!]
    \centering
    \begin{tabular}{l|llll}
 \toprule
 %\multicolumn{5}{|c|}{DOPE Attack Templates} \\
 Method & Parameters $\theta$ & Features $\Psi$ & Function $\rho(\Psi)$ &  Loss $ L(\theta,\Psi)$ \\
 \hline
 $\brm$ (\citet{Farahmand2008BRM}), Eq. \eqref{eq:brm} & $\eta$ in $q_\eta$ & $\Phi$ or $r$ &  $v_\brm$ & MSBR \\
 WIS (\citet{rubenstein1981MC}), Eq. \eqref{eq:wis} & $\theta_b$ in $\pi_b^{\theta_b}$ & $\Phi$ or $r$ & $v_\text{WIS}$ & CEL \\
 PDIS (\citet{Precup00temporalabstraction}), Eq. \eqref{eq:pdis} & $\theta_b$ & $\Phi$ or $r$ & $v_\text{PDIS}$ & CEL \\
 CPDIS (\citet{Thomas2015SafeRL}), Eq. \eqref{eq:cwpdis} & $\theta_b$ & $\Phi$ or $r$ & $v_\text{CPDIS}$ & CEL \\
WDR/DR (\citet{jiang16doublyrobust}), Eq. \eqref{eq:dr} & $\theta_b$, $\eta$ & $\Phi$ or $r$ & $v_\text{WDR}\ \text{or}\ v_\text{DR}$ & CEL + MSBR or MSBR \\
     \bottomrule
\end{tabular}
    \caption{Settings for the four components of the DOPE attack for five different OPE methods.    \label{table:attack_templates}}
\end{table*}

The DOPE objective in~\eqref{eq:bilevel_obj} increases the estimated value from the original value, thereby increasing the error. Alternatively, if the attacker wants to decrease the estimated value of the given policy, they may do so by simply changing the sign of the objective. The constraint~\eqref{eq:bilevel_one} estimates the optimal parameter $\tpert$ from $D$ after perturbing $\Psi$ to $\Psi+\Delta$. The constraint~\eqref{eq:bilevel_two} ensures that the perturbation added to each sample $\delta_i$, i.e. $i^\text{th}$ row of $\Delta$, is limited to the user-defined budget $\eps$ in $\ell_p$ norm. This prevents the attack framework from generating adversarial transitions that can be easily detected as anomalous. Further, the constraint~\eqref{eq:bilevel_three} limits the number of transitions that the attacker can perturb. Finally, note that $\torg$ is only computed once with the original features $\Psi$ and $\rho(\torg,\Psi)$ is a constant that can be ignored while solving the optimization problem. 

\subsection{Attacking OPE methods using DOPE}\label{sec:attack_methods}

We are now ready to formally define the four components of the DOPE framework.
\cref{table:attack_templates} summarizes the choice of these components for each OPE method we attack.\\ 
%
(a) \emph{Features}:
Let $\psi(s,a,r) \in \real^{Q}$ be an arbitrary component of the transition tuple $\langle s,a,r \rangle$ in $D$ that is perturbed by the attacker. For example,  $\psi(s,a,r)$ could either be the state features $\Phi$ or the reward vector $r$.
We will use $\Psi \in \real^{n \times Q}$
to represent the sample matrix of  $\psi(s,a,r)$ constructed from the $n$ samples in $D$. \\
(b) \emph{Parameters}:
The parameters $\theta(\Psi) \in \real^P$ are the parameters of interest for a given OPE method, written as a function of $\Psi$ to clarify that these are estimated from samples in $D$. In $\brm$, $\theta$ represents the parameters of the Q-value function $q_\eta(s,a)$, whereas in IS methods,  $\theta$ represents the parameters of the estimated behavior policy $\pi_b^{\theta_b}(a|s)$. \\
(c) \emph{Loss function}:
The loss function $L(\theta,\Psi)$ with $L\colon \real^{P} \times \real^{n \times Q} \to \real$ is the empirical loss optimized by the OPE method to derive the optimal parameter $\theta(\Psi) \in  \argmin_{\theta' \in \real^P} \, L(\theta', \Psi)$ from the data. As an example, $L$ in $\brm$ and $\dr$ is the MSBR error, whereas in IS methods, $L$ is the CEL loss optimized to estimate the behavior policy. \\
(d) \emph{Value estimation function}:
Finally, the value estimation function $\rho(\theta(\Psi),\Psi)$ with $\rho: \real^{P} \times \real^{n \times Q} \to \real $ is the function used by the OPE method to compute the mean value of $\pi$ at the initial states. For example, in $\brm$, $\rho$ represents $v_\brm$. We will use the shorthand $\rho(\Psi):= \rho(\theta(\Psi),\Psi)$. 

The loss function $L(\theta,\Psi)$ must be twice continuously differentiable and linearly separable with respect to the transitions in $D$. We provide some examples of such loss functions such as MSBR and CEL and show that they are twice continuously differentiable in~\cref{app:examples}. Further, the value estimation function $\rho(\theta,\Psi)$ also needs to be continuously differentiable with respect to $\theta$ and $\psi$. These assumptions, as \cref{sec:optimization} shows, are important for the influence functions to be well-defined~\cite{koh2017influence}.

\section{Optimization}\label{sec:optimization}

In this section, we discuss the challenges of optimizing the DOPE problem in \eqref{eq:bilevel} and propose an approximate scheme for finding the optimal adversarial perturbations. 

There are two major challenges in solving the optimization problem in~\cref{eq:bilevel}. First, the constraint~\eqref{eq:bilevel_three} is non-differentiable and requires the attacker to select a set of at most $\alpha n$ transitions, such that perturbing these transitions results in maximum change in the value of the policy in the desired direction. It is important to realize that finding this set requires perturbing all possible subsets of data $\Psi$ whose size is at most $\alpha n$ and re-estimating the optimal parameter $\theta$ for each perturbation. The number of such subsets is larger than $\binom{n}{\alpha n}$. Thus, finding the optimal set is computationally infeasible. Second, observe that \eqref{eq:bilevel} is a bilevel optimization problem where the inner-level problem \eqref{eq:bilevel_one} is non-linear in the case of OPE methods which makes it generally NP-Hard to solve~\cite{Wiesemann2013Pessimistic}. 

We address these two challenges by deriving an approximation to the bilevel optimization problem (\ref{eq:bilevel}) using the Taylor expansion of \cref{eq:bilevel_obj}. We show that the resulting problem is simpler to optimize and has a closed-form solution. In \cref{sec:experiments}, we empirically demonstrate the effectiveness of our approximate solution on several domains. 

\textbf{Approximation} We define the influence score of the $i^\text{th}$ data point as $I_{\Psi_i}=\nabla_{\Psi_i} \rho(\Psi)$ as the rate of change in the value estimate $\rho(\Psi)$ with respect to the data point $\Psi_i\equiv \psi(s_i,a_i,r_i)$. Then, using the first-order Taylor expansion of $\rho(\Psi+ \Delta)$, we can approximate the net error in the value-function estimate  $\rho(\Psi+\Delta) - \rho(\Psi)$ as the weighted sum of the influence scores of individual data points, 
\begin{equation}\label{first_order_approx}
     \begin{aligned}
 \rho(\Psi+\Delta) - \rho(\Psi) &  \approx
    \sum_{i=1}^n (\nabla_{\Psi_i}{\rho(\Psi)})\tr \delta_i.
    \end{aligned}
\end{equation}
Using Eq.~\eqref{first_order_approx} reduces the optimization in~\eqref{eq:bilevel} to 
\begin{equation} \label{bilevel:approx}
\begin{aligned}
  &\max_{s\in \{0,1\}^n} \max_{\{\delta_i\}_{i=1}^N} %\in \Real^{n \times Q}}
    \left\{\sum_{i=1}^n 
    s_iI_{\Psi_i} \tr \delta_i \Bigm\vert \|\delta_i \|_p \leq \eps, \forall i\right\} ,\\
   &\text{ subject to } \sum_{i=1}^n s_i = \alpha \cdot n~.
\end{aligned}
\end{equation}
Here, $s\in\{0,1\}^N$ is a vector of binary indicators such that $s_i=1$ indicates that the $i^\text{th}$ transition is amongst the $\alpha n$ transitions selected to be perturbed. We can now compute an approximately optimal set of perturbations in polynomial time as shown in \cref{prop_greedy} for norms $p=1,2,\infty$.

\begin{thm}\label{prop_greedy}
  Let $(s^*,\Delta^*)$ be an optimal solution to the optimization problem in~\eqref{bilevel:approx} and define the \emph{approximate influential set} as $S^*_{\alpha}=\{i : s^*_i =1 , \forall i = 1,\dots,n\}$. Then,
 \begin{enumerate}[nosep]
    \item $S^*_{\alpha}$ can be constructed by choosing the set of $\alpha n$ transitions with the largest $q$-norm of their influence scores $I_{\Psi_i}$. Here, $q$-norm is the dual of $p$-norm used in \eqref{bilevel:approx}, i.e. $1/p+1/q=1$.
    \item For all $i \in [1,\dots n]$, the optimal $\delta^*_i$ for $p=1,2,\infty$ can be computed in closed-form as
    \begin{align*}
            \begin{aligned}
        & \text{ If } p=\infty, \text{ then } \, \delta^*_{i} = \eps \cdot  \operatorname{sign}(I_{\Psi_i}) \\
         & \text{ If } p=2, \text{ then } \, \delta^*_{i} =  \eps \cdot \frac{I_{\Psi_i}}{\|I_{\Psi_i}\|_2}. \\
         & \text{ If } p=1, \text{ then } \, \forall j \in [1,Q], \,  \\ 
        &  \delta^*_{i,j} =  \begin{cases}
         \eps \cdot \operatorname{sign}(I_{\Psi_i}(j))  &\,  \text{ if } \, j \in {\displaystyle\argmax_{m \in [1,Q]}} I_{\Psi_i}(m) \\
        0 &\, \text{ otherwise }
        \end{cases}
    \end{aligned}
        \end{align*}
\end{enumerate}
\end{thm}

\begin{rem}[Relation to optimal solution]
Solving the approximate problem~\eqref{bilevel:approx} gives us a lower bound to the optimal solution of the original problem \eqref{eq:bilevel}. Suppose $\Delta^*$ is the optimal solution for~\eqref{bilevel:approx}  that we get from \cref{prop_greedy} while $\Delta^{**}$ is the (intractable) optimal solution for \eqref{eq:bilevel}. Then, the maximum error in the value function is at least as much as what we get, 
\begin{align*}\label{eq:lower_bound}
    \rho(\Psi+\Delta^{**}) - \rho(\Psi) & = \max_{\Delta \in \real^{n \times Q}}  \rho(\Psi+\Delta) - \rho(\Psi)  \\ & \geq \rho(\Psi+\Delta^{*}) - \rho(\Psi)~.
\end{align*}
\end{rem}

\textbf{Influence scores} Finally, it remains to discuss how to compute the influence scores of each transition in $D$: $I_{\Psi_i}= \nabla_{\Psi_i} \rho(\Psi)$. Recall that $\rho(\Psi)$ is not only a function of $\Psi$ but also $\theta(\Psi)$ which is also a function of $\Psi_i$.
Hence, using the chain rule, we get for each $i \in [1 \dots n]$ that 
\begin{equation}\label{chain_rule}
    \begin{aligned}
I_{\Psi_i} \approx   \frac{\partial{\rho(\theta,\Psi)}}{\partial \Psi_i}\bigg|_{\torg(\Psi)}  +   \frac{\partial{\rho(\theta,\Psi)}}{\partial \theta}\bigg|_{{\torg}(\Psi)} \frac{\partial {\theta}(\Psi) }{\partial \Psi_i}.
\end{aligned}
\end{equation}

\begin{algorithm}[htbp!] 
\SetAlgoLined
 \SetKwFunction{merge}{merge}
  \KwIn{Features $\Psi$, attack budget $\eps$, $\%$ of corrupt transitions $\alpha$, norm-type $p$}
  Compute $\torg \gets \argmin_{\theta \in \real^P} L(\theta,\Psi)$ \;
  Compute $\|I_{\Psi_{{i}}}\|_q$ for all $i = 1, \dots, n$ using~\eqref{chain_rule} \;
  $S^*_{\alpha} \gets \alpha \cdot n$ indices $i$ with largest $\|I_{\Psi_i}\|_q$ \;
  \For{$k \in S^*_{\alpha}$}{
    Let $\delta^{*}_{{k}} \in {\displaystyle \argmax_{\delta \in \real^Q}} \{I_{\Psi_k}^\top \delta \mid \|\delta \|_p \leq \eps\}$ using Item 2 in \cref{prop_greedy}\;
  }
  Use line search to find the largest step-size $\beta \in [0,1]$ $\st$ the value estimate increases: $\rho(\theta,  \Psi + \beta \cdot \delta^{*}) - \rho(\theta,\Psi) > 0$\;
    %Set $\Psi \gets \beta\cdot  (\Psi + \delta^{*})$ \;
    \Return{$\Psi =   \Psi + \beta\cdot\delta^{*}$} \;
    \caption{OPE Attack Algorithm \label{sec:algorithm}}
    \label{algorithm1}
\end{algorithm}

The computation of the partial derivative $\nicefrac{\partial \theta(\Psi)}{\partial \Psi_i}$ is not straightforward. However, we can approximately compute it as $\nicefrac{\partial  \theta(\Psi)}{\partial \Psi_i}=H_{\torg(\Psi)}^{-1} \nicefrac{\partial^2 L(\theta,\Psi_i)}{\partial \theta \partial \Psi_i}\big|_{\torg(\Psi)}$ where $H_{\torg(\Psi)}  = \nicefrac{\partial^2 L(\theta,\Psi)}{\partial \theta^2}\big|_{\torg(\Psi)}$~\citep[][Section 2.2]{koh2017influence}. See \cref{apps:preliminary} for more details.

To compute $I_{\Psi_i}$ in \eqref{chain_rule}, we require that $L(\theta,\Psi)$ is twice continuously differentiable and linearly separable with respect to the transitions in $D$, and $\rho(\theta,\Psi)$ is continuously differentiable with respect to $\theta$ and $\psi$.
Although these conditions may seem restrictive, they hold true for the OPE methods we have studied.

The derivatives in~\eqref{chain_rule} can be easily computed using automatic-differentiation software like PyTorch~\cite{torch2019}. Computing the influence score $I_{\Psi_i}$ can be very expensive due to the Hessian-inverse term $H^{-1}_{\theta_{org}(\Psi)}$ which requires $\mathcal{O}(P^3)$ operations to compute. Fortunately, as shown in~\cite{koh2017influence}, we can avoid the computation of the Hessian-inverse term while computing $I_{\Psi_i}$ by instead first approximately computing the Hessian-inverse vector product
%
\[
  c_\text{prod}=H_{\torg(\Psi)}^{-1} \frac{\partial{\rho(\theta,\Psi)}}{\partial \theta}\bigg|_{{\torg}(\Psi)}
\]
%
in $\mathcal{O}(nP)$ time using the Pearlmutter's method~\cite{pearmutter1994fast} and first-order Taylor approximation of the Hessian-inverse matrix, and then applying the Pearlmutter's method again to compute the Hessian-vector product $c_\text{prod} \cdot \nicefrac{\partial^2 L(\theta,\Psi_i)}{\partial \theta \partial \Psi_i}\big|_{\torg(\Psi)}$ in $\mathcal{O}(P)$ time.

\textbf{Algorithm outline} We outline how to approximately solve the DOPE optimization~\eqref{eq:bilevel} in Algorithm~\ref{sec:algorithm}, which consists of two main steps. In the first step, we compute an approximation of the optimal set of transitions to perturb $S^*_{\alpha}$ by choosing $\alpha n$ points in $\Psi$ with the largest $q$-norm of their influence scores $\|I_{\psi_{\cdot}}\|_q$. In the second step, we compute $\Delta$ for all points in $S^*_{\alpha}$ using \cref{prop_greedy} and use line search to find the optimal step size that guarantees an increase in the error of the value estimate. The second step may be repeated until no further perturbation to data points in $S^*_{\alpha}$ results in an increase in the error in the value estimate.

The main computational bottleneck is in computing the influence score for each data point. In some cases, this cost can be reduced. We derive closed-form expressions for the influence score in the case of the linear $\brm$ method under two settings a) when the adversary perturbs only the state features or b) only the reward features.
\begin{prop}\label{prop:BRM_rewards} If the attacker only perturbs the reward vector $r$ constructed from batch of transition tuples $D$. Then, the influence score of the $i^\text{th}$ data point $I_{r_i,\theta,\Psi}$ for the $\brm$ method can be computed as
\begin{equation}\label{BRM_reward}
I_{r,\theta,\Psi} = 4 p_0^T \Phi_{0} \left( ({\Phi} - \gamma  \cdot{\Phi}_p)^2 \right)^{-1} \left( \Phi - \gamma  \cdot \Phi_p\right)\,,
\end{equation}
where $\Phi_{0}$ is a sample matrix of initial state features constructed from $D_0$.
\end{prop}
\begin{prop}\label{prop:BRM_features}
If the attacker only perturbs the state feature matrix $\Phi$. Then, the influence score of the $i^\text{th}$ data point $I_{\phi(s_i,a_i),\theta,\Psi}$ for the $\brm$ method can be computed as
\begin{equation}\label{BRM_features}
\begin{aligned}
I_{\Psi,\theta,\Psi} &= 4 \cdot p_0^T \cdot \Phi_{0} \cdot \left( ({\Phi} - \gamma \cdot {\Phi}_p)^2 \right)^{-1}\cdot \\ & \left(2 \cdot w \bigotimes \Phi - 2\cdot \gamma \cdot w \bigotimes \Phi_p \right. \\ & \left. + 2 \cdot \mathbf{I} \bigotimes (\Phi \cdot w - \gamma \cdot \Phi_p \cdot w - r) \right),
\end{aligned}
\end{equation}
where $\bigotimes$ denotes the Kronecker product between matrices.
\end{prop}
\Cref{prop:BRM_rewards} and \ref{prop:BRM_features} follow from the chain rule and basic properties of the gradient operator for matrices.

\section{Experiments}\label{sec:experiments}
In this section, we investigate the strengths and weaknesses of the DOPE attack. First, we evaluate the effectiveness of the DOPE attack on OPE methods for different values of attack budget and identify which methods are most vulnerable to adversarial contamination. Second, we compare the performance of DOPE with three custom baselines: Random DOPE, FSGM-based Attack, and Random Attack. 

\subsection{Domains and Experimental Setup}

We first describe the five domains used in the experiments.
\emph{Cancer:} This domain~\cite{gottesman2020interpretable} models the growth of tumors in cancer patients. It consists of 4-dimensional states which represent the growth dynamics of the tumor in the patient, and two actions that indicate if a given patient is to be administered chemotherapy or not at a given time step. \newline
\emph{HIV:} The HIV domain \citep{hiv} has 6-dimensional states representing the state of the patient, and four actions that represent four different types of treatments.  \newline
\emph{Mountain Car:} In the Mountain Car~\cite{brockman2016openai} domain, the task is to drive a car positioned between two mountains to the top of the mountain on the right in the shortest time possible. The 2-dimensional state represents the car's current position and the current time-step, and the three actions represent: drive forward, drive backward, and do not move.  \newline
\emph{Cartpole:}
The Cartpole domain~\cite{brockman2016openai} models a simple control problem where the goal is to apply +1/-1 force to keep a pole attached to a moving cart from falling. The 2-dimensional state represents the cartpole dynamics, and the two actions represent the force applied to the pole.  \newline
\emph{Continuous Gridworld:}
The Continuous Gridworld is a custom domain that consists of a 2-dimensional state space that represents the coordinates of the agent, and two actions $a_0,a_1$ that determine the direction and step size of the agent. The agent begins at coordinate $(1,1)$ and moves towards coordinates $(50,50)$ to maximize its rewards.

\paragraph{Implementation details}
For each domain, we apply Deep Q-learning (DQN) to a randomly initialized neural network policy and obtain partially optimized deterministic policies.
We fix the deterministic policy obtained for each domain as the evaluation policy and use an $\epsilon$-greedy version of the evaluation policy as our behavior policy~\cite{gottesman2020interpretable}. We set $\epsilon = 0.1$ for the HIV domain and $\epsilon = 0.05$ for other domains. 
We use the behavior policy to generate five datasets, each containing $N$ trajectories of length $T$ (see  \cref{app:experiments} for the values of $N$ and $T$) and use it to evaluate the value of the evaluation policy. Our code is made available in the supplementary materials.


For any given OPE method that learns the Q-value function of the evaluation policy from data, we use linear function approximators to represent these Q-value functions and optimize the squared Bellman residual with $L_2$ regularization to learn it.
Note that we consider linear function approximations in line with the precedent set by other recent works in the off-policy evaluation literature~\cite{gottesman2020interpretable,Jin2020Provably}. Linear function approximations are commonly employed in the off-policy evaluation literature due to their simplicity, low computational complexity, and convergence guarantees~\cite{gottesman2020interpretable}. While our framework is general enough to accommodate any differentiable function approximations including deep learning models, computing the influence functions for non-linear function approximations is computationally expensive, and the time complexity grows as the square of the number of model parameters $\theta$. Hence, we resort to linear function approximators. Note, however, that we try to offset the limitations in the expressive power of linear function approximations by leveraging complex state representations obtained from the second last layer of a trained deep Q-network as input features to the linear function approximations in our experimentation (See~\cref{app:experiments}). 

% Furthermore, while the impreciseness of the first-order approximations may reduce the effectiveness of the attacks on more complex nonlinear functions, we can address this challenge by incorporating higher-order influence functions or group influence functions~\cite{Basu2020SecondOrderGI} into the framework. 
For OPE methods that require learning behavior policy from the data, we train a multinomial logistic regression model to predict the behavior policy's action probabilities using maximum likelihood estimation. Following standard practice in RL, we clip the behavior probabilities to $0.01$ to avoid importance sampling weights from exploding. Note that although clipping the behavior probabilities prohibits the attacker from making individual behavior policy action probabilities too small, an attacker can still leverage the fact that the importance sampling weights are a function of the product of behavior policy action probabilities, and thus, the importance weights can be made very large by simply making the behavior policy action probabilities of as many points as possible to close to the clipping threshold. 

In all our experiments, we perturb only state features. Finally, the values of the hyperparameters used in our experiments are discussed in \cref{app:experiments}.

\begin{table*}[h!]
\centering
\begin{tabular}{|p{2cm}||rr|rr|rr|rr|rr|} 
 \toprule
% \multirow[t]{2}{*}{Domain} 
% & \multicolumn{10}{|c|}{OPE Methods} \\
%  \cline{2-11}
  \multirow[t]{2}{*}{Domain}    & \multicolumn{2}{|c|}{BRM} & \multicolumn{2}{|c|}{WIS} & \multicolumn{2}{|c|}{PDIS} & \multicolumn{2}{|c|}{CPDIS}& \multicolumn{2}{|c|}{$\wdr$} \\
 \cline{2-11}
    & $lb$ & $ub$ & $lb$ & $ub$ & $lb$ & $ub$ & $lb$ & $ub$ & $lb$ & $ub$ \\
 \hline
 Cancer & 0.85 & 0.97 & 0.69& 0.69 & \textbf{8.95} & \textbf{10.69} & 0.48 & 0.58 & 3.36 & 3.72 \\
HIV & \textbf{343.35} & \textbf{440.92} & 0.0 & 0.1 & 1.4& 2.42 & 0.09 & 0.24 & \textbf{139.71} & \textbf{893.31} \\
Gridworld & \textbf{94.76} & \textbf{98.35} & 0.0 & 0.0 & \textbf{97.15} & \textbf{98.25} & 0.0 & 0.0 & \textbf{25.5} & \textbf{27.31} \\
Cartpole & 0.0 & 0.0 & 0.02 & 0.05  & \textbf{4.46e9} & \textbf{4.08e10} & 0.0 & 0.0 & 0.0 & 0.0 \\
MountainCar & 0.05& 0.07 & \textbf{100.0} & \textbf{100.0} & \textbf{98.37} & \textbf{99.62} & \textbf{47.38} & \textbf{98.68} &  0.02 & 0.03\\
 \bottomrule
\end{tabular}
\caption{\label{table:summary} Summary of the errors achieved by data poisoning across domains and OPE algorithms at $\eps=0.5\sigma$ and $\alpha=1.0$ and $p=1$. Here $lb$ and $ub$ denote the lower limit and upper limit of 95\% bootstrap confidence intervals of interquartile mean of percentage error in the value estimates, over 10 runs. We observe that the attack is successful on most of the methods across domains. CPDIS and WIS are usually the most resilient OPE methods.
% Here $\alpha$, $\eps$ and p are set to $0.05$ and $1.0\sigma$ and $1$ respectively.
} 
% %\vpsace{-0.1in}
\end{table*}

 \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/cancersubset_experiment88_budget_0.png}
    %\vpsace{-0.3cm}
     \caption{Cancer    \label{fig:R1}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/hivsubset_experiment88_budget_0.png}
    %\vpsace{-0.3cm}
     \caption{HIV    \label{fig:R2}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/customsubset_experiment88_budget_0.png}
    %\vpsace{-0.3cm}
     \caption{Gridworld \label{fig:R3} }
    \end{subfigure}
    \caption{\label{exp:effectiveness1}\cref{fig:R1,fig:R2,fig:R3}
  compares the effect of DOPE attack on BRM, WIS, PDIS, CPDIS and WDR methods in the  Cancer, HIV and Continuous Gridworld domains (left to right) 
  for different values of attacker's budget $\eps= \text{frac} \cdot \sigma$ and $p=1$ ($\ell_1$ norm). 
  Larger the value of frac, the larger are the perturbations added by the DOPE attack, and accordingly we observe larger errors in the value estimates. 
%   Note that the percentage errors of CPDIS and WIS in~\cref{fig:R2,fig:R3} are too small to be visible in the plots. 
  } 
     \end{figure*}
      \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/cancersubset_experiment88_corrupt_0.png}
    %\vpsace{-0.3cm}
     \caption{Cancer    \label{fig:R11}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/hivsubset_experiment88_corrupt_0.png}
    %\vpsace{-0.3cm}
     \caption{HIV   \label{fig:R21}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/customsubset_experiment88_corrupt_0.png}
    %\vpsace{-0.3cm}
     \caption{Gridworld \label{fig:R31} }
    \end{subfigure}
    \caption{ \label{exp:effectiveness2}\cref{fig:R11,fig:R21,fig:R31}
   compares the effect of DOPE attack on BRM, WIS, PDIS, CPDIS, and WDR methods in Cancer, HIV, and Continuous Gridworld domains (left to right) for different percentages of corruption $\alpha$ at $\eps=1.0\sigma$ and $p=1$ ($l_1$ norm). 
  The larger the value of $\alpha$, the larger the number of points perturbed by the DOPE attack, and accordingly, we observe larger errors in the value estimates. 
%   Note that the percentage errors of CPDIS and WIS in~\cref{fig:R21,fig:R31} are too small to be visible in the plots. 
   }
     \end{figure*}
   
      \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.75\linewidth]{plots/experiments12/FQE-hiv-influenceexperiment12_0.png}
    % %\vpsace{-0.3cm}
     \caption{BRM    \label{fig:R13}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.75\linewidth]{plots/experiments12/IS_pdis-hiv-influenceexperiment12_0.png}
    % %\vpsace{-0.3cm}
     \caption{IS    \label{fig:R23}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.75\linewidth]{plots/experiments12/WDR_wdr-hiv-influenceexperiment12_0.png}
    % %\vpsace{-0.3cm}
     \caption{WDR \label{fig:R33} }
    \end{subfigure}
    %  \begin{subfigure}[b]{0.33\textwidth}
    % \centering
    % \includegraphics[width=0.8\linewidth]{plots/experiments8/cartpoleexperiment8.png}
    % %\vpsace{-0.3cm}
    %  \caption{ \label{fig:R4} }
    % \end{subfigure}
    \caption{ \label{exp:baselines1}\cref{fig:R13,fig:R23,fig:R33} compare the effects of Random attack, Random DOPE attack (an ablated version of DOPE), FSGM-based Attack and DOPE attack on the error in the value function estimates of BRM, IS, and DR methods (left to right) in HIV domain. The percentage error in the Random attack and FSGM-based attack is small relative to the percentage error due to DOPE and Random DOPE attack, and hence their curves lies close to the x-axis. 
    DOPE attack outperforms both the Random DOPE and Random attacks at nearly all values of the attacker's budget.
    }
     \end{figure*}
     
We evaluate the effectiveness of our attack model by computing the percentage error in the value function estimate relative to the initial value estimate. We report the $95\%$ bootstrap confidence intervals of the interquartile mean (IQM) of percentage error using our results from the 10 runs (10 datasets) since the IQM confidence intervals are found to be more reliable in practice~\cite{agarwal2021deep}. In this setting, a large percentage error indicates that the OPE method is less robust to adversarial contamination.


\subsection{Effectiveness of DOPE Attack}
Here we evaluate the effectiveness of the DOPE attack on five OPE methods for a range of attack budgets. In our first experiment, we fix the percentage of corrupt data points $\alpha = 0.05$ and vary the budget $\eps$ as $\text{frac} \cdot \sigma$, where $\text{frac}$ varies from $0.0$ to $0.51$ in step-sizes of $0.05$ and $\sigma^2=\frac{2}{N\cdot(N-1)}\sum_{i=1}^N\sum_{j=i+1}^N \|\xi(s_i)-\xi(s_j)\|_p^2$ is the standard deviation of all pairwise distances between the state-features in the dataset. \cref{exp:effectiveness1,exp:effectiveness3}  compare the percentage error in the value estimate of the OPE methods in all domains. Our results show that even when corrupting only $5\%$ of the data points, the attacker need not perturb the state features significantly to achieve large errors in the value estimate. In fact, with a perturbation budget as small as $\eps=0.5\sigma$, DOPE can result in a substantial error in the policy's value in HIV, Cancer, Mountain Car, and Continuous Gridworld domains. Further, a larger attacker's budget means the DOPE model has more leeway on the perturbations that it can add to the dataset, and hence, we observe larger errors for larger budget values. Note that the percentage errors of CPDIS and WIS in~\cref{fig:R2,fig:R3} are too small to be visible in the plots. 

In the second experiment, we vary the percentage of corrupt data points between 0.0 and  $0.10$ with a step size of 0.02 for all the domains (Figure~\ref{exp:effectiveness2}). We fix the perturbation budget $\eps$ to $1.0\sigma$. Our experimental results in \cref{exp:effectiveness1,exp:effectiveness2,exp:effectiveness3} demonstrate that corrupting only $0.05\%$ of the data points using DOPE is sufficient to observe a significant error in the value estimate of a given policy. It is important to realize that the attacker's budget $\eps$ is local to each data point and is not impacted by the number of points perturbed. Hence, we see that a larger percentage of corrupt data points yields a larger percentage error in the value estimates. Note that the percentage errors of CPDIS and WIS in~\cref{fig:R21,fig:R31} are too small relative to BRM and WDR and therefore are not clearly visible in the plots. 

Finally, we summarize the impact of DOPE attack ($\eps=0.5\sigma$ and $\alpha=1.0$, $p=1$) on all OPE methods and domains in \cref{table:summary}. 
It can be seen that the DOPE attack has a very high impact on BRM, PDIS, and WDR methods and an almost negligible impact on CPDIS and WIS methods. We hypothesize that CPDIS and WIS methods may be more robust because the weight normalization that they employ potentially minimizes the importance of any individual data point, especially when the rewards are uniformly distributed throughout the trajectory. On the other hand, the weights in PDIS are not normalized, and therefore, it appears that in Cartpole and HIV domains, the DOPE attack model is able to significantly impact the importance sampling weights and result in significant errors in the value estimates. In WDR, the attacker can introduce errors through both, the Q-value function learned from the data as well as the importance sampling weights, and therefore, we observe significant errors in the value estimates of $\wdr$ method in HIV and Gridworld domains. 

\subsection{COMPARISON WITH BASELINES} 
Here we compare the DOPE attack to three custom baselines: Random Attack and Random DOPE Attack (ablation of DOPE Attack) and FSGM-based Attack. In Random Attack, we choose $\alpha n$ random points to perturb and sample perturbations for these points from a uniform $l_1$ norm ball with a radius equal to the perturbation budget $\eps$. For more details on the sampling algorithm, see Algorithm 4.1 in~\cite{Calafiore1998samplinguniform}. In Random DOPE Attack, we select points randomly and update them using \cref{prop_greedy}. The purpose of using this ablation is to investigate the benefit of selecting data points to perturb based on their influence scores as suggested in  \cref{prop_greedy}.
The third baseline is an FGSM-based OPE attack which is a variant of the Fast Gradient Sign Method (FGSM)~\cite{GoodfellowSS2015Explaining}, a popular test-time attack designed to elicit misclassification errors from supervised learning models. Note that FGSM has never been used to attack OPE methods in prior literature, and we are the first to introduce and leverage a variant of it as a baseline in this context. Our FGSM-based OPE attack baseline modifies the transition tuples (features) $\psi(s,a,r)$ to maximize the (supervised learning) loss ($L(\theta)$) optimized by the OPE method, thus resulting in sub-optimal estimates of $\theta$. Note that the FGSM-based OPE attack baseline does not directly maximize the error in the value function estimates, unlike our proposed framework. Given these baselines, we fix the value of $\alpha$ to $0.05$ and vary the budget $\eps$ from $0.0$ to $0.25$ with step size $0.04$.

For each dataset and each value of the budget $\eps$, we average the percentage change in the value estimate for Random DOPE attack and Random attack over 50 trials. Results with the Gridworld domain are shown in Figure~\ref{exp:baselines1}. See \cref{exp:baselines:cancer,exp:baselines:gridworld,exp:baselines:hiv,exp:baselines:mountaincar} in~\cref{app:experiments} for results on other datasets. 

The experimental results in \cref{exp:baselines1,exp:baselines:cancer,exp:baselines:gridworld,exp:baselines:hiv,exp:baselines:mountaincar} demonstrate that in contrast to the DOPE attack, the Random attack and FSGM-based attack fail to introduce any significant error in the value-function estimate and, therefore, cannot be used as an alternative to the DOPE attack model.
Further, it can be seen that when the points to perturb are randomly selected (Random DOPE), it is likely to result in a smaller adversarial impact than when influential data points are chosen for perturbations (DOPE). These results are not surprising as we would expect the value function to be highly dependent on the influential data points. In some domains like Cancer and HIV, there is very little difference between the performance of DOPE and Random DOPE attacks. We hypothesize that this is due to all data points having similar influence scores.

\section{Conclusion}

We proposed a novel data poisoning framework to analyze the sensitivity of OPE methods to adversarial contamination at train time. We formulated the data poisoning problem as a bilevel optimization problem and proposed a computationally tractable solution that leverages the notion of influence functions from robust statistics literature. Using the proposed framework, we analyzed the sensitivity of five popular OPE methods on multiple datasets from medical and control domains. %To the best of our knowledge, our work is the first to study the sensitivity of a wide range of OPE methods to train-time data-poisoning attacks. 
Our experimental results on various medical and control domains demonstrated that existing OPE methods are highly vulnerable to adversarial contamination %. Our findings question the reliability of policy values derived using OPE methods
thus highlighting the need for developing OPE methods that are statistically robust to train-time data poisoning attacks.
% Future work includes extending the framework to broader classes of OPE methods and testing defense methods.

% \begin{contributions} % will be removed in pdf for initial submission,
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
% EL conceptualized the idea, implemented the experiments, and wrote the paper. HS helped in conceptualizing the idea and writing. HL, MP, and CR supervised the project and helped with writing.
% \end{contributions}

% \begin{acknowledgements} % will be removed in pdf for initial submission,
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
% % The authors would like to thank the anonymous reviewers for their helpful feedback and all the funding agencies listed below for supporting this work. This work is supported in part by the NSF awards \#IIS-2008461 and \#IIS-2040989, and research awards from Amazon, Harvard Data Science Institute, Bayer, and Google. HL would like to thank Sujatha and Mohan Lakkaraju for their continued support and encouragement. The views expressed here are those of the authors and do not reflect the official policy or position of the funding agencies.
% \end{acknowledgements}

%\clearpage
\bibliography{lobo_674}
%\bibliographystyle{icml2022}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\clearpage
% \appendix
% % \section{Notation}\label{app_notation}
% % \section{Experimental Results:}
% % \begin{figure*}[h!]
  
% % \begin{tabular}{ |p{3cm}|p{3cm}|p{3cm}|  }
% % \hline
% % \multicolumn{3}{|c|}{Country List} \\
% % \hline
% % Country Name or Area Name& ISO ALPHA 2 Code &ISO ALPHA 3 \\
% % \hline
% % Afghanistan & AF &AFG \\
% % Aland Islands & AX   & ALA \\
% % Albania &AL & ALB \\
% % Algeria    &DZ & DZA \\
% % American Samoa & AS & ASM \\
% % Andorra & AD & AND   \\
% % Angola & AO & AGO \\
% % \hline
% % \end{tabular}
% %     \caption{Caption}
% %     \label{fig:my_label}
% % \end{figure*}


% % \marek{you already define $D$ earlier. What are these transitions in the set $D$ and how do they relate to the original definition of the dataset? Can you define the dataset in terms of $\Psi_i$ from the outset? Or better explain what exactly these $\Psi_i$ are and why you are introducing them.} 
% % We will use $\Delta = (\delta_i)_{i=1}^{N}$  to denote the perturbation matrix where $\delta_i \in \real^Q$ is a vector of perturbations added to $\Psi_i$. 
% % Finally, we will $\Psi'=\Psi+ \Delta$ to denote the resultant corrupt matrix obtained by adding perturbations $\Delta$ to $\Psi$.
% % \section{Proofs}\label{app_proofs}



% %  \elita{add observations}

% % \section{Approximately Optimal Perturbations}


% % Hence, we can write the optimization problem in~\ref{approx_attack_2} as
% % \begin{equation}\label{approx_1}
% %     \begin{aligned}
% %     \max_{s\in \{0,1\}^N} & \sum_{k=1}^N \max_{\{\delta_k\} \in \Real^{Q}}  -   I_{\Psi_i} \tr \delta_k\\
% %   & \sum_k s_k = \alpha \cdot N \\
% %   &  \|\delta_k \|_p \leq \eps \cdot s_k, \; k = 1, \ldots N.
% % \end{aligned}\end{equation}
% % It thus follows that $\forall k \in [1, \dots N]$, the optimal perturbation $\delta^{*}_k$ can be independently computed by solving  $\delta^{*}_k \in \argmax_{x} {I_{\Psi_k,\theta,\Psi }^T x} \st \|x\|_p \leq \epsilon$. This is easy to compute for $p=1,2,\infty$ as we show in \cref{prop_greedy}.

% % Next, we need to solve for the optimal value of $s$. From the theory of convex optimization, we know that the p-norm of any vector $x \in \real^M$ $\|x\|_q$ can be expressed as $\|x\|_q= \max {z^Tx} \st \|z\|_p \leq 1$ where $\frac{1}{p} + \frac{1}{q}=1$. \hs{q might be confused with q function?} Hence, given the optimal-perturbation $\delta^{*}_k \, \forall k \in[1,\dots n]$, the problem in \cref{bilevel:approx} boils down to solving \hs{reference to be edited}
% % \begin{equation}
% %     \begin{aligned}
% %      \max_{s\in \{0,1\}^N} & \sum_{k=1}^n \|I_{\Psi_k,\theta,\Psi}\|_q  \\
% %       & \sum_k s_k = \alpha \cdot n. 
% %     \end{aligned}
% % \end{equation}
% % %\hima{lot of white space around equations needs to be fixed}
% % It is now easy to see that the optimal set of transitions for the approximate attack problem in~\ref{bilevel:approx} is simply the set of $\alpha n$ transitions with the largest value of the $q$-norm of their influence scores. We formally state these results in \cref{prop_greedy}.


% % \section{Implementation Details}
% % \paragraph{Implementation Details:}

% % For each domain, we generate Radial Basis features for each observed state $s$ and next-state $s'$ in the transition tuples $(s,a,r,s') \in D$. For simplicity, we assume that every $(s,s')$ pair in $(s,a,r,s') \in D$ is unique. We note that, in our experiments, the attacker only perturbs original features of observed states $s$ in the transition tuples and these perturbations do not affect the observed next-states $s'$ in $(s, a,r,s') \in D$. Further, we use Regularized Ridge Regression to estimate the parameters of the q-value function in $\brm$ and Regularized Logistic Regression to estimate the behavior policies in IS and DR methods. For simplicity, we assume that the behavior policy in DR is provided by the experts and not learned from $D$. In spite of this assumption, the attacker can still exploit the importance-sampling weights to construct We provide the details of the hyperparameter used, in the Appendix.


% % \section{Related Work}
% % \section{Additional Preliminaries}

% % \section{Takeaways}
% % \begin{itemize}
% %     \item With $\epsilon$ value as small as $0.5 std(x_1-x_2)$, we can 
% % \end{itemize}

% \section{Additional Preliminaries}\label{apps:preliminary}
% \paragraph{Influence functions}

% % \hima{can we just get rid of influence functions preliminaries? and just explain as we are proposing a solution?}
% Influence function is a popular tool used to quantify the change in an empirically learned estimator with small changes in data. Consider a supervised learning problem with input space $\mathcal{X}$ and output space $\mathcal{Y}$, a batch of data $(z)_{i=1}^n$ where $z_i =(x_i,y_i) \in (X \times Y)$ and an unknown prediction function $f: \mathcal{X} \to \mathcal{Y}$ where $f$ is parameterized by $\theta \in\Theta$. Given a convex and doubly differentiable loss function $L(\theta,z)$ such that $L: \Theta \times \mathcal{X} \to \real$ and $\theta \in \argmin_{\theta' \in\Theta} \frac{1}{n}\sum_{i=1}^n L(\theta',z_i)$ is the empirical risk minimizer, then,
% the effect $I_{z,\theta,D}$ of perturbing a data point $z$ $\to$ $z_{\delta}=(x+ \delta,y)$ on the parameter $\theta$ can be approximated via Taylor expansion as
% \begin{equation}\label{eq:influence_fn}
%     \begin{aligned} 
%     & \mathcal{I}_{z_{\delta}, \theta,D} = \frac{\theta_{z,\delta} - \theta}{\delta} \approx \frac{\partial  \theta} {\partial x}    \\ & \approx  \left(- H_{\theta}^{-1} \frac{\partial^2 L(\theta,z)}{\partial \theta \partial x}\right)
%      \text{ where } H_{\theta} = \frac{\partial^2 L(\theta,D)}{\partial^2 \theta}
% \end{aligned}
% \end{equation}
% where $\theta_{z,\delta}$ are the new optimal parameters learned from the training data point after replacing $z$ by $z_\delta$.
% % To construct an adversarial attack the attacker simply needs to perturb the transitions in the direction that maximizes its influence on the prediction error. We will adopt the influence-function to determine the direction and magnitude of perturbation that needs to be added to attack OPE methods.
% We refer the readers to \cite{koh2018stronger} for more details.
% \section{Proofs:}\label{app:proofs}
% \begin{proof}[Proof of \cref{prop_greedy}]\label{proof:prop_greedy}
% Recall the optimization problem in~\eqref{bilevel:approx}:
% % \begin{equation} \label{bilevel:approx_app}
% % \begin{aligned}
% %     & \max_{s\in \{0,1\}^n} \max_{\{\delta_k\}_{k=1}^N \in \Real^{n \times Q}}   \sum_{k=1}^n s_k \cdot I_{\Psi_i} \tr \delta_k  \\
% %    & \text{ subject to } \sum_{k=1}^n s_k = \alpha \cdot n, \\ &
% %    \qquad  \|\delta_k \|_p \leq \eps \cdot s_k, \; k = 1, \ldots n.
% % \end{aligned}
% % \end{equation}
% %
% \begin{equation} \label{bilevel:approx_app}  
% \begin{aligned}
%   &\max_{s\in \{0,1\}^n} \max_{\{\delta_i\}_{i=1}^N} %\in \Real^{n \times Q}}
%     \left\{\sum_{i=1}^n 
%     s_iI_{\Psi_i} \tr \delta_i \mid \|\delta_i \|_p \leq \eps, \forall i\right\} ,\\
%   &\text{ subject to } \sum_{i=1}^n s_i = \alpha \cdot n~.
% \end{aligned}
% \end{equation}
% Notice that in~\eqref{bilevel:approx_app}, $\forall k \in [1, \dots N]$, $I_{\Psi_i}$ is independent of $\delta_k$ and so the optimal perturbation $\delta^{*}_k$ can be independently computed by solving  $\delta^{*}_k \in \argmax_{x} \{I_{\Psi_k,\theta,\Psi }^T x \mid \|x\|_p \leq \eps\}$. The $p$-norm  $\|x\|_p$ of any vector $x \in \real^M$  can be expressed using its dual norm as $\|x\|_p= \max \left\{ z^T x \mid \|z\|_q \leq 1 \right\}$ where $\nicefrac{1}{p} + \nicefrac{1}{q}=1$~\cite{boyd2004convex}. Thus, given the optimal-perturbation $\delta^*_k$ for each $k \in 1,\dots, n$, the problem in \eqref{bilevel:approx} boils down to solving
% \begin{equation}
%     \begin{aligned}
%      \max_{s\in \{0,1\}^N} & \sum_{k=1}^n \|I_{\Psi_k,\theta,\Psi}\|_q  \\
%       & \sum_k s_k = \alpha \cdot n. 
%     \end{aligned}
% \end{equation}

% It is now easy to see that the optimal set of transitions for the approximate attack problem in~\eqref{bilevel:approx} is simply the set of $\alpha n$ transitions with the largest value of the $q$-norm of their influence scores. The closed-form solution for $\delta^{*}_k$ at $p=1,2,\infty$ follows from standard convex optimization results for dual norms~\cite{boyd2004convex}.
% \end{proof}

% \section{Experimental Details:}\label{app:experiments}
% \subsection{Additional Optimization Tricks used in experiments:}
% \begin{enumerate}
%     \item Recall that we use the DQN algorithm to learn the optimal Q-value function using a neural network, from which we derive the evaluation policy. In the case of the Cartpole and Mountain Car domains, we use this Q-value network to transform the state features into features $\phi(s, a)$. Specifically, we use the output of the second last layer of the Q-value network as the transformed state features. We do this to get a more accurate feature representation for linear function approximators which in turn would result in a more accurate initial value function estimate.
%     \item In all our experiments, we use line-search to find the optimal step size to update the state features with the perturbations derived using~\cref{prop_greedy}. If for a given attacker's budget, we have access to the error in the value-function estimate for a lower value of the attacker's budget, then, we use it as the minimum threshold error to achieve while applying the line search. Applying this method enables us to achieve a monotonic trend in the percentage error in the value estimate with respect to the perturbation budget. The monotonic trend is otherwise difficult to achieve especially when the Loss function is non-convex.
    
%     \item To optimize the DOPE objective for any given OPE method, we need have differentiable evaluation policy action probabilities. In the case, where the evaluation policy is a deterministic Q-learning policy, we obtain differentiable action probabilities by applying softmax to the q-values with very small temperature values. 
    
%     \item Link to code: \href{https://github.com/elitalobo/DOPE}{https://github.com/elitalobo/DOPE}
% \end{enumerate}





% \begin{figure*}[h!]
%     \centering
%     \begin{tabular}{ |p{5cm}||p{5cm}|  }
%  \hline
%  \multicolumn{2}{|c|}{Hyperparameter values for Cancer domain} \\
%  \hline
%  Hyperparameter & Value\\
%  \hline
% Number of trajectories & 500 \\
% Policy Network layers &  $64\times 28$ \\
% Normalize rewards & No \\
% Regularization for $\pi_b$ &  1e-2 \\
% Regularization for $q_{\eta}$ & 1e-2\\
% Discount factor & 0.95 \\
% Trajectory Length (T) & 30 \\
% Direction of Attack &  +1\\
% Num. Epochs for CEL & 5000 \\
%  \hline
% \end{tabular}
% \end{figure*}


% \begin{figure*}[h!]
%     \centering
%     \begin{tabular}{ |p{5cm}||p{5cm}|  }
%  \hline
%  \multicolumn{2}{|c|}{Hyperparameter values for HIV domain} \\
%  \hline
%  Hyperparameter & Value\\
%  \hline
% Number of trajectories & 1000 \\
% Policy Network layers &  $300\times 50$ \\
% Normalize rewards & Yes \\
% Regularization for $\pi_b$ &  1e-2 \\
% Regularization for $q_{\eta}$ & 1e-2\\
% Discount factor & 0.98 \\
% Trajectory Length (T) & 50 \\
% Direction of Attack &  -1\\
% Num. Epochs for CEL & 5000 \\
%  \hline
% \end{tabular}
% \end{figure*}


% \begin{figure*}[h!]
%     \centering
%     \begin{tabular}{ |p{5cm}||p{5cm}|  }
%  \hline
%  \multicolumn{2}{|c|}{Hyperparameter values for Continuous Gridworld domain} \\
%  \hline
%  Hyperparameter & Value\\
%  \hline
% Number of trajectories & 500 \\
% Policy Network layers &  $24$ \\
% Normalize rewards & No \\
% Regularization for $\pi_b$ &  1e-2 \\
% Regularization for $q_{\eta}$ & 1e-2\\
% Discount factor & 0.95 \\
% Trajectory Length (T) & 50 \\
% Direction of Attack &  -1\\
% Num. Epochs for CEL & 5000 \\
%  \hline
% \end{tabular}
% \end{figure*}


% \begin{figure*}[h!]
%     \centering
%     \begin{tabular}{ |p{5cm}||p{5cm}|  }
%  \hline
%  \multicolumn{2}{|c|}{Hyperparameter values for MountainCar domain} \\
%  \hline
%  Hyperparameter & Value\\
%  \hline
% Number of trajectories & 250 \\
% Policy Network layers &  $60$ \\
% Normalize rewards & No \\
% Regularization for $\pi_b$ &  1e-2 \\
% Regularization for $q_{\eta}$ & 1e-2\\
% Discount factor & 0.99 \\
% Trajectory Length (T) & 150 \\
% Direction of Attack &  +1\\
% Num. Epochs for CEL & 5000 \\
%  \hline
% \end{tabular}
% \end{figure*}


% \begin{figure*}[h!]
%     \centering
%     \begin{tabular}{ |p{5cm}||p{5cm}|  }
%  \hline
%  \multicolumn{2}{|c|}{Hyperparameter values for Cancer domain} \\
%  \hline
%  Hyperparameter & Value\\
%  \hline
% Number of trajectories & 1000 \\
% Policy Network layers &  $100\times 24$ \\
% Normalize rewards & No \\
% Regularization for $\pi_b$ &  1e-2 \\
% Regularization for $q_{\eta}$ & 1e-2\\
% Discount factor & 0.98 \\
% Trajectory Length (T) & 100 \\
% Direction of Attack &  +1\\
% Num. Epochs for CEL & 5000 \\
%  \hline
% \end{tabular}
% \end{figure*}

% \subsection{Additional Domain Details:}

% \emph{Cancer:} This domain~\cite{gottesman2020interpretable} models the growth of tumors in cancer patients. It consists of 4-dimensional states which represent the growth dynamics of the tumor in the patient, and two actions that indicate if a given patient is to be administered chemotherapy or not at a given time step. \newline
% \emph{HIV:} The HIV domain has 6-dimensional states representing the state of the patient, and four actions that represent four different types of treatments.  \newline
% \emph{MountainCar:} In the Mountain Car~\cite{brockman2016openai} domain, the task is to drive a car positioned between two mountains to the top of the mountain on the right in the shortest time possible. The 2-dimensional state represents the car's current position and the current time-step, and the three actions represent: drive forward, drive backward, and do not move.  \newline
% \emph{Cartpole:}
% The Cartpole domain~\cite{brockman2016openai} models a simple control problem where the goal is to apply +1/-1 force to keep a pole attached to a moving cart from falling. The 2-dimensional state represents the cartpole dynamics, and the two actions represent the force applied to the pole.  \newline
% % \emph{Continuous Gridworld:}
% % The Continuous Gridworld is a custom domain that consists of a 2-dimensional state space that represents the coordinates of the agent, and two actions $a_0,a_1$ that determine the direction and step size of the agent. The agent begins at coordinate $(1,1)$ and moves towards coordinates $(50,50)$ to maximize its rewards.
% \emph{Continuous Gridworld:}
% The gridworld domain consists of a 2-dimensional state space that represent the coordinates of the agent and 2 actions $(a_0,a_1)$ that determines the direction and step size of the agent. The task is to begin at coordinate $(1,1)$ and move towards coordinates $(50,50)$. Taking action $a_0$ at $(x,y)$ transitions the agent to $(x+0.2,y+0.45)$ with probability 1.0. On the other hand, taking action $a_1$ transitions the agent to $(x+0.3,y+0.5)$ with probability 0.95 and to $(1,1)$ with probability 0.05.
% If the agent transitions to $(x',y')$, the agent receives a reward of $(x+ 0.5y)$. We set the maximum length of the episode to 50 and collected 500 trajectories using the behavior policy.

% \section{Examples of twice continuously differentiable loss functions for DOPE Framework :}\label{app:examples}
% All the loss functions ($L$) that we leverage in this work such as Mean Squared Bellman residual (MSBR) for learning the Q-value function, and the Cross-Entropy Loss (referred to as CEL in the paper) for fitting the multinomial logistic regression model are twice continuously differentiable with respect to the parameters $\theta$. Below, we show that these loss functions are twice continuously differentiable. 

% In BRM and WDR, $\theta=\eta$ represents the parameters of the q-value function $q_\eta$. The parameters $\eta$ are estimated from the data by minimizing the Mean Squared Bellman Residual (MSBR). We compute the derivative of MSBR below to show that this loss function is twice differentiable and satisfies the assumption of our attack framework.
% \begin{equation}
%     \begin{aligned}
%       \text{MSBR}(\eta) \;&=\;  \|q_{\eta}-\mathcal{T}^{\pi} q_{\eta}\|^2_W \\
%      &= \|\Phi \eta - (r + \gamma \Phi_p \eta)\|_2^2 \\
%     &\frac{\partial \text{MSBR}(\eta)}{\partial \eta} = 2 \cdot (\Phi - \gamma \Phi_p)^T (\Phi \eta - (r + \gamma \Phi_p \eta)) \\
%     &\frac{\partial^2 \text{MSBR}(\eta)}{\partial \eta^2} = 2 \cdot (\Phi - \gamma \Phi_p)^T (\Phi - \gamma \Phi_p) 
%     \end{aligned}
% \end{equation}

% In the case of Importance Sampling-based OPE methods such as WIS, PDIS, and CPDIS, the behavior policy parameters ($\theta=\theta_b \in \real^{A \cdot d}$ ) are estimated from the data using a multinomial logistic regression model. Hence, we compute below the second-order derivative of the cross-entropy loss of the multinomial logistic regression model and show that this loss function is twice differentiable as well and satisfies the assumption of our attack framework.

% The, cross entropy loss for $\theta=\theta_b$ is given by
% \begin{equation}
%     \begin{aligned}
%      \text{CEL}(\theta) &= \log\left(\prod_{l=1}^n  \frac{\exp(\theta_{a_l}^T \xi(s_l))}{\sum_{i=1}^A \exp(\theta_i^T \xi(s_l)) } \right) \\
%      &= \sum_{l=1}^n  \log\left( \frac{\exp(\theta_{a_l}^T \xi(s_l))}{\sum_{i=1}^A \exp(\theta_i^T \xi(s_l)) } \right) \label{eq:1} \\
%      &= \sum_{l=1}^n \left(\theta_{a_l}^T \xi(s_l) - \log\left(\sum_{j=1}^A \exp(\theta_j^T \xi(s_l))\right)\right).
%     \end{aligned}
% \end{equation}

% We can compute the second order derivative of the cross entropy loss as follows: 
% \begin{equation}
%     \begin{aligned}
%      \frac{\partial  \text{CEL}(\theta)}{\partial \theta_{a_l}} &= \sum_{l=1}^n \left(\xi(s_l) - \frac{\exp(\theta_{a_l}^T \xi(s_l))\xi(s_l)}{\sum_{j=1}^A \exp(\theta_j^T \xi(s_l)) }\right) \\
%   \frac{\partial^2  \text{CEL}(\theta)}{\partial \theta_{a_l} \theta_k (k\neq a_l)} &= \sum_{l=1}^n \frac{\exp(\theta_{k}^T \xi(s_l))\exp(\theta_{a_l}^T\xi(s_l))\xi(s_l)^T\xi(s_l)}{(\sum_{j=1}^A \exp(\theta_j^T \xi(s_l)))^2} \\
%   \frac{\partial^2  \text{CEL}(\theta)}{\partial \theta_{a_l}^2} &= \sum_{l=1}^n \left(-\frac{\exp(\theta_{a_l}^T \xi(s_l))  \xi(s_l)^T \xi(s_l)}{\sum_{j=1}^A \exp(\theta_j^T\xi(s_l))} + \right. \\ & 
%   \left. \frac{\exp(\theta_{a_l}^T \xi(s_l))^2 \xi(s_l)^T \xi(s_l)}{(\sum_{j=1}^A \exp(\theta_j^T\xi(s_l)))^2} \right).
%     \end{aligned}
% \end{equation}

% \section{Related Work}~\label{app:relatedworks}
% Adversarial attacks have been extensively studied in Reinforcement Learning~\cite{gleave2021adversarial,Wu2021AttackInfluence,lin2019tactics,Zhang2020Adaptive,zhang2019online,lin2019tactics,Kiourti2019TrojDRLTA,Chen2019adversarial}. These attacks can be broadly classified into two main categories - train-time attacks (data-poisoning attacks) and test-time attacks. 

% \textbf{Test-time attacks:} In test-time attacks in RL~\cite{lin2019tactics,gleave2021adversarial,Behzadan2017VulnerabilityOD,kos2017delving,Wu2021Adversarial,Chen2019adversarial,huang2017adversarial}, the attacker manipulates test-time observations to fool the agent to take target malicious actions, without directly changing the agent's policy. In this setting, the noise added to the test-time observations at any time step does not directly impact the agent's future decisions. A large majority of the work that focuses on test-time attacks aims to either minimize the agent's rewards~\cite{huang2017adversarial,Behzadan2017VulnerabilityOD} or lead the agent to adversarial states~\cite{lin2019tactics}, which differs from our goal of perturbing train-time observations to maximize error in the value estimate of a given policy for a given OPE method.

% \textbf{Train-time attacks:} 
% In train-time or data-poisoning attacks, the adversary perturbs the training data by a small margin to facilitate erroneous learning of decision models. Prior work on data-poisoning have mainly targeted supervised learning models in Machine Learning~\cite{koh2018stronger,koh2017influence,fang2020influence,Wu2021AttackInfluence,steinhardt2017certified}. However, recently there has been emerging interests in data-poisoning attacks on Batch RL agents~\cite{zhang2021corruptionrobust,Ma2019PolicyPI,rakhsha2020policy} and Online RL Agents~\cite{zhang2019online,Zhang2020Adaptive,rakhsha2020policy,Zhang2008PolicyTeaching,Zhang2009policyteaching}. In a pioneering research work,~\cite{zhang2019online} proposed a framework that perturbs rewards such that a batch RL agent learns an adversarial target policy. In the following work,~\cite{rakhsha2020policy} proposed a framework for poisoning rewards and transition dynamics to force a Batch agent to learn an adversarial target policy. In \cite{wu2022copa}, authors propose methods to certify the robustness of a policy learned from offline data after a poisoning attack. It outputs the least cumulative reward that can be attained by a poisoned policy. ~\cite{zhang2019online} develops fast adaptive data-poisoning attacks on online RL agents where rewards must be perturbed in real-time. Nonetheless, these data-poisoning works differ from our work in two main aspects: a)They target learning of optimal adversarial policies, whereas our work targets learning erroneous value-function estimates for any given policy and OPE method b) our main goal is to analyze the sensitivity of different OPE algorithms to train-time attacks which has not been explored in any of these previous work.
% %  and multi-armed bandits~\cite{ma2018data,LiuS19a,Jun2018AdversarialAO}
% % A growing body of works~\cite{Ng99policyinvariance,asmuth2008RewardShaping} has studied reward-shaping in Reinforcement Learning, which is similar in mechanism to data-poisoning attacks. However, in reward shaping, the rewards in the training data are manipulated to speed up the convergence of learning algorithms while guaranteeing that the optimal policy remains unchanged.
% % In contrast, data-poisoning attacks perturb rewards and/or observations to achieve adversarial outcomes.

% Finally, our work is similar in vein to the bilevel-optimization framework proposed by~\cite{koh2018stronger} for data-poisoning attacks on supervised learning algorithms with data sanitization defense mechanisms. However, in contrast to this work, we exploit specific properties of OPE algorithms to construct stronger data-poisoning attacks as well as compare the sensitivity of different OPE algorithms in RL.

% \textbf{Influence functions:}
% The influence function was originally introduced in robust statistics~\cite{cook1980influence,hampel1974influence} to understand the effect of perturbing of removing a train data point on small linear models estimated from the data. In more recent work, influence functions have been used as an diagnostic tool in deep learning  and reinforcement learning algorithms to detect adversarial training data points~\cite{broderick2021automatic,koh2018stronger,koh2017influence,gottesman2020interpretable,cohen2020detecting}, optimal sub-sampling~\cite{Ting2018optimalsubsampling} and to aide decision-policy optimization~\cite{Munos02variableresolution}. A few work have also proposed influence-functions based data-poisoning attacks on supervised learning algorithms~\cite{koh2018stronger,koh2017influence,Wu2021AttackInfluence,fang2020influence}. However, our work differs from theirs in terms of context (reinforcement learning) and objectives optimized.

% \section{Experimental Results:}\label{app:experimental_results}

    
% \subsection{Effect of increasing randomness of the behavior policy on DOPE Attack:}
% In all our experiments, we chose small values of $\epsilon$ for the behavior policy to examine the cases where the OPE methods are difficult to attack. A larger value of epsilon would result in a larger state-action distribution mismatch between the datasets collected using the behavior policy and the datasets that would have been collected with the evaluation policy. This distribution mismatch would result in large importance sampling weights and out-of-distribution estimation errors and increase the variance in the value function estimates. As a result, the OPE methods would become more brittle and thus, more vulnerable to data poisoning attacks. 

% To illustrate this effect, we compare the percentage error in the value function estimates of a near-optimal policy in the HIV domain for two different values of $\epsilon$,  0.05 and 0.25. For this experiment, we set the perturbation budget to $\eps=0.5\sigma$ and percentage of corrupt points to $\alpha=0.05$. We report the interquartile mean of the percentage error in the value function estimates observed across 5 trials in ~\cref{table:summary3}. Our results in~\cref{table:summary3} indicate that OPE methods like BRM, WDR are more vulnerable to the data poisoning attack for larger values of $\epsilon$. 


% \begin{table*}[h!]
% \centering
% \begin{tabular}{|p{2cm}||rr|rr|rr|rr|rr|rr|} 
%  \hline
%  Methods & BRM & WIS & PDIS	& CPDIS &	WDR\\
%  \hline
% epsilon=0.05 &	334.2 &	5.83e-3	& 1.61 &	0.22 &	118.35 \\
% \hline
% epsilon=0.25 & 427.15 & 0.0 & 2.59 & 0.06 & 1489.22\\
% \hline
% \end{tabular}
% \caption{\label{table:summary3} Percentage errors in the value function estimates observed for different values of $\varepsilon$ on the HIV domain.}
% \end{table*}

% \subsection{Anomaly Detection Methods}
% In this experiment, we investigate if standard anomaly detection methods can identify the poisoned data points from the dataset.
% % We would like to note that the budget constraint embedded in our optimization problem (Equation 8d in the main paper) ensures that the original data instances are perturbed in a manner that cannot be easily detected by naive anomaly detection techniques. Specifically, we set the perturbation budget to be a constant time of the *standard deviation* of the original data instances (e.g., $\varepsilon=0.5\sigma$ as discussed in Section 5.2 in the main paper). This ensures that the perturbed data points are not too far off from the original instances. 

% For this purpose, we use two popular state-of-the-art anomaly detection methods~\cite{Emmott2013Systematic}, namely, the Isolation Forests~\cite{Liu2008IsolationF} and the Local Outlier Factor~\cite{Breunig2000LOF} method. We set the perturbation budget $\eps$ to be $0.5 \sigma$ and the percentage of corrupt points to be $\alpha=0.05$. We report the True Positive Rate (Fraction of perturbed data points tagged as outliers) and the False Positive Rate (Fraction of original data instances tagged as outliers). Our experimental results with the aforementioned anomaly detection methods, and the WDR OPE method across Cancer, HIV, and Gridworld domains are shown in in~\cref{table:summary1} and~\cref{table:summary2}. While the Isolation Forests method has a high true positive rate, it also has a high false-positive rate indicating that several original data instances are being tagged as outliers. On the other hand, the Local Outlier Factor method exhibits low true positive and false-positive rates. The following results suggest that the perturbed data points are not readily distinguishable from the original data instances. 
% These results are not surprising as the budget constraint embedded in our optimization problem~\cref{eq:bilevel_three} ensures that the original data instances are perturbed in a manner that cannot be easily detected by naive anomaly detection techniques. 

% % | OPE Method= WDR 	| True Positive Rate (Fraction of<br> perturbed data points tagged as outliers) 	| False Positive Rate (Fraction of original <br>data instances tagged as outliers) 	|
% % |---	|---	|---	|

% % | Cancer 	| 1.0 	| 0.26 	|
% % | HIV 	| 0.4 	| 0.15 	|
% % | Gridworld 	| 1.0 	| 0.78 	|

% \begin{table*}[h!]
% \centering
% \begin{tabular}{|p{3cm}||rr|rr|rr|} 
%  \hline
%  OPE Method = WDR & True Positive Rate &  False Positive Rate \\
%  \hline
%  Cancer & 1.0 & 0.26 \\
%  \hline
%  HIV & 0.47 & 0.16 \\
%  \hline
%  Gridworld & 1.0 & 0.9 \\
%   \hline
% \end{tabular}
% \caption{\label{table:summary1}Results with Isolation Forests anomaly detection method and WDR Method.}
% \end{table*}

% \begin{table*}[h!]
% \centering
% \begin{tabular}{|p{3cm}||rr|rr|rr|} 
%  \hline
%  OPE Method = WDR & True Positive Rate  & False Positive Rate \\
%  \hline
%   Cancer 	&  0.02 &  0.05 	\\
%   \hline
%  HIV 	&  0.08 	& 0.07 	\\
%  \hline
%  Gridworld 	& 0.01 	& 0.07 	\\
%  \hline
% \end{tabular}
% \caption{\label{table:summary2}Results with Local Outlier Factor anomaly detection method and WDR Method.}
% \end{table*}

% \begin{table*}[h!]
% \centering
% \begin{tabular}{|p{3cm}||rr|rr|rr|} 
%  \hline
%  OPE Method = PDIS & True Positive Rate &  False Positive Rate \\
%  \hline
%  Cancer & 1.0 & 0.31 \\
%  \hline
%  HIV & 0.17 & 0.17 \\
%  \hline
%  Gridworld & 1.0 & 0.5 \\
%   \hline
% \end{tabular}
% \caption{\label{table:summary1a}Results with Isolation Forests anomaly detection method and PDIS method.}
% \end{table*}

% \begin{table*}[h!]
% \centering
% \begin{tabular}{|p{3cm}||rr|rr|rr|} 
%  \hline
%  OPE Method = PDIS & True Positive Rate  & False Positive Rate \\
%  \hline
%   Cancer 	&  0.0 &  0.05 	\\
%   \hline
%  HIV 	&  0.32 	& 0.07 	\\
%  \hline
%  Gridworld 	& 0.03 	& 0.06 	\\
%  \hline
% \end{tabular}
% \caption{\label{table:summary2a}Results with Local Outlier Factor anomaly detection method and PDIS Method.}
% \end{table*}



% % | OPE Method =WDR 	| True Positive Rate(Fraction of perturbed <br>data points detected) 	| False Positive Rate (Fraction of unperturbed <br>data points detected as outliers) 	|
% % |---	|---	|---	|
% % | Cancer 	| 0.02 	| 0.05 	|
% % | HIV 	| 0.09 	| 0.07 	|
% % | Gridworld 	| 0.02 	| 0.07 	|

% \subsection{Effectiveness of DOPE Attack}
% % \begin{figure*}[h!]
% %     \centering
% %     \begin{subfigure}[b]{0.33\textwidth}
% %     \centering
% %     \includegraphics[width=0.8\linewidth]{plots/experiments8/cartpolesubset_experiment88_budget.png}
% %     %\vpsace{-0.3cm}
% %      \caption{    \label{fig:R14}}
% %     \end{subfigure}
% %      \begin{subfigure}[b]{0.33\textwidth}
% %     \centering
% %     \includegraphics[width=0.8\linewidth]{plots/experiments8/mountaincarsubset_experiment88_budget.png}
% %     %\vpsace{-0.3cm}
% %      \caption{    \label{fig:R24}}
% %     \end{subfigure}
% %     \caption{\label{exp:effectiveness3} \cref{fig:R14,fig:R24}
% %   compares the effect of DOPE attack on BRM, IS, PDIS and CWPDIS methods in Cartpole and MountainCar domains (left to right) for different values of attacker's budget $\epsilon$.}
% %      \end{figure*}
%       \begin{figure}[h!]
%     \centering
%     \begin{subfigure}[b]{0.5\linewidth}
%     \centering
%     \includegraphics[width=\linewidth]{plots/experiments8/cartpolesubset_experiment88_budget_0.png}
%     %\vpsace{-0.3cm}
%      \caption{Cartpole    \label{fig:R14}}
%     \end{subfigure}%
%      \begin{subfigure}[b]{0.5\linewidth}
%     \centering
%     \includegraphics[width=\linewidth]{plots/experiments8/mountaincarsubset_experiment88_budget_0.png}
%     %\vpsace{-0.3cm}
%      \caption{Mountain Car    \label{fig:R24}}
%     \end{subfigure}
%     %\vpsace{-0.3cm}
%     \caption{\label{exp:effectiveness3} \cref{fig:R14,fig:R24}
%   compares the effect of DOPE attack on BRM, WIS, PDIS, CPDIS and WDR methods in Cartpole and Mountain Car domains for different values of attacker's budget $\eps=frac \cdot \sigma$ and $p=1$.}
%   %\vpsace{-0.5cm}
%      \end{figure}
%       \begin{figure*}[h!]
%     \centering
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments8/cartpolesubset_experiment88_corrupt_0.png}
%     %\vpsace{-0.3cm}
%      \caption{  Cartpole  \label{fig:R15}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments8/mountaincarsubset_experiment88_corrupt_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   MountainCar \label{fig:R25}}
%     \end{subfigure}
%     \caption{\label{exp:effectiveness4} \cref{fig:R15,fig:R25}
%   compares the effect of DOPE attack on BRM, WIS, PDIS, CPDIS and WDR methods in in Cartpole and MountainCar domains (left to right) for different percentages of corruption $\alpha$ and $p=1$.}
%      \end{figure*}
   
% %   \subsection{Comparison with Baselines}
%   %\vpsace{1cm}
   
%       \begin{figure*}[h!]
%     \centering
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/FQE-cancer-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{  BRM  \label{fig:b11}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_is-cancer-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   WIS \label{fig:b12}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_pdis-cancer-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ PDIS \label{fig:b13} }
%     \end{subfigure}
%   \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_cpdis-cancer-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ CPDIS \label{fig:b14} }
%     \end{subfigure}
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/WDR_wdr-cancer-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ WDR \label{fig:b15} }
%     \end{subfigure}
%     \caption{\label{exp:baselines:cancer} \cref{fig:b11,fig:b12,fig:b13,fig:b14,fig:b15} compares the effect of random attack, Random DOPE attack, FSGM-based Attack and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in Cancer domain.}
%     \end{figure*}
    
    
%       \begin{figure*}[h!]
%     \centering
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/FQE-cartpole-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{  BRM  \label{fig:bb11}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_is-cartpole-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   WIS \label{fig:bb12}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_pdis-cartpole-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ PDIS \label{fig:bb13} }
%     \end{subfigure}
%   \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_cpdis-cartpole-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ CPDIS \label{fig:bb14} }
%     \end{subfigure}
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/WDR_wdr-cartpole-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ WDR \label{fig:bb15} }
%     \end{subfigure}
%     \caption{\label{exp:baselines:cartpole} \cref{fig:bb11,fig:bb12,fig:bb13,fig:bb14,fig:bb15} compares the effect of random attack, Random DOPE attack, FSGM-based Attack and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in Cartpole domain.}
%     \end{figure*}
    
%      \begin{figure*}[h!]
%     \centering
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/FQE-hiv-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   BRM \label{fig:b31}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_is-hiv-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   WIS \label{fig:b32}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.30\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_pdis-hiv-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ PDIS \label{fig:b33} }
%     \end{subfigure}
%   \begin{subfigure}[b]{0.30\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_cpdis-hiv-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ CPDIS \label{fig:b34} }
%     \end{subfigure}
%     \begin{subfigure}[b]{0.30\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/WDR_wdr-hiv-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ WDR \label{fig:b35} }
%     \end{subfigure}
%     \caption{\label{exp:baselines:hiv} \cref{fig:b31,fig:b32,fig:b33,fig:b34,fig:b35} compares the effect of random attack, Random DOPE attack, FSGM-based Attack and DOPE attack on the error in the value function estimates of BRM, WIS and PDIS, CPDIS, WDR methods (left to right) in HIV domain.}
%     \end{figure*}
%         \begin{figure*}[h!]
%     \centering
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/FQE-custom-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   BRM  \label{fig:b41}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_is-custom-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   WIS \label{fig:b42}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_pdis-custom-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ PDIS \label{fig:b43} }
%     \end{subfigure}
%   \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_cpdis-custom-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ CPDIS \label{fig:b44} }
%     \end{subfigure}
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/WDR_wdr-custom-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ WDR \label{fig:b45} }
%     \end{subfigure}
%     \caption{ \label{exp:baselines:gridworld}\cref{fig:b41,fig:b42,fig:b43,fig:b44,fig:b45} compares the effect of random attack, Random DOPE attack, FSGM-based Attack and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in Continuous Gridworld domain.}
%     \end{figure*}
    
%         \begin{figure*}[h!]
%     \centering
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/FQE-mountaincar-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{  BRM  \label{fig:b51}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_is-mountaincar-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   WIS \label{fig:b52}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_pdis-mountaincar-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ PDIS \label{fig:b53} }
%     \end{subfigure}
%   \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_cpdis-mountaincar-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ CPDIS \label{fig:b54} }
%     \end{subfigure}
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments12/WDR_wdr-mountaincar-influenceexperiment12_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ WDR \label{fig:b55} }
%     \end{subfigure}
%     \caption{ \label{exp:baselines:mountaincar} \cref{fig:b51,fig:b52,fig:b53,fig:b54,fig:b55} compares the effect of random attack, Random DOPE attack, FSGM-based attack and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in MountainCar domain.}
%     \end{figure*}
% \newpage
% \subsection{Comparison with Projected DOPE Attack Method}
% Here we compare the DOPE attack to Projected DOPE Attack. In Projected DOPE Attack, we first compute the set of top $\alpha n$ influential points and their influences. Next, we set the optimal perturbations for the most influential points as the projection of their influences on the constrained space defined by the attack budget constraints. 
% We fix the value of $\alpha$ to $0.05$ and vary the budget $\eps$ from $0.0$ to $0.25$ with step size $0.04$.

%  Results for all the domains are shown in \cref{exp:baselines3:cancer,exp:baselines3:gridworld,exp:baselines3:hiv,exp:baselines3:mountaincar,exp:baselines3:cartpole}. These results indicate that there is no clear winner between DOPE and Projected DOPE as they both can perform well depending on the environment and the datasets collected.

%   \begin{figure*}[h!]
%     \centering
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/FQE-cancer-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{    BRM \label{fig:f11}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_is-cancer-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   WIS \label{fig:f12}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_pdis-cancer-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ PDIS \label{fig:f13} }
%     \end{subfigure}
%   \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_cpdis-cancer-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ CPDIS \label{fig:f14} }
%     \end{subfigure}
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/WDR_wdr-cancer-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ WDR \label{fig:f15} }
%     \end{subfigure}
%     \caption{\label{exp:baselines3:cancer} \cref{fig:f11,fig:f12,fig:f13,fig:f14,fig:f15} compares the effect of Projected DOPE attack and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in Cancer domain.}
%     \end{figure*}
    
%      \begin{figure*}[h!]
%     \centering
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/FQE-hiv-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   BRM  \label{fig:g31}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_is-hiv-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   WIS \label{fig:g32}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.30\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_pdis-hiv-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ PDIS \label{fig:g33} }
%     \end{subfigure}
%   \begin{subfigure}[b]{0.30\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_cpdis-hiv-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ CPDIS \label{fig:g34} }
%     \end{subfigure}
%     \begin{subfigure}[b]{0.30\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/WDR_wdr-hiv-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ WDR \label{fig:g35} }
%     \end{subfigure}
%     \caption{\label{exp:baselines3:hiv} \cref{fig:g31,fig:g32,fig:g33,fig:g34,fig:g35} compares the effect of Projected DOPE attack and DOPE attack on the error in the value function estimates of BRM, WIS and PDIS, CPDIS, WDR methods (left to right) in HIV domain.}
%     \end{figure*}
%         \begin{figure*}[h!]
%     \centering
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/FQE-custom-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   BRM \label{fig:h41}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_is-custom-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{  WIS  \label{fig:h42}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_pdis-custom-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ PDIS \label{fig:h43} }
%     \end{subfigure}
%   \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_cpdis-custom-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ CPDIS \label{fig:h44} }
%     \end{subfigure}
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/WDR_wdr-custom-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ WDR \label{fig:h45} }
%     \end{subfigure}
%     \caption{ \label{exp:baselines3:gridworld}\cref{fig:h41,fig:h42,fig:h43,fig:h44,fig:h45} compares the effect of Projected DOPE attack and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in Continuous Gridworld domain.}
%     \end{figure*}
    
%         \begin{figure*}[h!]
%     \centering
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/FQE-mountaincar-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{    BRM \label{fig:i51}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_is-mountaincar-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   WIS \label{fig:i52}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_pdis-mountaincar-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ PDIS \label{fig:i53} }
%     \end{subfigure}
%   \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_cpdis-mountaincar-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ CPDIS \label{fig:i54} }
%     \end{subfigure}
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/WDR_wdr-mountaincar-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ WDR \label{fig:i55} }
%     \end{subfigure}
%     \caption{ \label{exp:baselines3:mountaincar} \cref{fig:i51,fig:i52,fig:i53,fig:i54,fig:i55} compares the effect of Projected DOPE and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in MountainCar domain.}
%     \end{figure*}
    
    
    
    
%      \begin{figure*}[h!]
%     \centering
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/FQE-cartpole-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{    BRM \label{fig:j51}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_is-cartpole-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{   WIS \label{fig:j52}}
%     \end{subfigure}
%      \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_pdis-cartpole-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ PDIS \label{fig:j53} }
%     \end{subfigure}
%   \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_cpdis-cartpole-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ CPDIS \label{fig:j54} }
%     \end{subfigure}
%     \begin{subfigure}[b]{0.33\textwidth}
%     \centering
%     \includegraphics[width=0.8\linewidth]{plots/experiments13/WDR_wdr-cartpole-influenceexperiment13_0.png}
%     %\vpsace{-0.3cm}
%      \caption{ WDR \label{fig:j55} }
%     \end{subfigure}
%     \caption{ \label{exp:baselines3:cartpole} \cref{fig:j51,fig:j52,fig:j53,fig:j54,fig:j55} compares the effect of Projected DOPE and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in Cartpole domain.}
%     \end{figure*}



\end{document}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End:
