%
\documentclass[accepted]{uai2022} %
                                    %
                                    %
                                    %
%
%
                                         %
%
                                          %
%
%
%

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{booktabs} %
\usepackage{nicefrac}
\usepackage{amsfonts,amsmath,amssymb,mathtools,amsthm}
\usepackage{graphicx}
\usepackage{multirow}
\usepackage{multicol}
\usepackage[capitalise,noabbrev]{cleveref}
\usepackage{color,caption,subcaption}
\hypersetup{colorlinks=true,citecolor=black,linkcolor=black,urlcolor=black}

%
%

%
\usepackage{natbib} %
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\renewcommand{\cite}[1]{\citep{#1}}

    
%
%
%
%
%
\usepackage{derivative}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\real}{\mathbb{R}}
\newcommand{\states}{\mathcal{S}}
\newcommand{\actions}{\mathcal{A}}
\newcommand{\opt}{^\star}
\newcommand{\popt}{^{\pi\opt}}
\newcommand{\pI}{^{\pi}}
\newcommand{\tr}{^\top}
\newcommand{\data}{\Psi}
\newcommand{\cs}{\\[1ex] & }
\newcommand{\minimize}[1]{\operatorname*{minimize}_{#1} \quad &}
\newcommand{\maximize}[1]{\operatorname*{maximize}_{#1} \quad &}
\newcommand{\R}{\mathcal{R}}
\newcommand{\one}{\bm{1}}
\newcommand{\zero}{\bm{0}}
\newcommand{\p}{\mathbb{P}}
\newcommand{\PP}{\mathcal{P}}
\renewcommand{\ss}{\,\mid\,}
\newcommand{\rw}{\mathfrak{r}}
\newcommand{\srob}{\rho^S}
\newcommand{\drob}{\rho^D}
\newcommand{\rrob}{\rho^R}
\newcommand{\sarob}{\rho^{RA}}
\newcommand{\saV}{v_{SA}^{\pi}}
\newcommand{\sV}{v_{S}^{\pi}}
\newcommand{\sr}{R}				%
\usepackage[font=small,labelfont=bf]{caption}
%
%
%
\newcommand{\Real}{\mathbb{R}}
\newcommand{\RealPlus}{\Real_{+}}
\newcommand{\Int}{\mathbb{Z}}
\newcommand{\Natural}{\mathbb{N}}
\newcommand{\NaturalPlus}{\Natural_{+}}

\newcommand{\eps}{\varepsilon}
\newcommand{\To}{\longrightarrow}
\newcommand{\BX}{\bm{B}(X)}
%
\newcommand{\Pow}[1]{\mathcal{P}\left(#1\right)}  %
\newcommand{\E}[1]{\mathbb{E}\left[ #1 \right]}
\newcommand{\Ex}[2]{\mathbb{E}_{#1}\left[ #2 \right]}
\renewcommand{\P}[1]{\mathbb{P}\left[ #1 \right]}
\renewcommand{\Pr}[2]{\mathbb{P}_{#1}\left[ #2 \right]}
\newcommand{\Var}[1]{\bm{Var}\left[ #1 \right]}
\newcommand{\Proj}[1]{\bm{P}_{#1}}
\newcommand{\cond}{\textrm{if }}
\newcommand{\dotp}[2]{\langle #1, #2 \rangle}
\newcommand{\eye}{\bm{I}}
\newcommand{\setp}{\mathsf{P}^s}
\newcommand{\Pf}{f}
\newcommand{\st}{\operatorname{s.t.}}
\newcommand{\stc}{\operatorname{subject\,to} \quad &}
\newcommand{\Exp}[2]{\mathbb{E}_{#1} \left[ #2 \right] }
\renewcommand{\Pr}{\mathbb{P}}
\newcommand{\torg}{\theta_\text{org}}
\newcommand{\tpert}{\theta_\text{pert}}

\newcommand{\Bell}{\mathfrak{T}}
\newcommand{\T}{\Bell^{\pi}}
\newcommand{\Topt}{\mathfrak{T}}
\newcommand{\SRopt}{\mathfrak{L}^S}
\newcommand{\SARopt}{\mathfrak{L}^{SA}}
\newcommand{\SBopt}{\mathfrak{B}^S}
\newcommand{\SABopt}{\mathfrak{B}^{SA}}
\newcommand{\SigmaQ}{\widetilde{\Sigma}}
\newcommand{\muQ}{\tilde{\mu}}
\newcommand{\w}{w}
\newcommand{\msbr}{\operatorname{MSBR}}
\newcommand{\brm}{\text{BRM}}
\newcommand{\wis}{\text{WIS}}
\newcommand{\pdis}{\text{PDIS}}
\newcommand{\cpdis}{\text{CPDIS}}
\newcommand{\dr}{\text{DR}}
\newcommand{\wdr}{\text{WDR}}

\theoremstyle{plain}
\newtheorem{thm}{Theorem}[section]
\newtheorem{cor}[thm]{Corollary}
\newtheorem{lem}[thm]{Lemma}
\newtheorem{prop}[thm]{Proposition}
\newtheorem{exm}[thm]{Example}
\theoremstyle{definition}
\newtheorem{defn}[thm]{Definition}
\newtheorem{asm}[thm]{Assumption}
\theoremstyle{remark}
\newtheorem{rem}[thm]{Remark}
\newtheorem{fact}[thm]{Fact}
\usepackage[ruled]{algorithm2e}
%
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\cov}{Cov}
\DeclareMathOperator{\dst}{dst}
%
%
%
%
\usepackage{xr-hyper}
\usepackage{hyperref}
\usepackage{xcite}


\makeatletter
\newcommand*{\addFileDependency}[1]{%
  \typeout{(#1)}%
  \@addtofilelist{#1}%
  \IfFileExists{#1}{}{\typeout{No file #1.}}%
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
%

%
\myexternaldocument{lobo_674}

%
\renewcommand{\Pr}{\mathbb{P}}
\newcommand{\elita}[1]{\textcolor{orange}{#1}}
\newcommand{\hs}[1]{\textcolor{magenta}{#1}}

%
\usepackage{xifthen}
\usepackage[normalem]{ulem}
\newcommand{\mm}[2][]{\ifthenelse{\isempty{#1}}{}{\textcolor{red}{[\sout{#1}]}}\textcolor{green}{#2}}
\newcommand{\marek}[1]{\textcolor{red}{[#1]}}
\newcommand{\hima}[1]{\textcolor{red}{[#1]}}

\crefmultiformat{thm}{Theorems~#2#1#3}{ and~#2#1#3}{, #2#1#3}{ and~#2#1#3}


\newenvironment{mprog}{\begin{array}{>{\displaystyle}r>{\displaystyle}l>{\displaystyle}l}}{\end{array}}
\newcommand\commentfont[1]{\footnotesize\ttfamily\textcolor{magenta}{#1}}
\SetCommentSty{commentfont}

\title{Data Poisoning Attacks on Off-Policy Policy Evaluation Methods (Supplementary Material)}

%
%
%
%
%
%
%
%
\author[1]{\href{mailto:<loboelita@gmail.com>}{Elita Lobo}{}}
\author[2]{Harvineet Singh}
\author[1]{Marek Petrik}
\author[3]{Cynthia Rudin}
\author[4]{Himabindu Lakkaraju}
%
\affil[1]{%
    University of New Hampshire\\
    Durham, NH, USA
}
\affil[2]{%
    New York University\\
    New York, NY, USA
}
\affil[3]{%
    Duke University\\
    Durham, NC, USA
  }
  
\affil[4]{%
    Harvard University\\
    Boston, MA, USA
  }
\begin{document}
\maketitle

%
%
%
%
%

%
%

%

%
%


%

%


%

%
%

%
%
%
%
%
%
%
%
%
%
%

%

%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%
%
%
%
%
%
%
%
%
%

%
%
%
%
%

%
%

%
%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%

%

%
%
%
%
%
%
%
%
%
%
%
%
%

%

%

%

%

%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%
%
%
%
%
%

%
%
%
%
%
%
%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%

%
%

%
%
%
%
%
%
%

%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%
%

%

%
%
%
%
%
%
%
%

%
%
%
%


%
%

%
%

%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%

%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
   
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
     
%


%
%

%

%
%

%
%
%

%

%
%

%

%
%
%
%

%
%
%
%
%

%
%
%
%
%

%
%
%

%

%
%
%
%
%
  
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%


%
%
%
%



%

%


%
%
%
%
%
%
%
%

%
%
%
%
%
%
%
%
%


%
%

%


%
%

%
%
%
%

\section{Additional Preliminaries}\label{apps:preliminary}
\paragraph{Influence functions}

%
Influence function is a popular tool used to quantify the change in an empirically learned estimator with small changes in data. Consider a supervised learning problem with input space $\mathcal{X}$ and output space $\mathcal{Y}$, a batch of data $(z)_{i=1}^n$ where $z_i =(x_i,y_i) \in (X \times Y)$ and an unknown prediction function $f: \mathcal{X} \to \mathcal{Y}$ where $f$ is parameterized by $\theta \in\Theta$. Given a convex and doubly differentiable loss function $L(\theta,z)$ such that $L: \Theta \times \mathcal{X} \to \real$ and $\theta \in \argmin_{\theta' \in\Theta} \frac{1}{n}\sum_{i=1}^n L(\theta',z_i)$ is the empirical risk minimizer, then,
the effect $I_{z,\theta,D}$ of perturbing a data point $z$ $\to$ $z_{\delta}=(x+ \delta,y)$ on the parameter $\theta$ can be approximated via Taylor expansion as
\begin{equation}\label{eq:influence_fn}
    \begin{aligned} 
    & \mathcal{I}_{z_{\delta}, \theta,D} = \frac{\theta_{z,\delta} - \theta}{\delta} \approx \frac{\partial  \theta} {\partial x}    \\ & \approx  \left(- H_{\theta}^{-1} \frac{\partial^2 L(\theta,z)}{\partial \theta \partial x}\right)
     \text{ where } H_{\theta} = \frac{\partial^2 L(\theta,D)}{\partial^2 \theta}
\end{aligned}
\end{equation}
where $\theta_{z,\delta}$ are the new optimal parameters learned from the training data point after replacing $z$ by $z_\delta$.
%
We refer the readers to \cite{koh2018stronger} for more details.
\section{Proofs:}\label{app:proofs}
\begin{proof}[Proof of \cref{prop_greedy}]\label{proof:prop_greedy}
Recall the optimization problem in~\eqref{bilevel:approx}:
%
%
%
%
%
%
%
%
\begin{equation} \label{bilevel:approx_app}  
\begin{aligned}
  &\max_{s\in \{0,1\}^n} \max_{\{\delta_i\}_{i=1}^N} %
    \left\{\sum_{i=1}^n 
    s_iI_{\Psi_i} \tr \delta_i \mid \|\delta_i \|_p \leq \eps, \forall i\right\} ,\\
   &\text{ subject to } \sum_{i=1}^n s_i = \alpha \cdot n~.
\end{aligned}
\end{equation}
Notice that in~\eqref{bilevel:approx_app}, $\forall k \in [1, \dots N]$, $I_{\Psi_i}$ is independent of $\delta_k$ and so the optimal perturbation $\delta^{*}_k$ can be independently computed by solving  $\delta^{*}_k \in \argmax_{x} \{I_{\Psi_k,\theta,\Psi }^T x \mid \|x\|_p \leq \eps\}$. The $p$-norm  $\|x\|_p$ of any vector $x \in \real^M$  can be expressed using its dual norm as $\|x\|_p= \max \left\{ z^T x \mid \|z\|_q \leq 1 \right\}$ where $\nicefrac{1}{p} + \nicefrac{1}{q}=1$~\cite{boyd2004convex}. Thus, given the optimal-perturbation $\delta^*_k$ for each $k \in 1,\dots, n$, the problem in \eqref{bilevel:approx} boils down to solving
\begin{equation}
    \begin{aligned}
     \max_{s\in \{0,1\}^N} & \sum_{k=1}^n \|I_{\Psi_k,\theta,\Psi}\|_q  \\
      & \sum_k s_k = \alpha \cdot n. 
    \end{aligned}
\end{equation}

It is now easy to see that the optimal set of transitions for the approximate attack problem in~\eqref{bilevel:approx} is simply the set of $\alpha n$ transitions with the largest value of the $q$-norm of their influence scores. The closed-form solution for $\delta^{*}_k$ at $p=1,2,\infty$ follows from standard convex optimization results for dual norms~\cite{boyd2004convex}.
\end{proof}

\section{Experimental Details:}\label{app:experiments}
\subsection{Additional Optimization Tricks used in experiments:}
\begin{enumerate}
    \item Recall that we use the DQN algorithm to learn the optimal Q-value function using a neural network, from which we derive the evaluation policy. In the case of the Cartpole and Mountain Car domains, we use this Q-value network to transform the state features into features $\phi(s, a)$. Specifically, we use the output of the second last layer of the Q-value network as the transformed state features. We do this to get a more accurate feature representation for linear function approximators which in turn would result in a more accurate initial value function estimate.
    \item In all our experiments, we use line-search to find the optimal step size to update the state features with the perturbations derived using~\cref{prop_greedy}. If for a given attacker's budget, we have access to the error in the value-function estimate for a lower value of the attacker's budget, then, we use it as the minimum threshold error to achieve while applying the line search. Applying this method enables us to achieve a monotonic trend in the percentage error in the value estimate with respect to the perturbation budget. The monotonic trend is otherwise difficult to achieve especially when the Loss function is non-convex.
    
    \item To optimize the DOPE objective for any given OPE method, we need have differentiable evaluation policy action probabilities. In the case, where the evaluation policy is a deterministic Q-learning policy, we obtain differentiable action probabilities by applying softmax to the q-values with very small temperature values. 
    
    \item Link to code: \href{https://github.com/elitalobo/DOPE}{https://github.com/elitalobo/DOPE}
\end{enumerate}





\begin{figure*}[h!]
    \centering
    \begin{tabular}{ |p{5cm}||p{5cm}|  }
 \hline
 \multicolumn{2}{|c|}{Hyperparameter values for Cancer domain} \\
 \hline
 Hyperparameter & Value\\
 \hline
Number of trajectories & 500 \\
Policy Network layers &  $64\times 28$ \\
Normalize rewards & No \\
Regularization for $\pi_b$ &  1e-2 \\
Regularization for $q_{\eta}$ & 1e-2\\
Discount factor & 0.95 \\
Trajectory Length (T) & 30 \\
Direction of Attack &  +1\\
Num. Epochs for CEL & 5000 \\
 \hline
\end{tabular}
\end{figure*}


\begin{figure*}[h!]
    \centering
    \begin{tabular}{ |p{5cm}||p{5cm}|  }
 \hline
 \multicolumn{2}{|c|}{Hyperparameter values for HIV domain} \\
 \hline
 Hyperparameter & Value\\
 \hline
Number of trajectories & 1000 \\
Policy Network layers &  $300\times 50$ \\
Normalize rewards & Yes \\
Regularization for $\pi_b$ &  1e-2 \\
Regularization for $q_{\eta}$ & 1e-2\\
Discount factor & 0.98 \\
Trajectory Length (T) & 50 \\
Direction of Attack &  -1\\
Num. Epochs for CEL & 5000 \\
 \hline
\end{tabular}
\end{figure*}


\begin{figure*}[h!]
    \centering
    \begin{tabular}{ |p{5cm}||p{5cm}|  }
 \hline
 \multicolumn{2}{|c|}{Hyperparameter values for Continuous Gridworld domain} \\
 \hline
 Hyperparameter & Value\\
 \hline
Number of trajectories & 500 \\
Policy Network layers &  $24$ \\
Normalize rewards & No \\
Regularization for $\pi_b$ &  1e-2 \\
Regularization for $q_{\eta}$ & 1e-2\\
Discount factor & 0.95 \\
Trajectory Length (T) & 50 \\
Direction of Attack &  -1\\
Num. Epochs for CEL & 5000 \\
 \hline
\end{tabular}
\end{figure*}


\begin{figure*}[h!]
    \centering
    \begin{tabular}{ |p{5cm}||p{5cm}|  }
 \hline
 \multicolumn{2}{|c|}{Hyperparameter values for MountainCar domain} \\
 \hline
 Hyperparameter & Value\\
 \hline
Number of trajectories & 250 \\
Policy Network layers &  $60$ \\
Normalize rewards & No \\
Regularization for $\pi_b$ &  1e-2 \\
Regularization for $q_{\eta}$ & 1e-2\\
Discount factor & 0.99 \\
Trajectory Length (T) & 150 \\
Direction of Attack &  +1\\
Num. Epochs for CEL & 5000 \\
 \hline
\end{tabular}
\end{figure*}


\begin{figure*}[h!]
    \centering
    \begin{tabular}{ |p{5cm}||p{5cm}|  }
 \hline
 \multicolumn{2}{|c|}{Hyperparameter values for Cancer domain} \\
 \hline
 Hyperparameter & Value\\
 \hline
Number of trajectories & 1000 \\
Policy Network layers &  $100\times 24$ \\
Normalize rewards & No \\
Regularization for $\pi_b$ &  1e-2 \\
Regularization for $q_{\eta}$ & 1e-2\\
Discount factor & 0.98 \\
Trajectory Length (T) & 100 \\
Direction of Attack &  +1\\
Num. Epochs for CEL & 5000 \\
 \hline
\end{tabular}
\end{figure*}

\subsection{Additional Domain Details:}

\emph{Cancer:} This domain~\cite{gottesman2020interpretable} models the growth of tumors in cancer patients. It consists of 4-dimensional states which represent the growth dynamics of the tumor in the patient, and two actions that indicate if a given patient is to be administered chemotherapy or not at a given time step. \newline
\emph{HIV:} The HIV domain has 6-dimensional states representing the state of the patient, and four actions that represent four different types of treatments.  \newline
\emph{MountainCar:} In the Mountain Car~\cite{brockman2016openai} domain, the task is to drive a car positioned between two mountains to the top of the mountain on the right in the shortest time possible. The 2-dimensional state represents the car's current position and the current time-step, and the three actions represent: drive forward, drive backward, and do not move.  \newline
\emph{Cartpole:}
The Cartpole domain~\cite{brockman2016openai} models a simple control problem where the goal is to apply +1/-1 force to keep a pole attached to a moving cart from falling. The 2-dimensional state represents the cartpole dynamics, and the two actions represent the force applied to the pole.  \newline
%
%
\emph{Continuous Gridworld:}
The gridworld domain consists of a 2-dimensional state space that represent the coordinates of the agent and 2 actions $(a_0,a_1)$ that determines the direction and step size of the agent. The task is to begin at coordinate $(1,1)$ and move towards coordinates $(50,50)$. Taking action $a_0$ at $(x,y)$ transitions the agent to $(x+0.2,y+0.45)$ with probability 1.0. On the other hand, taking action $a_1$ transitions the agent to $(x+0.3,y+0.5)$ with probability 0.95 and to $(1,1)$ with probability 0.05.
If the agent transitions to $(x',y')$, the agent receives a reward of $(x+ 0.5y)$. We set the maximum length of the episode to 50 and collected 500 trajectories using the behavior policy.

\section{Examples of twice continuously differentiable loss functions for DOPE Framework :}\label{app:examples}
All the loss functions ($L$) that we leverage in this work such as Mean Squared Bellman residual (MSBR) for learning the Q-value function, and the Cross-Entropy Loss (referred to as CEL in the paper) for fitting the multinomial logistic regression model are twice continuously differentiable with respect to the parameters $\theta$. Below, we show that these loss functions are twice continuously differentiable. 

In BRM and WDR, $\theta=\eta$ represents the parameters of the q-value function $q_\eta$. The parameters $\eta$ are estimated from the data by minimizing the Mean Squared Bellman Residual (MSBR). We compute the derivative of MSBR below to show that this loss function is twice differentiable and satisfies the assumption of our attack framework.
\begin{equation}
    \begin{aligned}
      \text{MSBR}(\eta) \;&=\;  \|q_{\eta}-\mathcal{T}^{\pi} q_{\eta}\|^2_W \\
     &= \|\Phi \eta - (r + \gamma \Phi_p \eta)\|_2^2 \\
    &\frac{\partial \text{MSBR}(\eta)}{\partial \eta} = 2 \cdot (\Phi - \gamma \Phi_p)^T (\Phi \eta - (r + \gamma \Phi_p \eta)) \\
    &\frac{\partial^2 \text{MSBR}(\eta)}{\partial \eta^2} = 2 \cdot (\Phi - \gamma \Phi_p)^T (\Phi - \gamma \Phi_p) 
    \end{aligned}
\end{equation}

In the case of Importance Sampling-based OPE methods such as WIS, PDIS, and CPDIS, the behavior policy parameters ($\theta=\theta_b \in \real^{A \cdot d}$ ) are estimated from the data using a multinomial logistic regression model. Hence, we compute below the second-order derivative of the cross-entropy loss of the multinomial logistic regression model and show that this loss function is twice differentiable as well and satisfies the assumption of our attack framework.

The, cross entropy loss for $\theta=\theta_b$ is given by
\begin{equation}
    \begin{aligned}
     \text{CEL}(\theta) &= \log\left(\prod_{l=1}^n  \frac{\exp(\theta_{a_l}^T \xi(s_l))}{\sum_{i=1}^A \exp(\theta_i^T \xi(s_l)) } \right) \\
     &= \sum_{l=1}^n  \log\left( \frac{\exp(\theta_{a_l}^T \xi(s_l))}{\sum_{i=1}^A \exp(\theta_i^T \xi(s_l)) } \right) \label{eq:1} \\
     &= \sum_{l=1}^n \left(\theta_{a_l}^T \xi(s_l) - \log\left(\sum_{j=1}^A \exp(\theta_j^T \xi(s_l))\right)\right).
    \end{aligned}
\end{equation}

We can compute the second order derivative of the cross entropy loss as follows: 
\begin{equation}
    \begin{aligned}
     \frac{\partial  \text{CEL}(\theta)}{\partial \theta_{a_l}} &= \sum_{l=1}^n \left(\xi(s_l) - \frac{\exp(\theta_{a_l}^T \xi(s_l))\xi(s_l)}{\sum_{j=1}^A \exp(\theta_j^T \xi(s_l)) }\right) \\
   \frac{\partial^2  \text{CEL}(\theta)}{\partial \theta_{a_l} \theta_k (k\neq a_l)} &= \sum_{l=1}^n \frac{\exp(\theta_{k}^T \xi(s_l))\exp(\theta_{a_l}^T\xi(s_l))\xi(s_l)^T\xi(s_l)}{(\sum_{j=1}^A \exp(\theta_j^T \xi(s_l)))^2} \\
   \frac{\partial^2  \text{CEL}(\theta)}{\partial \theta_{a_l}^2} &= \sum_{l=1}^n \left(-\frac{\exp(\theta_{a_l}^T \xi(s_l))  \xi(s_l)^T \xi(s_l)}{\sum_{j=1}^A \exp(\theta_j^T\xi(s_l))} + \right. \\ & 
   \left. \frac{\exp(\theta_{a_l}^T \xi(s_l))^2 \xi(s_l)^T \xi(s_l)}{(\sum_{j=1}^A \exp(\theta_j^T\xi(s_l)))^2} \right).
    \end{aligned}
\end{equation}

\section{Related Work}~\label{app:relatedworks}
Adversarial attacks have been extensively studied in Reinforcement Learning~\cite{gleave2021adversarial,Wu2021AttackInfluence,lin2019tactics,Zhang2020Adaptive,zhang2019online,lin2019tactics,Kiourti2019TrojDRLTA,Chen2019adversarial}. These attacks can be broadly classified into two main categories - train-time attacks (data-poisoning attacks) and test-time attacks. 

\textbf{Test-time attacks:} In test-time attacks in RL~\cite{lin2019tactics,gleave2021adversarial,Behzadan2017VulnerabilityOD,kos2017delving,Wu2021Adversarial,Chen2019adversarial,huang2017adversarial}, the attacker manipulates test-time observations to fool the agent to take target malicious actions, without directly changing the agent's policy. In this setting, the noise added to the test-time observations at any time step does not directly impact the agent's future decisions. A large majority of the work that focuses on test-time attacks aims to either minimize the agent's rewards~\cite{huang2017adversarial,Behzadan2017VulnerabilityOD} or lead the agent to adversarial states~\cite{lin2019tactics}, which differs from our goal of perturbing train-time observations to maximize error in the value estimate of a given policy for a given OPE method.

\textbf{Train-time attacks:} 
In train-time or data-poisoning attacks, the adversary perturbs the training data by a small margin to facilitate erroneous learning of decision models. Prior work on data-poisoning have mainly targeted supervised learning models in Machine Learning~\cite{koh2018stronger,koh2017influence,fang2020influence,Wu2021AttackInfluence,steinhardt2017certified}. However, recently there has been emerging interests in data-poisoning attacks on Batch RL agents~\cite{zhang2021corruptionrobust,Ma2019PolicyPI,rakhsha2020policy} and Online RL Agents~\cite{zhang2019online,Zhang2020Adaptive,rakhsha2020policy,Zhang2008PolicyTeaching,Zhang2009policyteaching}. In a pioneering research work,~\cite{zhang2019online} proposed a framework that perturbs rewards such that a batch RL agent learns an adversarial target policy. In the following work,~\cite{rakhsha2020policy} proposed a framework for poisoning rewards and transition dynamics to force a Batch agent to learn an adversarial target policy. In \cite{wu2022copa}, authors propose methods to certify the robustness of a policy learned from offline data after a poisoning attack. It outputs the least cumulative reward that can be attained by a poisoned policy. ~\cite{zhang2019online} develops fast adaptive data-poisoning attacks on online RL agents where rewards must be perturbed in real-time. Nonetheless, these data-poisoning works differ from our work in two main aspects: a)They target learning of optimal adversarial policies, whereas our work targets learning erroneous value-function estimates for any given policy and OPE method b) our main goal is to analyze the sensitivity of different OPE algorithms to train-time attacks which has not been explored in any of these previous work.
%
%
%

Finally, our work is similar in vein to the bilevel-optimization framework proposed by~\cite{koh2018stronger} for data-poisoning attacks on supervised learning algorithms with data sanitization defense mechanisms. However, in contrast to this work, we exploit specific properties of OPE algorithms to construct stronger data-poisoning attacks as well as compare the sensitivity of different OPE algorithms in RL.

\textbf{Influence functions:}
The influence function was originally introduced in robust statistics~\cite{cook1980influence,hampel1974influence} to understand the effect of perturbing of removing a train data point on small linear models estimated from the data. In more recent work, influence functions have been used as an diagnostic tool in deep learning  and reinforcement learning algorithms to detect adversarial training data points~\cite{broderick2021automatic,koh2018stronger,koh2017influence,gottesman2020interpretable,cohen2020detecting}, optimal sub-sampling~\cite{Ting2018optimalsubsampling} and to aide decision-policy optimization~\cite{Munos02variableresolution}. A few work have also proposed influence-functions based data-poisoning attacks on supervised learning algorithms~\cite{koh2018stronger,koh2017influence,Wu2021AttackInfluence,fang2020influence}. However, our work differs from theirs in terms of context (reinforcement learning) and objectives optimized.

\section{Experimental Results:}\label{app:experimental_results}

    
\subsection{Effect of increasing randomness of the behavior policy on DOPE Attack:}
In all our experiments, we chose small values of $\epsilon$ for the behavior policy to examine the cases where the OPE methods are difficult to attack. A larger value of epsilon would result in a larger state-action distribution mismatch between the datasets collected using the behavior policy and the datasets that would have been collected with the evaluation policy. This distribution mismatch would result in large importance sampling weights and out-of-distribution estimation errors and increase the variance in the value function estimates. As a result, the OPE methods would become more brittle and thus, more vulnerable to data poisoning attacks. 

To illustrate this effect, we compare the percentage error in the value function estimates of a near-optimal policy in the HIV domain for two different values of $\epsilon$,  0.05 and 0.25. For this experiment, we set the perturbation budget to $\eps=0.5\sigma$ and percentage of corrupt points to $\alpha=0.05$. We report the interquartile mean of the percentage error in the value function estimates observed across 5 trials in ~\cref{table:summary3}. Our results in~\cref{table:summary3} indicate that OPE methods like BRM, WDR are more vulnerable to the data poisoning attack for larger values of $\epsilon$. 


\begin{table*}[h!]
\centering
\begin{tabular}{|p{2cm}||rr|rr|rr|rr|rr|rr|} 
 \hline
 Methods & BRM & WIS & PDIS	& CPDIS &	WDR\\
 \hline
epsilon=0.05 &	334.2 &	5.83e-3	& 1.61 &	0.22 &	118.35 \\
\hline
epsilon=0.25 & 427.15 & 0.0 & 2.59 & 0.06 & 1489.22\\
\hline
\end{tabular}
\caption{\label{table:summary3} Percentage errors in the value function estimates observed for different values of $\varepsilon$ on the HIV domain.}
\end{table*}

\subsection{Anomaly Detection Methods}
In this experiment, we investigate if standard anomaly detection methods can identify the poisoned data points from the dataset.
%

For this purpose, we use two popular state-of-the-art anomaly detection methods~\cite{Emmott2013Systematic}, namely, the Isolation Forests~\cite{Liu2008IsolationF} and the Local Outlier Factor~\cite{Breunig2000LOF} method. We set the perturbation budget $\eps$ to be $0.5 \sigma$ and the percentage of corrupt points to be $\alpha=0.05$. We report the True Positive Rate (Fraction of perturbed data points tagged as outliers) and the False Positive Rate (Fraction of original data instances tagged as outliers). Our experimental results with the aforementioned anomaly detection methods, and the WDR OPE method across Cancer, HIV, and Gridworld domains are shown in in~\cref{table:summary1} and~\cref{table:summary2}. While the Isolation Forests method has a high true positive rate, it also has a high false-positive rate indicating that several original data instances are being tagged as outliers. On the other hand, the Local Outlier Factor method exhibits low true positive and false-positive rates. The following results suggest that the perturbed data points are not readily distinguishable from the original data instances. 
These results are not surprising as the budget constraint embedded in our optimization problem~\cref{eq:bilevel_three} ensures that the original data instances are perturbed in a manner that cannot be easily detected by naive anomaly detection techniques. 

%
%

%
%
%

\begin{table*}[h!]
\centering
\begin{tabular}{|p{3cm}||rr|rr|rr|} 
 \hline
 OPE Method = WDR & True Positive Rate &  False Positive Rate \\
 \hline
 Cancer & 1.0 & 0.26 \\
 \hline
 HIV & 0.47 & 0.16 \\
 \hline
 Gridworld & 1.0 & 0.9 \\
   \hline
\end{tabular}
\caption{\label{table:summary1}Results with Isolation Forests anomaly detection method and WDR Method.}
\end{table*}

\begin{table*}[h!]
\centering
\begin{tabular}{|p{3cm}||rr|rr|rr|} 
 \hline
 OPE Method = WDR & True Positive Rate  & False Positive Rate \\
 \hline
  Cancer 	&  0.02 &  0.05 	\\
  \hline
 HIV 	&  0.08 	& 0.07 	\\
 \hline
 Gridworld 	& 0.01 	& 0.07 	\\
 \hline
\end{tabular}
\caption{\label{table:summary2}Results with Local Outlier Factor anomaly detection method and WDR Method.}
\end{table*}

\begin{table*}[h!]
\centering
\begin{tabular}{|p{3cm}||rr|rr|rr|} 
 \hline
 OPE Method = PDIS & True Positive Rate &  False Positive Rate \\
 \hline
 Cancer & 1.0 & 0.31 \\
 \hline
 HIV & 0.17 & 0.17 \\
 \hline
 Gridworld & 1.0 & 0.5 \\
   \hline
\end{tabular}
\caption{\label{table:summary1a}Results with Isolation Forests anomaly detection method and PDIS method.}
\end{table*}

\begin{table*}[h!]
\centering
\begin{tabular}{|p{3cm}||rr|rr|rr|} 
 \hline
 OPE Method = PDIS & True Positive Rate  & False Positive Rate \\
 \hline
  Cancer 	&  0.0 &  0.05 	\\
  \hline
 HIV 	&  0.32 	& 0.07 	\\
 \hline
 Gridworld 	& 0.03 	& 0.06 	\\
 \hline
\end{tabular}
\caption{\label{table:summary2a}Results with Local Outlier Factor anomaly detection method and PDIS Method.}
\end{table*}



%
%
%
%
%

\subsection{Effectiveness of DOPE Attack}
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
%
      \begin{figure}[h!]
    \centering
    \begin{subfigure}[b]{0.5\linewidth}
    \centering
    \includegraphics[width=\linewidth]{plots/experiments8/cartpolesubset_experiment88_budget_0.png}
    %
     \caption{Cartpole    \label{fig:R14}}
    \end{subfigure}%
     \begin{subfigure}[b]{0.5\linewidth}
    \centering
    \includegraphics[width=\linewidth]{plots/experiments8/mountaincarsubset_experiment88_budget_0.png}
    %
     \caption{Mountain Car    \label{fig:R24}}
    \end{subfigure}
    %
    \caption{\label{exp:effectiveness3} \cref{fig:R14,fig:R24}
  compares the effect of DOPE attack on BRM, WIS, PDIS, CPDIS and WDR methods in Cartpole and Mountain Car domains for different values of attacker's budget $\eps=frac \cdot \sigma$ and $p=1$.}
  %
     \end{figure}
      \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/cartpolesubset_experiment88_corrupt_0.png}
    %
     \caption{  Cartpole  \label{fig:R15}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments8/mountaincarsubset_experiment88_corrupt_0.png}
    %
     \caption{   MountainCar \label{fig:R25}}
    \end{subfigure}
    \caption{\label{exp:effectiveness4} \cref{fig:R15,fig:R25}
   compares the effect of DOPE attack on BRM, WIS, PDIS, CPDIS and WDR methods in in Cartpole and MountainCar domains (left to right) for different percentages of corruption $\alpha$ and $p=1$.}
     \end{figure*}
   
%
   %
   
      \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/FQE-cancer-influenceexperiment12_0.png}
    %
     \caption{  BRM  \label{fig:b11}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_is-cancer-influenceexperiment12_0.png}
    %
     \caption{   WIS \label{fig:b12}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_pdis-cancer-influenceexperiment12_0.png}
    %
     \caption{ PDIS \label{fig:b13} }
    \end{subfigure}
   \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_cpdis-cancer-influenceexperiment12_0.png}
    %
     \caption{ CPDIS \label{fig:b14} }
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/WDR_wdr-cancer-influenceexperiment12_0.png}
    %
     \caption{ WDR \label{fig:b15} }
    \end{subfigure}
    \caption{\label{exp:baselines:cancer} \cref{fig:b11,fig:b12,fig:b13,fig:b14,fig:b15} compares the effect of random attack, Random DOPE attack, FSGM-based Attack and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in Cancer domain.}
    \end{figure*}
    
    
      \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/FQE-cartpole-influenceexperiment12_0.png}
    %
     \caption{  BRM  \label{fig:bb11}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_is-cartpole-influenceexperiment12_0.png}
    %
     \caption{   WIS \label{fig:bb12}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_pdis-cartpole-influenceexperiment12_0.png}
    %
     \caption{ PDIS \label{fig:bb13} }
    \end{subfigure}
   \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_cpdis-cartpole-influenceexperiment12_0.png}
    %
     \caption{ CPDIS \label{fig:bb14} }
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/WDR_wdr-cartpole-influenceexperiment12_0.png}
    %
     \caption{ WDR \label{fig:bb15} }
    \end{subfigure}
    \caption{\label{exp:baselines:cartpole} \cref{fig:bb11,fig:bb12,fig:bb13,fig:bb14,fig:bb15} compares the effect of random attack, Random DOPE attack, FSGM-based Attack and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in Cartpole domain.}
    \end{figure*}
    
     \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/FQE-hiv-influenceexperiment12_0.png}
    %
     \caption{   BRM \label{fig:b31}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_is-hiv-influenceexperiment12_0.png}
    %
     \caption{   WIS \label{fig:b32}}
    \end{subfigure}
     \begin{subfigure}[b]{0.30\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_pdis-hiv-influenceexperiment12_0.png}
    %
     \caption{ PDIS \label{fig:b33} }
    \end{subfigure}
   \begin{subfigure}[b]{0.30\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_cpdis-hiv-influenceexperiment12_0.png}
    %
     \caption{ CPDIS \label{fig:b34} }
    \end{subfigure}
    \begin{subfigure}[b]{0.30\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/WDR_wdr-hiv-influenceexperiment12_0.png}
    %
     \caption{ WDR \label{fig:b35} }
    \end{subfigure}
    \caption{\label{exp:baselines:hiv} \cref{fig:b31,fig:b32,fig:b33,fig:b34,fig:b35} compares the effect of random attack, Random DOPE attack, FSGM-based Attack and DOPE attack on the error in the value function estimates of BRM, WIS and PDIS, CPDIS, WDR methods (left to right) in HIV domain.}
    \end{figure*}
        \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/FQE-custom-influenceexperiment12_0.png}
    %
     \caption{   BRM  \label{fig:b41}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_is-custom-influenceexperiment12_0.png}
    %
     \caption{   WIS \label{fig:b42}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_pdis-custom-influenceexperiment12_0.png}
    %
     \caption{ PDIS \label{fig:b43} }
    \end{subfigure}
   \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_cpdis-custom-influenceexperiment12_0.png}
    %
     \caption{ CPDIS \label{fig:b44} }
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/WDR_wdr-custom-influenceexperiment12_0.png}
    %
     \caption{ WDR \label{fig:b45} }
    \end{subfigure}
    \caption{ \label{exp:baselines:gridworld}\cref{fig:b41,fig:b42,fig:b43,fig:b44,fig:b45} compares the effect of random attack, Random DOPE attack, FSGM-based Attack and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in Continuous Gridworld domain.}
    \end{figure*}
    
        \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/FQE-mountaincar-influenceexperiment12_0.png}
    %
     \caption{  BRM  \label{fig:b51}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_is-mountaincar-influenceexperiment12_0.png}
    %
     \caption{   WIS \label{fig:b52}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_pdis-mountaincar-influenceexperiment12_0.png}
    %
     \caption{ PDIS \label{fig:b53} }
    \end{subfigure}
   \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/IS_cpdis-mountaincar-influenceexperiment12_0.png}
    %
     \caption{ CPDIS \label{fig:b54} }
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments12/WDR_wdr-mountaincar-influenceexperiment12_0.png}
    %
     \caption{ WDR \label{fig:b55} }
    \end{subfigure}
    \caption{ \label{exp:baselines:mountaincar} \cref{fig:b51,fig:b52,fig:b53,fig:b54,fig:b55} compares the effect of random attack, Random DOPE attack, FSGM-based attack and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in MountainCar domain.}
    \end{figure*}
\newpage
\subsection{Comparison with Projected DOPE Attack Method}
Here we compare the DOPE attack to Projected DOPE Attack. In Projected DOPE Attack, we first compute the set of top $\alpha n$ influential points and their influences. Next, we set the optimal perturbations for the most influential points as the projection of their influences on the constrained space defined by the attack budget constraints. 
We fix the value of $\alpha$ to $0.05$ and vary the budget $\eps$ from $0.0$ to $0.25$ with step size $0.04$.

 Results for all the domains are shown in \cref{exp:baselines3:cancer,exp:baselines3:gridworld,exp:baselines3:hiv,exp:baselines3:mountaincar,exp:baselines3:cartpole}. These results indicate that there is no clear winner between DOPE and Projected DOPE as they both can perform well depending on the environment and the datasets collected.

  \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/FQE-cancer-influenceexperiment13_0.png}
    %
     \caption{    BRM \label{fig:f11}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_is-cancer-influenceexperiment13_0.png}
    %
     \caption{   WIS \label{fig:f12}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_pdis-cancer-influenceexperiment13_0.png}
    %
     \caption{ PDIS \label{fig:f13} }
    \end{subfigure}
  \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_cpdis-cancer-influenceexperiment13_0.png}
    %
     \caption{ CPDIS \label{fig:f14} }
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/WDR_wdr-cancer-influenceexperiment13_0.png}
    %
     \caption{ WDR \label{fig:f15} }
    \end{subfigure}
    \caption{\label{exp:baselines3:cancer} \cref{fig:f11,fig:f12,fig:f13,fig:f14,fig:f15} compares the effect of Projected DOPE attack and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in Cancer domain.}
    \end{figure*}
    
     \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/FQE-hiv-influenceexperiment13_0.png}
    %
     \caption{   BRM  \label{fig:g31}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_is-hiv-influenceexperiment13_0.png}
    %
     \caption{   WIS \label{fig:g32}}
    \end{subfigure}
     \begin{subfigure}[b]{0.30\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_pdis-hiv-influenceexperiment13_0.png}
    %
     \caption{ PDIS \label{fig:g33} }
    \end{subfigure}
  \begin{subfigure}[b]{0.30\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_cpdis-hiv-influenceexperiment13_0.png}
    %
     \caption{ CPDIS \label{fig:g34} }
    \end{subfigure}
    \begin{subfigure}[b]{0.30\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/WDR_wdr-hiv-influenceexperiment13_0.png}
    %
     \caption{ WDR \label{fig:g35} }
    \end{subfigure}
    \caption{\label{exp:baselines3:hiv} \cref{fig:g31,fig:g32,fig:g33,fig:g34,fig:g35} compares the effect of Projected DOPE attack and DOPE attack on the error in the value function estimates of BRM, WIS and PDIS, CPDIS, WDR methods (left to right) in HIV domain.}
    \end{figure*}
        \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/FQE-custom-influenceexperiment13_0.png}
    %
     \caption{   BRM \label{fig:h41}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_is-custom-influenceexperiment13_0.png}
    %
     \caption{  WIS  \label{fig:h42}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_pdis-custom-influenceexperiment13_0.png}
    %
     \caption{ PDIS \label{fig:h43} }
    \end{subfigure}
  \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_cpdis-custom-influenceexperiment13_0.png}
    %
     \caption{ CPDIS \label{fig:h44} }
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/WDR_wdr-custom-influenceexperiment13_0.png}
    %
     \caption{ WDR \label{fig:h45} }
    \end{subfigure}
    \caption{ \label{exp:baselines3:gridworld}\cref{fig:h41,fig:h42,fig:h43,fig:h44,fig:h45} compares the effect of Projected DOPE attack and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in Continuous Gridworld domain.}
    \end{figure*}
    
        \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/FQE-mountaincar-influenceexperiment13_0.png}
    %
     \caption{    BRM \label{fig:i51}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_is-mountaincar-influenceexperiment13_0.png}
    %
     \caption{   WIS \label{fig:i52}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_pdis-mountaincar-influenceexperiment13_0.png}
    %
     \caption{ PDIS \label{fig:i53} }
    \end{subfigure}
  \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_cpdis-mountaincar-influenceexperiment13_0.png}
    %
     \caption{ CPDIS \label{fig:i54} }
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/WDR_wdr-mountaincar-influenceexperiment13_0.png}
    %
     \caption{ WDR \label{fig:i55} }
    \end{subfigure}
    \caption{ \label{exp:baselines3:mountaincar} \cref{fig:i51,fig:i52,fig:i53,fig:i54,fig:i55} compares the effect of Projected DOPE and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in MountainCar domain.}
    \end{figure*}
    
    
    
    
     \begin{figure*}[h!]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/FQE-cartpole-influenceexperiment13_0.png}
    %
     \caption{    BRM \label{fig:j51}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_is-cartpole-influenceexperiment13_0.png}
    %
     \caption{   WIS \label{fig:j52}}
    \end{subfigure}
     \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_pdis-cartpole-influenceexperiment13_0.png}
    %
     \caption{ PDIS \label{fig:j53} }
    \end{subfigure}
  \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/IS_cpdis-cartpole-influenceexperiment13_0.png}
    %
     \caption{ CPDIS \label{fig:j54} }
    \end{subfigure}
    \begin{subfigure}[b]{0.33\textwidth}
    \centering
    \includegraphics[width=0.8\linewidth]{plots/experiments13/WDR_wdr-cartpole-influenceexperiment13_0.png}
    %
     \caption{ WDR \label{fig:j55} }
    \end{subfigure}
    \caption{ \label{exp:baselines3:cartpole} \cref{fig:j51,fig:j52,fig:j53,fig:j54,fig:j55} compares the effect of Projected DOPE and DOPE attack on the error in the value function estimates of BRM, WIS, PDIS, CPDIS and WDR methods (left to right) in Cartpole domain.}
    \end{figure*}

\clearpage
\bibliography{lobo_674}


\end{document}
