\def\year{2022}\relax
%File: formatting-instructions-latex-2022.tex
%release 2022.1
\documentclass[letterpaper]{article} % DO NOT CHANGE THIS
\usepackage{aaai22}  % DO NOT CHANGE THIS
\usepackage{times}  % DO NOT CHANGE THIS
\usepackage{helvet}  % DO NOT CHANGE THIS
\usepackage{courier}  % DO NOT CHANGE THIS
\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
\usepackage{graphicx} % DO NOT CHANGE THIS
\urlstyle{rm} % DO NOT CHANGE THIS
\def\UrlFont{\rm}  % DO NOT CHANGE THIS
\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
\frenchspacing  % DO NOT CHANGE THIS
\setlength{\pdfpagewidth}{8.5in}  % DO NOT CHANGE THIS
\setlength{\pdfpageheight}{11in}  % DO NOT CHANGE THIS
%
% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
\usepackage{algorithm}
\usepackage[noend]{algorithmic}


%packages from NeurIPS paper
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\def\UrlBreaks{\do\/\do-}
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{bm}
% \usepackage{subfig}
\usepackage{graphicx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{thmtools}
\usepackage{thm-restate}
\usepackage{enumitem}
% \usepackage{subcaption}
% \usepackage[ruled,vlined,linesnumbered,lined,commentsnumbered,noend]{algorithm2e}
\usepackage{mathtools}


\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}{Claim}
\newtheorem{example}[theorem]{Example}
\newtheorem{cond}{Condition}
\newtheorem{remark}{Remark}
\newtheorem{proposition}{Proposition}
\theoremstyle{definition}
\newtheorem{definition}{Definition}%[section]


\newcommand\jk[1]{\textcolor{blue}{#1}}
\newcommand\lx[1]{\textcolor{olive}{[Lily: #1]}}

%
% These are are recommended to typeset listings but not required. See the subsubsection on listing. Remove this block if you don't have listings in your paper.
\usepackage{newfloat}
\usepackage{listings}
\lstset{%
	basicstyle={\footnotesize\ttfamily},% footnotesize acceptable for monospace
	numbers=left,numberstyle=\footnotesize,xleftmargin=2em,% show line numbers, remove this entire line if you don't want the numbers.
	aboveskip=0pt,belowskip=0pt,%
	showstringspaces=false,tabsize=2,breaklines=true}
\floatstyle{ruled}
\newfloat{listing}{tb}{lst}{}
\floatname{listing}{Listing}
%
%\nocopyright
%
% PDF Info Is REQUIRED.
% For /Title, write your title in Mixed Case.
% Don't use accents or commands. Retain the parentheses.
% For /Author, add all authors within the parentheses,
% separated by commas. No accents, special characters
% or commands are allowed.
% Keep the /TemplateVersion tag as is
\pdfinfo{
/Title (AAAI Press Formatting Instructions for Authors Using LaTeX -- A Guide)
/Author (AAAI Press Staff, Pater Patel Schneider, Sunil Issar, J. Scott Penberthy, George Ferguson, Hans Guesgen, Francisco Cruz, Marc Pujol-Gonzalez)
/TemplateVersion (2022.1)
}

% DISALLOWED PACKAGES
% \usepackage{authblk} -- This package is specifically forbidden
% \usepackage{balance} -- This package is specifically forbidden
% \usepackage{color (if used in text)
% \usepackage{CJK} -- This package is specifically forbidden
% \usepackage{float} -- This package is specifically forbidden
% \usepackage{flushend} -- This package is specifically forbidden
% \usepackage{fontenc} -- This package is specifically forbidden
% \usepackage{fullpage} -- This package is specifically forbidden
% \usepackage{geometry} -- This package is specifically forbidden
% \usepackage{grffile} -- This package is specifically forbidden
% \usepackage{hyperref} -- This package is specifically forbidden
% \usepackage{navigator} -- This package is specifically forbidden
% (or any other package that embeds links such as navigator or hyperref)
% \indentfirst} -- This package is specifically forbidden
% \layout} -- This package is specifically forbidden
% \multicol} -- This package is specifically forbidden
% \nameref} -- This package is specifically forbidden
% \usepackage{savetrees} -- This package is specifically forbidden
% \usepackage{setspace} -- This package is specifically forbidden
% \usepackage{stfloats} -- This package is specifically forbidden
% \usepackage{tabu} -- This package is specifically forbidden
% \usepackage{titlesec} -- This package is specifically forbidden
% \usepackage{tocbibind} -- This package is specifically forbidden
% \usepackage{ulem} -- This package is specifically forbidden
% \usepackage{wrapfig} -- This package is specifically forbidden
% DISALLOWED COMMANDS
% \nocopyright -- Your paper will not be published if you use this command
% \addtolength -- This command may not be used
% \balance -- This command may not be used
% \baselinestretch -- Your paper will not be published if you use this command
% \clearpage -- No page breaks of any kind may be used for the final version of your paper
% \columnsep -- This command may not be used
% \newpage -- No page breaks of any kind may be used for the final version of your paper
% \pagebreak -- No page breaks of any kind may be used for the final version of your paperr
% \pagestyle -- This command may not be used
% \tiny -- This is not an acceptable font size.
% \vspace{- -- No negative value may be used in proximity of a caption, figure, table, section, subsection, subsubsection, or reference
% \vskip{- -- No negative value may be used to alter spacing above or below a caption, figure, table, section, subsection, subsubsection, or reference

\setcounter{secnumdepth}{2} %May be changed to 1 or 2 if section numbers are desired.

% The file aaai22.sty is the style file for AAAI Press
% proceedings, working notes, and technical reports.
%

% Title
\title{Your Bandit Model is Not Perfect: Introducing Robustness to Restless Bandits Enabled by Deep Reinforcement Learning}
% \author {
%     % Authors
%     Jackson A.~Killian,\textsuperscript{\rm 1}
%     Lily Xu,\textsuperscript{\rm 1}
%     Arpita Biswas,\textsuperscript{\rm 1}
%     Milind Tambe\textsuperscript{\rm 1}
% }
% \affiliations {
%     % Affiliations
%     \textsuperscript{\rm 1} Harvard University, Cambridge, MA, USA\\
%     jkillian@g.harvard.edu, lily\_xu@g.harvard.edu, arpitabiswas@seas.harvard.edu, milind\_tambe@harvard.edu
% }
\author {
    % Authors
    Anonymous submission: 8738
}


\begin{document}

\maketitle
\begin{abstract}
Restless multi-arm bandits (RMABs) are receiving renewed attention for their potential to model real-world planning problems under resource constraints. However, few RMAB models have surpassed theoretical interest, since they make the limiting assumption that model parameters are perfectly known. In the real world, model parameters often must be estimated via historical data or expert input, introducing uncertainty. In this light, we introduce a new paradigm, \emph{Robust RMABs}, a challenging generalization of RMABs that incorporates interval uncertainty over parameters of the dynamic model of each arm. This uncovers several new challenges for RMABs and inspires new algorithmic techniques of general interest. Our contributions are:
(i)~We introduce the Robust Restless Bandit problem with interval uncertainty and solve a minimax regret objective;
(ii)~We tackle the complexity of the robust objective via a double oracle (DO) approach and analyze its convergence;
(iii)~To enable our DO approach, we introduce RMABPPO, a novel deep reinforcement learning (RL) algorithm for solving RMABs, of potential general interest. 
% RMABPPO learns an auxiliary ``$\lambda$-network'' in tandem with individual arm networks to reduce sample complexity while guaranteeing convergence. 
The procedure also generalizes to continuous-action settings, the first algorithm of its kind for RMAB, as well as multi-action settings, the first deep RL algorithm to do so;
(iv)~We design the first adversary algorithm for  RMABs, required to implement the notoriously difficult minimax regret adversary oracle and also of general interest, by formulating it as a multi-agent RL problem and solving with a multi-agent extension of RMABPPO.
\end{abstract}


\section{Introduction}

\input{1.2_new_intro}

\section{Related Work}

%Killian et al.~\cite{killian2021multiAction} proposed a method that leverages the convexity of an approximate Lagrangian version of the multi-action RMAB problem. 


%The \textit{restless multi-armed bandit} (RMAB) problem was introduced by Whittle~\cite{whittle1988restless} where he showed that a relaxed version of RMAB problem can be solved optimally using a heuristic called \textit{Whittle Index policy}. This policy is shown to be optimal when the RMAB instances satisfy \textit{indexability} property. Moreover, Papadimitriou and Tsitsiklis~\cite{papadimitriou1994complexity} established that solving RMAB is PSPACE-hard, even for the special case when the transition rules are deterministic. 

\textbf{RMAB: } \citet{whittle1988restless} introduced RMABs as a reward-maximizing binary-action problem in which a planner selects $k$ out of $N$ state-transitioning arms. He also gave a relaxation-based heuristic, now dubbed the Whittle index, which is optimal under \textit{indexability} \cite{weber1990index}. While the approach seeded a vast literature for the binary-action paradigm, it did not extend to multi-action settings, where each arm admits more than two actions. \citet{glazebrook2011general} and \citet{hodge2015asymptotic} extended index solutions to multi-action RMABs with special monotonic structure, while \citet{killian2021multiAction} proposed a more general method based on an approximate Lagrangian version of the multi-action RMAB problem. Also related are weakly coupled Markov decision processes (WCMDP). \citet{hawkins2003langrangian} proposed a Lagrangian relaxation for WCMDPs and an LP for minimizing the Lagrange bound. \citet{adelman2008relaxations} and \citet{gocgun2012lagrangian} gave less scalable solutions that provide better approximations to WCMDPs. However, all assume knowledge of the state transition dynamics to compute policies. A few recent works have studied online RMABs with unknown dynamics but make limiting assumptions that the planner only acts on one arm per round action~\cite{gafni2020learning} or that states periodically reset~\cite{jung2019regret}. None consider robust planning under environment uncertainty, which we address. 

\textbf{RL for RMAB:} A few recent works learn Whittle indices for indexable binary-action RMABs using (i)~DRL \cite{nakhleh2020neurwin} and (ii)~tabular Q-learning ~\cite{biswas2021learn,fu2019towards,avrachenkov2020whittle}. \citet{killian2021Q} take tabular Q-learning to the multi-action setting. In contrast, our DRL approach provides a more general solution to binary and multi-action RMAB domains, not requiring indexability or problem structure, and is far more scalable than tabular methods. We are also the first to handle continuous-action RMABs, a key component of the nature oracle. 

\textbf{Robust planning:} A large body of work on robust planning in RL mainly focuses on maximin reward through robust adversarial RL \cite{pinto2017robust} or MARL \cite{lanctot2017unified,li2019robust}, but they may return conservative policies overly sensitive to the worst case. The minimax regret criterion \cite{braziunas2007minimax} avoids this pitfall, but computing minimax regret-optimal strategies is hard, often involving very large or continuous strategy spaces. Accordingly, \citet{mcmahan2003planning} proposed the double oracle approach to explore a small subset of strategies while still guaranteeing optimal convergence \cite{nguyen2014regret,gilbert2017double}. Subsequently, double oracle has been extended to optimize MARL problems with multiple selfish agents \cite{lanctot2017unified}. Further, recently, \citet{xu2021robust} used double oracle to solve a single Markov Decision Process (MDP) minimax regret planning problem and used RL to design the oracles. However, their RL algorithms are incapable of learning good policies over the inherently combinatorial state and action spaces of the $N$ resource-constraint-linked MDPs of RMABs. Additionally, their approach is designed only for continuous state/action spaces, whereas our approach is capable of finding robust policies for any combination of discrete/continuous state/action spaces. We accomplish this jointly by (1)~our novel formulation of the nature oracle as a MARL problem, which decomposes the non-stationarities to be learned with separate networks, and (2)~building off of the flexible PPO training procedure \cite{schulman2017proximal}.

% Robust planning has been identified as a critical concern in domains such as healthcare \cite{begoli2019need,ghassemi2019practical,wilder2017uncharted}, environmental conservation \cite{moilanen2006planning,regan2005robust,visconti2015building}, and urban planning \cite{shortridge2017robust,yao2009evacuation}, underlining the need to develop effective policies that are robust to uncertainty for these urgent real-world settings. 

% Robustness objectives have been considered for bandit applications in the two-action stochastic setting, where each pull of an arm draws a reward sampled from an unknown Bernoulli distribution. \citet{wei2021nonstationary} address minimax regret of time-varying reward shifts using heuristics to trade off remembering vs.\ forgetting, and \citet{garivier2016maximin} consider a maximin objective to guide exploration in Monte Carlo Tree Search. However, RMABs are significantly harder than the stochastic settings since, in RMABs, the rewards are dependent on the current state which in turn depends on the actions taken on the arms. 


% TODO: adversarial stochastic bandits - find one or two papers that talk about maxmin in stochastic bandits. which motivate why maxmin objectives are well-studied. but stochastic bandits are much simpler than restless bandits, so we're looking to generalize those results

\begin{figure*}[t]
\centering
\includegraphics[width=0.80\linewidth]{img/concept_fig.pdf} 
\caption{(a) Proposed framework for solving the Robust RMAB problem. The main loop follows a double oracle approach to iteratively compute a minimax regret optimal RMAB policy where each oracle is a novel DRL algorithm for RMABs. 
% The set of agent RMAB policies map states to actions for the arms of the RMAB and the set of nature model parameter settings control the RMAB transition dynamics. Each loop $e$, we compute the optimal mixed strategy over agent and nature policies, then pass each mixed strategy to the opposing oracle which return best responses to add to their respective sets. Designing each oracle are two of our key contributions, requiring novel DRL algorithms for RMABs. 
(b) The nature oracle: a novel multi-agent RL formulation of RMAB, that tackles non-stationarity with a centralized critic.}
% In particular, we tackle the non-stationarity of the regret-maximizing nature oracle by formulating it as a MARL problem. $\pi^{(A)}$ is a ``helper'' which learns the optimal policy, and thus the maximum return, while $\pi^{(B)}$ learns environment parameters which maximize regret of the current (fixed) agent mixed strategy, and is returned by the oracle.} 
\label{fig:concept} 
\end{figure*} 


\section{Problem Statement}
We consider the multi-action RMAB setting with $N$ arms. Though our approaches generalize to continuous state and action spaces, our exposition will consider the discrete cases. Each arm $n\in [N]$ follows an MDP $(\mathcal{S}_n, \mathcal{A}_n, \mathcal{C}_n, T_n, R_n, \beta)$, where $\mathcal{S}_n$ is a set of finite, discrete states, $\mathcal{A}_n$ is a set of finite, discrete actions, $\mathcal{C}_n : \mathcal{A}_n \xrightarrow[]{} \mathbb{R}$ corresponds to action costs, $T_n: \mathcal{S}_n\times \mathcal{A}_n \times \mathcal{S}_n \xrightarrow[]{} [0,1]$ gives the probability of transitioning from one state to another given an action,  $R_n:\mathcal{S}_n\times \mathcal{A}_n \times \mathcal{S}_n \xrightarrow[]{} \mathbb{R}$ is a reward function, and $\beta \in [0, 1)$ is the discount parameter. Each round, the agent must select actions for each arm such that the sum cost of actions does not exceed a per-round budget $B$. The aim of multi-action RMABs is to maximize total reward over a fixed number of $H$ rounds, subject to this budget constraint, generalizing the well-studied binary-action RMAB. 

In this work, we extend multi-action RMABs to the robust setting in which the exact transition probabilities are unknown. Instead, the transitions of each arm are determined by a set of parameters $\mathcal{P}_n$. 
%that are known with \emph{interval uncertainty}. The relationship between $\mathcal{P}_n$ and $T_n$ is a function of the user's model, e.g., (1) $\mathcal{P}_n$ could correspond to exact entries of $T_n$ or (2) $\mathcal{P}_n$ could represent a measure such as ``infectivity'' in an epidemic model, which would influence many entries of $T_n$. 
For a given arm $n \in [N]$ and parameter $p_n \in \mathcal{P}_n$, let  $\langle{\omega_{n,p_n}}\rangle:=[\underline{\omega}_{n,p_n}, \overline{\omega}_{n,p_n}]$ represent the range, i.e., uncertainty over the model parameter. We ask the following question: \textit{how to find a solution to the multi-action RMAB problem that is robust to such uncertainties?}


% For a given tuple $e := (s,a,s') \in \mathcal{S}_n\times \mathcal{A}_n\times \mathcal{S}_n$, let  $\langle{\omega_{n,e}}\rangle:=[\underline{\omega}_{n,e}, \overline{\omega}_{n,e}]$ represent the range. Let $\hat{\omega}$ be a given realization of the transition probabilities such that $\hat{\omega_{n,e}} \in \langle{\omega_e}\rangle$ for all $e \in \mathcal{S}_n\times \mathcal{A}_n\times \mathcal{S}_n$ and for all $n\in N$. We ask the following question: \textit{how to find a solution to the RMAB problem that is robust to such uncertainties?}

%Let $[\theta^{(a)}_n(s_n, a_n, s_n^\prime), \theta^{(b)}_n(s_n, a_n, s_n^\prime)]$ represent the range for a given transition probability and let $\bm{\theta}$ represent a given realization of the transition probabilities such that $\bm{\theta}_n(s_n, a_n, s_n^\prime) \in [\theta^{(a)}_n(s_n, a_n, s_n^\prime), \theta^{(b)}_n(s_n, a_n, s_n^\prime)] \forall n, s_n, a_n, s_n^\prime$.  All other parts of the MDP for each arm will remain the same and known. Note that we follow the conventions of bounded MDPs (BMDPs) \cite{givan2000bounded} and will refer to Multi-Action RMABs whose arm follow BMDPs as Multi-Action BRMABs. 


% In the absence of perfect knowledge of transition probabilities, it is not possible to compute the expected reward, and thus, the objective of maximizing expected reward is not appropriate.
%[Move to related work]  \citet{givan2000bounded} consider pessimistic (worst-case) and optimistic (best-case) objectives for RMAB problem when the transitions are uncertain.  consider the minimax regret criterion \cite{ahmed2017sampling,kwak2012towards} -- we consider the same and define it below. The above papers give methods for finding policies that optimize the corresponding objectives. However, their approaches cannot be used directly to solve Multi-action BRMABs because the tools are developed for standard BMDPs. Multi-action BRMABs can be represented as BMDPs, but the conversion requires exponentially-sized state and action spaces, making it intractable to use existing tools. Our challenge then, is to develop techniques for finding robust policies for Multi-Action BRMABs, since no such techniques exist.
We consider an objective that incorporates uncertainty over $\mathcal{P}_n$. Let $\omega$ be a given realization of the model parameters such that $\omega_{n,p_n} \in \langle{\omega_{n,p_n}}\rangle$ for all $n \in [N]$ and $p_n \in \mathcal{P}_n$. Let $G(\pi,\omega)$ be the planner's expected reward under policy $\pi$ and a realization~$\omega$ of the uncertainty. Regret is defined: 
\begin{align}
L(\pi,\omega) = G(\pi^*_{\omega},\omega) - G(\pi,\omega) \ ,
\label{eq:regret}
\end{align}
where $\pi^*_{\omega}$ is the optimal policy under $\omega$. In our robust planning formulation, our objective is to compute a policy~$\pi$ that minimizes the maximum regret~$L$ possible for any realization of $\omega$, leading to the following minimax objective:
\begin{align}
    \min_{\pi}\max_{{\omega}}{L(\pi,{\omega})} \ .
    \label{eq:minimax}
\end{align}
This problem is computationally expensive to solve since simply computing a policy~$\pi$ that maximizes the expected reward $G(\pi,\omega)$ is PSPACE-Hard \cite{papadimitriou1994complexity} even when the transition rules are known, i.e., $\omega$ is given. This challenge is likely a key reason why the robust formulation has not yet been addressed. 
%However, we handle this complexity via a Lagrangian relaxation of the relevant underlying objective of maximizing expected reward (detailed in Section \ref{sec:lagrangian_relaxation}). 
To overcome the complexity of the minimax optimization, we take a double oracle approach \cite{mcmahan2003planning}, which requires key innovations to work in the RMAB setting. 



\section{Preliminaries}
\label{sec:preliminaries}

The double oracle approach achieves the minimax regret objective in Eq.~\ref{eq:minimax} by casting the optimization problem as a zero-sum game between two players, the \textit{agent} and \textit{nature}, visualized in Fig.~\ref{fig:concept}(a). The agent selects an RMAB policy that minimizes regret for some realization of the model parameters. Nature then adversarially selects the values of $\mathcal{P}_n$ that maximize regret for a given policy of the RMAB planner. This framework is desirable since it converges to an $\varepsilon$--optimal solution \cite{adam2021double,xu2021robust}, assuming the oracles return the best response for both players. The key technical contributions of this paper arise from designing the agent and nature oracles (right boxes of Fig.~\ref{fig:concept}(a)). 

For the agent, minimizing regret with respect to a fixed nature strategy is equivalent to maximizing reward w.r.t.\ that strategy, so the agent objective is the same as solving a multi-action RMAB to find the best policy. 
%bsection{Lagrangian Relaxation} 
% \label{sec:lagrangian_relaxation}
%Let $\pi^*_{\omega}$ be the optimal policy for a given multi-action RMAB defined by $\hat{\omega}$ parameter. Formally, 
A policy~$\pi$ maps states to decision matrices $\bm{A} \in \{0,1\}^{N\times |\mathcal{A}|}$ where the total number of active actions is constrained by a budget for each round. 
% \begin{align}
%     &\sum_{j=1}^{|\mathcal{A}|} \bm{A}_{nj} = 1 \hspace{3mm} \forall n \in [N] \label{eq:single_action_constraint} \\
%     &\sum_{n=1}^{N}\sum_{j=1}^{|\mathcal{A}|} \bm{A}_{nj}c_j \le B \label{eq:budget_constraint}
% \end{align}
%Let $\overline{\mathcal{A}}$ be the set of feasible decision matrices and l
Let $\bm{s} = (s^1, \ldots, s^{N})$ represent the initial state of each arm. Then, for a given parameter $\omega$, the optimal policy $\pi^*_{\omega}$ maximizes the expected discounted sum of rewards of all arms as given by the constrained Bellman equation:
\begin{equation}
\begin{aligned}\label{eq:combined_value_function}
    J(\bm{s}) &= \max_{\bm{A}}\left\{\sum_{n=1}^{N} R_n(s_n, a_{nj}, s_n^\prime) + \beta \mathbb{E}_{\omega}[J(\bm{s}^\prime) \mid \bm{s}, \bm{A}]\right\} \\
    \text{s.t. } & \sum_{j=1}^{|\mathcal{A}|} a_{nj} = 1 \hspace{3mm} \forall n \in [N] 
    \qquad
    \sum_{n=1}^{N}\sum_{j=1}^{|\mathcal{A}|} a_{nj}c_{nj} \le B
\end{aligned}
\end{equation}
%However, this is an optimization problem with exponentially many states and actions, making it at least PSPACE-Hard to solve \cite{papadimitriou1994complexity}. 
where $a_{nj} \in \bm{A}_n$ and $c_{nj}$ are the corresponding action costs in $\mathcal{C}_n$. We then relax the problem by taking the Lagrangian relaxation of the budget constraint~\cite{hawkins2003langrangian}, giving:
\begin{align}
    &J(\bm{s}, \lambda^*) = \min_{\lambda} \left( \frac{\lambda B}{1-\beta} + \sum_{n=1}^{N}\max_{a_{nj}\in\mathcal{A}_n}Q_n(s_n, a_{nj}, \lambda) \right) \label{eq:decoupled_value_func} \\
    &\quad \text{where }\hspace{1mm} Q_n(s_n, a_{nj}, \lambda) =
    R_n(s_n, a_{nj}, s_n^\prime) - \lambda c_{nj} + \nonumber \\ 
    &\quad \qquad \beta \mathbb{E}_{\omega} \left[ Q_n(s_n^{\prime}, a_{nj}, \lambda) \mid \pi^{La}_{\omega}(\lambda) \right] \ . \label{eq:arm_value_func_lagrange}
\end{align}
Here, $Q$ is the value function and  $\pi^{La}_{\omega}(\lambda)$ is the optimal policy for a given $\lambda$. 
% See \citet{adelman2008relaxations} for a detailed derivation. 
Note that for a given value of $\lambda$, Eq.~\ref{eq:decoupled_value_func} could be solved using $N$ individual value iterations. However, setting $\lambda:=$ $\lambda^*$ is critical to finding good policies for multi-action RMABs and is asymptotically optimal in the binary-action case \cite{weber1990index}, i.e., $\pi^{La}_{\omega}(\lambda^*) \xrightarrow[]{} \pi^{*}_{\omega}$. Given this relationship, in the remainder of the paper, we focus on computing $\pi^{La}_{\omega}(\lambda^*)$ and denote it as $\pi^{*}_{\omega}$ for convenience. 

% Much effort has been invested in finding fast methods for computing $\pi^{La}_{\omega}(\lambda^*)$ in the binary-action case \cite{glazebrook2006some,Sombabu2020,Liu2010} and recently in the multi-action case \cite{glazebrook2011general,hodge2015asymptotic,killian2021beyond}. \textbf{However, the best general method still relies on solving linear programs for each arm, which does not scale well to problems with very large state or action spaces. Therefore in this work, we will investigate methods that can solve Eq.~\ref{eq:decoupled_value_func} via deep reinforcement learning (RL), which has recently seen major success in finding optimal policies for large-scale MDPs. The key challenge to developing such RL techniques for Multi-Action RMABs will be in deriving a gradient update rule for $\lambda$ that allows us to converge to $\pi^{La}_{\omega}(\lambda^*)$.}

\section{Solving Robust Restless Bandits}
\label{sec:robust-rmab}

We now build our approach for finding \emph{robust} RMAB policies, visualized in Fig.~\ref{fig:concept}(a). 
%The approach pitches the robustness problem as a two player zero-sum game with an \textit{agent} that aims to minimize regret and \textit{nature} that aims to maximize regret \cite{xu2021robust}. Double oracle is an iterative technique for finding an optimal solution \cite{mcmahan2003planning}, i.e., a mixed equilibrium strategy for both the agent and nature. 
The \textit{pure strategy} space for the agent is the set of all feasible RMAB policies $\pi: \mathcal{S}_1\times\cdots\times \mathcal{S}_N \mapsto \mathcal{A}_1\times\cdots\times \mathcal{A}_N$. The pure strategy space for nature is a closed set of parameters $\omega$ within the given uncertainty intervals. The algorithm maintains a pure strategy set for the agent and nature (Fig.~\ref{fig:concept}(a) left boxes); each iteration, these are used to compute optimal mixed strategies in a regret game (Fig.~\ref{fig:concept}(a) center), then passed to the opposing oracles (Fig.~\ref{fig:concept}(a) right boxes) which return best responses to add to the strategy sets. 

The agent oracle's goal is to find an RMAB policy $\pi$, or pure strategy, to minimize regret (Eq.~\ref{eq:regret}) given a \emph{mixed strategy} $\tilde{\omega}$, where a mixed strategy is a probability distribution over a set of pure strategies. That is, the agent minimizes $L(\pi,\tilde{\omega})$ w.r.t.\ $\pi$, while $\tilde{\omega}$ is constant. Recall from Eq.~\ref{eq:regret} that
$L(\pi,\tilde{\omega}) = G(\pi^{*}_{\tilde{\omega}},\tilde{\omega}) - G(\pi,\tilde{\omega})$.
Since $\tilde{\omega}$ and $\pi^{*}_{\tilde{\omega}}$ are constant, then the first term $G(\pi^{*}_{\tilde{\omega}},\tilde{\omega})$ is also constant. Thus minimizing $L(\pi,\tilde{\omega})$ is equivalent to maximizing the second term $G(\pi,\tilde{\omega})$, which is maximal at $\pi=\pi^{*}_{\tilde{\omega}}$. In other words, the agent oracle must compute an optimal reward-maximizing policy w.r.t.\ $\tilde{\omega}$. Such a reward maximizing objective aligns with existing RL techniques, allowing us to build on them, but still leaving the challenge of learning in the combinatorial state and action spaces of the RMAB. 

Conversely, the nature oracle's goal is to find a parameter setting (pure strategy) $\omega$ that maximizes the agent's regret given a mixed strategy $\tilde{\pi}$, i.e., maximize $L(\tilde{\pi},\omega)$ with respect to $\omega$, while $\tilde{\pi}$ is fixed. This objective is far more challenging because both $G(\pi^*_{\omega},\omega)$ and $G(\tilde{\pi},\omega)$ are functions of $\omega$. Most critically, computing $G(\pi^*_{\omega},\omega)$ requires obtaining an optimal policy $\pi^*_{\omega}$ as $\omega$ changes in the optimization---this amounts to a planning problem in which an agent must learn an optimal policy while the environment, controlled by $\omega$, is changing over time, making the nature oracle notoriously difficult to implement. Moreover, in the interval uncertainty setting we consider, $\omega$ is defined by a space of continuous values; thus nature's pure strategy space is infinite, making the problem even more complex, since it cannot be exhaustively searched. 

\emph{To tackle all of this complexity, as one of our main contributions, we propose a novel method for implementing the regret-maximizing nature oracle by casting it as a MARL problem}. The approach, visualized in Fig.~\ref{fig:concept}(b), trains one auxiliary agent to solve for a policy $\pi^*_{\omega}$ ($\pi^A$ in Fig.~\ref{fig:concept}(b)), needed to compute $G(\pi^*_{\omega},\omega)$ in the regret term, and simultaneously trains a second agent to learn worst-case parameters $\omega$ ($\pi^B$ in Fig.~\ref{fig:concept}(b)) that minimize $G(\tilde{\pi},\omega)$---together, these will maximize the regret  $L(\tilde{\pi},\omega)$. The non-stationarity is mitigated in this MARL setup by centralized critic networks which expose the actions of each agent to the other, allowing them to include the other's actions in their learned state space.
Ultimately, solving a MARL problem requires an RL algorithm to optimize the underlying policy; hence we first introduce our novel RL approach, RMABPPO, to solve RMABs (Sec.~\ref{sec:rl-rmab}) as a part of our agent oracle and then use the algorithm as the backbone of our nature oracle (Sec.~\ref{sec:marl-rmab}).
%our multi-agent RL approach relies on the RMABPPO algorithm described above as the underlying technique. 
%RMABPPO will be the backbone of both the agent and nature oracles, so we describe it first, then provide our full double oracle algorithm.

%However, to solve a multi-agent reinforcement learning problem requires an algorithm for deep reinforcement learning the underlying problem. To the best of the authors' knowledge, no such algorithms exist for multi-action RMABs. Therefore, as our second main contribution we introduce RMAB Proximal Policy Optimization (RMABPPO), a novel deep reinforcement learning algorithm for computing Lagrange policies for multi-action RMABs.

\subsection{Agent Oracle: Deep RL for RMAB}
\label{sec:rl-rmab}


\begin{algorithm}[t]
\caption{RMABPPO}
\label{alg:rmabppo}
\begin{flushleft}
\textbf{Input}: Initial state $\bm{s}_0$, nature mixed strategy $\tilde{\omega}$ \\
\textbf{Parameters}: \texttt{n\_epochs}, \texttt{n\_subepochs}, \texttt{n\_steps}

\end{flushleft}
\begin{algorithmic}[1] %[1] enables line numbers
\STATE Randomly initialize a policy network $\pi_{\theta_n}$ for each arm $n \in [N]$
\STATE Randomly initialize $\lambda$-network $\Lambda$
\STATE Initialize an empty \texttt{buffer}
\FOR{$\textit{epoch} = 1, 2, \ldots, \texttt{n\_epochs}$}
% \STATE Sample $\lambda$ from \textsc{Lambda} \\
\STATE Sample $\lambda = \Lambda(\bm{s})$
\FOR {$\textit{subepoch} = 1, \ldots, \texttt{n\_subepochs}$}
\FOR {timestep $t = 1, \ldots, \texttt{n\_steps}$}
\STATE Sample action $a_n = \pi_{\theta_n}(s_n, \lambda)$ for all $n \in [N]$
\STATE Add trajectories $(\bm{s}, \bm{a}, r, \bm{s}^\prime, \lambda)$ to \texttt{buffer} % NOTE: here we do not impose a budget constraint
\ENDFOR
\STATE Update arm policy networks $\pi_{\theta_n}$ via PPO, using tuples in \texttt{buffer} %, i.e., learn $Q(s, a, \lambda)$ and $\pi(s, \lambda)$ for a given $\lambda$
% \STATE \textbf{else if} $j \bmod \kappa = 0$ \textbf{then} \hspace{.5em} Freeze $\altpolicy$ parameters \\
% \STATE \textbf{else} \hspace{.5em}  Freeze $\attract$ parameters \label{line:wake-end} \\
% \STATE Update $\altpolicy$ and $\attract$ using gradient ascent to maximize regret: $\reward(\altpolicy, \attract) - \reward(\defpolicy, \attract)$
\ENDFOR
\STATE Update $\Lambda$ w/ sum discounted costs of final subepoch
\ENDFOR
% \STATE \textbf{return} $\attract$,  $\altpolicy$
\STATE \textbf{return} $\pi_{\theta_1}, \ldots, \pi_{\theta_N}$ and $\Lambda$
\end{algorithmic}
\end{algorithm}


% Existing index-based approaches for solving restless bandits are computationally infeasible for large problem sizes (many arms, many actions, or both) and cannot be extended to continuous states or actions. We present the first application of reinforcement learning to solve RMABs, which enables us to better scale while simultaneously accommodating continuous states and actions. Our RL implementation then becomes the foundation of our robust planning approach. 

Existing DRL approaches can be applied to the objective in Eq.~\ref{eq:combined_value_function}, but they fail to scale past trivially sized RMAB problems since the action and state spaces grow exponentially in $N$. For example, for a binary-action RMAB with $N=50$ and $B=20$, the action space would be of size $\binom{50}{20}\approx10^{12}$, which is not feasible to learn, even with a neural network. To overcome this, we develop a novel DRL algorithm that instead solves the decoupled problem (Eq.~\ref{eq:decoupled_value_func}). The key benefit of decoupling is to render policies and $Q$ values of each arm independent, allowing us to learn $N$ independent networks with linearly sized state and action spaces, relieving the combinatorial burden of the learning problem
--- the above example would now only have $N \times 2 = 100$ actions. 
However, this approach introduces a new technical challenge in solving the dual objective which maximizes over policies but minimizes over $\lambda$, as discussed in Sec.~\ref{sec:preliminaries}.

To solve this, we derive a dual gradient update procedure that iteratively optimizes each objective as follows: (1)~holding $\lambda$ constant, learn $N$ independent policy networks via a policy gradient procedure, augmenting the state space to include $\lambda$ as input, as in Eq.~\ref{eq:decoupled_value_func}; (2)~use sampled trajectories from those learned policies as an estimate to update $\lambda$ towards its minimizing value via a novel gradient update rule. Another challenge is that $\lambda^*$ of Eq.~\ref{eq:decoupled_value_func} depends on the current state of each arm --- therefore, a key element of our approach is to learn this function $\lambda^*(\bm{s})$ concurrently with our iterative optimization, using a neural network we call the $\lambda$-network that is parameterized by $\Lambda$. To train the $\lambda$-network, we use the following gradient update rule.
\begin{restatable}[]{proposition}{lambdaUpdate}
\label{thm:lambda_update}
A gradient rule for updating the $\lambda$-network, parameterized by $\Lambda$, such that for a state $\bm{s}$, the $\lambda$-network predicts the value $\lambda$ that minimizes Eq.~\ref{eq:decoupled_value_func} is as follows:
\begin{equation}
\begin{aligned}
    \Lambda_t = \Lambda_{t-1} - \alpha \left( \frac{B}{1-\beta} + \sum_{n=1}^{N}D_n(s_n, \lambda_{t-1}(\bm{s})) \right) 
\end{aligned}
\end{equation}
where $\alpha$ is the learning rate and $D_n(s_n, \lambda)$ is the negative of the expected $\beta$-discounted sum of action costs for arm $n$ starting at state $s_n$ under the optimal policy for arm $n$ for a given value of $\lambda$.
\end{restatable}

% \begin{theorem}
% \label{thm:lambda-update}
%     \Lambda_t = \Lambda_{t-1} -  g(\bm{s}, \lambda_{t-1}(\bm{s})) = \Lambda_{t-1} - \alpha \left( \frac{B}{1-\beta} + \sum_{n=1}^{N}D_n(s_n, \lambda_{t-1}(\bm{s})) \right) 
% \end{theorem}
Although $D_n$ cannot be computed exactly as we do not know the optimal policy, it can be estimated from samples of multiple rollouts of the policy during training. As long as arm policies are trained for adequate time on the given value of $\lambda$, the gradient estimate will be accurate, i.e., $D_n(s_n, \lambda_{t-1}(\bm{s})) \approx -\sum_{k=0}^{K-1} \beta^k c_{nk}$ where $K$ is the number of samples collected in an epoch and $c_{nk}$ is the action cost of arm $n$ in round $k$. Moreover, this procedure will converge to the optimal parameters $\Lambda$ if the arm policies are optimal.
% \begin{equation}
% \begin{aligned}\label{eq:estimation_of_discounted_cost}
%     D_n(s_n, \lambda_{t-1}(\bm{s})) \approx -\sum_{z=0}^{T-1} \beta^z c_{nz}
% \end{aligned}
% \end{equation}

\begin{restatable}[]{proposition}{lambdaConvergence}
\label{thm:lambda_convergence}
Given arm policies corresponding to optimal $Q$-functions, 
% the gradient update rule of 
Prop.~\ref{thm:lambda_update} will lead $\Lambda$ to converge to the optimal as the number of training epochs and $K\xrightarrow[]{}\infty$.
\end{restatable}

Proofs of both propositions are given in the appendix. Note that to collect samples that reflect the proper gradient, the RMAB budget must not be imposed at training time --- rather, the policy networks and $\lambda$-network must learn to play the Lagrange policy of Eq.~\ref{eq:decoupled_value_func} which spends the correct budget in expectation. At training time, actions are sampled randomly according to the actor network distributions. At test time, actions are taken deterministically by greedily selecting the highest probability actions until the budget is spent.
% When $g$ is negative, it will indicate \textit{overspending}, meaning $\Lambda$ should increase it's prediction for $\lambda$ at state $\bm{s}$ to make acting more expensive, encouraging the policy to make more effective use of the budget across all arms. When $g$ is positive, it will indicate that the full budget is not being spent in expectation, meaning $\Lambda$ should decrease its prediction of $\lambda$ at $\bm{s}$ to encourage more actions. 

In theory, the policy networks could be trained via any DRL procedure that ensure the above characteristics for training the $\lambda$-network. In practice, we train with proximal policy optimization (PPO) \cite{schulman2017proximal}, a policy gradient approach which has demonstrated state-of-the-art performance while being relatively simple to implement. Importantly, PPO is also flexible enough to handle both discrete and continuous actions which is necessary in the nature oracle. 

Finally, to enable our iterative, dual-update procedure in practice, we need a mechanism to encourage both (1) exploring new arm policy actions after an update to $\Lambda$, then (2) later exploiting learned policy actions to develop good gradient estimates for $\Lambda$. We navigate this important trade-off by adding an entropy regularization term to the policy networks losses, controlled via a cyclical temperature parameter. %Importantly, PPO applies to both discrete and continuous action networks, which is necessary to extend our algorithm to the nature oracle. 
We call our algorithm RMABPPO, give pseudocode in Algorithm~\ref{alg:rmabppo}, and give more implementation details in the appendix.
% As long as we pick learning rates and update timings appropriately, it should be easy enough to show that this procedure converges to the optimal solution, i.e., Eq.~\ref{eq:decoupled_value_func}. In theory, this update procedure also should plug and play nicely with the multi-agent RL procedure proposed for the nature oracle.

% our solution is to train $n$ separate RL agents 
%Why combined RL approach doesn't work: doesn't scale, esp in discrete settings. So we need the lambda approach. [describe lambda network]







\subsection{Nature Oracle: Multi-Agent RL for RMAB}
\label{sec:marl-rmab}

Armed with a DRL procedure for learning RMAB policies, we can create the MARL procedure to implement the nature oracle. Recall the challenge of the nature oracle is to jointly optimize a policy $\pi^*_{\omega}$ and model parameters~$\omega$. We propose to treat this optimization as a MARL problem designed to handle this form of non-stationarity \cite{lowe2017multi} via centralized critics. The procedure is visualized in Fig.~\ref{fig:concept}(b).

To implement the nature oracle, we introduce two agents $A$ and $B$, where $A$'s goal is to optimize the RMAB policy $\pi^*_{\omega}$ and $B$'s goal is to find parameters $\omega$ that maximize regret of the current agent mixed strategy $\tilde{\pi}$. We define a shared transition function $T: \mathcal{S} \times \mathcal{A}_A \times \mathcal{A}_B \xrightarrow[]{} \mathcal{S}$. Here, $\mathcal{A}_A$ is the action space of the underlying multi-action RMAB. At a given state $\bm{s}$, the action space $\mathcal{A}_B$ lets agent $B$ select $\omega$ which, in general, depends on $\bm{s}$. That is, at each step, agent $B$ will select environment parameters $\omega$, and thus state/action transition probabilities that will determine the outcome of agent $A$'s actions. We adopt the centralized critic idea from multi-agent PPO \cite{yu2021surprising} to our RMAB setting to create MA-RMABPPO. 
Since the policy space of agent $A$ is discrete while that of agent $B$ is continuous, a notable benefit to PPO is that it offers a convenient way to train both policies. % with minimal differences in implementation. 


% Formally, a multi-agent RL problem involves $Z$~agents, each with action space $\mathcal{A}_z$, a shared environment with states $\mathcal{S}$, and a shared environment transition function, $T:\mathcal{S} \times \mathcal{A}_1 \times \cdots \times \mathcal{A}_Z \xrightarrow[]{} \mathcal{S}$. Each agent can have arbitrary reward functions. A seminal paper introduced multi-agent deep deterministic policyhttps://www.overleaf.com/project/60624fdfafdb4bfe7133e86b gradient (MADDPG) \cite{lowe2017multi} for solving such tasks, where the core idea is to train centralized \textit{critics} which learn agent-specific Q-functions \textit{that have knowledge of all the other agents' actions}, and decentralized \textit{actors} which learn agent-specific policies. This setup is specifically designed to handle the non-stationarity induced when multiple agents influence one environment --- the centralized critic with knowledge of all agent actions allows each individual agent to learn as if the environment is stable by conditioning its value functions on the actions of the other agents, making learning itself more stable. We will build off of multi-agent PPO (MAPPO), a recent algorithm that uses the same centralized critic idea, but with a PPO training procedure \cite{yu2021surprising}.

% We use this formalism to implement the nature oracle as follows. There are two agents $A$ and $B$, where $A$'s goal is to optimize the RMAB policy $\pi^*$  and $B$'s goal is to find parameters $\hat{\omega}$ that maximize regret of the current best agent strategy $\pi^\prime$. The transition function is $T: \mathcal{S} \times \mathcal{A}_A \times \mathcal{A}_B \xrightarrow[]{} \mathcal{S}$. The action space $\mathcal{A}_A$ will be the same as the action space of multi-action RMAB. At a given state $\bm{s}$, the action space $\mathcal{A}_B$ will allow agent $B$ to select $\hat{\omega}$ which, in general, may depend on $\bm{s}$. That is, at each step, agent $B$ will select environment parameters $\hat{\omega}$, and thus state/action transition probabilities that will determine the outcome of agent $A$'s actions. We adopt the centralized critic idea from MAPPO to our RMAB setting to create MA-RMABPPO. Again, it is important that we use a PPO-based training procedure since agent $A$ has a discrete policy space but agent $B$ has a continuous policy space, and PPO offers a convenient way to train both policies with minimal differences in implementation. 

% Since RMABPPO requires a cyclical procedure for updating the $\lambda$-network, the same will be true of RMABPPO. 

% After training both agents simultaneously with MADDPG, each network will converge to a deterministic optimal policy -- most importantly, agent $B$'s policy will represent a deterministic setting of $\bm{\theta}$, i.e., a pure strategy for nature.

The most important step is to define the rewards for agents $A$ and $B$ to match their objectives. Since agent $A$'s objective is to find $\pi^*_{\omega}$, it adopts the reward defined by the underlying RMAB, i.e., ${R}^{(A)} = \sum_{n=1}^N R_n$. However, agent $B$'s objective is to learn the regret-maximizing parameters $\omega$. This objective is challenging because it requires %maximizing regret of an input policy $\tilde{\pi}$, which relies on 
computing and optimizing over the returns of the fixed input policy $\tilde{\pi}$ with respect to all possible $\omega$, which is in general non-convex. In practice, to estimate the returns of $\tilde{\pi}_\omega$, we execute a series of roll-outs against agent $B$'s current action. %, but multi-step returns could be used where greater accuracy is required.
That is, given $\bm{s}$ and $\bm{a}$ at a given round, we sample the next state $\bm{s^\prime}$, and define the reward of agent~$B$, as ${R}^{(B)} = \sum_{n=1}^N R_n(s_n,a_n,s^\prime_n) - \frac{1}{Y}\sum_{y=1}^{Y}r_y^{\tilde{\pi}}$, where $r_y^{\tilde{\pi}}$ is the reward obtained from each of $Y$ one-step Monte Carlo simulations of the mixed strategy $\tilde{\pi}$.
 %That is, for every $(\bm{s},\bm{a},r,\bm{s}^\prime)$ tuple sampled from the environment, we define regret here as $R_B = \sum_{n=1}^N R_n(s_n,a_n,s^\prime_n) - \frac{1}{Y}\sum_{y=1}^{Y}r_y^{\tilde{\pi}}$, where $r_y^{\tilde{\pi}}$ is the reward obtained from each of $Y$ random 1-step Monte Carlo simulations of mixed strategy $\tilde{\pi}$ from state $\bm{s}$.

To train the networks, agent $A$ has the same policy network architecture as RMABPPO, i.e., $N$ discrete policy networks and one $\lambda$-network, and the agent $B$ actor network is a single continuous-action policy network. Since agent $A$ and $B$ have separate reward functions, they have their own critic networks, but they are ``centralized'' in that they both take the actions of the other as input. Other than this, the training for agent $A$ follows the same as in RMABPPO, and agent $B$ is trained in a standard PPO fashion. In practice, to ensure good gradient estimates for agent $A$'s $\lambda$-network in MA-RMABPPO, we keep agent $B$'s network --- and thus the environment --- constant between $\Lambda$ updates, updating $B$'s network at the same frequency as the $\lambda$-network updates. Pseudocode for MA-RMABPPO and further details of its implementation are given in the appendix. %Algorithm~\ref{alg:marmabppo}.


\begin{algorithm}[t]
\caption{RMABDO}
\label{alg:full-alg}
\textbf{Input}: Environment simulator and parameter uncertainty interval $\langle \omega_{n, p_n} \rangle$ for all $n \in [N]$ \\
\textbf{Parameters}: Convergence threshold $\varepsilon$%, number of perturbations $O$
\\
\textbf{Output}: Agent mixed strategy $\tilde{\pi}$
\begin{algorithmic}[1] %[1] enables line numbers
\STATE $\Omega_0 = \{\omega_0\}$, with $\omega_0$ selected at random
\STATE $\Pi_0 = \{\pi_{B_1}, \pi_{B_2}, \ldots\}$, where $\pi_{B_i}$ are baseline and heuristic strategies \\
\FOR{epoch $e = 1, 2, \ldots$}
\STATE Solve for $(\tilde{\pi}_e, \tilde{\omega}_e)$, mixed Nash equilibrium of regret game with strategy sets $\Omega_{e-1}$ and $\Pi_{e-1}$ \\
\STATE $\pi_e = \textsc{RMABPPO}(\tilde{\omega}_e)$ \\
\STATE $\omega_e = \textsc{MA-RMABPPO}(\tilde{\pi}_e)$ \\
\STATE $\Omega_e = \Omega_{e-1} \cup \{\omega_e\}, \Pi_e = \Pi_{e-1} \cup \{\pi_e\}$
\IF{$L(\tilde{\pi}_e, \omega_e) - L(\tilde{\pi}_{e-1}, \tilde{\omega}_{e-1}) \leq \varepsilon$ and $L(\pi_e, \tilde{\omega}_e) - L(\tilde{\pi}_{e-1}, \tilde{\omega}_{e-1}) \leq \varepsilon$}
\STATE \textbf{break}
\ENDIF
\ENDFOR
\STATE \textbf{return} $\tilde{\pi}_e$
% \STATE \textbf{return} $\tilde{\pi}_e$
\end{algorithmic}
\end{algorithm}



\subsection{Minimax Regret-Robust RMAB Double Oracle}
We now have all the pieces we need to present our robust algorithm RMAB Double Oracle (RMABDO), visualized in Fig.~\ref{fig:concept}(a), with pseudocode presented in Algorithm~\ref{alg:full-alg}, adapted from the MIRROR framework \cite{xu2021robust}. 
%We cast the robust planning problem as a zero-sum game between an agent (who plans actions to take on the arms) and nature (who sets worst-case instantiations of the environment parameters within the uncertainty set). 
We use RMABPPO to instantiate the agent oracle and MA-RMABPPO for the nature oracle.
%The agent oracle, RMABPPO, is given above in Section~\ref{sec:rl-rmab}. The nature oracle, MA-RMABPPO, requires the greatest innovation, as we provide in Section~\ref{sec:marl-rmab}. 
The double oracle approach proceeds as follows. 
%Each agent maintains a finite set of pure strategies, $\Pi$ for the agent and $\Omega$ for nature. Given each player's strategy set, we first compute a Nash equilibrium over the sets, giving an optimal mixed strategy for each player to play against its opponent. Next, each oracle is queried to produce a new pure strategy as a best response against the opponent's current mixed strategy. If each best response strategy is already in the players' strategy sets, then we terminate, having provably converged. Else we continue iterating.
The agent maintains strategy set $\Pi$, initially empty, and nature maintains strategy set $\Omega$, initialized with an arbitrary parameter setting. In each iteration, we solve for a mixed Nash equilibrium in the regret game between the agent and nature to learn a mixed strategy ($\tilde{\pi}, \tilde{\omega}$) for each player. We then call the agent and nature oracles to compute best responses $\pi$ and $\omega$ to their opponent's strategy, which get added to their respective strategy sets $\Pi$ and $\Omega$. We repeat this process until the improvement in value for each player is within the tolerance~$\varepsilon$ or until a set number of iterations.

\begin{restatable}[]{proposition}{rmabdoConvergenceProposition}
\label{thm:rmabdo_convergence}
RMABDO converges in a finite number of steps to within $\epsilon$ value of the minimax regret-optimal policy.
% Any non-robust reward-maximizing approach can achieve arbitrarily bad performance when evaluated in terms of regret.
\end{restatable}

\noindent Additionally, we show that a policy that maximizes reward assuming a fixed parameter set can incur arbitrarily large regret when the parameters are changed (proofs in appendix). 

\begin{restatable}[]{proposition}{regretProposition}
\label{thm:regret}
In the Robust RMAB problem with interval uncertainty, the max regret of a reward-maximizing policy can be arbitrarily large compared to a minimax regret-optimal policy.
% Any non-robust reward-maximizing approach can achieve arbitrarily bad performance when evaluated in terms of regret.
\end{restatable}


% \subsection{Agent oracle}

% \subsubsection{Description}


% \subsubsection{Theory}



% \subsection{Nature oracle}

% \subsubsection{Description}

% MARL approach to solving nature oracle

% Motivation for using a policy gradient--based RL algorithm for the agent oracle is that we can build upon it to enable the nature oracle to differentiate directly through to optimize the environment parameters.

% \subsection{Combined double oracle}




% \begin{theorem}
% \label{thm:convergence}
% finite convergence? or stronger guarantee of convergence?
% \end{theorem}

% \begin{proof}[Proof sketch]
% TODO Lily: proof sketch here. chexck if it's just all the same as UAI result
% \end{proof}

% \begin{theorem}
% \label{thm:pure-strategy}
% TODO Arpita:

% ultimately reducing to a pure strategy from the mixed strategy: 
% sampling a pure strategy from mixed strategy is lower complexity than solving a pure strategy (and pure strategy mixed Nash equilibrium may not even exist). additionally, if there are any pure strategies that have probability 0 in the mixed strategy, then we know an optimal pure strategy exists without considering those strategies with probability 0

% potentially make a hardness claim for pure strategies? why we can't work with pure strategies
% \end{theorem}



% The main challenge in implementing the approach will be in adjusting the MADDPG algorithm to interact with the RL approach that we develop for solving the RMAB problem. For instance, if we went even with the simple approach proposed in section \ref{section:approach_agent_oracle}, we would have to adjust MADDPG to handle the fact that we need to sample longer-term reward trajectories, rather than one-step rewards. \textbf{One key question -- how did Xu et al., define rewards using their DDPG approach to solve for $\pi^*$ and $z$ simultaneously? How to define a one-step reward that involves estimating the outcome of $\pi^\prime$?}


% Firstly, computing $\pi^*_{\bm{\theta}}$ is PSPACE-Hard in general, so even evaluating $L$ is difficult in practice. One possible approach is to consider the binary-action BRMAB case, and replace $\pi^*_{\bm{\theta}}$ with $\pi^W_{\bm{\theta}}$ where $\pi^W_{\bm{\theta}}$ is the \textit{Whittle index} policy and is known to be asymptotically optimal under the technical condition \textit{indexability}. General algorithms that are polynomial in $N$ exist for computing the Whittle index policy to arbitrary precision. This would give us a tractable method for computing $L$. In the Multi-action setting, an alternative option would be to use the techniques from \cite{killian2021beyond} to compute $\pi^{La}_{\bm{\theta}}$ where $La$ denotes the \textit{Lagrange policy} (see \cite{killian2021beyond}) and is not known to have guarantees, but has good performance. Another option would be to use reinforcement learning (RL) to compute $\pi^*_{\bm{\theta}}$ directly. However, doing this will be challenging because of the exponentially large state and action spaces over which the RL algorithm would have to learn. Another option would be to develop some novel RL techniques for computing $\pi^{La}_{\bm{\theta}}$, which has a more tractable state space (not clear if this would be more efficient than using existing techniques for solving for $\pi^{La}_{\bm{\theta}}$).


% Now, the planner must take decisions for all arms jointly, subject to two constraints each round: (1)~select one action for each arm and (2)~the sum of action costs over all arms must not exceed a given budget $B$. Formally, the planner must choose a decision matrix $\bm{A} \in \{0,1\}^{N\times M}$ such that:
% \begin{align}
%     &\sum_{j=1}^{M} \bm{A}_{ij} = 1 \hspace{3mm} \forall i \in [N] \label{eq:single_action_constraint} \\
%     &\sum_{i=1}^{N}\sum_{j=1}^{M} \bm{A}_{ij}c_j \le B \label{eq:budget_constraint}
% \end{align}
% Let $\overline{\mathcal{A}}$ be the set of decision matrices respecting constraints \ref{eq:single_action_constraint} and \ref{eq:budget_constraint} and let $\bm{s} = (s^1, ..., s^{N})$ represents the initial state of each arm. The planner's goal is to maximize the total discounted reward of all arms over time, subject to constraints \ref{eq:single_action_constraint} and \ref{eq:budget_constraint}, as given by the constrained Bellman equation:
% \begin{equation}
% \begin{aligned}\label{eq:combined_value_function}
%     J(\bm{s}) = \max_{\bm{A}\in \overline{\mathcal{A}}}\left\{\sum_{i=1}^{N} r^i(s^i) + \beta E[J(\bm{s}^\prime) | \bm{s}, \bm{A}]\right\}
% \end{aligned}
% \end{equation}
% However, this corresponds to an optimization problem with exponentially many states and combinatorially many actions, making it PSPACE-Hard to solve directly \cite{papadimitriou1999complexity}. To circumvent this, we take the Lagrangian relaxation of constraint \ref{eq:budget_constraint}

\section{Experimental Evaluation}
\label{sec:experiments}
We first experimentally demonstrate the importance of robust planning in the presence of uncertainty using a hand-crafted synthetic domain (inspired by Prop.~\ref{thm:regret}). We then evaluate our algorithm on two challenging real-world-inspired public health planning scenarios which demonstrate the new real-world capability of the robust RMAB framework. We compare our algorithm against five baselines. Namely, we compare against three variations of the approach from \citet{hawkins2003langrangian}, which computes a reward-maximizing Lagrange policy for each step of a multi-action RMAB problem for fixed model parameters. The three variations are, pessimistic (\textbf{HP}), mean (\textbf{HM}), and optimistic (\textbf{HO}), which assume the model parameters have been set to lower bound, mean, and upper bound of the intervals. We also implement \textbf{RLvMid}, which learns a policy via RMABPPO assuming \textit{mean} parameters, and \textbf{Rand}, which acts randomly within budget. All results are averaged over 50 random seeds and were executed on a cluster running CentOS with Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.1 GHz with 8GB of RAM using Python 3.7.10. Our RMABPPO implementation builds on OpenAI Spinning Up \cite{SpinningUp2018} and RMABDO builds on the MIRROR implementation \cite{xu2021robust}, computing Nash equilibria using Nashpy 0.0.21 \cite{Knight2018}. Code is available in the supplement and hyperparameter settings are in the appendix.
% We show that RMABPPO provides an effective RL solution to solving RMABs, particularly as problem sizes increase, and that MA-RMABPPO offers max regret--minimizing solutions with reasonable runtime for realistic problems. 
The experimental domains are as follows. 

\textbf{Synthetic} demonstrates that reward-maximizing policies (RLvMid, HP, HM, HO)
%corresponding to either pessimistic (\textbf{HP}), mean (\textbf{HM}), or optimistic (\textbf{HO}) 
may incur large regret in the presence of uncertainty. There are three binary-action arm types $\{U,V,W\}$, each with $\mathcal{C} = \{0, 1\}$, $\mathcal{S}=\{0,1\}$, $R(s)=s$, and the following transition matrix, with rows and columns corresponding to actions and next states, respectively:
\[T^n_{s=0}=
\begin{bmatrix}
    0.5  &  0.5 \\
    0.5  &  0.5
\end{bmatrix}, \hspace{2mm}
T^n_{s=1}=
\begin{bmatrix}
    1.0  &  0.0 \\
    1-p_n  &  p_n
\end{bmatrix}
\]
\[
p_U \in [0.00, 1.00],\hspace{0.5mm}
    p_V \in [0.05, 0.90],\hspace{0.5mm}
    p_W \in [0.10, 0.95]
% \begin{matrix}
%     p_U \in [0.00, 1.00] \\
%     p_V \in [0.05, 0.90] \\
%     p_W \in [0.10, 0.95]
% \end{matrix} \ .
\]
When an arm is at $s=0$, each action has equal impact on the state transition. When the arms are at $s=1$, selecting arms with high $p_n$ is optimal. This implies that policies can be specified by the order in which arms would be acted on, when they are in state $s=1$. Accordingly, $\pi_\textit{HP} = [W,V,U]$, $\pi_\textit{HM} = [W,U,V]$, and $\pi_\textit{HO} = [U,W,V]$. However, observe that there exist values of $p_n$ that can make each of the reward-maximizing policies incur large regret, e.g., $p=[0.0, 0.9, 0.1]$ for $\pi_\textit{HM}$, which would induce an optimal policy $[1,2,0]$ that is the reverse of $\pi_\textit{HM}$. 

\textbf{ARMMAN} is a real-world \emph{maternal healthcare intervention problem} modeled as a binary-action RMAB \cite{biswas2021learn}. The objective is to select a subset of mothers each week to intervene on with tailored maternal health messaging to encourage engagement. The behavior of each enrolled woman is modeled by an MDP with three states: Self-motivated, Persuadable, and Lost Cause. We use the summary statistics mentioned in their paper and assume uncertainty intervals of $0.5$ centered around the transition parameters, resulting in 6 uncertain parameters per arm (details in appendix). Similar to the setup by \citet{biswas2021learn}, we assume 1:1:3 split of arms with high, medium, and low probability of increasing their engagement upon intervention. In our experiments, we scale the value of $N$ in multiples of $5$ to keep the same split of arm categories of 1:1:3. 


\begin{figure*}[t]
    \centering
    \includegraphics[width=0.95\textwidth]{img/all_experiments.pdf}
    \caption{\textbf{(a-f)} Maximum policy regret in robust setting for Synthetic (a,b), ARMMAN (c,d) and SIS (e,f) domains. Lower is better. Synthetic is scaled by 3 and ARMMAN by 5 to maintain the distributions of arm types specified in Section \ref{sec:experiments}. (e) uses $S=50$ and (f) uses $N=5,B=4$. RMABDO beats all baselines by a large margin across various parameter settings. \textbf{(g-l)}~Policy returns for reward-maximizing setting (agent oracle) for synthetic (g,h), ARMMAN (i,j), and SIS (k,l) domains. Higher is better (k) uses $S=50$ and (l) uses $N=5,B=4$. RMABPPO is competitive across parameter settings.}
    \label{fig:all_experiments}
\end{figure*}

\begin{figure}[t]
    \centering
    \includegraphics[width=0.58\linewidth]{img/hawkins_slow_runtime_sis.pdf}
 \caption{The poor scaling of query time of the Hawkins baseline compared to RMABPPO, as discussed in Section~\ref{sec:experiments}, even for relatively small problem sizes ($N = 10, B = 2$).}
    \label{fig:hawkins_bad_runtime}
\end{figure}

\textbf{SIS Epidemic Model} is a discrete-state model in which arms represent distinct geographic regions and each member of an arm's population of size $N_{\textit{p}}$ is either (\textbf{S})usceptible to or (\textbf{I})nfected with an infectious disease. Such models have been the subject of increased interest following the COVID-19 pandemic \cite{hinch2021openabm,kerr2021covasim}, and will represent a large-state and multi-action experimental domain. In our model, the count of \textbf{S} members is the state of each arm. Each arm's SIS model is defined by parameters $\lambda_{\textit{c}}$, the average number of contacts per round, and $r_{\textit{infect}}$, the probability of infection given contact with an \textbf{I} member. Details on computing discrete state transition probabilities from these parameters are derived from \citet{yaesoubi2011generalized} and given in the appendix. We introduce three intervention actions $\{a_0, a_1, a_2\}$ with costs $c=\{0, 1, 2\}$. Action $a_0$ represents no action, $a_1$ represents messaging about physical distancing (divides $\lambda_{\textit{c}}$ by $a^{\textit{eff}}_1$), and $a_2$ represents distributing face masks (divides $r_{\textit{infect}}$ by $a^{\textit{eff}}_2$). We impose the following uncertainty intervals: $\lambda_{\textit{c}} \in [1, 10]$, $r_{\textit{infect}} \in [0.5, 0.99]$, $a^{\textit{eff}}_{\{1,2\}} \in [1, 10]$.



% \noindent\textbf{Robust Double Oracle}
% \label{sec:experiments-do}





% \begin{figure}[t]
%     \centering
%     \begin{subfigure}[t]{0.325\linewidth}
%         \centering
%         \includegraphics[width=\textwidth]{example-image-a}
%         \caption{Counterexample domain, varying $n$}
%         \label{fig:counterexample-n}
%     \end{subfigure}
%     \hfill
%     \begin{subfigure}[t]{0.325\linewidth}
%         \centering
%         \includegraphics[width=\textwidth]{example-image-b}
%         \caption{ARMMAN domain, varying $n$}
%         \label{fig:armman-n}
%     \end{subfigure}
%     \hfill
%     \begin{subfigure}[t]{0.325\linewidth}
%         \centering
%         \includegraphics[width=\linewidth]{example-image-c}
%         \vfill
%         \caption{Domain C, varying $n$}
%         \label{fig:van-n}
%     \end{subfigure} \\
    
%     \begin{subfigure}[t]{0.325\linewidth}
%         \centering
%         \includegraphics[width=\textwidth]{example-image-a}
%         \caption{Counterexample domain, varying budget}
%         \label{fig:counterexample-budget}
%     \end{subfigure}
%     \hfill
%     \begin{subfigure}[t]{0.325\linewidth}
%         \centering
%         \includegraphics[width=\textwidth]{example-image-b}
%         \caption{ARMMAN domain, varying budget}
%         \label{fig:armman-budget}
%     \end{subfigure}
%     \hfill
%     \begin{subfigure}[t]{0.325\linewidth}
%         \centering
%         \includegraphics[width=\linewidth]{example-image-c}
%         \vfill
%         \caption{Domain C, varying budget}
%         \label{fig:van-budget}
%     \end{subfigure}
%     \caption{Performance of algorithms across all settings, evaluated in terms of regret}
%     \label{fig:performance}
% \end{figure}
\paragraph{Robust RMAB}
First we evaluate the performance of the algorithms in uncertain environments. We compute the regret of an agent's pure strategy $\pi$ against a nature pure strategy $\omega$ as the difference between the average reward obtained by the agent's strategy on $\omega$ and the average reward of the best strategy in the experiment against $\omega$. The average reward is the discounted sum of rewards over all arms for a horizon of length $10$, over $25$ simulations. In each setting, double oracle runs for $6$ iterations, using $100$ rollout steps and $100$ training epochs for each oracle. After completion, each baseline strategy is evaluated by querying the nature oracle for the best response against that strategy, then computing max regret against all $\omega$. The regret of RMABDO is computed as the utility of the agent mixed strategy returned by the double oracle over the two-player regret game.
% Hawkins is an online algorithm; converting it to offline would take exponential space in our problem setting

Fig.~\ref{fig:all_experiments}(a-f) shows RMABDO has the lowest regret and beats the baselines in all domains. (a,b) shows results on the synthetic domain, demonstrating that our approach can reduce regret by \char`\~$50\%$ against existing benchmarks, across various values of $N$ and $B$. 
% This is expected because the domain is designed to ensure large regret for HP, HM, and HO baselines. 
% Here, a benefit of $50\%$ in the regret corresponds to, in the worst case, keeping $1/3$ of the arms in the good state for an extra round compared to the baseline. 
Moreover, as $B$ increases, the regret incurred may increase, since higher budget implies better reward potential for the optimal policy; however, the regret for RMABDO remains small even with an increasing $B$. 
Similarly, for the ARMMAN domain (c-d), a challenging domain adapted from a real-world dataset, our algorithm performs consistently better than the baselines, achieving regret that is around $50\%$ lower than the best baselines. In the SIS domain (e-f), another real-world planning setting with a larger state space and multiple actions, our results are robust across parameter settings. Importantly, this holds even as we increase the state space from $100$ to $500$ (f), in which running the Hawkins baseline becomes prohibitively intensive. 

\emph{Finally, we run sensitivity analyses of the algorithms against $H$ and the size of the uncertainty sets.} As expected, when $H$ varies from 10 to 100, RMABDO maintains very low regret, while competitor regret as much as doubles, increasing RMABDO's relative improvement as high as \char`\~60\%. Similar results are obtained when varying the uncertainty intervals between 0.25, 0.5 and 1.0 times their widths from the experiments in Fig.~\ref{fig:all_experiments}, with RMABDO always dominating. The full sensitivity analysis is given in the appendix.

\paragraph{RMABPPO} We also evaluate the performance of RMABPPO, our novel DRL approach to find reward-maximizing policies for multi-action RMABs, which implements our agent oracle. We compare against \textbf{No Action} and \textbf{Random} baselines as well as the computationally intensive solution by Hawkins which computes the Lagrange policy, but which requires exact model parameters and discrete states/actions. Hawkins upper bounds RMABPPO for small discrete problems since it is exact whereas RMABPPO learns the Lagrange policy from samples. Each experiment is a traditional reward-maximizing RMAB instantiated with a random sample of valid parameter settings for each seed. Fig.~\ref{fig:all_experiments}(g-l) shows the reward of RMABPPO is comparable to the Hawkins algorithm and significantly better than random, providing insight into the success of our RMABDO approach which RMABPPO enables, and showing promise for RMABPPO as an algorithm of general interest. In the synthetic domain (g,h), RMABPPO learns to act on the $33\%$ of arms who belong to category $W$. The mean reward of RMABPPO almost matches that of Hawkins algorithm as $N$ scales with a commensurate budget (g). As we fix $N$ and vary the budget (h), the optimal policy accumulates more reward, and RMABPPO almost equals the optimal. We observe similar results on the ARMMAN domain (i,j), where it is optimal to act on $20\%$ of arms (of type $A$; see appendix). On the SIS domain (k,l), the strong performance of RMABPPO holds in a multi-action setting even as we increase the number of states from 50 to 500 (l). %The bottom-right plot shows that RMABPPO achieves reasonable reward. %Additional results for larger settings are included in the appendix.
Moreover, RMABPPO beats Hawkins computationally: in Fig.~\ref{fig:hawkins_bad_runtime}, a single rollout ($10$ rounds) of Hawkins takes \char`\~$100$ seconds when there are $500$ states, scaling quadratically in general. This demonstrates that it would be prohibitive to run Hawkins in the loop of RMABDO, since agent policies are evaluated thousands of times to compute the regret matrices. For just $25$ simulations, computation would take \char`\~$42$ minutes to evaluate one cell in the regret matrix, where the matrix has size $|\Pi| \times |\Omega|$. %The key computational bottleneck with Hawkins is that it requires solving a linear program for the current state profile, that would subsequently change in the next timestep. Additional results for larger settings are included in the appendix. 

% \paragraph{Limitations}
% \label{sec:limitations}
% We believe these advancements have the potential to improve resource allocation in low-resource settings, but acknowledge they are not without tradeoffs. For example, the baseline methods we compare against, while less robust, can provide interpretable `index' policies that capture the value for acting on an arm, whereas our solution's output can be difficult for a domain expert to interpret. Further, such optimization tools have the risk of amplifying underlying biases in the data and translating that to unfair resource allocation. However, by addressing the robust version of the problem, we directly address this concern by providing a flexible tool for \textit{mitigating} biases, by allowing users to tune their uncertainties and thus, providing natural ways to develop good policies even when data availability is skewed.
%Baselines: combined RL, Hawkins, random, no action. 
% some myopic strategy?

%evaluate against Hawkins here; we might not beat Hawkins w.r.t. reward but we should beat w.r.t. regret. perhaps all we need is to show that our computation speed is better than Hawkins even if performance is not better. (but Milind also said that we don't want to worry about the RL vs. MIP debate; we shouldn't have to worry about having to justify our decision to use RL)



\section{Conclusion}
% We make a key advancement in Restless Bandit modeling by introducing the robust setting and providing robust planning tools for the common real-world scenario where available data or expert knowledge is limited. To enable our approach, we develop a novel deep RL framework for learning RMAB policies, RMABPPO, which demonstrates promising performance independent of our robust policy framework. While we believe these advancements have the potential to improve resource allocation in low-resource settings, they are not without tradeoffs --- for example the baseline methods we compare against, while less robust, can provide interpretable `index' policies that capture the value for acting on an arm, whereas our solution's output is not calibrated in such a way and would be difficult for a domain expert to interpret. Further, such optimization tools have the risk of amplifying underlying biases in the data and translating that to unfair resource allocation. However, by addressing the robust version of the problem, we directly address this concern by providing a flexible tool for \textit{mitigating} biases, by allowing users to tune their uncertainties and thus natural ways to develop good policies even when data availability is skewed. Ultimately, we hope these contributions bring us a step in the direction of deploying restless bandits in the real world for positive and robust impact.
We address a key limitation blocking RMABs from use in most real-world planning problems: that environment dynamics are not known precisely. To plan effective and safe policies, it is essential to take a robust approach to account for model uncertainty, which we provide in RMABDO, enabled by RMABPPO, a novel and general DRL algorithm for RMABs of general interest. We hope these contributions bring us closer to deploying RMABs for real-world impact.

\bibliography{9_bibliography}

% \section{Acknowledgments}

\appendix
\input{2_appendix}

\end{document}
