\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent



\usepackage[american]{babel}
% \usepackage[british]{babel}

% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%packages from NeurIPS paper
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{url}            % simple URL typesetting
\def\UrlBreaks{\do\/\do-}
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{bm}
% \usepackage{subfig}
\usepackage{graphicx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{thmtools}
% \usepackage{thm-restate}
\usepackage{enumitem}
\usepackage{subcaption}
\usepackage{algorithm}
\usepackage[noend]{algorithmic}
\usepackage{mathtools}
\usepackage{xr-hyper}
\usepackage{hyperref}       % hyperlinks
\usepackage{etoolbox}
\robustify\setcounter
\robustify\addtocounter
\robustify\setlength
\robustify\addtolength


\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}{Claim}
\newtheorem{example}[theorem]{Example}
\newtheorem{cond}{Condition}
\newtheorem{remark}{Remark}
\newtheorem{proposition}{Proposition}
\theoremstyle{definition}
\newtheorem{definition}{Definition}%[section]

\renewcommand\thesection{\Alph{section}}
\renewcommand{\thefigure}{A\arabic{figure}}
\renewcommand{\thetable}{A\arabic{table}}
\renewcommand{\thealgorithm}{A\arabic{algorithm}}

\renewcommand{\algorithmiccomment}[1]{\color{blue}{\hfill{\textit{// #1}}}}

% \addto\languageXYZ{\renewcommand\proofname{Proof Sketch}}
% \renewcommand*{\proofname}{Proof Sketch}


% \newcommand\jk[1]{\textcolor{blue}{#1}}
% \newcommand\lx[1]{\textcolor{olive}{[Lily: #1]}}


% In your preamble

\makeatletter
\newcommand*{\addFileDependency}[1]{
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myappendixexternaldocument}[1]{
    \externaldocument{#1}
    \addFileDependency{#1.tex}
    \addFileDependency{#1.aux}
}
%%% END HELPER CODE

% put all the external documents here!
\myappendixexternaldocument{killian_313}

% just to see what's happening
\listfiles


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% \title{Your Bandit Model is Not Perfect: Introducing Robustness to Restless Bandits Enabled by Deep Reinforcement Learning}
\title{Restless and Uncertain: Robust Policies for Restless Bandits \\ via Deep Multi-Agent Reinforcement Learning (Supplementary material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jkillian@g.harvard.edu>?Subject=Your UAI 2022 paper}{Jackson~A.~Killian}{}}
\author[1]{\href{mailto:<lily_xu@g.harvard.edu>?Subject=Your UAI 2022 paper}{Lily~Xu}{}}
\author[1,2]{\href{mailto:<arpitabiswas@seas.harvard.edu>?Subject=Your UAI 2022 paper}{Arpita~Biswas}{}}
\author[1,2]{\href{mailto:<milind_tambe@harvard.edu>?Subject=Your UAI 2022 paper}{Milind~Tambe}{}}
% Add affiliations after the authors
\affil[1]{%
    Computer Science,
    Harvard University,
    Cambridge, MA, USA
}
\affil[2]{%
    Center for Research on Computation and Society,
    Harvard University,
    Cambridge, MA, USA
}
  
\begin{document}
\maketitle


\section{Proofs}
\label{sec:appendix:proofs}

\subsection{Proof of Proposition~\ref{thm:lambda_update}}
% \lambdaUpdate*
\begin{proposition}
To learn the value $\lambda$ that minimizes Eq.~\ref{eq:decoupled_value_func} given a state $\bm{s}$, the $\lambda$-network, parameterized by $\Lambda$, should be updated with the following gradient rule:
%A gradient rule for updating the $\lambda$-network, parameterized by $\Lambda$, such that for a state $\bm{s}$, the $\lambda$-network predicts the value $\lambda$ that minimizes Eq.~\ref{eq:decoupled_value_func} is as follows:
\begin{equation}
\begin{aligned}
    \Lambda_t = \Lambda_{t-1} - \alpha \left( \frac{B}{1-\beta} + \sum_{n=1}^{N}D_n(s_n, \lambda_{t-1}(\bm{s})) \right) 
\end{aligned}
\end{equation}
where $\alpha$ is the learning rate and $D_n(s_n, \lambda)$ is the negative of the expected $\beta$-discounted sum of action costs for arm $n$ starting at state $s_n$ under the optimal policy for arm $n$ for a given value of $\lambda$.
\end{proposition}

\begin{proof}
The gradient update rule is derived by taking the gradient of Eq.~\ref{eq:decoupled_value_func} with respect to $\lambda$, which has two main terms, $\lambda B / (1-\beta)$, and the sum over $Q_n$, the Q-functions with respect to $\lambda$. Looking more closely at $Q_n$, the only terms which are a function of $\lambda$ are the costs of actions taken by the policy that $Q_n$ implies, i.e., terms $-\lambda c_j$. Thus, the gradient of $Q_n$ is the negative expected discounted sum of costs taken by the optimal policy at the given value of $\lambda$, i.e., $\frac{dQ_n}{d\lambda} = -\mathbb{E}[\sum_{t=0}^{H} \beta^t c_{n,t}]$, where $c_{n,t}$ is the cost of the action taken on arm $n$ in round $t$.
\end{proof}

\subsection{Proof of Proposition~\ref{thm:lambda_convergence}}
\begin{proposition}
Given arm policies corresponding to optimal $Q$-functions, 
% the gradient update rule of 
Prop.~\ref{thm:lambda_update} will lead $\Lambda$ to converge to the optimal as the number of training epochs and $K\xrightarrow[]{}\infty$.
\end{proposition}

\begin{proof}
Eq.~\ref{eq:decoupled_value_func} is convex in $\lambda$, which follows from definition of $Q_n$, i.e., the max over piece-wise linear functions of $\lambda$ is also a convex function in $\lambda$.  Thus the learning task of $\Lambda$ is also convex. Therefore, all that is required for asymptotic convergence of $\Lambda$ is that (1) the gradients we estimate via Prop.~\ref{thm:lambda_update} are accurate, and that (2) all inputs, i.e., all states $\bm{s}$, are seen infinitely often in the limit. (1) is achieved by the assumption that optimal $Q$-functions are given, an analytic condition that is achieved in practice by allowing the arm-networks to train for a reasonable number of rounds under a given output of the $\lambda$-network, before updating $\Lambda$. Specifically, given optimal Q-functions and their corresponding optimal policies, the sampled sums of spent budget from those optimal policies represent an unbiased estimator of each $D_n$. Note, though that to be an \textit{unbiased} estimator, this relies on not imposing the budget constraint \textit{at training time}, a procedure we carry out in practice.\footnote{It is critical to note that at test time, \textit{we always impose the budget constraint} --- i.e., all of our methods solve the original constrained RMAB problem --- they only use the Lagrangian relaxation as a tool to find good policies to the original constrained problem.} Thus (1) is achieved. (2) is achieved by following a training procedure that uniformly randomly samples start states $\bm{s}$ for each round of training until convergence. Thus the proposition is established.
\end{proof}

\subsection{Proof of Proposition~\ref{thm:rrdpo_convergence}}
\begin{proposition}
RR-DPO converges in a finite number of steps to the minimax regret-optimal policy.
\end{proposition}



\begin{proof}
A common strategy for establishing optimal convergence of the double oracle is to show that the pure strategy sets of both players can be exhausted. We can achieve this in our setting under the conditions (1) that each player has a finite strategy set, i.e., is possible to be exhausted and (2) that each oracle gives an optimal best response. Since the agent pure strategy set is already finite, we can achieve (1) by discretizing the nature oracle---in effect by rounding the outputs of the policy network. For (2), for analytical purposes, we make the common assumption that our oracles internally converge to their optimal values, i.e., in our case, the arm-networks and $\lambda$-network converge optimally. However, since our networks learn the Lagrange-relaxed version of the problem, some additional tools are needed. Speficially, we must identify conditions in which DDLPO-Act gives policies which approach $\pi^*_\omega$.
% Specifically, to obtain a policy at test time that respects the original budget constraint, Q-values from the arm-networks are passed to the knapsack procedure (Sec.~\ref{sec:appendix:knapsack}). So to establish an optimal oracle, we also must identify conditions in which the knapsack procedure over the learned $Q(\lambda)$ arm networks goes to $\pi^*_\omega$. 
This can be achieved in the binary-action setting with $\alpha=$ `Whittle', which uses a binary search procedure to identify a value of $\lambda$ such that exactly $B$ arms have $Q_n(a=1,\lambda) > Q_n(a=0,\lambda)$, then acting on those arms. This procedure is equivalent to the Whittle index policy, which is asymptotically optimal for binary-action RMABs \citep{weber1990index}.
%\footnote{Note that this represents a slight modification to how the networks learned by DDLPO and MA-DDLPO are run at test time. In practice, we compute $\lambda$ via the $\lambda$-network rather than following a binary search procedure, the latter of which is less efficient, but polynomial time. Our analysis requires this slight modification since, even assuming (2), the $\lambda$-network may not give a value $\lambda$ which, when combined with the knapsack procedure, replicates the Whittle index policy; the converged $\lambda$-network will give $\lambda$ values which solve Eq.~\ref{eq:decoupled_value_func}, i.e., which induce $Q_n(\lambda)$-values that spend $\frac{B}{1-\beta}$ in expectation over the horizon, which can, in general, be different than the $\lambda$ required to induce the Whittle index policy. However, as shown in experiments, we find using the $\lambda$ that solves Eq.~\ref{eq:decoupled_value_func} via the learned $\lambda$-network performs well in binary-action settings.} 
%Thus, optimal convergence in finite iterations is established under the above conditions of a discretized nature, and binary-action RMAB.
\end{proof}
% This follows from Thm.~2 of \citet{xu2021robust}), which guarantees the same if the problem satisfies the conditions of a continuous game. It can be seen that the RR-DPO nature oracle satisfies these conditions since regret is a continuous function of the nature oracle's strategy space, i.e., the environment parameters. In the case of a continuous-action agent oracle, the same holds and the proof is complete. In the case where the action space is discrete for the agent oracle, the strategy space is exponentially large but finite, and so, in the worst case, the agent oracle is guaranteed to eventually exhaust all pure strategies, after which the same results for the continuous game would hold.

\subsection{Proof of Proposition~\ref{thm:regret}}
\begin{proposition}
In the Robust RMAB problem with interval uncertainty, the max regret of a reward-maximizing policy can be arbitrarily large compared to a minimax regret-optimal policy.
% Any non-robust reward-maximizing approach can achieve arbitrarily bad performance when evaluated in terms of regret.
\end{proposition}

\begin{proof}
Consider a binary-action RMAB problem with two arms A and~B. Let the reward from each arm be $R$ when the arm is in a \textit{good} state and $0$ in a \textit{bad} state. Our problem is to plan the best action with a budget of $1$ and horizon of $1$. Supposing the initial state is \textit{bad} for each arm, the transition probabilities for the transition matrix for each arm~$n$ is 
\begin{footnotesize}
$\begin{bmatrix} 1 & 0 \\ 1 - p_n & p_n \end{bmatrix}$\end{footnotesize}
where the uncertain variable~$p_n$ is constrained to be within $p_A, p_B \in [0, 1]$. Each value in the matrix corresponds to the probability of an arm at state \textit{bad} transitioning to \textit{bad} (column 1) or \textit{good} (column 2) if we take the \textit{passive} (row 1) or \textit{active} action (row 2). 


To compute a reward-maximizing policy that does not consider robustness to uncertainty, we must optimize for one instantiation of the uncertainty set, which requires making one of three assumptions.
\begin{itemize}
    \item \textit{Case~1:} If we assume $p_A = p_B$, then an optimal policy is to act with probability $a_A$ on arm A and $a_B$ on arm B as long as $a_A + a_B = 1$. W.l.o.g., suppose $a_A \geq a_B$; then nature would set $p_A = 0$ and $p_B = 1$, imposing regret at least $R/2$. 
    \item \textit{Case~2:} If $p_A > p_B$, then the optimal policy would be to always act on arm A with probability $a_A = 1$ and never act on B ($a_B = 0$). Nature would then set $p_A = 0$ and $p_B = 1$ to impose regret $R$. 
    \item \textit{Case~3:} If $p_A < p_B$, the case is symmetric to Case~2 and result in regret $R$. Clearly, max regret is minimized when our action is such that $a_A + a_B = 1$; in this setting, we learn this optimal policy only under Case~1. Following Case 2 or~3, the difference between our regret and the minimax regret is $R/2$, which grows arbitrarily higher as $R \to \infty$.
\end{itemize}

A slight modification to this problem renders Case~1 non-optimal. Let the reward be $R$ when arm A is in a \textit{good} state and $R-1$ for arm B, so the optimal policy learned under the assumption from Case~1 leads to $a_A = 1$ and $a_B = 0$. Then nature could respond with $p_A = 0$ and $p_B = 1$, yielding reward $0$ and regret $R-1$, while the minimax regret--optimal policy achieves a minimum reward of $(R-1)/2$ (by playing $a_A = 0.5$ and $a_B = 0.5$ where nature responds with $p_A = 0$ and $p_B = 1$). Thus, the gap again can grow arbitrarily high as $R \to \infty$ provided that $R > 1$. We therefore have that in all cases, any reward-maximizing policy can achieve arbitrarily bad performance in terms of regret.
%In all three cases, the regret incurred grows arbitrarily high as $R \to \infty$. \lx{what is minimax regret here? want to show regret of the reward-maximizing agent can be arbitrarily worse than the regret of the minimax regret--optimizing agent. show everything will be non-optimal except for assuming $p_A = p_B$, and then for that case we could change the reward to make regret high}
\end{proof}


\section{DDLPO-Act subroutines }
\label{sec:appendix:knapsack}
Here we provide the integer program which implements \texttt{QKnapsack}, one of the action-selection procedures used in Alg.~\ref{alg:ddlpo-act} to take actions at test time. \texttt{QKnapsack} takes $\lambda$ and $Q_n(s,a,\lambda)$ from the learned $\lambda$-network and arm networks, respectively, and returns the combination of actions that maximizes the sum of Q-values over all arms, subject to the costs of each action $\mathcal{C}$ and the budget constraint $B$.

\begin{align}
    &\max_{X} \sum_{n=1}^{N}\sum_{j=1}^{|\mathcal{A}|}x_{nj}Q_n(s_n, a_{nj}, \lambda) \\
    &\text{s.t. }\sum_{i=n}^{N}\sum_{j=1}^{|\mathcal{A}|} x_{nj}c_j \le B \\
    &\sum_{j=1}^{|\mathcal{A}|} x_{nj} = 1 \hspace{3mm} \forall n \in 1...N \\
    &x_{nj} \in \{0,1\}
    \label{eq:knapsack}
\end{align}


In Alg.~\ref{alg:bina-search}, we give the procedure $\texttt{BinaSearch}$ which implements a binary search over the learned $Q(\lambda)$-values to find a charge $\lambda$ for which exactly $B$ arms prefer to act rather than not act. This mimics the Whittle index policy in binary-action settings.

\begin{algorithm}[t]
\caption{BinaSearch (for the Whittle Index Policy)}
\label{alg:bina-search}
\textbf{Input}: State $\bm{s}$, arm critic networks $\phi_1,\ldots,\phi_N$, budget $B$, tolerance $\epsilon$.
\begin{algorithmic}[1] %[1] enables line numbers
\STATE $q_{nj} = \phi_n(s_n,a_{nj},\lambda=0)$ \hspace{1mm} $\forall n \in [N], \forall j \in [|\mathcal{A}|]$
\STATE $lb = 0$
\STATE $ub = \max_{n\in [N], j \in [|\mathcal{A}|]}{\{q_{nj}}\}$
% \COMMENT{Now binary search for the Whittle index}
\color{black}\WHILE{$ub - lb > \epsilon$}
\STATE $\lambda = \frac{ub+lb}{2}$
\STATE $q_{nj} = \phi_n(s_n,a_{nj},\lambda)$ \hspace{1mm} $\forall n \in [N], \forall j \in [|\mathcal{A}|]$
\IF{fewer than $B$ arms have $q_{n,j=1} > q_{n,j=0}$}
\STATE $ub=\lambda$ \COMMENT{Charging too much, decrease}
\ELSIF{more than $B$ arms have $q_{n,j=1} > q_{n,j=0}$}
\STATE $lb=\lambda$ \COMMENT{Can charge more, increase}
\ELSIF{exactly $B$ arms have $q_{n,j=1} > q_{n,j=0}$}
\STATE break
\ENDIF
\ENDWHILE
\STATE $\bm{a} = \bm{0}$
\STATE $a_n = 1$ where $q_{n,j=1} > q_{n,j=0}$
\IF{$ub - lb \le \epsilon$}
\STATE break ties randomly s.t. $||\bm{a}||_1=B$
\ENDIF
\STATE \textbf{return} $\bm{a}$
\end{algorithmic}
\end{algorithm}



\section{Experimental Domain Details}
\label{sec:appendix:domains}
\subsection{ARMMAN}
\label{sec:appendix:domains:armman}
The MDPs in the ARMMAN domain \citep{biswas2021learn} have three ordered states representing the level of engagement of the beneficiaries in the previous week. Rewards are better for lower states, i.e., $R(0)=1, R(1)=0.5, R(2)=0$. At each step, the beneficiary may only change by one level, e.g., low-to-medium or high-to-medium but not low-to-high. They also assume that beneficiaries follow one of three typical patterns, A, B, and C, resulting in three MDPs with different transition probabilities. 
\iffalse
as follows \citep{biswas2021learn} :

\[T^A_{s=0}=
\begin{bmatrix}
    0.1 & 0.9 & 0.0 \\
    0.1 & 0.9 & 0.0
\end{bmatrix}, \hspace{2mm}
T^A_{s=1}=
\begin{bmatrix}
    0.0 & 0.2 & 0.8 \\
    0.8 & 0.2 & 0.0
\end{bmatrix},\hspace{2mm}
T^A_{s=2}=
\begin{bmatrix}
    0.0 & 0.4 & 0.6 \\
    0.0 & 0.4 & 0.6
\end{bmatrix} \
\]

\[T^B_{s=0}=
\begin{bmatrix}
    0.9 & 0.1 & 0.0 \\
    0.9 & 0.1 & 0.0
\end{bmatrix}, \hspace{2mm}
T^B_{s=1}=
\begin{bmatrix}
    0.0 & 0.4 & 0.6 \\
    0.4 & 0.6 & 0.0
\end{bmatrix},\hspace{2mm}
T^B_{s=2}=
\begin{bmatrix}
    0.0 & 0.4 & 0.6 \\
    0.0 & 0.4 & 0.6
\end{bmatrix} \
\]

\[T^C_{s=0}=
\begin{bmatrix}
    0.1 & 0.9 & 0.0 \\
    0.1 & 0.9 & 0.0
\end{bmatrix}, \hspace{2mm}
T^C_{s=1}=
\begin{bmatrix}
    0.0 & 0.4 & 0.6 \\
    0.1 & 0.9 & 0.0
\end{bmatrix},\hspace{2mm}
T^C_{s=2}=
\begin{bmatrix}
    0.0 & 0.4 & 0.6 \\
    0.0 & 0.4 & 0.6
\end{bmatrix} \ ,
\]

where the rows correspond to actions (i.e., row 0 is action 0 and row 1 is action 1), and the columns correspond to the next state. 
\fi 
There are two patterns of effects present that differentiate the beneficiary types. (1)~For each of the above types, the planner can only make a difference when the patient is in state 1. Type A responds very positively to interventions, but regresses to low reward states in absence. Type B has a similar but less amplified effect, and type C is likely to stay in state 1, but can be prevented from regressing to state 2 when an action is taken. (2) Further, types A and C have only a 10\% chance of staying in the high reward state, while type B has a 90\% chance of staying there.

We converted these patient types to robust versions where the transition probabilities are uncertain as follows:

\[T^i_{s=0}=
\begin{bmatrix}
    p^i_{000} & 1 - p^i_{000} & 0.0 \\
    p^i_{010} & 1 - p^i_{010}  & 0.0
\end{bmatrix},
\]
\[T^i_{s=1}=
\begin{bmatrix}
    0.0 & 1 - p^i_{102} & p^i_{102} \\
    p^i_{110} & 1 - p^i_{110} & 0.0
\end{bmatrix}, \
\]
\[T^i_{s=2}=
\begin{bmatrix}
    0.0 & 1 - p^i_{202} & p^i_{202} \\
    0.0 & 1 - p^i_{212} & p^i_{212}
\end{bmatrix},
\]
where $i$ indexes the type (i.e., A, B or C).
We then set each $p^i_{sas^\prime}$ to be in a range of width 0.5 centered on the entries from each of the A, B, C beneficiary types for $s\in\{1,2\}$. To add additional heterogeneity to the experiments, for $s=0$, we set the range to 1.0 so that any beneficiary type can be made to have some non-negligible chance of staying in the good state, rather than only type B beneficiaries. The full set of parameter ranges are given in the Table~\ref{table:armman_robust_params} below.

\begin{table}[ht]
    \centering
    \begin{tabular}{|c|c|c||c|c||c|c|}
    \toprule
       Param &  L &  U & L &  U & L &  U \\
    \midrule
          Type A &        &        &  Type B &          &  Type C &          \\
    \midrule
     $p^i_{000}$ &   .00 &   1 &    .00 &     1 &    .00 &     1 \\
     $p^i_{010}$ &   .00 &   1 &    .00 &     1 &    .00 &     1 \\
     $p^i_{102}$ &   .50 &   1 &    .35 &     .85 &    .35 &     .85 \\
     $p^i_{110}$ &   .50 &   1 &    .15 &     .65 &    .00 &     .50 \\
     $p^i_{202}$ &   .35 &   .85 &    .35 &     .85 &    .35 &     .85 \\
     $p^i_{212}$ &   .35 &   .85 &    .35 &     .85 &    .35 &     .85 \\
    \bottomrule
    \end{tabular}
    \caption{Upper (U) and lower (L) parameter ranges for the robust ARMMAN environment.}
    \label{table:armman_robust_params}
\end{table}

In all experiments, 20\% of arms were sampled from type A, 20\% from type B and 60\% for type C. %In the robust double oracle experiments, 
To add additional heterogeneity, for each of the 50 random seeds we uniformly sample a sub-range contained within the ranges given in Table~\ref{table:armman_robust_params}. In the agent oracle experiments, for each of the 50 random seeds, since these require fully instantiated transition matrices, we uniformly sample each parameter value for each arm according to its type such that the values are contained in the ranges given in Table~\ref{table:armman_robust_params}.
% ARMMAN is a non-profit organization that provides a free call-based service to its beneficiaries (pregnant women and new mothers). The healthcare intervention problem of ARMMAN is to decide whom to intervene (in-person visit by a healthcare worker) on and when, in order to improve their overall engagement with the healthcare program. We use the summary statistics derived from the call records of the beneficiaries, provided by~\citet{biswas2021learn}, to simulate the healthcare intervention problem. Each beneficiary (arm) is assumed to be in one of the three states---Self Motivated (S), Persuadable (P), and Lost Cause (L), and are categorized into three groups depending on behavior. We additionally assume uncertainty intervals centered around the transition parameters, resulting in $6$ uncertain parameters per arm, as follows:---(i)~$20\%$ of arms are of category $A$ with uncertainty intervals as follows: $p^{PS}=[L, H]$ and $p^{PL}=[l, h]$, (B)~$20\%$ of arms are of category $B$: $p^{PS}=[l,h]$ and $p^{PL}=[l, h]$, and (C)~the rest $60\%$ of arms are of category $C$: $p^{PS}=[l,h]$ and $p^{PL}=[l,h]$. 

\subsection{SIS  Epidemic Model}
\label{sec:appendix:domains:sis}
In this domain, each arm follows its own compartmental SIS epidemic model. Each arm's SIS model tracks whether each of $N_p$ members of a population is in a susceptible (S) or infectious (I) state. This can be tracked with $N_p$ states, since it can be computed how many people are in state I if only the number of people in state S and the population size $N_p$ is known.

To define a discrete SIS model, we instantiate the model given in \citet{yaesoubi2011generalized} section 4.1 with a $\Delta t$ of 1. We also augment the model to include action effects and rewards. Specifically, $R(N_S) = N_S/N_p$, where $N_S$ is the number of susceptible (non-infected) people. Further, there are three actions $\{a_0, a_1, a_2\}$ with costs $c=\{0, 1, 2\}$. Action $a_0$ represents no action, $a_1$ divides the contacts per day $\kappa$ ($\lambda$ in \citet{yaesoubi2011generalized}) by $a^{\textit{eff}}_1$, and $a_2$ divides the infectiousness $r_{\textit{infect}}$ ($r(t)$ in \citet{yaesoubi2011generalized}) by $a^{\textit{eff}}_2$. That is, taking action $a_1$ will \textit{reduce} the average number of contacts per day in a given arm, and taking action $a_2$ will reduce the probability of infection given contact in a given arm, thus reducing the expected number of people that will become infected in the next round. However, to make this a robust problem, the relative effect sizes of each action for each arm will not be known to the planner, nor will the $\kappa$ or $r_{\textit{infect}}$. We impose the following uncertainty intervals for all arms: $\kappa \in [1, 10]$, $r_{\textit{infect}} \in [0.5, 0.99]$, $a^{\textit{eff}}_1 \in [1, 10]$, and $a^{\textit{eff}}_2 \in [1, 10]$. 

In the robust double oracle experiments, to add additional heterogeneity, for each of the 50 random seeds we uniformly sample a sub-range contained within the ranges given above for each arm. In the agent oracle experiments, for each of the 50 random seeds, since these require fully instantiated transition matrices, we uniformly sample each parameter value for each arm such that the values are contained in the ranges given above.

\section{Hyperparameter Settings and Implementation Details}
\label{sec:appendix:hyperparams-and-details}
\textbf{Neural networks: }All neural networks in experiments are implemented using PyTorch 1.3.1 \citep{NEURIPS2019_9015} with 2 fully connected layers each with 16 units and tanh activation functions, and a final layer of appropriate size for the relevant output dimension with an identity activation function. The output of discrete actor networks (i.e., the policy network from the agent oracle, and the policy network of agent A in the nature oracle) pass through a categorical distribution from which actions are randomly sampled at training time, without a budget imposed. It is critical not to impose the budget at training time, so that the budget spent by the optimal policy under a given $\lambda$ will result in a meaningful gradient for updating the $\lambda$-network. The output of continuous actor networks (i.e., agent B in the nature oracle which selects environment parameter settings) instead are passed as the means of Gaussian distributions -- with the log standard deviations learned as individual parameters separate from the network -- from which continuous actions are sampled at training time. At test time, actions are sampled from both types of networks deterministically. For categorical distributions, we greedily select the highest probability actions. For Gaussian distributions, we act according to the means. All discount factors were set to 0.9. The remaining hyperparameters that were constant for all experiments for the agent and nature oracles are indicated in Table \ref{table:hyperparams}. For \textbf{Robust Double Oracle} experiments, all agent and nature oracles were run for 100 training epochs. For \textbf{Agent Oracle} experiments, DDLPO was run for 100 training epochs for the synthetic and ARMMAN domains and 200 epochs for the SIS domain.

\textbf{$\lambda$-network: } Critical to training the $\lambda$-network is cyclical control of the temperature parameter that weights the entropy term in the actor loss functions. Recall that the $\lambda$-network is only updated every \texttt{n\_subepochs}. In general, after each update to the $\lambda$-network, we want to encourage exploration so that actor networks explore the new part of the state space defined by updated predictions of $\lambda$. However, after \texttt{n\_subepochs} rounds, we will use the cost of the sampled actor policies as a gradient for updating the $\lambda$-network, and that gradient will only be accurate if the actor policy has converged to the optimal policies for the given $\lambda$ predictions. Therefore, we also want to have little or no exploration in the round before we update the $\lambda$-network. In general, we would also like the entropy of the policy network to reduce over time so that the actor networks and $\lambda$-networks eventually both converge.

To accomplish both of these tasks, the weight (temperature) of the entropy regularization term in the loss function of the actor network will decay/reset according to two processes. The first process will linearly decay the temperature from some positive, but time-decaying  starting value (see next process) $\tau_t$ immediately after each $\lambda$-network update, down to 0 after $\texttt{n\_subepochs}$. The second process will linearly decay the temperature from a maximum $\tau_0$ (\emph{start entropy coeff} in Table~\ref{table:hyperparams}) down to $\tau_{\min}$ (\emph{end entropy coeff} in Table~\ref{table:hyperparams}) by the end of training. 

We found that it also helps to train the actor network with no entropy and with the $\lambda$-network frozen for some number of rounds before training is stopped (\emph{lambda freeze epochs} in Table~\ref{table:hyperparams}).

\textbf{Double Oracle: }In all experiments in the main text, we initialize the agent strategy list with HO, HM, and HP, and the nature strategy list with pessimistic, mean, and optimistic nature strategies, then run RR-DPO for 6 iterations. This produces a set of 8 agent strategies, 8 nature strategies, a table where each entry represents the regret of each agent pure strategy (row) against each nature pure strategy (column), and an optimal mixed strategy over each set that represents a Nash equilibrium of the minimax regret game given in the table. The regret table is computed by first computing the returns of each agent/nature pure strategy combination, then subtracting the max value of each column from all entries in that column (i.e., the best agent strategy for a given nature strategy gets 0 regret). The regret of RR-DPO is reported as the expected utility corresponding to the Nash equilibrium of the regret game given by the table, once that regret table is normalized to account for the returns of baselines (see next paragraph). 

After this main loop completes, we then compute the regret of the baselines by evaluating each baseline policy against each pure strategy in the nature strategy list. Then, we also run the nature oracle against each baseline policy to find a nature strategy that should maximize the regret of that baseline. The regret for each baseline is reported as the max regret against this new nature strategy, as well as all pure nature strategies from the main RR-DPO loop. 

\textbf{Hawkins Baselines: }The Hawkins policies are implemented with gurobipy 9.1.2, a Python wrapper for Gurobi (9.0.3) \citep{gurobi} following the LP given in \citet{hawkins2003langrangian} equation 2.5 to compute $\lambda$ and $Q(s,a,\lambda)$ for each arm and the integer program in equation 2.12 to select actions.

\textbf{RLvMid Baseline: }We found that RLvMid found effective policies for the nature strategy it was trained against (as evidenced in Figure \ref{fig:all_experiments})(a-f), but that that learned policy could be brittle against other nature strategies. This is likely because different nature strategies produce different distributions of states, meaning RLvMid would fit policies well to states seen when planning against the mean nature strategy, but underfit its policies for states seen more often in different distributions. 
% Note that the RR-DPO is designed to correct for this brittleness by both (1) having the agent oracle learn against mixed nature strategies and (2) learning a mixed agent strategy. 
However, the lone RLvMid baseline policy can somewhat correct for this effect by training an ensemble of policies against slight perturbations of the mean nature strategy that adjust the parameter values output by nature by a small $\epsilon$. In all experiments we train 3 RLvMid policies against 3 random perturbations of the mean nature strategy, then report the regret of RLvMid as the minimum of the max regrets returned by any of the 3.

\begin{table}[ht]
    \centering
    \begin{tabular}{|l|r|}
    \toprule
                 \textbf{parameter} &   \textbf{value} \\
    \midrule
                     \textit{\textbf{agent}} &         \\
                clip ratio & 2.0e+00 \\
      lambda freeze epochs & 2.0e+01 \\
       start entropy coeff & 5.0e-01 \\
         end entropy coeff & 0.0e+00 \\
       actor learning rate & 2.0e-03 \\
      critic learning rate & 2.0e-03 \\
      lambda learning rate & 2.0e-03 \\
          trains per epoch & 2.0e+01 \\
      n\_subepochs & 4.0e+00 \\
                           &         \\
                    \textit{\textbf{nature}} &         \\
                clip ratio & 2.0e+00 \\
      lambda freeze epochs & 2.0e+01 \\
       start entropy coeff & 5.0e-01 \\
         end entropy coeff & 0.0e+00 \\
      actorA learning rate & 1.0e-03 \\
     criticA learning rate & 1.0e-03 \\
      actorB learning rate & 5.0e-03 \\
     criticB learning rate & 5.0e-03 \\
      lambda learning rate & 2.0e-03 \\
          trains per epoch & 2.0e+01 \\
      n\_subepochs & 4.0e+00 \\
      n\_sims & 2.5e+01 \\
    \bottomrule
    \end{tabular}
    \caption{Hyperparameter settings for agent and nature oracles for all experiments.}
    \label{table:hyperparams}
\end{table}

% \section{Additional Experimental Results}
% Fig.~\ref{fig:do_performance_larger} shows regret results from additional robust experiments which scale up the number of arms for Synthetic (top left), ARMMAN (top right) and SIS (bottom left; $N_p=50$), as well as and the size of the state space of SIS (bottom right; $N=5, B=4$). RR-DPO continues to outperform all baselines by a large margin. When the size of the state space is scaled up for SIS, it becomes infeasible to run the Hawkins baselines due to its long query time which grows quadratically with the size of the state space. Because of this, we exclude the Hawkins baselines from the main double oracle loop in these experiments. Further, Hawkins cannot even be evaluated as a baseline as the state space increases since to compute its maximum regret, we must get one best response from the nature oracle against the baseline, which requires querying the Hawkins baseline policies tens of thousands of times, which is prohibitive when the query time takes even \char`\~ $1s$ to run.

% \begin{figure}[t]
%     \centering
%         \begin{subfigure}[t]{0.45\linewidth}
%         \centering
%         \includegraphics[width=\textwidth]{img/appendix/regret_final_loop_appendix_h10_epoch5_datacounterexample.pdf}
%         % \caption{Counterexample domain, varying budget}
%         \label{fig:counterexample_do_appendix}
%     \end{subfigure}
%     \qquad
%     \begin{subfigure}[t]{0.45\linewidth}
%         \centering
%         \includegraphics[width=\textwidth]{img/appendix/regret_final_loop_appendix_h10_epoch5_dataarmman.pdf}
%         % \caption{Counterexample domain, varying budget}
%         \label{fig:armman_do_appendix}
%     \end{subfigure}
%     \\
%     \begin{subfigure}[t]{\linewidth}
%         \centering
%         \includegraphics[width=.9\textwidth]{img/appendix/regret_final_loop_appendix_h10_epoch5_datasis.pdf}
%         % \caption{Counterexample domain, varying $n$}
%         \label{fig:sis_do_appendix}
%     \end{subfigure}
%     \caption{Additional experiments showing maximum policy regret in robust setting for Synthetic (top left), ARMMAN (top right) and SIS (bottom) domains, respectively. Synthetic is scaled by 3 and ARMMAN by 5 to maintain the distributions of arm types specified in Section \ref{sec:experiments}. RR-DPO beats all baselines by a large margin across various parameter settings. When the state space is scaled up (bottom right) Hawkins baselines become infeasible to run due to its long query time (see Section~\ref{sec:experiments-do} for a discussion), even for a small number of arms ($N = 5, B = 4$).}
%     \label{fig:do_performance_larger}
% \end{figure}



\begin{figure*}[t]
    \centering
    \includegraphics[width=0.9\linewidth]{img/appendix/um_and_h_sensitivity_analysis_aaai.png}
    \caption{(Left column) varies the uncertainty intervals to be 0.25, 0.5 and 1.0 times their widths (UM = uncertainty multiplier). The gap between our robust RR-DPO method and non-robust methods becomes larger as the uncertainty interval increases, and our robust algorithm RR-DPO always provides the lowest regret policies. (Right column) varies the horizon H in {10, 25, 50, 100}. As expected, the gap between RR-DPO and the baselines either stays the same, or increases as H is increased, further demonstrating the robustness of our algorithm to various parameters.}
    \label{fig:um_and_h_sensitivity_analysis}
\end{figure*}



\end{document}
