% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{times}
\usepackage{soul}
\usepackage{url}
%\usepackage[hidelinks]{hyperref}
\usepackage[utf8]{inputenc}
\usepackage{caption}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{booktabs}
\usepackage{algorithm}
\usepackage[noend]{algorithmic}
\usepackage[switch]{lineno}
\usepackage{bm}
\renewcommand{\algorithmiccomment}[1]{//#1}
\usepackage{xcolor} 
\usepackage{amsfonts}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{mathtools}
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\newtheorem{example}{Example}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}

\title{Group Fairness in Predict-Then-Optimize Settings for Restless Bandits}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Shresth Verma\thanks{Equation Contribution}}{}}
\author[1]{Yunfan Zhao$^*$}
\author[1]{Sanket Shah}
\author[1]{Niclas Boehmer}
\author[2]{Aparna Taneja}
\author[1,2]{Milind Tambe}
% Add affiliations after the authors
\affil[1]{%
    Harvard University
}
\affil[2]{%
    Google Research India
}

  
  \begin{document}



  \maketitle


\begin{abstract} 
Restless multi-arm bandits (RMABs) are a model for sequentially allocating a limited number of resources to agents modeled as Markov Decision Processes. RMABs have applications in cellular networks, anti-poaching, and in particular, healthcare. For such high-stakes use cases, allocations are often required to treat different groups of agents (e.g., defined by sensitive attributes) fairly. In addition to the fairness challenge, agents' transition probabilities are often unknown and need to be learned in real-world problems.  
Thus, group fairness in RMABs requires us to simultaneously learn transition probabilities and how much budget we allocate to each group. Overcoming this key challenge ignored by previous work, we develop a decision-focused-learning pipeline to solve \emph{equitable RMABs}, using a novel budget allocation algorithm to prevent disparity between groups. Our results on both synthetic and real-world large-scale datasets demonstrate that incorporating fair planning into the learning step greatly improves equity with little sacrifice in utility.
\end{abstract}


\section{Introduction}
Restless multi-arm bandits (RMABs) are a model for sequentially distributing scarce resources to a set of agents. 
Concretely, we have a set of arms and a limited budget and face the question of deciding which arms to pull in each round. 
The state of arms evolves according to a Markov Decision Process where transition probabilities depend on whether the arm is pulled in this step. 
RMABs have a broad range of applications, including resource allocation in anti-poaching, machine maintenance, cellular networks \citep{modi2019transfer,zhao2007myopic,bagheri2015restless,glazebrook2006some,qian2016restless,yu2018deadline,ruiz2020multi}. RMABs have especially be used in healthcare settings such as call scheduling in a maternal and child care program \citep{mate2022field,killian2023equitable}, screening patients at risk of cancer \citep{lee2019optimal}, and allocating hepatitis C treatment \citep{ayer2019prioritizing}.
%In particular, RMAB have been successfully used to solve high stake healthcare resource allocation problems \citep{mate2022field,killian2023equitable,lee2019optimal,ayer2019prioritizing}. Mate \textit{et al.}~\shortcite{mate2022field} deployed RMAB models in a maternal and child health care program that need to efficiently allocate service calls to maximize beneficiaries' engagement. Lee \textit{et al.}~\shortcite{lee2019optimal} used RMAB model to efficiently screen a population of patients who may have cancer and to maximize early-stage cancer detections. Ayer \textit{et al.}~\shortcite{ayer2019prioritizing} used RMABs to determine which inmates to have higher priority in receiving hepatitis C treatment in U.S. prisons.
% In particular, RMAB models have been successfully deployed in ARMMAN, an NGO in India working on maternal and child health care. ARMMAN operates a large scale health information program serving over one million beneficiaries with clear evidence of health benefits. Part of this program involves delivering automated voice messages weekly to expecting or new mothers over cell phones. ARMMAN initiates service calls to enrolled beneficiaries to ensure they do not stop listening to useful health information messages or drop off from the program. Given the limited number of service calls that can be made, a key question is how to allocate the service calls. 



In high-stakes resource allocation settings such as healthcare, authorities often want to give priority to agent groups, e.g., as defined by sensitive attributes, that are most in need. 
For instance, some governments require non-discrimination based on sensitive attributes
%such as religion 
\citep{amon2020ending} and non-profits commonly aim to prioritize low income groups \citep{verma2023increasing}. 
The standard RMAB objective of maximizing arms' summed reward falls short in these cases. For example, if one unit of budget could increase the utility of a marginalized group from 0.2 to 0.35 or increase that of a non-marginalized group from 0.5 to 0.7, a strictly utilitarian policy would favor the non-marginalized group, further widening the socio-economic gap. Importantly, equal allocation is often not sufficient, and equity or balanced outcomes is preferred \citep{marsh1994equity,luss2012equitable} %,braveman2003defining}.
%We develop algorithms for RMABs that maximize different group fairness objectives, e.g., the Nash fairness objective \citep{caragiannis2019unreasonable} that maximizes the product of groups' utilities.

While the literature \citep{herlihy2023planning,li2022efficient,killian2023equitable} has mainly focused on how to plan fairly, a main challenge in using RMABs in the real world is how to predict unknown transition probabilities of arms. The task of predicting transition probabilities that are later used in a planning problem falls within a well-studied predict-then-optimize framework \citep{elmachtoub2022smart}. A naive ``two-stage'' approach is to first learn transition probabilities minimizing prediction error and subsequently use them to decide on an allocation maximizing the fairness objective. However, it has been shown that such approaches suffer from objective mismatch \citep{wilder2019melding} and inaccurate predictions in our setting may even increase disparity among groups. 
%However, when using RMABs in the real world, the transition probabilities of arms are often unknown, a challenge ignored by existing works on fairness in RMABs \citep{herlihy2023planning,li2022efficient,killian2023equitable}.
% A standard two-stage approach to deal with this problem is to first learn transition probabilities minimizing prediction error and subsequently use them to decide on an allocation maximizing the fairness objective. 


Decision-focused-learning (DFL) approaches that incorporate the downstream optimization problem have proven to be both scalable and effective \citep{mandi2023decision,agrawal2019differentiable}, and have been studied for RMABs when maximizing the summed reward \citep{wang2023scalable}.
However, applying the DFL paradigm to optimize a group fairness objective presents us with novel challenges: 
The optimization objective becomes more complex and makes it necessary to simultaneously learn agents' transition probabilities and the allocation of budget to groups.
To incorporate how both transition probabilities and budget allocations affect the fairness objective, one needs to differentiate through the process of allocating budgets to different groups. %While previous works allocate budgets using expensive procedures that are non-differentiable \cite{killian2023equitable}, we provide a scalable and differentiable procedure based on a regularized optimal transport problem. 


Tackling this challenge, we provide a DFL pipeline for \textit{equitable} RMABs that simultaneously learns budget allocation and transition probabilities, in an offline learning setting. Our main contributions are:
\begin{itemize}
\item To the best of our knowledge, we are the first to develop a decision-focused-learning (DFL) method to solve \textit{equitable} RMABs, outperforming various baselines by 20-40\% in fairness objectives, on both synthetic and real-world large-scale datasets. 
\item We propose a novel differentiable budget allocation method such that the effect of changes in budget on the fairness objective is incorporated during training. Our theoretical results shed light on the feasibility and effectiveness of this method. 
\item Our method optimizes fairness objectives while achieving utility $>$90\% of a state-of-the-art algorithm \citep{wang2023scalable} that purely optimizes utility, on a real-world health information dataset.
% \item Empirical results suggest that our approach allocates more budgets to groups that suffer from poor outcomes under strictly utilitarian methods and thus prevents disparity among groups.
\item Our pipeline is compatible with a broad range of fairness objectives, including Maximin Reward, Gini Index, and Max Nash Welfare, and generally to a range of functions defined over utilities of groups. 
\end{itemize} 

% \newpage 
% \maketitle

% \begin{abstract}
% Restless multi-arm bandits, a class of sequential resource allocation problems involving multiple agents with a resource constraint, have applications in healthcare, cellular networks, and anti-poaching. For such high-stakes use cases, decisions must ensure equity among groups. In real-world problems, ground truth probabilities of MDPs in the RMABs are often unknown, and one must simultaneously learn group budget allocations and transition probabilities, a key challenge ignored by previous works on fairness in RMABs.  We develop a decision-focused-learning pipeline to solve equitable RMABs, using a novel budget allocation algorithm to prevent disparity between groups. Our results on both synthetic and real-world large-scale datasets demonstrate that our approach greatly improves equity with little sacrifice in utility. 
% \end{abstract}


% \section{Introduction}
% Restless Multi-arm bandits (RMABs) has a set of hetereogenous arms and a limited budget. The decision maker has to choose which arms to pull. Regardless of being pulled or not, the state of each arm can change. Each arm is modeled by a Markov Decision Process, and the transition probabilities are dependent on the state of the arm and the pulling decision. 
% %RMABs are shown to be PSPACE hard \citep{papadimitriou1994complexity} even when the transition dynamics are known, and Whittle index policy is a commonly used approximate solution method in RMAB problems \citep{zayas2019asymptotically,ghosh2023indexability,hodge2015asymptotic}. 
% RMABs have a broad range of applications, including resource allocations in anti-poaching, machine maintenance, and cellular network \citep{modi2019transfer,zhao2007myopic,bagheri2015restless,glazebrook2006some,qian2016restless,yu2018deadline,ruiz2020multi}. In particular, RMAB is used in healthcare settings such as call scheduling in a maternal and child care program \citep{mate2022field,killian2023equitable}, screening patients at risk of cancer \citep{lee2019optimal}, and allocating hepatitis C treatment \citep{ayer2019prioritizing}.
% %In particular, RMAB have been successfully used to solve high stake healthcare resource allocation problems \citep{mate2022field,killian2023equitable,lee2019optimal,ayer2019prioritizing}. Mate \textit{et al.}~\shortcite{mate2022field} deployed RMAB models in a maternal and child health care program that need to efficiently allocate service calls to maximize beneficiaries' engagement. Lee \textit{et al.}~\shortcite{lee2019optimal} used RMAB model to efficiently screen a population of patients who may have cancer and to maximize early-stage cancer detections. Ayer \textit{et al.}~\shortcite{ayer2019prioritizing} used RMABs to determine which inmates to have higher priority in receiving hepatitis C treatment in U.S. prisons.
% % In particular, RMAB models have been successfully deployed in ARMMAN, an NGO in India working on maternal and child health care. ARMMAN operates a large scale health information program serving over one million beneficiaries with clear evidence of health benefits. Part of this program involves delivering automated voice messages weekly to expecting or new mothers over cell phones. ARMMAN initiates service calls to enrolled beneficiaries to ensure they do not stop listening to useful health information messages or drop off from the program. Given the limited number of service calls that can be made, a key question is how to allocate the service calls. 



% In such high stake settings,  governments require non-discrimination based on sensitive attributes
% %such as religion 
% \citep{amon2020ending}, and non-profits aim to prioritize low income groups \citep{verma2023increasing}. Purely maximizing the summed social welfare may harm marginalized socio-economic groups. For example, if one unit budget could increase the (normalized) utility of a marginalized group from 0.2 to 0.35 or increase that of a non-marginalized group from 0.5 to 0.7, a strictly utilitarian policy would favor the non-marginalized group, further widening the socio-economic gap. Thus, to reduce disparity across groups, governments and NGOs prefer to prioritize groups more in need \citep{caragiannis2019unreasonable,moulin2004fair}. Importantly, equality or equal allocation is often not sufficient, and equity or balancing outcomes is preferred \citep{marsh1994equity,luss2012equitable} %,braveman2003defining}.



% In real-world problems, ground truth probabilities of MDPs in the RMAB are unknown, and one must simultaneously learn group budget allocations and transition probabilities, a challenge ignored by existing works on fairness in RMABs \citep{herlihy2023planning,li2022efficient,killian2023equitable}. Using inaccurate transition probability predictions to compute budgets may increase disparity among groups, and poor budget allocations could lead to poor decisions, subsequently affect transition probability learning. To tackle this challenge, we design a pipeline that simultaneously learns budget allocation and transition probability, incorporating how they affect the fairness objective. 
% %We design RMAB algorithms that balance group outcomes, and we focus on offline RMABs with unknown transition probabilities. 
% %Most existing works on fairness in RMABs study individual instead of group level fairness and solve a planning problem given true transition dynamics. \citet{herlihy2023planning} and \citet{li2022efficient} studied equality in that each arm be given a minimum amount of resources, which fails to consider each group's need and may lead to imbalanced outcomes. \citet{killian2023equitable} studied balancing group outcomes given true transition probabilities, ignoring the challenge of learning transition probabilities. 

% The task of predicting transition probabilities that are later used in a planning problem falls within a well-studied predict-then-optimize framework \citep{elmachtoub2022smart}. When training a prediction model, two-stage approaches that solely maximizes prediction accuracy suffer from objective mismatch \citep{wilder2019melding}, while decision-focused-learning (DFL) approaches that incorporate the downstream optimization problem are both scalable and effective \citep{wang2023scalable}. Notably, there has been great research interest in DFL for top-k ranking \citep{mandi2023decision,agrawal2019differentiable,wilder2019melding}, a problem similar to choosing arms to pull in RMABs, but these works fail to address sequential decision making. 

% We provide a DFL pipeline for \textit{equitable} RMABs, reducing disparities across groups. Our main contributions are:
% \begin{itemize}
% \item To the best of our knowledge, we are the first to develop a decision-focused-learning (DFL) method to solve \textit{equitable} RMABs, outperforming various baselines by 20-40\% in fairness objectives, on both synthetic and real-world large-scale datasets. 
% \item Our method optimizes fairness objectives, while achieving utility $>$90\% of a state-of-the-art algorithm \citep{wang2023scalable} that purely optimizes utility, on a real-world healthcare dataset.  
% \item We propose a novel differentiable budget allocation method such that during training we incorporate the effect of changes in budget on the fairness objective. Our theoretical results shed light on the feasibility and effectiveness of this method. 
% \item Our pipeline is compatible with a broad range of fairness objectives, including Maximin Reward, Gini Index, and Max Nash Welfare. 
% \item Empirical results suggest that our approach allocates more budgets to groups that suffer from poor outcomes under strictly utilitarian methods and thus prevents disparity among groups.
% \end{itemize} 



\section{Literature Review}
% could cut literature review if need more space
\textbf{RMAB} Restless multi-arm bandits are shown to be PSPACE hard \citep{papadimitriou1994complexity} and approximated algorithms have been proposed \citep{hawkins2003langrangian,whittle1988restless}. In particular, \citet{whittle1988restless} uses a Lagrangian relaxation to
decouple computations for arms and select actions by computing Whittle indices of each arm. The resulting Whittle index policy is asymptotically optimal under an indexability assumption \citep{akbarzadeh2019restless,weber1990index}. 


\textbf{Fairness in RMABs} \citet{li2022efficient,herlihy2023planning} consider individual fairness constraints that ensure sufficient resources are given to each arm. 
%\citet{li2022efficient} considers an individual fairness constraint that upper bounds for each arm the number of epochs since the arm was pulled last. \citet{herlihy2023planning} solves RMABs with an individual fairness constraint that lower bounds the probability of each arm being pulled. 
\citet{li2023avoiding} studies fairness by always probabilistically favoring an arm that yields higher long-term cumulative reward. \citet{biswas2023fairness} studies fairness for workers who pull the arms, ensuring that we never overburden the workers. However, above works do not consider group-level fairness, where balancing group outcomes is a key challenge. \citet{killian2023equitable} designs RMAB algorithms to achieve equal outcomes but fails to address practical settings with unknown transition dynamics. Notice none of the above works have dealt with unknown transition probabilities. 

\textbf{Decision focused learning} The predict-then-optimize framework contains a prediction problem where predicted parameters are used in a downstream optimization problem that solves for a solution, and the overall goal is to obtain high-quality solutions \citep{elmachtoub2022smart}. Two-stage approaches that ignore the downstream optimization problem when solving the prediction problem result in a mismatch of the prediction loss and the solution quality \citep{lambert2020objective,wilder2019melding,mandi2020smart}.
Incorporating the downstream optimization task in the prediction problem is shown to improve model performance both theoretically \citep{grigas2021integrated,mandi2023decision,shah2022decision} and empirically \citep{amos2017optnet,huang2019addressing,agrawal2019differentiable,wang2021learning}. \citet{verma2023restless} and \citet{wang2023scalable} provide DFL methods tailored to RMABs but ignore fairness. 

\textbf{Fair top-k ranking}  Choosing which arms to pull is conceptually similar to top-k ranking. \citet{singh2019policy,yadav2021policy} provide policy gradient methods for learning fair ranking policies. \citet{celis2017ranking} constructs a mixed integer programming problem to solve for rankings and explicitly enforces fair representation between groups. \citet{zehlike2017fa} studies fairness up to a threshold and provides a greedy algorithm to achieve fair ranking. \citet{singh2018fairness,biega2018equity} use fairness constraints that link relevance to allocation to exposure items. Interestingly, top-k selection can be formulated as an LP and solved using DFL \citep{kotary2022end,wilder2019melding}. However, existing works in fair top-k ranking do not address MDPs, where actions affect future states.


\textbf{Equity and Group Fairness} Since groups have different needs,  allocating the same resources to each group is not sufficient, and balanced outcomes are preferred \citep{luss2012equitable,mcglaughlin2020improving}. Although much theory in social welfare studies individual level equity, equity is often measured on a group level in applications such as facility location \citep{marsh1994equity}, healthcare \citep{braveman2003defining}, and humanitarian logistics \citep{gutjahr2018equity}. Existing works on fair allocation extend equity to a group level \citep{barman2018groupwise,suksompong2018approximate} but fail to consider settings where resources are scarce and only a small subpopulation receives resources. % where a common assumption that everyone gets at least one resource.  Existing works fail to consider settings where resources are scarce and only a small porportion of people receive resources, which is models by RMABs.


\section{Preliminaries}

%Firstly, we define RMABs and a Whittle Index policy that prioritize the ``most efficient" arms. However, such policy may give little resources to marginalized socio-economic groups and lead to hugely unequal outcomes across groups. Next, we introduce fairness objectives including Max Nash Welfare, and we discuss key assumptions. 
%an equitable objective that is shown to lead to balanced outcomes across groups \citep{luss2012equitable,mcglaughlin2020improving}.

\subsection{RMABs and the Whittle Index Policy}
We consider a restless multi-arm bandit problem with $N$ arms and a budget of $B$. We focus on an offline learning setting where a historical dataset is available. Each arm has a discrete state space $\mathcal{S}_i$ and actions are binary $\{0,1\}$. We denote the transition probability of arm $i$ from state $s$ to state $s'$ under action $a$ as $P_i(s,a,s')$. The transition probabilities are unknown and arm features $x_i$ are available. We let $R_i(s)$ denote the per-arm reward for arm $i$ at state $s$. Since in practice, it is common that $S_i, R_i$ are the same for all arms \citep{herlihy2023planning,mate2022field}, we drop the subscript $i$. Notice our methods apply to the general setting. We let $M=|\mathcal{S}|$ denote the number of possible states. 
The vector of arm states is $\bm s\in\mathbb{R}^N$, and the one-hot encoding of actions on arms is $\bm a \in\{0,1\}^N$. A policy $\pi$ maps arm states $\bm s$ to  actions $\bm a$, while satisfying budget constraints $\sum_{i=1}^N a_i\le B, \forall t$ at each timestep $t\in[H]$. 
%At timestep $t\in[H]$, the vector of arm states is $\bm s^t\in\mathbb{R}^N$, and the one-hot encoding of actions on arms is $\bm a^t \in\{0,1\}^N$. A policy $\pi$ maps arm states $\bm s^t$ to  actions $\bm a^t$, while satisfying budget constraints $\sum_{i=1}^N a_i^t\le B, \forall t$.
For a reward-maximizing RMAB without fairness considerations, the optimal policy maximizes the following Bellman equation: % with an action space defined by a set of inequalities:
\begin{align}
\label{eq:constrained_bellman_equation}
 V(\boldsymbol{s}, B)&=\max_{\bm a}\left\{\sum_{i=1}^N R\left(\boldsymbol{s}_i\right)+\beta \mathbb{E}\left[V(\boldsymbol{s}^{\prime}, B) \mid \boldsymbol{s}, \bm a\right]\right\}, \nonumber\\
& \text { s.t. } \sum_{i=1}^N a_i \leq B,
\end{align}
% commented out writings from Jackson's paper
% \begin{align*}
% \underset{\boldsymbol{s}^{t+1} \sim P\left(\boldsymbol{s}^t, \pi\left(\boldsymbol{s}^t\right), \cdot\right)}{\mathbb{E}} \sum_{t=0}^{H-1} \sum_{n=1}^N R\left(s_n^t\right)
% \end{align*}
% A common approach to computing such policy $\pi$ is consider the value \citep{killian2023equitable}:
% \begin{align*}
% V^t\left(\boldsymbol{s}^t, b\right)=\max _{\boldsymbol{a}^t} & \left\{\sum_{n=1}^N R\left(s_n^t\right)+\mathbb{E}\left[V^{t+1}\left(\boldsymbol{s}^{t+1}\right) \mid \boldsymbol{s}^t, \boldsymbol{a}^t\right]\right\} \\
% \text { s.t. } & \sum_{n=1}^N a_n^k \leq b \quad \forall k \in t, \ldots, H
% \end{align*}
% The whittle index policy uses the Lagrangian relaxation\citep{whittle1988restless}:
% \begin{align*}
% &L^t\left(s^t, b\right)=\min _{\boldsymbol{\lambda}, V_{n \in[1, \ldots, N]}^{k \in[t, \ldots, H]}} \sum_{n=1}^N V_n^t\left(s_n^t, \lambda^t\right)+b \sum_{k=t}^H \lambda^k\\
% & \text { s.t. } V_n^k\left(s_n^k, \lambda\right) \geq R\left(s_n^k\right)-a_{n j}^k \lambda^k+ \\
% & \sum_{s^{\prime} \in \mathcal{S}} V_n^{k+1}\left(s^{\prime}, \lambda\right) P\left(s_n^k, a_{n j}^k, s^{\prime}\right)\\
% &\forall k \in t, \ldots, H-1, \forall j \in\{0,1\}, \forall s_n^k \in \mathcal{S}, \forall n \in 1, \ldots, N\\
% &V_n^H\left(s_n, \lambda\right)=0 \quad \forall s_n \in \mathcal{S}, \quad \forall n \in 1, \ldots, N
% \end{align*}
where $\beta\in (0,1]$ is a discount factor. 
%Solving Problem~\ref{eq:constrained_bellman_equation} is shown to be PSPACE hard \citep{papadimitriou1994complexity} since the state space and the action space grow exponentially in the number of arms
Note the state space and the action space grow exponentially in $N$. To learn a policy, a scalable approach is to use the Whittle Index derived from the Lagrangian relaxation \citep{whittle1988restless}. 
\begin{align}
\label{eq:lagrangian_relaxation}
& J\left(s, B\right)=\min _{\lambda \geq 0}\left(\frac{\lambda B}{1-\beta}+\sum_{i=1}^N \max _{a_i \in \{0,1\}}\left\{Q_i\left(s_i, a_i, \lambda\right)\right\}\right), \\
& \text { s.t. } Q_i(s_i, a_i, \lambda) = R(s_i) - \lambda a_i+\beta \mathbb{E}\left[Q_i(s_i^{\prime}, a_i, \lambda) \mid \pi(\lambda)\right] \nonumber.
\end{align}
The Whittle Index $W_i(s_i)$ is equal to the smallest action charge $m$ that makes pulling as rewarding as not pulling
\begin{definition}[Whittle index]
The Whittle index associated to state $s_i$ is:
\begin{align*}
W_i(s_i):=\inf_{m}\left\{Q_i(s_i,a_i=0,m) = Q_i(s_i,a_i=1,m)\right\}.
\end{align*}
\end{definition}
Intuitively, given a unit budget, an arm with a higher Whittle index would benefit from a larger increase in discounted cumulative reward. 
%arms with high Whittle Index values require budget to produce reward but need only little budget. 
The Whittle Index policy $\pi^{\text {whittle }}$ is to pull $B$ arms with the highest Whittle indices. 

% commented out writings from Kai's DFL paper
% \begin{definition}[Whittle index]
% Given state $u \in \mathcal{S}$, we define the Whittle index associated to state $u$ by:
% $$
% W_i(u):=\inf _m\left\{Q_i^m(u ; a=0)=Q_i^m(u ; a=1)\right\}
% $$
% where the value functions are defined by the following Bellman equations, augmented with subsidy $m$ for action $a=0$.
% \begin{align*}
% V_i^m(s) & =\max _a Q_i^m(s ; a) \\
% Q_i^m(s ; a) & =m \mathbf{1}_{a=0}+R(s)+\gamma \sum_{s^{\prime}} P_i\left(s, a, s^{\prime}\right) V_i^m\left(s^{\prime}\right)
% \end{align*}
% \end{definition}




\subsection{Fairness Objectives and Group Budget Allocations}

The set of $N$ arms can be partitioned into groups $\mathcal{G}$ according to known arm features (such as age, geographical location, and education \citep{mate2022field}).
%, i.e. the union of groups $g\in\mathcal{G}$ is $\{1,2,...,N\}$. 
Let $N_g$ denote the size of group $g$. We define the value function for a group: %, which are crucial in the Max Nash Welfare objective. 
\begin{align}
\label{eq:value_function_each_group}
V_g(s_g,b_g) = \frac{1}{N_g}V(s_g,b_g),
\end{align}
where $\bm s_g\in\{0,1\}^{N_g}$ is a vector representing states of all arms in group $g$. We develop a differentiable pipeline that accommodates any objective that satisfies a key assumption:

\begin{assumption}\label{assp:obj_differentiable_in_group_values}
The fairness objective or a proxy of it is differentiable in group values (\ref{eq:value_function_each_group}).
\end{assumption}
A fairness objective that fails to satisfy Assumption~\ref{assp:obj_differentiable_in_group_values} has little hope of being compatible with a differentiable pipeline, where the gradient of the objective with respect to group budget allocations needs to be evaluated. Many popular fairness objectives, including Max Nash Welfare (MNW), Maximin Reward (MMR), and Gini Index, satisfy Assumption~\ref{assp:obj_differentiable_in_group_values}. Given fixed groups $g\in\mathcal{G}$, MNW optimizes the product of group values:
\begin{align}\label{eq:mnw_obj}
\text{MNW} (\bm s) := \max _{b_g, g\in\mathcal{G}} \prod_{g \in \mathcal{G}} V_g\left(s_g, b_g\right) \ \text{s.t. } \sum_{g \in \mathcal{G}} b_g=B.
\end{align}
By giving diminishing returns for any group's marginal increase in utility, MNW naturally trades off efficiency and equity \citep{caragiannis2019unreasonable,ramezani2009nash}. MMR optimizes the so-called egalitarian
social welfare \citep{asadpour2007approximation,caragiannis2012efficiency}:
\begin{align}\label{eq:maximin_obj}
\text{MMR} (\bm s) := \max _{b_g, g\in\mathcal{G}} \min_{g \in \mathcal{G}} V_g\left(s_g, b_g\right) \ \text{s.t. } \sum_{g \in \mathcal{G}} b_g=B.
\end{align}
By maximizing the utility of the worst-off group, MMR is shown to be favorable both theoretically and empirically \citep{brandt2016handbook,bonald2006queueing}. We will provide a differentiable proxy of MMR. 




\section{Equitable RMABs with Decision-Focused-Learning}

We begin by illustrating that naive approaches to address fairness fail. Next, we propose a differentiable pipeline compatible with a broad range of fairness objectives and provide theoretical results that shed light on the feasibility and effectiveness of our pipeline. After that, we discuss details on how to use the pipeline with various fairness objectives. 

\subsection{Need for a Differentiable Pipeline}

%In solving our problem of interest, we are not given any reasonably good budget allocation or transition probability prediction to start with, and we need to simultaneously optimize per group budget allocation (solve Equation~\ref{eq:mnw_obj}) and provide high quality transition probability predictions as measured in the fairness objective. 

One naive approach is to prioritize learning optimal transition probabilities to maximize utility, and then allocate budgets proportional to group size. This approach does not simultaneously learn budget allocations and transition probabilities, and thus it does not take into account groups' needs. We demonstrate it fails to provide sufficient budgets to marginalized groups (see Figure~\ref{fig:budget}). 

A more principled approach is to first predict the transition probabilities by minimizing a prediction loss, and then plugin the predictions into a budget allocation algorithm in \citet{killian2023equitable} that maximizes the fairness objective. However, predictions from such a two-stage procedure can be of poor quality \citep{wang2023scalable}, which leads to poor budget allocations (see Figure~\ref{fig:synth_results}). 

%Having discussed why alternatives fail, we introduce our first algorithm, which is a greedy procedure, which follows a greedy procedure to compute budget allocations. Later, we will explain limitations of this first algorithm, which motivates a fully differentiable pipeline. 

\subsubsection{A Non-Differentiable Approach}

Having discussed why alternatives fail, we introduce our first DFL approach that simultaneously allocates budgets and predicts transition probabilities. We will first describe a budget allocation subroutine, and then discuss a main algorithm that uses the subroutine. We explain this approach using MNW, and later discuss other fairness objectives. 

%Observe that in Problem (\ref{eq:constrained_bellman_equation}), the feasible set increases with $B$ and thus the optimal objective value is monotone increasing in $B$. Consequently, $V_g(s_g,b_g)$ is monotone increasing in $b_g$ for all groups $g\in\mathcal{G}$. 

{\bf Budget Allocation } Problem (\ref{eq:mnw_obj}) can be rewritten as:
\begin{align}\label{eq:mnw_obj_take_log}
\max _{b_g} \sum_{g \in \mathcal{G}} \log\left(V_g\left(s_g, b_g\right) \right)\quad\text{s.t. } \sum_{g \in \mathcal{G}} b_g=B.
\end{align}
Since computing $V_g(s_g,b_g)$ 
%is intractable and 
is PSPACE hard \citep{papadimitriou1994complexity}, following \citet{killian2023equitable}, we replace $V_g(s_g,b_g)$ by the Lagrangian relaxation $J(s_g,b_g)$ that upper bounds it.
Taking $\frac{dJ(s_g,b_g)}{db_g}=\frac{\lambda}{1-\beta}$, we have that $J(s_g,b_g)$ is increasing in $b_g$. In addition, when $b_g$ increases, an optimal policy would take more actions, which implies a lower action charge $\lambda_k$. Thus, $\frac{dJ(s_g,b_g)}{db_g}=\frac{\lambda}{1-\beta}$ is decreasing in $b_g$, and $J(s_g,b_g)$ is concave. Consequently, to maximize MNW, it suffices to greedily assign one additional budget to the group that achieves the maximum increase in $\log J(s_g,b_g)$. 
% $$g\in \underset{g\in\mathcal{G}}{\text{argmax}} \ V_g(s_g,b_g+1) - V_g(s_g,b_g).$$
Based on above, Algorithm~\ref{alg:subroutine_budget_allo_greedy} gives budget to the group that provides the highest increase in log MNW. 
\begin{algorithm}[tb]
    \caption{Greedy Budget Allocation}
    \textbf{Input}: Sorted Whittle indices $W$, groups $g\in\mathcal{G}$, budget $B$, states $\bm s$, initial budgets $b_g=0, \forall g$
    \begin{algorithmic}[1] %[1] enables line numbers
    \STATE Compute for each group $g\in\mathcal{G}$, \\$J_{\Delta}\left(\boldsymbol{s}_g, b_g\right)=\log \left(J(\boldsymbol{s}_g, b_g+1\right))-\log \left(J(\boldsymbol{s}_g, b_g)\right)$
    \WHILE{there is budget remaining}
    \STATE Give one more budget to the group with the highest $J_\Delta(s_g,b_g)$ and recompute $J_\Delta(s_g,b_g)$ for this group. 
    \ENDWHILE
    \STATE \textbf{return} budget allocation $b_g$
    \end{algorithmic}
    \label{alg:subroutine_budget_allo_greedy}
\end{algorithm}



\begin{algorithm}[tb]
    \caption{Equitable DFL Algorithm for RMABs}
    \textbf{Input}: offline dataset, groups $g\in\mathcal{G}$, learning rate $\alpha_w$, frequency $n_f$, warm-up epochs $n_{ini}$
    % \textbf{Parameter}: Optional list of parameters\\
    % \textbf{Output}: Your algorithm's output
    \begin{algorithmic}[1] %[1] enables line numbers
    \STATE Initialize a neural network with weights $w$ to predict transition probabilities
    \FOR{epoch = 1,2,...}
        \STATE Predict transition probabilities $P$
        \STATE Compute Whittle indices $W$.
        \IF{$epoch > n_{ini}$ and $(epoch-n_{ini})\% n_f==0$}
            \STATE Allocate budget $b_g$ using Algorithm~\ref{alg:subroutine_budget_allo_greedy} or \ref{alg:subroutine_budget_allo_neural_net_based}  
        \ENDIF
        \FOR{timestep t =1,2,...,H}
            \FOR{$g\in\mathcal{G}$}
                \STATE Given $b_g$, Take actions according to the Whittle index policy $\pi^{\text{Whittle}}$ with soft-top-$B$ selection. % added explanation in the paragraph describing the algorithm
            \ENDFOR
        \ENDFOR
        \STATE For each group $g$, use importance sampling to compute group value $V_g(s_g,b_g)$.  
        \STATE Compute Max Nash Welfare objective \\$\operatorname{MNW}=\prod_g V_g(s_g,b_g)$
        \STATE Update $w\leftarrow w+\alpha_w \frac{d \operatorname{MNW}}{d \pi^{\text{Whittle}}} \frac{d \pi^{\text {Whittle }}}{d W} \frac{d W}{d P} \frac{d P}{d w}$
        % \IF {conditional}
        %     \STATE Perform task A.
        % \ELSE
        %     \STATE Perform task B.
        % \ENDIF
    \ENDFOR
    \STATE \textbf{return} trained neural network with weights $w$ 
    \end{algorithmic}
    \label{alg:equitable_rmab_main}
\end{algorithm}

{\bf Main Algorithm } Algorithm~\ref{alg:equitable_rmab_main} simultaneously predicts transition probabilities and  allocates group budgets. At each epoch, we predict transition probabilities and compute whittle indices (lines 3-4). After that, we allocate budgets (lines 5-6) and collect trajectories using the Whittle index policy $\pi^{\text {whittle }}$ (lines 7-9). For each group, having the budget allocation, we choose arms to pull according to a soft version of Whittle index policy, where instead of pulling arms with top Whittle indices, we use a soft-top-$K$ selection \citep{xie2020differentiable}. Using the collected trajectories and importance sampling, we compute the Max Nash Welfare objective (lines 11). Finally, we update the weights using the gradient. See Appendix~\ref{sec:appendix_algorithmic_details} for additional details, including importance sampling (line 10) and gradient computations (line 12). 

%To stabilize training and to facilitate the convergence of per group budget allocation, we use a warm start from budgets $b_g$ proportional to group sizes, and we allow the budget updating frequency to be different from transition probability updating frequency (lines 5-7). Notice a warm start is crucial as it could avoid that poor transition probability predictions resulting in poor budget allocation, which then hurts the transition probability learning. 



%Unlike \citet{killian2023equitable}, which recomputes the optimal budget allocation at every timestep, we fix the budget allocation across all timesteps within an epoch. This choice is necessary because (1) frequently updating the budget allocations may cause instability during training, especially in the first few epochs where transition probability predictions are not accurate; (2) frequently updating the budget allocations is extremely computationally expensive as allocation requires iteratively computing group values, and it is sufficient to fix budget allocation as $J(s_g,b_g)$ already takes into account future rewards. 

\textbf{Limitations. } Our first approach already addresses limitations of simple alternatives and provides substantial improvement in the MNW objective in several settings (see Figure~\ref{fig:synth_results}). However, this approach does not consistently outperform, and it fails to allocate sufficient budgets to a marginalized group (see Figure~\ref{fig:budget}). A key limitation of Algorithm~\ref{alg:subroutine_budget_allo_greedy} is that the budget allocation is non-differentiable and relies heavily on estimates of $J_{\Delta}(\bm s_g,b_g)$. When transition probability predictions are poor, $J_{\Delta}(\bm s_g,b_g)$ estimates are inaccurate, and the resulting poor budget allocations in turn harm transition probability learning. 
In addition, the first approach requires different procedures tailored to distinct fairness objectives (e.g. for a fairness objective other than MNW, we need to provide another greedy procedure based on its properties). 

%Naively replacing the argmax operation (line 3, Algorithm~\ref{alg:subroutine_budget_allo_greedy}) with softmax sampling could lead to vanishing gradient problem, since Algorithm~\ref{alg:subroutine_budget_allo_greedy} requires calculating values in a sequential order. Thus, it motivates a procedure that naturally differentiates through budget allocations. 


\subsection{Our Differentiable Pipeline}
\label{sec:method_differentiable_pipeline}

To address limitations in the first approach, we propose to use a differentiable budget allocation procedure (Algorithm~\ref{alg:subroutine_budget_allo_neural_net_based}), where we update budget allocations by taking gradient steps. To use this procedure, in the main algorithm (Algorithm~\ref{alg:equitable_rmab_main}), when allocating budgets (line 6), we call the differentiable procedure (Algorithm~\ref{alg:subroutine_budget_allo_neural_net_based}). 

We explain the pipeline using MNW and later discuss other fairness objectives. The gradient of the MNW objective with respect to $b_g$ can be calculated as $
\frac{d \operatorname{MNW}}{d \pi^{\text{Whittle}}} \frac{d \pi^{\text {Whittle }}}{d b_g}$. To evaluate the first term, the gradient of the MNW objective with respect to the Whittle indices, we use the policy gradient Theorem \citep{sutton1998introduction}. To calculate the second term, we establish the following proposition: 

\begin{algorithm}[tb]
    \caption{Differentiable Budget Allocation}
    \textbf{Input}:  groups $g\in\mathcal{G}$, budget $B$, learning rate $\alpha_b$, (if not the first time this algorithm is called: previous $b_g$, $\pi^{\text{Whittle}}$, and $MNW$)
    \begin{algorithmic}[1] %[1] enables line numbers
    \IF{$1^{st}$ time this algorithm is called}
        \STATE Initialize budgets $b_g$ proportional to group size.
    \ELSE
        \STATE Update $b_g \leftarrow b_g + \alpha_b \frac{d \operatorname{MNW}}{d \pi^{\text{Whittle}}} \frac{d \pi^{\text {Whittle }}}{d b_g} $
    \ENDIF
    \STATE \textbf{return} budget allocation $b_g$
    \end{algorithmic}
    \label{alg:subroutine_budget_allo_neural_net_based}
\end{algorithm}



% commented out the para below to avoid confusions. The proof of proposition 1 should better explain the details
% A key technical challenge in differentiating through budget allocations is to compute the gradient $\frac{d \pi^{\text {Whittle }}}{d b_g}$. This is not achieved by previous DFL algorithms for RMABs \citep{wang2023scalable}, which only calculates the gradient of the Whittle index policy with respect to the Whittle indices but not the budget. We address this challenge by leveraging differential properties of Sinkhorn approximation with respect to input distributions \citep{luise2018differential} and establishing the following proposition: 

\begin{proposition}
\label{prop:differentiate_budget_allocation}
For each group $g\in\mathcal{G}$, to compute an approximate of the gradient $\frac{d \pi^{\text {Whittle }}}{d b_g}$, it is sufficient to know Whittle indices of arms in $g$, budget $b_g$, the number of arms $N_g$ in group $g$. 
\end{proposition}

\begin{proof}[Proof of Proposition~\ref{prop:differentiate_budget_allocation}]
Pulling top-$b_g$ arms from each group $g\in\mathcal{G}$ can be formulated as an optimal transport problem. Specifically, we let $\bm\mu:=\frac{\bm 1_{N_g}}{N_g}\in\mathbb{R}^{N_g}$ and $\bm v:=[\frac{b_g}{N_g}, \frac{N_g - b_g}{N_g}]$. We let $y:=[0,1]^\top$ and a cost matrix $M_{ij}:=|\bar{W}_i-y_j|^2$, where $\bar{W}_i$ is normalized Whittle index of arm $i\in{g}$. The top-$b_g$ operator output can be obtained from a linear mapping of the optimal transport plan $T^*$ \citep{xie2020differentiable}:
\begin{align}
\label{eq:optimal_transport}
S(\bm\mu,\bm v)&:= \min_{T\in\Pi(\bm\mu, \bm v)} \langle T, M \rangle,
\end{align}
where $
\Pi(\bm\mu, \bm v):= \{T\in\mathbb{R}_+^{N_g,2} | T \bm 1_{2} = \bm\mu, T \bm 1_{N_g} = \bm v \}.
$
Notice solving the optimization problem (\ref{eq:optimal_transport}) only requires that we know $\bar{W}_i$, $b_g$, and $N_g$. 

Solving the optimization problem (\ref{eq:optimal_transport}) is expensive and a regularized version is commonly used \citep{cuturi2013sinkhorn}:
\begin{align*}
%\label{eq:optimal_transport_regularized}
\tilde{S}_{\epsilon}(\bm\mu,\bm v)&:= \min_{T\in\Pi(\bm\mu, \bm v)} \langle T, M \rangle + \epsilon \sum_{i,j} T_{ij} (\log T_{ij} - 1).
\end{align*}
Using the regularized version, an approximate gradient of the objective in (\ref{eq:optimal_transport}) with respect to input $\bm v$ can be computed (Algorithm 1 in \citet{luise2018differential}). 
% Yunfan: update description here. \mu does not depend on b_g. 
\end{proof}

Note the proof of Proposition~\ref{prop:differentiate_budget_allocation} already provides detailed procedures on gradient computations. When performing gradient updates, we project the learned budget onto the feasible region (we apply softmax normalization in the last layer of the neural network learning per group budget allocations). Thus, the budget constraint is satisfied throughout.

Since our pipeline relies on gradient updates, it would be desirable if the optimization landscape is ``nice''. Specifically, we will argue that a proxy of MNW that extends to non-integer values is concave. The analysis, tailored to RMAB problems, requires arguments to address that budgets are usually integer-valued. 
Observe that solving MNW (Problem~\ref{eq:mnw_obj}) is equivalent to solving the log of MNW (Problem~\ref{eq:mnw_obj_take_log}). For ease of exposition, we define under a fixed budget $b_g$ of a given group $g$:
\begin{align*}
h_g^{MNW}(b_g):=\max_{\{b_{g'}\}_{g'\in\mathcal{G}\setminus g}} \sum_{g'\in \mathcal{G}} \log V_g(s_g,b_g) \ \text{s.t.} \ \sum_{g'\in\mathcal{G}}=B.
\end{align*}
Since we cannot pull 0.5 arm and $h_g^{MNW}(b_g)$ is only defined on integer valued points $b_g\in\mathbb{Z}$, we construct a proxy of $h_g^{MNW}(\cdot)$ that is defined on continuous values $ b_g\in\mathbb{R}$:
\begin{align*}
&\hat{h}_g^{MNW}(b_g) := h_g^{MNW}(\floor*{b_g}) \\
&+ (b_g - \floor*{b_g}) \cdot (h_g^{MNW}(\ceil*{b_g}) - h_g^{MNW}(\floor*{b_g}))
\end{align*}
Observe that $\hat h_g^{MNW}(b_g)=h_g^{MNW}(b_g)$ for $ b_g\in\mathbb{Z}$, and on non-integer valued $b_g$ the function $\hat h_g^{MNW}(\cdot)$ is a linear extrapolation based on nearest integer points. Additionally, $\hat h_g^{MNW}(\cdot)$ is piecewise linear. We now prove a result on the concavity of $\hat h_g^{MNW}(\cdot)$.

\begin{theorem}
\label{thm:mnw_convex_in_budget}
Assume $V_{\Delta}(b_g):=V_g(b_g+1)-V_g(b_g)$ is a decreasing function of $b_g$ for each group $g$. For any group $g\in|\mathcal{G}|$, $\hat h_g^{MNW}(\cdot)$ is concave. 
\end{theorem}

To give an example, we consider a setting with two groups $\mathcal{G}=\{g_1,g_2\}$. Since the total budget is $B$, it suffices to choose $b_{g_1}$ and $b_{g_2}=B-b_{g_1}$  can be easily calculated. From Theorem~\ref{thm:mnw_convex_in_budget}, we have that $\hat h_{g_1}^{MNW}()$ is concave, implying that any local optima is a global optima. Thus, it suffices to start from any feasible integer values of $b_{g_1}$ and then iteratively move $b_{g_1}$ in steepest descent direction by one unit until convergence. Since $b_{g_1}\in[B]$ has at most $B+1$ possible choices, following gradient updates (line 4, Algorithm~\ref{alg:subroutine_budget_allo_neural_net_based}), our procedure will terminate in finite amount of iterations.  

{\bf Complexity.} 
Algorithm~\ref{alg:subroutine_budget_allo_neural_net_based} evaluates gradients separately for each group $g\in\mathcal{G}$ and each evaluation has complexity $O(N)$ \citep{luise2018differential}, and thus the total cost is $O(N|\mathcal{G}|)$. In contrast, Algorithm~\ref{alg:subroutine_budget_allo_greedy} has complexity $O(N\log (N)|\mathcal{G}|BH)$, where $H$ is the length of the trajectory used to compute $J(s_g,b_g)$. Specifically, the factor $|\mathcal{G}|B$ is due to that we start with computing $J(s_g,b_g)$ for each group and then greedily allocate one budget at a time. The factor $N\log (N)H$ is the cost of computing $J(s_g,b_g)$ \citep{killian2023equitable}. 

\subsection{Different Fairness Objectives} 

While the first approach uses Algorithm~\ref{alg:subroutine_budget_allo_greedy} that heavily relies on the special properties of MNW and is not compatible with other fairness objectives, our differentiable pipeline using Algorithm~\ref{alg:subroutine_budget_allo_neural_net_based} accommodates various fairness objectives. 

The MNW objective is a product (Equation~\ref{eq:mnw_obj}), which is differentiable with respect to group value functions. The Maximin Reward (MMR) objective employs a minimum operator, which is not differentiable. To address that, we approximate the minimum operator with the Hölder mean:
$$
f_p(x_1,...,x_k):=\left(\frac{1}{k}\sum_{i} x_i^p\right)^{\frac{1}{p}}, \quad \text{for} \quad p\rightarrow -\infty
$$
It is well-known that 
$$
\lim_{p\rightarrow -\infty} f_p(x_1,...,x_k)=\min(x_1,...,x_k).
$$
Thus, replacing MNW in Algorithm~\ref{alg:equitable_rmab_main} and \ref{alg:subroutine_budget_allo_neural_net_based} with $f_p(\cdot)$ applied on group values $V(s_g,b_g)$, we obtain a differentiable pipeline for MMR objective. 
Next, we analyze the optimization landscape for MMR, and argue a proxy of it that extends to non-integer values, is concave. Although the results are similar to that in MNW, the analysis tailored to MMR requires different arguments. We define under a fixed budget $b_g$ of a given group $g$:
\begin{align*}
h_g^{MMR}(b_g):=\max_{\{b_{g'}\}_{g'\in\mathcal{G}\setminus g}} \min_{g'\in \mathcal{G}} V_g(s_g,b_g) \ \text{s.t.} \ \sum_{g'\in\mathcal{G}}=B,
\end{align*}
and its proxy on continuous values $b_g\in\mathbb{R}$:
\begin{align*}
&\hat{h}_g^{MMR}(b_g) := h_g^{MMR}(\floor*{b_g}) \\
&+ (b_g - \floor*{b_g}) \cdot (h_g^{MMR}(\ceil*{b_g}) - h_g^{MMR}(\floor*{b_g}))
\end{align*}
We have the following result:
\begin{theorem}
\label{thm:maximin_convex_in_budget}
Assume $V_{\Delta}(b_g):=V_g(b_g+1)-V_g(b_g)$ is a decreasing function of $b_g$ for each group $g$. For any group $g\in|\mathcal{G}|$, $\hat h_g^{MMR}(\cdot)$ is concave. 
\end{theorem}
We conduct experiments on MNW and MMR. Note our pipeline is compatible with any objective that satisfies Assumption~\ref{assp:obj_differentiable_in_group_values}, including Gini Index (see Appendix~\ref{sec:appendix_algorithmic_details}).

%In implementation, the absolute value can be replaced by $\max\{x_i-x_j, x_j-x_i\}$. 





%Algorithm~\ref{alg:subroutine_budget_allo_softmax} replaces the greedy choice in Algorithm~\ref{alg:subroutine_budget_allo_greedy} by a softmax function, thus allowing the budget allocation to be differentiated through. 


% \begin{algorithm}[tb]
%     \caption{Budget Allocation (softmax)}
%     \label{alg:algorithm}
%     \textbf{Input}: Sorted Whittle indices $W$, groups $g\in\mathcal{G}$, budget $B$, states $\bm s$, reserved budgets $b_g^{\text{reserved}}$ 
%     \begin{algorithmic}[1] %[1] enables line numbers
%     \STATE Set budget $b_g=b_g^{\text{reserved}}$
%     \STATE Compute for each group $g\in\mathcal{G}$, \\$J_{\Delta}\left(\boldsymbol{s}_g, b_g\right)=\log \left(J(\boldsymbol{s}_g, b_g+1\right))-\log \left(J(\boldsymbol{s}_g, b_g)\right)$
%     \WHILE{there is budget remaining}
%     \STATE Sample a group $g$ from a softmax function \begin{align*}\sigma(g) = \frac{J_\Delta(s_g,b_g)}{\sum_{g'} J_\Delta(s_{g'},b_{g'})}.\end{align*}
%     \STATE Give one more budget to the group chosen and recompute $J_\Delta(s_g,b_g)$ for this group. 
%     \ENDWHILE
%     \STATE \textbf{return} budget allocation $b_g$
%     \end{algorithmic}
%     \label{alg:subroutine_budget_allo_softmax}
% \end{algorithm}





% \subsection{Properties of Fairness Objectives}

%Since our differentiable pipeline relies on gradient updates, it would be desirable if the optimization landscape is ``nice''. Specifically, for fairness objectives considered in experiments (MNW, MMR), we will argue proxies of them that extends to non-integer values, is concave. The analysis, tailored to RMAB problems, requires refined arguments to address that budgets are usually integer valued. 
%Notice that Problem~\ref{eq:mnw_obj} is a constrained nonlinear integer programming problem, since the per group budgets are integer valued (we cannot pull 0.5 arm).  Integer programs are NP-complete \citep{cook1998combinatorial} and in general cannot be solved using LP relaxation and then rounding to nearest integer as that may cause infeasibility \citep{wolsey2020integer}. However, we will show that Problem \ref{eq:mnw_obj} can be solved via gradient descent, and thus updating budget allocations using gradient of MNW with respect to $b_g$ is valid. 

% removing below as it weakens 4.2
%On top of the concavity property shown in Theorem~\ref{thm:mnw_convex_in_budget}, we use a soft-top-K operator \cite{xie2020differentiable} that assigns fractional weights to arms and approximates pulling top-K arms by sampling arms based on the weights. Thus, by applying the soft-top-K operator, we accommodate gradient updates to budgets that result in fractional values.





\section{Experiments}

%\subsection{Setup}
We consider the following baselines. The parameters to learn are the transition probabilities of the RMAB problem. The Decision-focused-learning methods (DF) learn transition probabilities by maximizing the decision objective. The objective evaluated using importance sampling (see Appendix~\ref{sec:appendix_algorithmic_details}), varies for distinct methods.

\begin{enumerate}
    \item \textbf{\citet{killian2023equitable} (Two Stage Learning + Greedy Budget Allocation)}: Transition probabilities are learned by maximizing the predictive accuracy. Group budgets are computed greedily (Algorithm~\ref{alg:subroutine_budget_allo_greedy}) where the Whittle index policy is applied for every group. 
    %Further, the budgets are allocated to different groups using the Greedy Budget Allocation algorithm (Algorithm 3). The interventions are then allocated to beneficiaries with the highest Whittle index within each group.
    \item \textbf{DF-NoFair (DFL+No Fairness)}: The decision objective here is total utility. Group budgets are not computed and the Whittle Index policy is applied to the entire population. This method is equivalent to that described in \cite{wang2023scalable}. 
    \item \textbf{DF-PropB (DFL+Proportional Budget)}: DFL decision objective here is total utility. We set group budgets proportional to group sizes and then apply the Whittle index policy for every group.
    \item \textbf{DF-GreedyB (DFL+Greedy Budget Allocation)}: DFL decision objective here is a fairness metric. Group budgets are computed greedily (Algorithm~\ref{alg:subroutine_budget_allo_greedy}) then apply Whittle index policy for every group. 
    \item \textbf{DF-LearnB (DFL+Learnable Budget Allocation)}: This is our differentiable pipeline. DFL decision objective here is a fairness metric. Group budgets are learned using Algorithm~\ref{alg:subroutine_budget_allo_neural_net_based}. The Whittle index policy is applied to every group. 
    
\end{enumerate}

% Since we have two components in our problem - learning transition probabilities and planning the allocation of resources, we consider the baselines with different strategies in each of these components. In learning, we consider the following methods:
% \begin{enumerate}
%     \item \textbf{Two-Stage Learning (TS)} where transition probabilities are learned by maximizing the predictive accuracy of learned transition probabilities. 
%     \item \textbf{Decision Focused Learning (DFL)} where the downstream optimization objective (total utility in the case of No Fairness and MNW elsewhere) is maximized instead of the intermediate step of maximizing predictive accuracy.
% \end{enumerate}
% On the budget allocation, we consider the following variations 
% \begin{enumerate}
%     \item \textbf{No Fairness (NoFair)} setting, where beneficiary group assignments are ignored and top-K beneficiaries are chosen from the complete population.
%     \item \textbf{Fairness through Proportional Budget (PropB)} where we allocate budget $b_g$ to group $g$ proportional to the group size and then allocate interventions to top-$b_g$ beneficiaries with the highest Whittle index within group $g$. % made notations consistent with preliminaries/algorithm section
%     \item \textbf{Fairness through Equitable Budget (EqB)} where we calculate budget allocation through (Algorithm 3) the greedy algorithm proposed by \citep{killian2023equitable}.
%     \item \textbf{Fairness through Learnable Budget (LearnB))} where we learn the budget allocation by backpropagating the gradient of decision quality w.r.t. $b_g$ through the soft-topK layer (Algorithm 2).
% \end{enumerate}
% 



% We conduct experiments on synthetic datasets with different total number of arms ($N \in (100,200,500)$), number of groups ($|\mathcal{G}| \in (2,3,4)$, different budgets ($B \in (0.2N, 0.4N, 0.6N)$) and different distributions of disadvantaged and majority groups. We also conduct experiments on a real-world dataset about a maternal health care program collected by a non-profit organization named ARMMAN. 
We conduct experiments on both synthetic and real-world large-scale datasets, which we describe below.

% {\bf Synthetic} The synthetic dataset models an RMAB problem with 2 states $\{0,1\}$, time horizon $T=10$, and a reward discount rate of $\gamma=0.99$. For different settings, we present in Table~\ref{table:mean_MNW} the respective number of arms $N$, the budget $B$, the number of groups, and the size of each group. We provide results on different distributions of disadvantaged and majority groups. We collect a unit reward at state 1 and 0 reward otherwise. We randomly generate the transition probabilities, while enforcing a constraint that pulling is strictly better than not pulling, for all groups. For some groups, the benefit of pulling over not pulling is larger than that for other groups, and we discuss additional details in the Appendix. Having sampled the transition probabilities, we map them to features in $\mathbb{R}^{16}$ using a randomly initialized neural network. We study an offline problem with historical data collected by running a random behavioral policy.  

% {\bf Synthetic} The synthetic dataset models an RMAB problem with 2 states $\{0,1\}$ for $N$ arms, $B$ budget, time horizon $T=10$, and a reward discount rate of $\gamma=0.99$. Within the population, we consider groups of arms with similar transition probabilities. For some groups, the benefit of pulling over not pulling is larger than that for other groups. We consider two different problem settings based on different group characteristics - i) where we have two groups with first group as severely disadvantaged. The groups form [50\%, 50\%] of the population. ii) where have three groups with first group severely disadvantaged and the second group moderately disadvantaged. The groups form [33\%, 33\%, 34\%] of the population. The disadvantaged groups are defined to have a lower benefit of pulling over not pulling than other groups.
% % the disadvantaged group and the other group both constitute $50\%$ of population, ii) where we have three groups with increasing benefit of pulling over not pulling and each forming a third of the population. 
% % For different settings, we present in Table~\ref{table:mean_MNW} the respective number of arms $N$, the budget $B$, the number of groups, and the size of each group. 
% We collect a unit reward at state 1 and 0 reward otherwise. We instantiate the experiments by randomly generating the transition probabilities, while enforcing the group characteristics and the constraint that pulling is strictly better than not pulling, for all groups. More details on synthetic data generation are in the Appendix. Having sampled the transition probabilities, we map them to features in $\mathbb{R}^{16}$ using a randomly initialized neural network. We study an offline problem with historical data collected by running a random behavioral policy.  

{\bf Synthetic} The synthetic dataset models an RMAB problem with 2 states $\{0,1\}$ for $N$ arms, $B$ budget, time horizon $T=10$, and a reward discount rate of $\gamma=0.99$. 
For each arm and time step, we collect a unit reward at state 1 and 0 reward otherwise.
We consider two settings with different group characteristics. \textbf{i) Two Groups}: one group is severely disadvantaged and there is a [50\%, 50\%] split between groups. \textbf{ii) Three groups }: The first group is severely disadvantaged and the second group is moderately disadvantaged. There is a [33\%, 33\%, 34\%] split between groups. 
Compared to arms in the advantaged group, arms in disadvantaged groups obtain lower reward when not pulled but also obtain a lower increase in reward when pulled. 
Transition probabilities are randomly generated while enforcing that pulling is always strictly better than not pulling as well as the group characteristics. Having sampled the transition probabilities, we map them to features in $\mathbb{R}^{16}$ using a randomly initialized neural network. We study an offline problem with historical data collected by running a random behavioral policy. 

{\bf Real-world data} The dataset is collected by ARMMAN ~\citep{armman-mhealth}, an NGO in India working on improving health awareness for expectant and new mothers. The program has enrolled over one million mothers, and health workers periodically make service calls to boost mothers' engagement in ARMMAN's health information program. Allocating limited service calls has been modeled as an RMAB problem with two actions (a health worker initiates a service call to the mother or not) and two states (engaging or not)~\citep{mate2022field,verma2023increasing}. Each beneficiary is modeled as a Markov Decision Process, with unknown transition dynamics that can be inferred from known features. Each week,  mothers are in the engaging state if they listen to a health information voice message sent by ARMMAN for more than 30 seconds. Using a service quality improvement study of 44K mothers conducted by ARMMAN in January 2022 (see Appendix~\ref{sec:appendix_consent_data_usage} for data usage and consent), we compute mothers' empirical transition probabilities. In discussion with ARMMAN, we define 4 groups based on mothers' education, income, and phone ownership status (see Table~\ref{tab: risk} in Appendix~\ref{sec:appendix_risk_attributes}). These groups are in proportions [26\%, 38\%, 29\%, 7\%], and Group C and D need more resources. 
%{sec:appendix_consent_data_usage}
Instead of giving service calls to mothers who respond better to service calls, ARMMAN aims to not leave out groups who do not respond as well. Finally, using mothers' empirical transition probabilities and group mapping, we run a simulated RMAB experiment for a subpopulation with $N=10000$ and $B=300$. 

% We evaluate our method based on a real-world use case of optimizing limited intervention allocation in a mobile health program. The program is run by ARMMAN ~\cite{armman-mhealth}, an NGO in India that works on improving health literacy for expectant and new mothers. The program has enrolled over one million beneficiaires. However, to boost beneficiaires' engagement, health workers have to periodically make service calls. The problem of how to allocating limited service call interventions has previously been modeled as an RMAB with two actions (call or not) and two states (engaging or not) ~\cite{mate2022field,verma2023increasing}.
% % SHould we cite previous works who model this problem 
% Specifically, each beneficiary is modelled as a Markov Decision Process, with unknown transition dynamics that can be predicted from the beneficiary characteristics. A beneficiary is defined to be in engaging state if she listens to a health information voice message sent by the NGO for more than 30 seconds.  
% Using a service quality improvement study of 44K beneficiaries conducted by ARMMAN in January 2022, we compute beneficiaries' empirical transition probabilities. In discussion with the NGO, we also define 4 beneficiary groups based on beneficiaries' education level, income level and phone ownership status (see Table~\ref{tab: risk} in Appendix).
% Finally, using beneficiaires' empirical transition probabilities and group mapping, we run a simulated RMAB experiment for a sample of beneficiaries with N=10000 and B=300. 

% We evaluate our method on an anonymized dataset on maternal and child health care collected by ARMMAN, an NGO in India that runs a large scale health information program with over one million enrolled beneficiaries. The dataset contains each beneficiary's features such as age, income, education level, language preference, etc. 
% %ARMMAN sends automated voice messages weekly to expecting or new mothers over cell phones and calls enrolled beneficiaries to ensure they do not stop listening to useful health information messages. ARMMAN can only initiate a limited number of service calls. 
% The problem of how to allocate the service calls is modelled as an RMAB with two actions (call or not) and two states (engaging or not).
% We model each beneficiary as a Markov Decision Process, with unknown transition dynamics that can be predicted from the beneficiary's demographic information (available features).  We define a beneficiary as engaging if she listens to a health information voice message sent by ARRMAN for more than 30 seconds. We wish to select a subset of beneficiaries each week to call to improve engagement. The historical trajectories are collected by running Round-Robin policy for 7 weeks with consent from beneficiaries \citep{mate2022field}.  Following \citep{wang2023scalable}, we randomly split the beneficiaries into 12 subsets each of size $N=639$ and budget $B=18$. We use 8,1, and 3 subsets for training, validation, and testing respectively. 


% \begin{table}
%     \centering
%     \begin{tabular}{lll}
%         \hline
%         Scenario  & $\delta$ & Runtime \\
%         \hline
%         Paris     & 0.1s     & 13.65ms \\
%         Paris     & 0.2s     & 0.01ms  \\
%         New York  & 0.1s     & 92.50ms \\
%         Singapore & 0.1s     & 33.33ms \\
%         Singapore & 0.2s     & 23.01ms \\
%         \hline
%     \end{tabular}
%     \caption{Latex default table}
%     \label{tab:plain}
% \end{table}

% \begin{table}
%     \centering
%     \begin{tabular}{lrr}
%         \toprule
%         Scenario  & $\delta$ (s) & Runtime (ms) \\
%         \midrule
%         Paris     & 0.1          & 13.65        \\
%                   & 0.2          & 0.01         \\
%         New York  & 0.1          & 92.50        \\
%         Singapore & 0.1          & 33.33        \\
%                   & 0.2          & 23.01        \\
%         \bottomrule
%     \end{tabular}
%     \caption{Booktabs table}
%     \label{tab:booktabs}
% \end{table}


% Please add the following required packages to your document preamble:
% \usepackage{booktabs}
% \usepackage{multirow}


% \begin{table}[h]
% \centering
% \caption{Risk attributes and their definitions. Each risk attribute contributes 1 to the risk score, resulting in risk score values between 0 and 3}
% \def\arraystretch{1.2}%
% % \def\arraystretch{1.5}%
% \resizebox{0.95\columnwidth}{!}{%
% \begin{tabular}{|p{2.2cm}|p{5cm}|}
% \hline
% \textbf{Risk Attribute} & \textbf{Definition}                             \\ \hline
% Low Income              & Monthly Family Income \\
% & \textless INR 15,000 (180 USD approx)                          \\ \hline
% Low Education           & Highest Education Level\\
% & matriculation (Grade 10) or below                       \\ \hline
% Phone Owner        & Phone not owned by beneficiairy                                      \\ \hline
% \end{tabular}
% }
% \label{tab: risk}
% \end{table}



% \begin{figure*}
%      \centering
%      \begin{subfigure}[b]{0.23\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/synth_A_mnw.png}
%     \caption{MNW metric synthetic problem setting A}
%     \label{fig:synth_a_MNW}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[b]{0.23\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/synth_B_mnw.png}
%          \caption{MNW metric synthetic problem setting B}
%          \label{fig:synth_b_MNW}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[b]{0.23\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/synth_C_mnw.png}
%          \caption{MNW metric synthetic problem setting C}
%          \label{fig:synth_c_MNW}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[b]{0.23\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/armman_mnw.png}
%          \caption{MNW metric real-world health information data experiment}
%          \label{fig:armman_mnw}
%      \end{subfigure}

%      \centering
%      \begin{subfigure}[b]{0.23\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/synth_A_util.png}
%     \caption{Utility for synthetic problem setting A}
%     \label{fig:synth_a_util}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[b]{0.23\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/synth_B_util.png}
%          \caption{Utility for synthetic problem setting B}
%          \label{fig:fig:synth_b_util}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[b]{0.23\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/synth_C_util.png}
%          \caption{Utility for synthetic problem setting C}
%          \label{fig:fig:synth_c_util}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[b]{0.23\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/armman_util.png}
%          \caption{Utility for real-world health information data experiment}
%          \label{fig:fig:synth_d_util}
%      \end{subfigure}
     
%         \caption{MaxNashWelfare (MNW) metric (a-d) and Utility metric (e-h) for different methods in synthetic and real-world health information data experiments. Synthetic experiments have N=200, B=40. Real-world data experiment has N=10000, B=300. The \% reduction in utility is calculated with respect to DF-NoFair, which pure optimizes utility. In ARMMAN (h), DF-LearnB has less than 1\% reduction in utility (total listenership averaged over all arms.}
%         \label{fig:results}
% \end{figure*}


% \begin{figure*}
%      \centering
%      \begin{subfigure}[b]{0.28\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/new_figs/synth_2grps_mnw.png}
%     \caption{MNW metric for synthetic problem setting with N=200, |G| = 2, budget = 20\%}
%     \label{fig:synth_a_MNW}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[b]{0.28\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/new_figs/synth_3grps_mnw.png}
%          \caption{MNW metric for synthetic problem setting with N=500, |G| = 3, budget = 20\%}
%          \label{fig:synth_b_MNW}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[b]{0.28\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/new_figs/armman_4grps_mnw.png}
%          \caption{MNW metric for ARMMAN problem setting with N=10000, |G| = 4, budget = 3\%}
%          \label{fig:synth_c_MNW}
%      \end{subfigure}
%      \hfill

     
%      \begin{subfigure}[b]{0.28\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/new_figs/synth_2grps_ope.png}
%     \caption{Utility for synthetic problem setting with N=200, |G| = 2, budget = 20\%}
%     \label{fig:synth_a_MNW}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[b]{0.28\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/new_figs/synth_3grps_ope.png}
%          \caption{Utility for synthetic problem setting with N=500, |G| = 3, budget = 20\%}
%          \label{fig:synth_b_MNW}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[b]{0.28\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/new_figs/armman_4grps_ope.png}
%          \caption{Utility for ARMMAN problem  with N=10000, |G| = 4, budget = 3\%}
%          \label{fig:synth_c_MNW}
%      \end{subfigure}
%      \hfill
     
%         \caption{MaxNashWelfare (MNW) metric (a-d) and Utility metric (e-h) for different methods in synthetic and real-world health information data experiments. Synthetic experiments have N=200, B=40. Real-world data experiment has N=10000, B=300. The \% reduction in utility is calculated with respect to DF-NoFair, which pure optimizes utility. In ARMMAN (h), DF-LearnB has less than 1\% reduction in utility (total listenership averaged over all arms.}
%         \label{fig:results}
% \end{figure*}


\begin{figure*}[ht]
     \centering
     \begin{subfigure}[b]{\linewidth}
         \centering
         \includegraphics[width=0.7\textwidth]{figures_final/labels_big.png}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.24\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/synth_scenario0_mnw.png}
    \caption{MNW metric, $|G|=2$}
    \label{fig:synth_a_MNW}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.24\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/synth_scenario3_mnw.png}
         \caption{MNW metric, $|G|=3$}
         \label{fig:synth_b_MNW}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.24\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/synth_scenario0_mmr.png}
         \caption{MMR metric, $|G|=2$}
         \label{fig:synth_a_MMR}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.24\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/synth_scenario3_mmr.png}
         \caption{MMR metric, $|G|=3$}
         \label{fig:synth_b_MMR}
     \end{subfigure}

     \centering
     \begin{subfigure}[b]{0.24\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/synth_scenario0_mnw_ope.png}
    \caption{Utility when maximizing MNW objective, $|G|=2$}
    \label{fig:synth_a_util}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.24\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/synth_scenario3_mnw_ope.png}
         \caption{Utility when maximizing MNW objective, $|G|=3$}
         \label{fig:fig:synth_b_util}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.24\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/synth_scenario0_mmr_ope.png}
         \caption{Utility when maximizing MMR objective, $|G|=2$}
         \label{fig:fig:synth_a_util_MMR}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.24\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/synth_scenario0_mmr_ope.png}
         \caption{Utility when maximizing MMR objective, $|G|=3$}
         \label{fig:fig:synth_b_util_MMR}
     \end{subfigure}
     
        \caption{We report the min-max normalized \cite{henderi2021comparison} fair objective metric (a-d) and Utility (e-h) across 20 different seeds in synthetic data experiments. Here $N=200$, $B=80$ (see Appendix~\ref{sec:appendix_additional_exp_synthetic} for more comprehensive results). }
        \label{fig:synth_results}
\end{figure*}

\subsection{Experimental Results}

We consider two Fairness objectives: \textit{Max Nash-Welfare (MNW)} and \textit{Maximin Reward (MMR)}. We report the respective Fairness Objective and the \textit{Utility}, which is the total sum of rewards of all arms. We report the fairness metrics across 20 different seeds. For each seed, we generate 10 instances of RMAB problems for each setting and split the instances into 70\% training, 10\% validation, and 20\% testing sets. Since every seed value results in different best and worst results, we use a popular min-max normalization \cite{henderi2021comparison,gajera2016effective}

%We consider two Fairness objectives: a) \textit{Max Nash-Welfare (MNW)} and b) \textit{Maximin Reward (MMR)}. Additionally, we consider \textit{Utility} of the system as total sum of rewards for all beneficiaries. In all experiments, we report the Fairness Objective and the Utility. We generate 10 instances of RMAB problems for each setting in the synthetic and real-world data experiments. We split the instances into 70\% training, 10\% validation, and 20\% testing sets. Additionally, we repeat all experiments with 10 different seeds. Since every seed value results in a different best and worst results, we report the normalized fairness metrics in all our experiments. 


%We will start with one illustrative example and later provide comprehensive evaluations across various problem settings.

% In Table~\ref{table:mean_MNW}, we provide comprehensive evaluations across various problem settings. We present results for number of arms $N\in\{100,200\}$, which is sufficiently large to model real-world RMAB problems \citep{wang2023scalable,verma2023restless}. We evaluate methods under budgets $B\in\{0.2N, 0.4N, 0.6N\}$. The results demonstrate that our proposed algorithm DF-LearnB achieves 20-50\% gains in MNW objective across different problem settings.  The results demonstrate that advantage of differentiating through the budget allocation in network updates, and the need for our fully differentiable pipeline. 




\begin{figure}
     \centering
     \begin{subfigure}[b]{\linewidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/labels_big.png}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.49\columnwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/armman_unequal_mmr.png}
         \caption{MMR metric}
         \label{fig:armman_mmr}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.49\columnwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/armman_unequal_mmr_ope.png}
         \caption{Utility}
         \label{fig:armman_ope_mmr}
     \end{subfigure}
        \caption{Fair objective metric and Utility metric when optimizing for MMR objective in real-world data experiments. For all experiments, we have N=10000, B=300 (see Appendix~\ref{sec:appendix_armman_group_sizes_results} Figure~\ref{fig:armman_results_equal_size} for corresponding results on MNW)}
        \label{fig:armman_results}
\end{figure}


In Figure~\ref{fig:synth_results}, we show results on synthetic data with different numbers of groups and fairness objectives while in Figure~\ref{fig:armman_results}, we show results from real-world health information data experiments. Across all problem scenarios, we observe that DF-LearnB outperforms baselines in terms of fairness metric, achieving  20-40\% gains (see Tables~\ref{tab:synth_mnw} and \ref{tab:synth_mmr} in Appendix for more detailed comparison). 
% The results demonstrate that our proposed algorithm DF-LearnB achieves 20-50\% gains in MNW objective across different experiment settings. 
This demonstrates the advantage of differentiating through the budget allocation in network updates, and the need for our fully differentiable pipeline. 
%We also observe that our algorithm optimizes the equitable objective without severe efficiency losses (see Figure~\ref{fig:synth_results}(e-h)). Notably, w
When optimizing for MNW, DF-LearnB achieves utility comparable to DF baselines, including a state-of-the-art DF algorithm \citep{wang2023scalable} that purely optimizes utility. When optimizing for MMR, we see a much higher gain in fairness but at the expense of a noticeable drop in Utility (see Figure~\ref{fig:synth_results}(c-d,g-h)). This suggests that different fairness metrics have varying levels of fairness-utility tradeoffs. While DF-LearnB can effectively optimize for the desired objective, the utility cost depends on the objective and so the choice of the objective can be application-dependent. In Appendix~\ref{sec:appendix_additional_experimental_results}, we present additional results showing similar trends across different values of $N\in\{100,200,500\}$ and $B\in\{0.2N, 0.4N, 0.6N\}$.





% In Table~\ref{table:mean_MNW}, we provide comprehensive evaluations across various problem settings. We present results for number of arms $N\in\{100,200\}$, which is sufficiently large to model real-world RMAB problems \citep{wang2023scalable,verma2023restless}. We evaluate methods under budgets $B\in\{0.2N, 0.4N, 0.6N\}$. The results demonstrate that our proposed algorithm DF-LearnB achieves 20-50\% gains in MNW objective across different problem settings.  The results demonstrate that advantage of differentiating through the budget allocation in network updates, and the need for our fully differentiable pipeline. 
% In Figure~\ref{fig:armman} we show the results from the real-world health information data experiment. In particular, our proposed approach outperforms all baselines under the MaxNashWelfare objective (see Figure~\ref{fig:armman_meanMNW}). 
% In Figure~\ref{fig:armman_utility}, we provide comparisons in utility achieved by each method, showing that our algorithm optimizes the equitable objective without sacrificing efficiency. Notably, for each budget planning scheme (NoFair, PropB, EqB), the DF method achieves utility higher than the corresponding TS method, demonstrating the benefit of decision-focused-learning. In addition, our DF-LearnB achieves utility comparable to DF baselines, including a state-of-the-art DF algorithm \citep{wang2023scalable} that purely optimizes utility.
% \begin{figure}[ht!]
%     \centering
%     \includegraphics[width=0.95\columnwidth]{figures_final/image.png}
%     \caption{Budget allocation to disadvantaged group, across training epochs in synthetic data experiment with $|G|=2$.}
% \begin{figure}
%         \centering
%         \includegraphics[width=1\linewidth]{figures_final/image.png}
%         \caption{Enter Caption}
%         \label{fig:enter-label}
%     \end{figure}
%     \label{fig:budget}
% \end{figure}
\begin{figure}[ht!]
    \centering
    \includegraphics[width=0.9\columnwidth]{figures_final/image.png}
    \caption{Budget allocation to disadvantaged group, across training epochs in synthetic data experiment with $|G|=2$.}
    \label{fig:budget}
\end{figure}

\textbf{Learning Budget Allocation} To further analyze how DF-LearnB produces high fairness objective values, we showcase the allocation of budget across training epochs. Figure~\ref{fig:budget} illustrates results on \textbf{Synthetic} with MNW objective (see Appendix~\ref{sec:appendix_additional_exp_synthetic} Figure~\ref{fig:armman_budget_equal} for results on MMR). While DF-LearnB slowly adds up the budget to the disadvantaged group alongside simultaneous learning of transition probabilities, DF-GreedyB has unstable updates and fails to allocate sufficient resources to the disadvantaged group. This demonstrates the importance of a fully differentiable pipeline. The budget allocation of \citet{killian2023equitable} is completely separated from the learning of transition probabilities, which thus leads to suboptimal budget allocation. Finally, the proportional group allocation is a fixed strategy allocating $50\%$ of the budget to disadvantaged group while the DF-NoFair strategy completely starves the disadvantaged group of all resources.
We make similar observations on \textbf{real-world ARMMAN data}. In Figure~\ref{fig:armman_budget}, we show the budget distribution when the MMR objective is used (see Appendix~\ref{sec:appendix_armman_group_sizes_results} Figure~\ref{fig:armman_budget_equal} for corresponding results on MNW). Notice group sizes are unequal in this real-world dataset, as reflected in DF-PropB. Due to the nature of MMR objective, ideal budget allocations should be fully oriented towards groups that are worst off and improve their situation as much as possible. Here Group C and D needs more resources. Notice our DF-LearnB method prioritizes these groups more in need, while other DF baselines fail to provide sufficient resources to these groups. 
% To further analyze how DF-LearnB achieves higher MNW objective while other approaches achieve suboptimal results, we present the allocation of budget to disadvantaged group across the training epochs for all the methods (see Figure~\ref{fig:budget}). In particular, DF-LearnB slowly adds up budget to the disadvantaged group while simultaneously learning transition probabilities through Decision-Focused Learning. On the other hand, DF-GreedyB has unstable updates in budget allocation. This demonstrates the importance of simultaneously learning budget allocation and transition probabilities in a fully differentiable pipeline.   
% %We argue that this is because DF-GreedyB uses Algorithm~\ref{alg:subroutine_budget_allo_greedy} which is not end-to-end differentiable. As budget is changing through a non-differentiable function, the objective becomes non-stationary thus making it hard to learn the MaxNashWelfare objective as a function of changing budgets. 
% \citet{killian2023equitable} baseline has similar budget allocation as DF-LearnB. However, since it is based on Two-Stage learning, even a correct budget allocation can lead to suboptimal fairness metric, because the objective mismatch could lead to poor transition probability predictions. Finally, the proportional group allocation is a fixed strategy allocating $50\%$ of the budget to disadvantaged group while the DF-NoFair strategy completely starves the disadvantaged group of all resources.


% In Figure~\ref{fig:armman_budget}, we show the budget distribution in the real-world health information data problem when the fairness objective is MaxNash Welfare. We also show the percentile distribution of Whittle Index for every group in Figure~\ref{fig:percentile}. As Group-D has a much higher Whittle index for all beneficiaires as compared to the other group, DF-NoFair allocates most of the budget to this group. On the other hand, \citet{killian2023equitable} and DF-GreedyB allocate budgets close to proportional budgets. DF-LearnB learns a budget allocation strategy between these two, thus balancing fairness and utility.



\begin{figure}[ht]
    \centering
    \includegraphics[width=0.9\columnwidth]{figures_final/armman_unequal_mmr_budgets.png }
    \caption{Budget allocation to different risk groups (see Appendix Table~\ref{tab: risk}) in real-world health information data experiment.}
    \label{fig:armman_budget}
\end{figure}

% moved to appendix
% \begin{figure}[h]
%      \centering
%      \begin{subfigure}[b]{0.49\columnwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figures_final/armman_wh_0.png}
%          \caption{Whittle Index for intervening in Non-Engagaing State }
%          \label{fig:percentile_non_engaging}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[b]{0.49\columnwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figures_final/armman_wh_1.png}
%          \caption{Whittle Index for intervening in Engaging State}
%          \label{fig:percentile_engaging}
%      \end{subfigure}
%         \caption{Whittle Index for different risk groups over top-k percentile}
%         \label{fig:percentile}
% \end{figure}

% \subsection{Decision Focused Evaluation}
\textbf{Decision Focused Evaluation}
To evaluate how close a method is to an optimal top-k selection, we analyze the rank correlation between Whittle indices computed from ground truth transition probabilities and these computed from learned probabilities. Since the top-k selection is performed for every group, we compute rank correlations at a group level and report the Average Group Rank Correlation (AGRC) over all groups and states. 
% In order to judge how close a method is to true top-k selection decision, we look at rank correlation between true Whittle indices and learnt Whittle indices. Additionally, since the top-k selection is performed for every group, we compute rank correlations at a group level and report the Average Group Rank Correlation (AGRC) over all groups and states. 
\begin{equation}
    \text{AGRC} = \frac{1}{|G||S|} \sum_{s\in S, g\in G} \textit{Spearmanr}(W_s^g, \hat{W_s^g})
\end{equation}
where $W_s^g$ and $\hat{W_s^g}$ are respectively the true and predicted Whittle index lists for group $g$ in state $s$. $\textit{Spearmanr}$ is the Spearman's rank coefficient.

We find that for synthetic data experiment optimizing MNW metric with $|G| = 2$, AGRC is 0.114 for \citet{killian2023equitable} and is 0.307 for our DF-LearnB. The low correlation values indicate that it is a hard problem to retrieve true transition probabilities and thus true Whittle index ordering for all arms within a group. However, a larger AGRC value for DF-LearnB clearly shows that DF-LearnB learns a better ordering of arms as compared to \citet{killian2023equitable}, resulting in decisions that yield higher fairness objective values. 
% We find that for the synthetic data experiment optimizing the MNW metric with $|G| = 2$, the AGRC for \citet{killian2023equitable} is 0.114, while for DF-LearnB, this value is 0.307. This clearly shows that DF-LearnB learns a better ordering of beneficiaries as compared to \citet{killian2023equitable} resulting in a top-k selection that yields higher fairness objective values. %utility per group.


\textbf{Runtime}
In Figure~\ref{fig:runtime_n} we demonstrate that our DF-LearnB has runtime substantially smaller than \citet{killian2023equitable} and comparable to baselines that ignore fairness. Specifically, DF-LearnB runtime is linear in the number of arms. In Figure~\ref{fig:runtime_groups}, we show DFL approaches that incorporate fairness have runtime linear in the number of groups, which aligns with our theoretical analysis (see Section~\ref{sec:method_differentiable_pipeline}).

%In Figure~\ref{fig:runtime_n} we demonstrate that our DF-LearnB has runtime substantially smaller than \citet{killian2023equitable} and comparable to alternative approaches that ignores fairness. Specifically, DF-LearnB runtime is linear in the number of arms. In Figure~\ref{fig:runtime_groups}, we show runtime with a changing number of groups. There is a linear increase runtime for all fair DFL approaches because, for computing Max Nash Welfare objective, one needs to compute utility per group and then take a product. This creates branches in the computation graph. Consequently, the step of computing the gradient through the \textit{soft-Top-K} layer is performed $|\mathcal{G}|$ times. 

% In Figure~\ref{fig:meanMNW_runtime}, we provide wall-clock time comparisons, demonstrate that our Algorithms achieve stronger performance than baselines with only moderate increase in runtime. The increase in runtime is because that one needs to compute utility per group and then take a product to get Max Nash Welfare objective. This creates branches in the computation graph. Specifically, an  expensive step of computing the gradient through softTopK layer is performed $|\mathcal{G}|$ times. Notice that our DF-LearnB algorithm has runtime only slightly higher than decision-focused-learning approaches that naively incorporate fairness (DF-PropB and DF-EqB), while achieving significantly higher equitable objective (see Figure~\ref{fig:meanMNW} and Table~\ref{table:mean_MNW}).

% \begin{figure}
%     \centering
%     \includegraphics[width=0.95\columnwidth]{figs/runtime.png}
%     \caption{Runtime for different DFL approaches.}
%     \label{fig:meanMNW_runtime}
% \end{figure}

% \begin{figure}
%   \centering
%   \begin{minipage}{0.4\columnwidth}
%     \includegraphics[width=\textwidth]{figs/runtime_n.png}
%     \caption{Flower one.}
%   \end{minipage}
%   % \hfill
%   \begin{minipage}{0.4\columnwidth}
%     \includegraphics[width=\textwidth]{figs/runtime_budget.png}
%     \caption{Flower two.}
%   \end{minipage}
% \end{figure}

\begin{figure}[h!]
     \centering
     \begin{subfigure}[b]{0.43\columnwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/runtime_arms.png}
         \caption{Runtime per epoch with changing number of arms}
         \label{fig:runtime_n}
     \end{subfigure}
     \hfill
     \begin{subfigure}[b]{0.56\columnwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/runtime_groups.png}
         \caption{Runtime per epoch with changing number of groups}
         \label{fig:runtime_groups}
     \end{subfigure}
        \caption{Runtime comparison of different algorithms}
        \label{fig:runtime}
\end{figure}


% In Figure~\ref{fig:meanMNW_utility}, we provide comparisons in utility achieved by each method, showing that our algorithm optimizes the equitable objective without sacrificing efficiency. Notably, for each budget planning scheme (NoFair, PropB, EqB), the DF method achieves utility higher than the corresponding TS method, demonstrating the benefit of decision-focused-learning. In addition, our DF-LearnB achieves utility comparable to DF baselines, including a state-of-the-art DF algorithm \citep{wang2023scalable} that purely optimizes utility.


% \begin{figure}
%     \centering
%     \includegraphics[width=0.95\columnwidth]{figs/normalized_utility.png}
%     \caption{Normalized population level OPE (utility) for different proposed methods. For each method, we report average and standard deviation over all problem settings listed in Table~\ref{table:mean_MNW}.}
%     \label{fig:meanMNW_utility}
% \end{figure}

% In Figure~\ref{fig:armman} we show the results from the real-world health information data experiment. In particular, our proposed approach outperforms all baselines under the MaxNashWelfare objective (see Figure~\ref{fig:armman_meanMNW}). 
% In Figure~\ref{fig:armman_utility}, we provide comparisons in utility achieved by each method, showing that our algorithm optimizes the equitable objective without sacrificing efficiency. Notably, for each budget planning scheme (NoFair, PropB, EqB), the DF method achieves utility higher than the corresponding TS method, demonstrating the benefit of decision-focused-learning. In addition, our DF-LearnB achieves utility comparable to DF baselines, including a state-of-the-art DF algorithm \citep{wang2023scalable} that purely optimizes utility.

% \begin{figure}
%     \centering
%     \includegraphics[width=0.95\columnwidth]{figs/armman_normalized_reward.png}
%     \caption{}
%     \label{fig:armman_meanMNW}
% \end{figure}

% \begin{figure}
%      \centering
%      \begin{subfigure}[b]{0.49\columnwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/armman_normalized_reward.png}
%     \caption{Normalized MaxNashWelfare objective for real-world health information data Experiment for different proposed methods}
%     \label{fig:armman_meanMNW}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[b]{0.49\columnwidth}
%          \centering
%          \includegraphics[width=\textwidth]{figs/armman_normalized_utility.png}
%          \caption{ Percentage reduction in population-level utility (OPE) as compared to maximum achieved utility.}
%          \label{fig:armman_utility}
%      \end{subfigure}
%         \caption{MaxNashWelfare objective and cost of fairness for different methods in the real-world health information data experiment}
%         \label{fig:armman}
% \end{figure}




\section{Conclusion}
We provide a novel decision-focused-learning pipeline for equitable RMABs, to prevent disparity between groups. Our algorithm simultaneously learns transition probabilities and per-group budget allocation. We propose a novel differentiable budget allocation method and provide theoretical results that shed light on the feasibility and effectiveness of this method. Notably, our techniques including the budget allocation method and the differentiable pipeline can be used to incorporate various fairness notions such as Max Nash Welfare, Maximin, and Gini Index. Our empirical results on both synthetic and real-world large-scale RMAB problems demonstrate that our method significantly improves performance as measured in an equitable objective, with little sacrifice in utility. 

\section{Acknowledgements}
This work was partially supported by the Harvard Data Science Initiative, ARO (W911NF-18-1-0208), and DARPA (HR001122C0182). The views and conclusions contained in this document are those of the authors and should not be interpreted as representing the official policies, either expressed or implied, of ARO, DARPA, or the U.S. Government. The U.S. Government is authorized to reproduce and distribute reprints for Government purposes notwithstanding any copyright notation herein.
% References
\bibliography{uai2024-template}



\newpage

\onecolumn

\title{Group Fairness in Predict-Then-Optimize Settings for Restless Bandits\\(Supplementary Material)}
\maketitle

\appendix

\section{Additional Algorithm Details}
Figure~\ref{fig:dfl_pipeline} provides an overview of the pipeline, illustrating the difference between two-stage training and decision-focused-learning. 
\begin{figure*}[h]
    \centering
    \includegraphics[width=0.95\columnwidth]{figs/fair_dfl_pipeline.png}
    \caption{This figure illustrates different methods to train the predictive model $f_w(\cdot)$. Two-stage training uses the predictive loss to perform gradient updates. In contrast, equitable decision-focused-learning backpropagates through the entire pipeline of computing equitable budget allocation and formulating an equitable policy.}
    \label{fig:dfl_pipeline}
\end{figure*}

\label{sec:appendix_algorithmic_details}
\subsection{Gradients with respect to Whittle Indices}

We define the value function for an arm $i\in[N]$ as
\begin{align}
\label{eq:value_function_each_arm}
V_i^{\lambda}(s) = \max_{a\in\{0,1\}} Q_i(s,a_i,\lambda).
\end{align}
The value function captures the discounted cumulative reward an arm could receive, assuming all future actions are taken optimally. Recall the Whittle index associated to the state $s_i$ is:
\begin{align*}
W_i(s_i):=\inf_{m}\left\{Q_i(s_i,a_i=0,m) = Q_i(s_i,a_i=1,m)\right\}.
\end{align*}. We obtain for each arm $i\in[N]$ that 
\begin{align}
\label{eq:system_of_inequalities}
V_i^{\lambda}(s) \geq\left\{\begin{array}{l}
\lambda + R(s) + \beta \sum_{s^{\prime} \in \mathcal{S}} P_i\left(s, a=0, s^{\prime}\right) V_i^{\lambda}\left(s^{\prime}\right) \\
R(s) + \beta \sum_{s^{\prime} \in \mathcal{S}} P_i\left(s, a=1, s^{\prime}\right) V_i^{\lambda}\left(s^{\prime}\right)
\end{array}\right.
\end{align}

The above system of inequalities can be rewritten in matrix form \citep{wang2023scalable}. Picking only rows where equality holds, we obtain

\begin{equation}
\label{eq:system_of_equalities}
\resizebox{0.5\hsize}{!}{%
$A\left[\begin{array}{ll}
\mathbf{1}_M & \beta P_i(\mathcal{S}, a=0, \mathcal{S})-I_M \\
\mathbf{0}_M & \beta P_i(\mathcal{S}, a=1, \mathcal{S})-I_M
\end{array}\right]\left[\begin{array}{c}
\lambda \\
V_i^{\lambda}
\end{array}\right]=-A\left[\begin{array}{l}
r(S) \\
r(S)
\end{array}\right] $%
}
\end{equation}

where the binary matrix $A \in\{0,1\}^{(M+1) \times 2 M}$ has row sums all equal to one. The above system of equations is full-rank and its solution is the Whittle index \citep{wang2021learning}. Thus, equation~\ref{eq:system_of_equalities} allows us to compute $\frac{dW}{dP}$, the gradient of Whittle indices with respect to transition probability predictions. 

\subsection{Importance Sampling}
In offline learning, importance sampling is commonly used to evaluate a target policy distinct from a behavioral policy (or a mixture of behavioral policies) that generates the offline dataset \cite{horvitz1952generalization,tokdar2010importance}. Specifically, the inverse propensity scores are defined as 
\begin{align}
\label{eq:inverse_propensity_score_simple}
\frac{\mathbb{I}\{a_i=a\}}{\mu(a_i|s_i)},
\end{align}
where $\mu(a_i|s_i)$ denotes the probability we choose action $a_i$ on state $s_i$, following the behavioral policy. 

More generally, given a target policy $\pi(\cdot,\cdot)$, one may use the following version of inverse propensity score:
\begin{align}
\label{eq:inverse_propensity_score_general}
\frac{\pi(a_i|s_i)}{\mu(a_i|s_i)},
\end{align}

Let $\hat{R}\left(s_i, a\right)$ denote the reward estimates obtained using inverse propensity scores. Let $\mathcal{A}$ denote the action space. Notably, $\hat{R}\left(s_i, a\right)$ is unbiased: 
\begin{align*}
\mathbb{E}\left[\hat{R}\left(s_i, a\right) \mid s_i, a\right] & =\sum_{a^{\prime} \in \mathcal{A}} \mu\left(a^{\prime} \mid s_i\right) \mathbb{E}\left[\hat R\left(a_i, a\right) \mid s_i, a, a_i=a^{\prime}\right]  = \sum_{a^{\prime} \in \mathcal{A}} \mu\left(a^{\prime} \mid s_i\right) r\left(s_i, a^{\prime}\right) \frac{\mathbf{1}\left(a^{\prime}=a\right)}{\mu\left(a^{\prime} \mid s_i\right)}  = r\left(s_i, a\right) .
\end{align*}
For simplicity, in the above derivations we used the simpler notion of inverse propensity scores (\ref{eq:inverse_propensity_score_simple}), and we assume that reward observations $r(s_i,a)$ are without noise. Similar results of unbiasedness can be obtained for the more general notion (\ref{eq:inverse_propensity_score_general}) \cite{tokdar2010importance}. When there is noise in reward observations, one may substitute $r(\cdot,\cdot)$ above by the expected reward and obtain a similar result. 


\subsection{The Gini Index Objective}
\label{sec:appendix_other_fairness_objectives}

One could replace MaxNashWelfare (MNW) objective in Algorithm~\ref{alg:equitable_rmab_main} and \ref{alg:subroutine_budget_allo_neural_net_based} with the Gini Index objective
$$
\frac{\sum_{i=1}^N\sum_{j=1}^N |x_i-x_j|}{2N^2\bar{x}},
$$
where $\bar{x}=\frac{1}{N}\sum_i x_i$. In implementation, the absolute value can be replaced by $\max\{x_i-x_j, x_j-x_i\}$. For equity in RMABs, we apply the objective on group values $V(s_g,b_g)$. 


% {\bf Maximin}

% To use our pipeline with Maximin objective, one only needs to replace the MNW objective in Algorithm~\ref{alg:equitable_rmab_main} by a differentiable approximation of the minimum operator, and then take gradients (line 12 in Algorithm~\ref{alg:equitable_rmab_main} and line 4 in Algorithm~\ref{alg:subroutine_budget_allo_neural_net_based}) in the same fashion as before. One choice of a differentiable approximation of the minimum operator is the Hölder mean or generalized mean:
% $$
% f_p(x_1,...,x_n):=\left(\frac{1}{n}\sum_{i} x_i^p\right)^{\frac{1}{p}}, \quad \text{for} \quad p\rightarrow -\infty
% $$
% It is well-known that 
% $$
% \lim_{p\rightarrow -\infty} f_p(x_1,...,x_n)=\min(x_1,...,x_n).
% $$
% Here for equity in RMABs, we plug in $V(s_g,b_g)$ for $x_i$'s. 

% {\bf Gini index}

% Similar to changes needed to use the Maximin objective, to use our pipeline with the Gini index we replace the MNW objective with 

% $$
% \frac{\sum_{i=1}^N\sum_{j=1}^N |x_i-x_j|}{2N^2\bar{x}},
% $$
% where $\bar{x}=\frac{1}{N}\sum_i x_i$. In implementation, the absolute value can be replaced by $\max\{x_i-x_j, x_j-x_i\}$. Again, for equity in RMABs, we plug in $V(s_g,b_g)$ for $x_i$'s. 



\section{Additional Experimental Details and Results}
\label{sec:appendix_additional_experimental_results}

\subsection{Implementation and Hyperparameters}
When implementing the gradient computations described in Proposition~\ref{prop:differentiate_budget_allocation} proof, we used the Python optimal transport library 
\citep{flamary2021pot}. For soft top-k selection, we use $\epsilon = 0.01$. We consider a feature dimension of 16 which are correlated with the transition probabilities. For learning the mapping between features and transition probabilities, we use a 3 layer fully connected neural network and add dropouts to prevent overfitting. We use learning rate $\text{lr} = 0.001$ for updating the weights of the neural network.
We run all experiments with time horizon $H=10$ and discount factor $\gamma = 0.99$. We run synthetic experiments for 20 epochs and ARMMAN experiments for 30 epochs.

\subsection{Synthetic Dataset}
\label{sec:appendix_additional_exp_synthetic}
% In the main paper, we reported the normalized fairness objective, which is obtained by dividing the MNW metric by the maximum value across all methods. 
Here we provide a more comprehensive set of results, and we report the mean and the standard deviation of fairness objectives for all methods in Tables \ref{tab:synth_mnw} and \ref{tab:synth_mmr}. We observe that our proposed method DF-LearnB substantially outperforms baselines, including both two-stage methods and decision-focused-learning methods. 

In synthetic experiments, we set up a disadvantaged group such that the benefit from pulling over not pulling an arm is strictly lower for the disadvantaged group as compared to the advantaged group, and if we do not act on any arms, then the disadvantaged group obtains much lower reward than the advantaged group. Hence in this setting, a method purely maximizing utility would favor arms from the advantaged group because they have higher whittle indices. However, this would induce a high fairness-penalty because not giving interventions to disadvantaged group creates high inequality across groups. 

% with three two-stage baselines including:
% \begin{itemize}
% \item \textbf{\citet{killian2023equitable} (Two Stage Learning + Greedy Budget Allocation)}: transition probabilities are learned by maximizing the predictive accuracy. Group budgets are computed greedily (Algorithm~\ref{alg:subroutine_budget_allo_greedy}) then apply Whittle index policy for every group. 
% %Further, the budgets are allocated to different groups using the Greedy Budget Allocation algorithm (Algorithm 3). The interventions are then allocated to beneficiaries with the highest Whittle index within each group.
% \item \textbf{TS-NoFair (No Fairness)}: transition probabilities are learned by maximizing the predictive accuracy. Group budgets are not computed and Whittle Index policy is applied to the entire population.
% \item \textbf{TS-PropB (Proportional Budget)}: transition probabilities are learned by maximizing the predictive accuracy. We set group budgets proportional to group sizes and then apply Whittle index policy for every group
% \end{itemize}

% \begin{figure}[htb]
%     \centering
%     \includegraphics[width=0.5\columnwidth]{figs/normalized_reward.png}
%     \caption{Normalized MaxNashWelfare Objective for different proposed methods. For each method, we report average and standard deviation over all problem settings listed in Table~\ref{table:mean_MNW}.}
%     \label{fig:meanMNW}
% \end{figure}



% Please add the following required packages to your document preamble:
% \usepackage{multirow}
\begin{table}[]
\caption{A comparison of MaxNashWelfare objective for different methods, under different number of arms $N$, budget $G$. The results illustrate that our DF-LearnB algorithm consistently outperforms baselines, often achieving 20-40\% gains in performance.} % UAI requires caption to be above table
\def\arraystretch{1.5}%
\resizebox{\columnwidth}{!}{%
% Please add the following required packages to your document preamble:
% \usepackage{multirow}
\begin{tabular}{|r|r|r|r|r|r|r|r|}
\hline
\textbf{N}           & \textbf{|G|}       & \textbf{B} & \textbf{DF-GreedyB}    & \textbf{DF-LearnB}              & \textbf{DF-NoFair}            & \textbf{DF-PropB}      & \textbf{Killian et al. (2023)} \\ \hline
\multirow{6}{*}{100} & \multirow{3}{*}{2} & 20         & 1844.4395$\pm$26.58    & \textbf{2075.7222$\pm$14.98}    & 1855.7535$\pm$19.28           & 1964.265$\pm$13.66     & 1509.0015$\pm$27.26            \\ \cline{3-8} 
                     &                    & 40         & 3099.4233$\pm$31.0     & \textbf{3201.858$\pm$32.83}     & 2848.6445$\pm$35.56           & 2967.8445$\pm$24.53    & 2554.0105$\pm$22.52            \\ \cline{3-8} 
                     &                    & 60         & 4331.004$\pm$70.77     & \textbf{4899.134$\pm$35.32}     & 4543.9746$\pm$79.88           & 4311.2197$\pm$41.29    & 3781.4805$\pm$30.23            \\ \cline{2-8} 
                     & \multirow{3}{*}{3} & 20         & 9772.077$\pm$180.51    & \textbf{10345.396$\pm$113.55}   & 9554.029$\pm$136.11           & 9294.717$\pm$70.64     & 7410.884$\pm$262.82            \\ \cline{3-8} 
                     &                    & 40         & 18594.375$\pm$332.59   & 20877.344$\pm$350.93            & \textbf{20877.727$\pm$322.38} & 17690.48$\pm$244.01    & 14015.834$\pm$291.23           \\ \cline{3-8} 
                     &                    & 60         & 30537.754$\pm$518.93   & 33922.445$\pm$639.43            & \textbf{34160.22$\pm$628.87}  & 30166.115$\pm$487.45   & 24601.406$\pm$494.18           \\ \hline
\multirow{6}{*}{200} & \multirow{3}{*}{2} & 40         & 7598.952$\pm$94.18     & \textbf{8109.785$\pm$56.59}     & 7457.035$\pm$73.44            & 7907.203$\pm$56.18     & 6322.539$\pm$94.12             \\ \cline{3-8} 
                     &                    & 80         & 6121.2$\pm$982.21      & \textbf{6416.3003$\pm$1029.48}  & 4779.436$\pm$939.67           & 6033.884$\pm$967.59    & 5080.54$\pm$815.24             \\ \cline{3-8} 
                     &                    & 120        & 17639.025$\pm$133.53   & \textbf{19646.81$\pm$176.58}    & 18649.248$\pm$219.88          & 17160.963$\pm$119.82   & 15324.928$\pm$141.23           \\ \cline{2-8} 
                     & \multirow{3}{*}{3} & 40         & 76300.6$\pm$1025.35    & \textbf{82425.66$\pm$818.74}    & 77052.31$\pm$885.67           & 77905.2$\pm$841.42     & 59414.332$\pm$675.85           \\ \cline{3-8} 
                     &                    & 80         & 73718.67$\pm$11895.45  & \textbf{85024.91$\pm$13699.44}  & 61591.492$\pm$12132.29        & 71157.48$\pm$11506.2   & 56691.207$\pm$9164.64          \\ \cline{3-8} 
                     &                    & 120        & 241364.95$\pm$4926.81  & \textbf{284413.8$\pm$4135.75}   & 255120.45$\pm$4803.43         & 243100.05$\pm$2427.86  & 195661.33$\pm$1573.02          \\ \hline
\multirow{6}{*}{500} & \multirow{3}{*}{2} & 100        & 47710.535$\pm$335.0    & \textbf{51242.055$\pm$146.56}   & 46619.305$\pm$271.52          & 49934.93$\pm$210.22    & 39057.38$\pm$267.54            \\ \cline{3-8} 
                     &                    & 200        & 79310.58$\pm$571.92    & \textbf{81587.65$\pm$462.29}    & 73139.51$\pm$731.59           & 76560.4$\pm$609.23     & 63439.438$\pm$293.03           \\ \cline{3-8} 
                     &                    & 300        & 111593.96$\pm$824.02   & \textbf{122485.984$\pm$634.14}  & 111663.516$\pm$1183.26        & 107888.336$\pm$606.95  & 94599.58$\pm$286.63            \\ \cline{2-8} 
                     & \multirow{3}{*}{3} & 100        & 1185198.8$\pm$9542.08  & \textbf{1283090.5$\pm$7886.83}  & 1173831.1$\pm$7249.73         & 1235366.4$\pm$9119.11  & 919897.7$\pm$14505.66          \\ \cline{3-8} 
                     &                    & 200        & 2316630.8$\pm$22941.54 & \textbf{2569942.0$\pm$28927.02} & 2473116.5$\pm$17234.65        & 2297424.5$\pm$25828.66 & 1742517.0$\pm$12990.4          \\ \cline{3-8} 
                     &                    & 300        & 3892794.8$\pm$41722.41 & \textbf{4434235.0$\pm$40364.05} & 4183696.0$\pm$50922.18        & 3855896.0$\pm$20056.38 & 3067652.0$\pm$17616.07         \\ \hline
\end{tabular}
}
\label{tab:synth_mnw}
\end{table}


% Please add the following required packages to your document preamble:
% \usepackage{multirow}
\begin{table}[]
\caption{A comparison of Maximin Reward objective for different methods, under different number of arms $N$, budget $G$. The results illustrate that our DF-LearnB algorithm consistently outperforms baselines, often achieving 20-40\% gains in performance.}
\centering
\def\arraystretch{1.5}%
\resizebox{0.8\columnwidth}{!}{%
\begin{tabular}{|r|r|r|r|r|r|r|r|}
\hline
\textbf{N}           & \textbf{|G|}       & \textbf{K} & \textbf{DF-GreedyB} & \textbf{DF-LearnB}       & \textbf{DF-NoFair} & \textbf{DF-PropB} & \textbf{Killian et al. (2023)} \\ \hline
\multirow{6}{*}{100} & \multirow{3}{*}{2} & 20         & 94.25$\pm$2.14      & \textbf{102.5$\pm$0.76}  & 66.92$\pm$0.56     & 88.17$\pm$0.85    & 74.33$\pm$1.45                 \\ \cline{3-8} 
                     &                    & 40         & 103.67$\pm$4.17     & \textbf{145.17$\pm$1.15} & 96.0$\pm$2.32      & 108.25$\pm$0.55   & 108.17$\pm$0.79                \\ \cline{3-8} 
                     &                    & 60         & 137.75$\pm$4.24     & \textbf{171.58$\pm$1.02} & 164.75$\pm$1.72    & 139.25$\pm$1.73   & 133.75$\pm$1.43                \\ \cline{2-8} 
                     & \multirow{3}{*}{3} & 20         & 55.61$\pm$1.15      & \textbf{78.78$\pm$1.05}  & 51.22$\pm$0.86     & 61.78$\pm$0.61    & 61.39$\pm$1.26                 \\ \cline{3-8} 
                     &                    & 40         & 84.33$\pm$2.82      & \textbf{102.72$\pm$1.18} & 65.17$\pm$1.07     & 78.94$\pm$0.95    & 73.89$\pm$1.91                 \\ \cline{3-8} 
                     &                    & 60         & 94.72$\pm$2.72      & \textbf{114.06$\pm$1.24} & 107.61$\pm$0.94    & 96.5$\pm$1.53     & 87.0$\pm$2.1                   \\ \hline
\multirow{6}{*}{200} & \multirow{3}{*}{2} & 40         & 191.83$\pm$1.39     & \textbf{205.17$\pm$2.36} & 140.17$\pm$1.82    & 177.33$\pm$1.5    & 156.83$\pm$2.52                \\ \cline{3-8} 
                     &                    & 80         & 237.12$\pm$5.51     & \textbf{294.42$\pm$2.37} & 197.96$\pm$5.0     & 221.38$\pm$1.8    & 218.19$\pm$2.59                \\ \cline{3-8} 
                     &                    & 120        & 297.67$\pm$5.23     & \textbf{350.67$\pm$3.19} & 336.83$\pm$3.07    & 287.33$\pm$3.57   & 279.83$\pm$2.6                 \\ \cline{2-8} 
                     & \multirow{3}{*}{3} & 40         & 112.67$\pm$1.33     & \textbf{156.56$\pm$1.68} & 96.44$\pm$1.04     & 120.89$\pm$1.57   & 120.44$\pm$1.32                \\ \cline{3-8} 
                     &                    & 80         & 162.62$\pm$4.27     & \textbf{214.69$\pm$1.71} & 134.95$\pm$6.53    & 153.31$\pm$2.01   & 151.49$\pm$2.65                \\ \cline{3-8} 
                     &                    & 120        & 209.56$\pm$3.5      & \textbf{236.89$\pm$3.18} & 228.44$\pm$2.5     & 189.44$\pm$2.56   & 176.0$\pm$2.19                 \\ \hline
\multirow{6}{*}{500} & \multirow{3}{*}{2} & 100        & 495.83$\pm$2.05     & \textbf{527.08$\pm$2.04} & 340.83$\pm$2.57    & 445.0$\pm$2.09    & 395.0$\pm$3.5                  \\ \cline{3-8} 
                     &                    & 200        & 647.08$\pm$7.77     & \textbf{740.0$\pm$1.73}  & 476.25$\pm$13.8    & 550.83$\pm$3.27   & 550.83$\pm$4.63                \\ \cline{3-8} 
                     &                    & 300        & 749.58$\pm$3.96     & \textbf{863.75$\pm$3.06} & 830.0$\pm$2.89     & 712.92$\pm$2.82   & 657.08$\pm$3.23                \\ \cline{2-8} 
                     & \multirow{3}{*}{3} & 100        & 327.5$\pm$4.22      & \textbf{406.67$\pm$3.18} & 239.44$\pm$1.87    & 303.06$\pm$3.03   & 307.78$\pm$3.14                \\ \cline{3-8} 
                     &                    & 200        & 418.89$\pm$11.0     & \textbf{548.33$\pm$3.18} & 303.61$\pm$2.01    & 381.39$\pm$1.71   & 384.72$\pm$2.19                \\ \cline{3-8} 
                     &                    & 300        & 525.0$\pm$5.39      & \textbf{596.67$\pm$2.04} & 579.44$\pm$2.31    & 488.61$\pm$2.99   & 438.61$\pm$3.13                \\ \hline
\end{tabular}
}
\label{tab:synth_mmr}
\end{table}

\clearpage

\subsection{Real-world ARMMAN Dataset}
\label{sec:appendix_armman_results}
\subsubsection{Secondary Analysis}
Our experiment falls into the category of secondary analysis of the data shared by ARMMAN. 
This paper does not involve the deployment of the proposed algorithm or any other baselines to the service call program. As noted earlier, the experiments are secondary analysis with approval from the ARMMAN ethics board.

\subsubsection{Consent and Data Usage}
\label{sec:appendix_consent_data_usage}
Consent is obtained from every beneficiary enrolling in the NGO's mobile health program. The data collected through the program is owned by the NGO and only the NGO is allowed to
share data. In our experiments, we use anonymized call listenership logs to calculate empirical transition probabilities. No personally
identifiable information (PII) is available to us.
The data exchange and usage were regulated by clearly
defined exchange protocols including anonymization, read-access
only to researchers, restricted use of the data for research purposes
only, and approval by ARMMAN’s ethics review committee.

\subsubsection{Risk Attributes}
\label{sec:appendix_risk_attributes}
Table~\ref{tab: risk} shows the risk attributes in the real-world ARMMAN dataset. We also show the percentile distribution of the Whittle Index for every group in Figure~\ref{fig:percentile}. 

\begin{table}[h]
\centering
\def\arraystretch{1.2}%
% \def\arraystretch{1.5}%
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{|p{2.2cm}|p{4.6cm}|p{1.7cm}|}
\hline
\textbf{Risk Attribute} & \textbf{Definition}                            & \textbf{Population Proportion} \\ \hline
Low Income              & Monthly Family Income \textless INR 15,000 (180 USD approx)     & 55.5\%                         \\ \hline
Low Education           & Highest Education Level matriculation or below & 37.2\%                         \\ \hline
Phone Owner        & Phone not owned by beneficiary                & 23.1\%                                               \\ \hline
\end{tabular}
%}
\caption{Risk attributes, their definitions, and their prevalence in the real-world data. Each risk attribute contributes to the risk score by 1 resulting in a risk score value between 0 and 3}
\label{tab: risk}
\end{table}
Group A has mothers who do not have any of the risk attributes. Group B has mothers with 1 risk attribute, Group C with 2 risk attributes, and Group D with all 3 risk attributes. 

% \begin{table}[h]
% \centering
% \caption{Risk attributes and their definitions. Each risk attribute contributes to the risk score by 1 resulting in risk score value between 0 and 4}
% \def\arraystretch{1.2}%
% % \def\arraystretch{1.5}%
% %\resizebox{\columnwidth}{!}{%
% \begin{tabular}{|p{2.2cm}|p{8.6cm}|}
% \hline
% \textbf{Risk Attribute} & \textbf{Definition}                             \\ \hline
% Low Income              & Monthly Family Income \textless INR 15,000 (180 USD approx)                          \\ \hline
% Low Education           & Highest Education Level matriculation (Grade 10) or below                       \\ \hline
% Phone Owner        & Phone not owned by beneficiairy                                      \\ \hline
% \end{tabular}
% %}
% \label{tab: risk}
% \end{table}

\begin{figure}[h]
     \centering
     \begin{subfigure}[b]{0.29\columnwidth}
         \centering
         \includegraphics[width=\textwidth]{figs/figures_final_old/armman_wh_0.png}
         \caption{Whittle Index for intervening in Non-Engaging State }
         \label{fig:percentile_non_engaging}
     \end{subfigure}
     % \hfill
     \begin{subfigure}[b]{0.29\columnwidth}
         \centering
         \includegraphics[width=\textwidth]{figs/figures_final_old/armman_wh_1.png}
         \caption{Whittle Index for intervening in Engaging State}
         \label{fig:percentile_engaging}
     \end{subfigure}
        \caption{Whittle Index for different risk groups over top-k percentile}
        \label{fig:percentile}
\end{figure}

\subsubsection{Group Sizes}
\label{sec:appendix_armman_group_sizes_results}
In the main paper, we presented results on ARMMAN with unequal group sizes. The group sizes are unequal since the proportion of people in distinct groups in the real-world are different. It is known that Max Nash Welfare favors groups of smaller sizes and may not be blinded used on equal group size settings \citep{killian2023equitable}. Thus, for results on MNW objective, we sample a subset of the population such that the group sizes are equal. We present results on equal group sizes in Figure~\ref{fig:armman_budget_equal} and \ref{fig:armman_results_equal_size}. 

When MNW objective is used, we observe that DF-NoFair allocates nearly all budgets to Group-D, whose mothers have much higher Whittle indices (see Figure~\ref{fig:percentile}). \citet{killian2023equitable} and DF-GreedyB allocate budgets close to proportional budgets. While baselines give extreme budget allocations (either all to group-D or simply equal allocations), our DF-LearnB balances fairness and utility and prioritizes groups more in need.

Similar to our observations in the main paper, in fairness objectives, our DF-LearnB outperforms baselines by a wide margin (see Figure~\ref{fig:armman_results_equal_size}). Notice our DF-LearnB greatly improves fairness objective values while achieving utility close to that of a State-of-the-Art algorithm purely maximizing utility (DF-NoFair) \citep{wang2023scalable}. 

Note our pipeline can be used with objectives such as Maximin, which naturally accommodates unequal group sizes. Even for MNW, our methods could be extended to accommodate unequal group sizes by adapting existing techniques, such as resampling arms, to balance group sizes \citep{killian2023equitable}. 


\begin{figure}[h]
     \centering
     \begin{subfigure}[b]{0.39\columnwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/armman_mnw_budgets.png}
         \caption{MNW objective}
         \label{fig:armman_budget_equal_mnw}
     \end{subfigure}
     % \hfill
     \begin{subfigure}[b]{0.39\columnwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/armman_mmr_budgets.png}
         \caption{MMR objective}
         \label{fig:armman_budget_equal_mmr}
     \end{subfigure}
        \caption{Budget allocation to different risk groups in real-world ARMMAN data experiment with equal group sizes.}
        \label{fig:armman_budget_equal}
\end{figure}

\begin{figure}
     \centering
     \begin{subfigure}[b]{0.7\linewidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/labels_big.png}
     \end{subfigure}
     \begin{subfigure}[b]{0.34\columnwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/armman_mnw.png}
         \caption{MNW metric}
         \label{fig:armman_mnw_equal_size}
     \end{subfigure}
     % \hfill
     \begin{subfigure}[b]{0.34\columnwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/armman_mnw.png}
         \caption{MMR metric}
         \label{fig:armman_mmr_equal_size}
     \end{subfigure}
     % \hfill
     \begin{subfigure}[b]{0.34\columnwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/armman_mnw_ope.png}
         \caption{Utility when maximizing MNW objective}
         \label{fig:armman_ope_mnw_equal_size}
     \end{subfigure}
     % \hfill
     \begin{subfigure}[b]{0.34\columnwidth}
         \centering
         \includegraphics[width=\textwidth]{figures_final/armman_mmr_ope.png}
         \caption{Utility when maximizing MMR objective}
         \label{fig:armman_ope_mmr_equal_size}
     \end{subfigure}
        \caption{Fair objective metric (a-b) and Utility metric (c-d) for different methods in real-world data experiments. For all experiments, we have N=10000, B=300}
        \label{fig:armman_results_equal_size}
\end{figure}



\clearpage
\section{Proof of Theorem~\ref{thm:mnw_convex_in_budget}}

\begin{proof}
When there are only two groups $g_1,g_2$, from the budget constraint we have $b_{g_1} = B - b_{g_2}$. If we can show $\hat h_{g_1}^{MNW}(\cdot)$ is concave, then by symmetry, the same argument can be applied to $g_2$ to show that $\hat h_{g_2}^{MNW}(\cdot)$ is concave. 

When there are three or more groups, we can view $\hat g=\mathcal{G}\setminus g_1$ as one artificial group. By the assumption that $V_g(b_g)$ has diminishing returns in $b_g$ for any $g\in\mathcal{G}$, we have that $V_{\hat g}(b_{\hat g})$ has diminishing returns. Consequently, the same argument in the two group case applies. By symmetry, the same argument can be applied to any group other than $g_1$.

Thus, it suffices to show $\hat h_{g_1}^{MNW}(\cdot)$ is concave in the two group case. 

Since the initial states $\bm s$ are given and fixed, for ease of notation, we drop the dependence of $V_g(s_g,b_g)$ on $s_g$ and write as $V_g(b_g)$. 

We will prove by contradiction. Suppose to the contrary that $\hat h_{g_1}^{MNW}(\cdot)$ is non-concave. By definition of concave functions, there must exist values $0\le x_1<x_2\le B$ and $\alpha\in[0,1]$ such that
\[
\alpha\cdot \hat h_{g_1}^{MNW}(x_1) + (1-\alpha)\cdot \hat h_{g_1}^{MNW}(x_2) > \hat h_{g_1}^{MNW}(\alpha x_1 + (1-\alpha) x_2).
\]
Since $\hat h_{g_1}^{MNW}(\cdot)$ is a piecewise linear function and is a linear extrapolation based on nearest integer points, there must exist an integer $k\in\{1,...,B-1\}$ such that
\begin{equation}
\label{eq:mnw_concave_eq1}
\frac{1}{2}h_{g_1}^{MNW}(k-1) + \frac{1}{2} h_{g_1}^{MNW}(k+1) > h_{g_1}^{MNW}(k).
\end{equation}
Figure~\ref{fig:non-concave} illustrates the landscape. 

\begin{figure}[h]
    \centering
    \includegraphics[width=0.5\columnwidth]{figs/non-concave.png}
    \caption{Illustration of a non-concave function in $b_{g_1}$}
    \label{fig:non-concave}
\end{figure}

Multiplying both sides of Equation~\ref{eq:mnw_concave_eq1} by 2 and rearranging terms, we obtain:
\begin{equation}
\label{eq:mnw_concave_expression_1}
h_{g_1}^{MNW}(k-1) - h_{g_1}^{MNW}(k) > h_{g_1}^{MNW}(k) - h_{g_1}^{MNW}(k+1)
\end{equation}
We consider two cases : (a) $h_{g_1}^{MNW}(k-1) - h_{g_1}^{MNW}(k)\ge 0$; (b) $h_{g_1}^{MNW}(k-1) - h_{g_1}^{MNW}(k) < 0$. 

% We rewrite $h(b_{g_1})$ as 
% \begin{align}
% \label{eq:mnw_concave_expression_2}
% &h(b_{g_1})=\sum_{i=1,2} \log V_{g_i}(b_{g_i})\\
% =& \sum_{i=1,2} \left[\log V_{g_i}(0) + 
% \sum_{j=1}^{b_{g_i}}\left( \log V_{g_i}(j) - \log V_{g_i}(j-1) \right)\right]
% \end{align}

{\bf case (a)} 

Since $h_{g_1}^{MNW}(k-1) - h_{g_1}^{MNW}(k)\ge 0$, we have $\epsilon:=h_{g_1}^{MNW}(k-1) - h_{g_1}^{MNW}(k)\ge 0$. By definition of MNW, we can write $h_{g_1}^{MNW}(b_{g_1})=\log V_{g_1}(b_{g_1}) + \log V_{g_2}(B-b_{g_1})$. Thus,
\begin{align*}
&\log V_{g_1}(k-1) + \log V_{g_2}(B-k+1) + \epsilon\\
=&\log V_{g_1}(k) + \log V_{g_2}(B-K) 
\end{align*}
Rearranging terms, we have 
\begin{align*}
&\log V_{g_1}(k) - \log V_{g_1}(k-1) +\epsilon\\
= &\log V_{g_2}(B-k+1) - \log V_{g_2}(B-k),
\end{align*}
Together with the assumption that each group's value function $V_g(b_g)$ has diminishing returns in $b_g$, we have 
\begin{align*}
&\log V_{g_1}(k+1) - \log V_{g_1}(k) +\epsilon\\
\le &\log V_{g_1}(k) - \log V_{g_1}(k-1) +\epsilon \\
= &\log V_{g_2}(B-k+1) - \log V_{g_2}(B-k)\\
\le & \log V_{g_2}(B-k) - \log V_{g_2}(B-k-1)
\end{align*}
Taking the first and the last line above and rearranging terms, we obtain
\begin{align*}
&\log V_{g_1}(k+1) +\log V_{g_2}(B-k-1)+\epsilon \\
\le & \log V_{g_1}(k) + \log V_{g_2}(B-k).
\end{align*}
Thus, $h_{g_1}^{MNW}(k+1)+\epsilon\le  h_{g_1}^{MNW}(k)$. Consequently, 
$h_{g_1}^{MNW}(k)-h_{g_1}^{MNW}(k+1)\ge \epsilon=h_{g_1}^{MNW}(k-1)-h_{g_1}^{MNW}(k)$, contradicting Equation~\ref{eq:mnw_concave_expression_1}.

{\bf case (b)} 

By Equation~\ref{eq:mnw_concave_expression_1} and that $h_{g_1}^{MNW}(k-1) - h_{g_1}^{MNW}(k)<  0$, we have 
\begin{equation*}
 h_{g_1}^{MNW}(k) - h_{g_1}^{MNW}(k+1) < h_{g_1}^{MNW}(k-1) - h_{g_1}^{MNW}(k) < 0.
\end{equation*}
Let $\epsilon:=h_{g_1}^{MNW}(k+1)-h_{g_1}^{MNW}(k)> 0$. By definition of MNW, we can write $h_{g_1}^{MNW}(b_{g_1})=\log V_{g_1}(b_{g_1}) + \log V_{g_2}(B-b_{g_1})$. Thus,
\begin{align*}
&\log V_{g_1}(k+1) + \log V_{g_2}(B-k-1)\\
=&\log V_{g_1}(k) + \log V_{g_2}(B-k) + \epsilon 
\end{align*}
Rearranging terms, we have 
\begin{align*}
&\log V_{g_1}(k+1) - \log V_{g_1}(k) \\
= &\log V_{g_2}(B-k) - \log V_{g_2}(B-k-1) +\epsilon,
\end{align*}
Together with the assumption that each group's value function $V_g(b_g)$ has diminishing returns in $b_g$, we have 
\begin{align*}
&\log V_{g_1}(k) - \log V_{g_1}(k-1)\\
\ge &\log V_{g_1}(k+1) - \log V_{g_1}(k) \\
= &\log V_{g_2}(B-k) - \log V_{g_2}(B-k-1) +\epsilon\\
\ge &\log V_{g_2}(B-k+1) - \log V_{g_2}(B-k) +\epsilon
\end{align*}
Taking the first and the last line above and rearranging terms, we obtain
\begin{align*}
&\log V_{g_1}(k) +\log V_{g_2}(B-k) \\
\ge & \log V_{g_1}(k-1) + \log V_{g_2}(B-k+1) +\epsilon.
\end{align*}
Thus, $h_{g_1}^{MNW}(k)\ge  h_{g_1}^{MNW}(k-1)+\epsilon$. Consequently, 
$h_{g_1}^{MNW}(k)-h_{g_1}^{MNW}(k-1)\ge \epsilon=h_{g_1}^{MNW}(k+1)-h_{g_1}^{MNW}(k)$, contradicting Equation~\ref{eq:mnw_concave_expression_1}.

Thus, in either case (a) and case (b), we have shown there is a contradiction. We conclude that $\hat h_g^{MNW}(\cdot)$ is concave for any group $g\in\mathcal{G}$. 

\end{proof}



% \section{Proof of Proposition \ref{prop:differentiate_budget_allocation}}

% \begin{proof}[Proof of Proposition~\ref{prop:differentiate_budget_allocation}]
% Pulling top-$b_g$ arms from each group $g\in\mathcal{G}$ can be formulated as an optimal transport problem. Specifically, we let $\bm\mu:=\frac{\bm 1_{N_g}}{N_g}\in\mathbb{R}^{N_g}$ and $\bm v:=[\frac{b_g}{N_g}, \frac{N_g - b_g}{N_g}]$. We let $y:=[0,1]^\top$ and a cost matrix $M_{ij}:=|\bar{W}_i-y_j|^2$, where $\bar{W}_i$ is normalized Whittle index of arm $i\in{g}$. The top-$b_g$ operator output can be obtained from a linear mapping of the optimal transport plan $T^*$ \citep{xie2020differentiable}:
% \begin{align}
% \label{eq:optimal_transport}
% S(\bm\mu,\bm v)&:= \min_{T\in\Pi(\bm\mu, \bm v)} \langle T, M \rangle,
% \end{align}
% where 
% \[
% \Pi(\bm\mu, \bm v):= \{T\in\mathbb{R}_+^{N_g,2} | T \bm 1_{2} = \bm\mu, T \bm 1_{N_g} = \bm v \}.
% \]
% Notice solving the optimization problem (\ref{eq:optimal_transport}) only requires that we know $\bar{W}_i$, $b_g$, and $N_g$. 

% Solving the optimization problem (\ref{eq:optimal_transport}) is computationally expensive and a regularized version is commonly used \citep{cuturi2013sinkhorn}:
% \begin{align}
% \label{eq:optimal_transport_regularized}
% \tilde{S}_{\epsilon}(\bm\mu,\bm v)&:= \min_{T\in\Pi(\bm\mu, \bm v)} \langle T, M \rangle + \epsilon \sum_{i,j} T_{ij} (\log T_{ij} - 1).
% \end{align}

% Using the regularized version, an approximate gradient of the objective in (\ref{eq:optimal_transport}) with respect to input $\bm v$ can be computed. \citep{luise2018differential}
% %by considering a regularized version of Problem (\ref{eq:optimal_transport}) \citep{luise2018differential}. 
% % Yunfan: update description here. \mu does not depend on b_g. 
% \end{proof}

\section{Proof of Theorem~\ref{thm:maximin_convex_in_budget}}

\begin{proof}
We start with defining linear extrapolations of group value functions: $$\hat V_g(b_g):=V_g(\floor*{b_g})+(b_g-\floor*{b_g})\cdot (V_g(\ceil*{b_g}) - V_g(\floor*{b_g})), \quad \forall g$$
Observe that $\hat V_g(b_g)=V_g(b_g)$ for $b_g\in\mathbb{Z}$ and on non-integer valued $b_g$ the function $\hat V_g(b_g)$ is a linear extrapolation based on nearest integer points. By the assumption that $V_g(b_g)$ has diminishing returns in $b_g$ for any $g\in\mathcal{G}$, we have that $\hat V_g(b_g)$ is concave in $b_g$.

When there are only two groups, $b_{g_1} = B - b_{g_2}$. Since $\hat V_{g_2}(\cdot)$ is concave, we have that $\hat V_{g_2}(B-b_{g_1})$ is a concave function of $b_{g_1}$. Since the minimum of concave functions is concave, we have that $\min(\hat V_{g_1}(b_{g_1}), \hat V_{g_2}(B-b_{g_1}))$ is a concave function of $b_{g_1}$. Thus, $\hat h_{g_1}^{MMR}(\cdot)$ is concave. By symmetry, the same argument can be applied to $b_{g_2}$ to show that $\hat h_{g_2}^{MMR}(\cdot)$ is concave. Thus, we proved the statement in the two groups case. 

When there are three or more groups, we can view $\hat g=\mathcal{G}\setminus g_1$ as one artificial group. By the assumption that $V_g(b_g)$ has diminishing returns in $b_g$ for any $g\in\mathcal{G}$, we have that $V_{\hat g}(b_{\hat g})$ has diminishing returns. Consequently, the same argument in the two group case applies.
\end{proof}




\end{document}

