% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% added by author
\usepackage[ruled,vlined,linesnumbered,noend]{algorithm2e}

% \usepackage{scrextend}
\usepackage{enumitem}
\usepackage{graphicx}
\usepackage{amsfonts}
\usepackage{mathrsfs}
\usepackage{amsmath}
\usepackage{setspace}
\newtheorem{lemma}{Lemma}
% \newtheorem{proof}{Proof}
\newtheorem{definition}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newenvironment{proof}{{\noindent\it Proof.}\quad}{\hfill $\square$ \par}
\newenvironment{proof_sketch}{{\noindent\it Proof Sketch.}\quad}{\hfill $\square$ \par}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Efficient Resource Allocation with Fairness Constraints in Restless Multi-Armed Bandits}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<dexunli.2019@phdcs.smu.edu.sg>?Subject=Your UAI 2022 paper}{Dexun Li}{}}
\author[1]{Pradeep Varakantham}
% Add affiliations after the authors
\affil[1]{%
    School of Computing and Information Systems\\
    Singapore Management University\\
    Singapore
}
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
  \begin{document}
\maketitle

\appendix
\section{More information}
\subsection{Whittle Index Policy}
\label{app:Whittle}
The belief state in next time step can be obtained by solving the following recursive equations:
\begin{equation}\label{eq:1}
\resizebox{\linewidth}{!}{$
    \displaystyle
\begin{aligned}
  \omega^i_s(u+1) &= 
    \begin{cases}
      \omega^i_s(u) P_{1,1}^{p,i}+(1-\omega^i_s(u))P_{0,1}^{p,i}  &  \text{ passive}\\
      P_{s^\prime, 1}^{i,a}&  \text{ active}\\
    \end{cases}
\end{aligned}
$}
\end{equation}

We take advantage of the Fast Whittle Index Computation algorithm introduce in~\citet{mate2020collapsing}. They derived a closed form for computing the Whittle index for both average reward and discounted reward criterion, where the objective could also be written as $\bar{R}_{\lambda}^{\pi}=\mathbb{E} \sum_{\omega} f^{\pi}(\omega) R_{a^{\pi}}(\omega)$, where $f^{\pi}(\omega)$ is defined as the fraction of time spent in each belief state $\omega$ induced by policy $\pi$ and $f^{\pi}(\omega)\in [0,1]$. Their proposed Whittle index computation algorithm can achieve a 3-order-of-magnitude speedup compared to~\citet{qian2016restless}. In the two-states setting ($s\in \{0,1\}$), they use a tuple $(B_0^{\omega_{th}}, B_1^{\omega_{th}})$ to denote the belief threshold, where $\omega_{th}\in [0,1]$, and $B_0^{\omega_{th}},B_1^{\omega_{th}} \in 1,\dots,L$ are the index of the first belief state in each chain where it is optimal to act (i.e., the belief is less than or equal to $\omega_{th}$). The length is at most $L$ long due to our fairness constraints. This is defined as the forward threshold policy, and \citet{mate2020collapsing} used the Markov chain structure to derive the occupancy frequencies for each belief state $\omega_s(t)$, which is as follows,
\begin{equation}
  f^{(B_0^{\omega_{th}},B_1^{\omega_{th}})}(\omega_s(t)) =
    \begin{cases}
      a & \text{if $s=0$, $t\leq B_0$}\\
      b & \text{if $s=1$, $t\leq B_1$}\\
      0 & \text{otherwise}
    \end{cases}       
\end{equation}
\begin{equation}
\resizebox{\linewidth}{!}{$
    a=\left(\frac{B_1 \omega_0(B_0)}{1-\omega_1(B_1)} + B_0\right)^{-1},b=\left(\frac{B_1\omega_0(B_0)}{1-\omega_1(B_1)}+B_0\right)^{-1}\frac{\omega_0(B_0)}{1-\omega_1(B_1)}
    $}
\end{equation}
These occupancy frequencies do not depend on the subsidy $\lambda$. For the forward threshold policy $(B_0^{\omega_{th}}, B_1^{\omega_{th}})$, they use the $R_{\lambda}^{B_0^{\omega_{th}}, B_1^{\omega_{th}}}$ to denote the average reward, then can decompose the average reward into the contribution of the state reward and the subsidy $\lambda$
\begin{equation}
\resizebox{\linewidth}{!}{$
\begin{aligned}
            R_{\lambda}^{(B_0^{\omega_{th}}, B_1^{\omega_{th}})} &= \sum_{\omega \in \mathscr{B}} \omega f^{(B_0^{\omega_{th}},B_1^{\omega_{th}})}(\omega)\\
            &+\omega \left(1-f^{(B_0^{\omega_{th}},B_1^{\omega_{th}})}(\omega_1(B_1)) - f^{(B_0^{\omega_{th}},B_1^{\omega_{th}})}(\omega_0(B_0))\right)
\end{aligned}
$}
\end{equation}
Given the definition of the Whittle index $\lambda$, this could be interpreted to two corresponding threshold policies being equally optimal.
% which means that we have the adjacent belief states as thresholds. 
More specifically, for a belief state $\omega_0(B_0)$, the two adjacent threshold polices $\{ (B_0^{\omega_{th}},B_1^{\omega_{th}}),(B_0^{\omega_{th}}+1,B_1^{\omega_{th}}) \}$ would be optimal to be active and passive respectively. 
recall that the Whittle index is the smallest $\lambda$ for which the active and the passive actions are both optimal. Thus the subsidy which makes the average reward of those two adjacent polices equal in value must be the Whittle index for the belief state $\omega_0(B_0)$. Formally, this could be calculated through $R_{(\lambda}^{B_0^{\omega_{th}}, B_1^{\omega_{th}})}=R_{(\lambda}^{B_0^{\omega_{th}}, B_1^{\omega_{th}}+1)}$. Similarly, we can obtain the Whittle index for the belief state $\omega_1(B_1)$ through $R_{\lambda}^{(B_0^{\omega_{th}}, B_1^{\omega_{th}})}=R_{\lambda}^{(B_0^{\omega_{th}}, B_1^{\omega_{th}}+1)}$. These computations are repeated for every belief states to find the minimum subsidy value while $B_s^{\omega_{th}}\leq L$. The main idea of their approach is shown in Fig~\ref{fig:forward_reverse_policy}.

\begin{figure}[ht]
    \centering
    \includegraphics[width=0.9\linewidth]{fig_omega-cropped.pdf}
    \caption{The forward and reverse policy}
    \label{fig:forward_reverse_policy}
\end{figure}

\subsection{Discussion of Fairness choice}
It is natural to ask what makes the proposed notion of fairness in this paper the right one? Our proposed fairness constraint is driven by the flaw of SOTA that a substantial number of arms are never selected, such starvation of intervention results in a huge demand for fairness requirement in the real-world. Furthermore, while our proposed algorithm can still be used, our notion of fairness can also be extended to fairness on the group/type of arms (i.e., check if the group fairness requirement is violated, and if so, select the arm with the highest index value in that group/type).
One of the most widely used fairness notion is to define a minimum rate that is required when allocating resources to users, and our fairness constraint can be viewed as a variant of this~\citep{chen2020fair,li2019combinatorial,patil2020achieving}. Another form of fairness constraint is to require that the algorithm never prefers a worse action over a better one based on the expected immediate reward~\citep{joseph2016fairness}. This can be seen as a variant of the Myopic algorithm in our baselines, which fails to satisfy our fairness constraint and performs worse compared to our method. Meanwhile, our Whittle index-based method can naturally satisfy such form of fairness constraint in the most time based on the long-term reward rather than the immediate reward. There might be many measures of fairness and it may be impossible to satisfy multiple types of fairness simultaneously (COMPAS Case Study). However, our proposed fairness constraint is one of most appropriate form in the real world. Namely, in the field of medical interventions, we can meet the requirement that everyone will receive medical treatment without sacrificing a significant overall performance, while SOTA/Myopic will only favor certain beneficiaries.

\subsection{Theoretical Analysis}
For ease of explanation, we have rewritten all theoretical parts here before the proof. 

\begin{theorem}\label{thm:FC_infinite}
For infinite time horizon ($T\rightarrow\infty$) RMAB with Fairness Constraints governed by parameters $\eta$ and $L$, Algorithm 1 ( i.e., activating arm $i$ at the end of the time period when its fairness constraint is violated) is optimal:
\begin{enumerate}[leftmargin=*]
    \item For $\omega^i \leq \omega^*$ (\emph{increasing belief process}), if
    \begin{align}
(P_{1,1}^{i,p}-P_{0,1}^{i,p})\left(1+\frac{\beta \Delta_3}{1-\beta} \right)&{\left(1-\beta(P_{1,1}^{i,a}-P_{0,1}^{i,a})\right)} \nonumber\\
& \leq (P_{1,1}^{i,a} - P_{0,1}^{i,a})
\end{align}
 $\Delta_3 = \min \{(P_{1,1}^{i,p}-P_{0,1}^{i,p}),(P_{1,1}^{i,a}-P_{0,1}^{i,a})\}$. 
\item For $\omega^i \geq \omega^\ast$ (\emph{non-increasing belief process}), if:
\begin{align}
(P_{1,1}^{i,p}-&P_{0,1}^{i,p})(1-\beta)\Delta_1 \geq \nonumber\\
&(P_{1,1}^{i,a} - P_{0,1}^{i,a})\left(1-\beta(P_{1,1}^{i,a} - P_{0,1}^{i,a})\right)
\end{align}
 $\Delta_1 = \min \{ 1, 1+\beta(P_{1,1}^{i,p}-P_{0,1}^{i,p})-\beta(P_{1,1}^{i,a}-P_{0,1}^{i,a}) \}$
\end{enumerate}
\end{theorem}

Then we have the whittle index decay phenomenon which is
\begin{theorem}\label{thm:decay}
For a finite horizon $T$, the Whittle index $\lambda_T$ is the value that satisfies the equation $V_{\lambda_T,T}(\omega,a=0)= V_{\lambda_T,T}(\omega,a=1)$ for the belief state $\omega$. Assuming indexability holds, the Whittle index will decay as the value of horizon $T$ decreases: $\forall T>1: \lambda_{T+1}>\lambda_{T}>\lambda_0 = 0$.
\end{theorem}

Similar to Theorem~\ref{thm:FC_infinite}, we have the condition for the optimality under the finite horizon:
\begin{theorem}\label{thm:FC_finite}
Consider the finite horizon RMAB problem with fairness constraint. Algorithm 1 (activating arm $i$ at the end of the time period when its fairness constraint is violated) is optimal:
\begin{enumerate}[leftmargin=*]
    \item When $\omega^i\leq \omega^\ast$ (increasing belief process), if
\begin{align}
%\resizebox{1\linewidth}{!}{
    (P_{1,1}^{i,p}-P_{0,1}^{i,p})&\left(\Delta_4\beta \sum_{t=0}^{T-2}[\beta^t]+1\right) \leq \nonumber\\
    &(P_{1,1}^{i,a} - P_{0,1}^{i,a})\sum_{t=0}^{T-2}[\beta^t(P_{1,1}^{i,a}-P_{0,1}^{i,a})^t]
\end{align}
 $\Delta_4= \min \{ (P_{1,1}^{i,p}-P_{0,1}^{i,p}), (P_{1,1}^{i,a}-P_{0,1}^{i,a})\}$, and $T$ is the residual horizon length.
\item When $\omega^i\geq \omega^\ast$ (non-increasing belief process), if
% (ii) When the belief state of arm $i$ is greater than $\omega^\ast$ ($\omega^i\geq \omega^\ast$), Algorithm~\ref{al:alg1} is optimal under the condition:
\begin{align}
%\resizebox{1\linewidth}{!}{$
    (P_{1,1}^{i,p}-P_{0,1}^{i,p})&\left(\Delta_2\beta\sum_{t=0}^{T-2}[\beta^t(P_{1,1}^{i,a}-P_{0,1}^{i,a})^t]+1\right) \geq \nonumber\\ 
    &(P_{1,1}^{i,a} - P_{0,1}^{i,a})\sum_{t=0}^{T-2}\beta^t
%    $}
\end{align}
 $\Delta_2= \min \{ (P_{1,1}^{i,p}-P_{0,1}^{i,p}), (P_{1,1}^{i,a}-P_{0,1}^{i,a})\}$.
\end{enumerate}
\end{theorem}

Then for our model-free approach, we have the following two theorems:
\begin{theorem}\label{thm:q_learning}
Selecting the highest-ranking arms according to the $Q_i^{\ast}(s,a=1,l)-Q_i^{\ast}(s,a=0,l)$ till the budget constraint is met is equivalent to maximizing $\left\{ \sum_{i=1}^N Q_i^\ast(s,a,l)\right\}$ over all possible action set $\{0,1\}^N$ such that $\sum_{i=1}^N a_i=k$.
\end{theorem}

\begin{theorem}\label{thm:Q_2}
{Stability and convergence}: The proposed approach converges to the optimal with probability $1$ under the following conditions:\\
\noindent {1.} The state space and action space are finite;\\
\noindent {2.} $\sum_{t=1}^{\infty}\alpha_t(s_t,a_t,l_t)=\infty \quad \sum_{t=1}^{\infty}\alpha_t^2 (\omega_i(t)) <\infty$
\end{theorem}


\section{Fully Observable Setting}
In order to explain the key idea, we initially consider a fully observable environment and then extend to a partially observable setting. Algorithm 1 provides the algorithm at each iteration for selecting the arms to activate (action set) in fully observable case (for both finite and infinite hoirzon cases).

     

\subsection{Infinite Horizon, Fully Observable}

We first provide the expression for $\lambda$ without the fairness constraints. 
We assume that $V_{\lambda, \infty}(s)$ denotes the value function which can be accrued from a single-armed bandit process with subsidy $\lambda$  over infinite horizon if the observed state is $s$.

Therefore, for the state 0, we have:\begin{equation}
\resizebox{\linewidth}{!}{$
    \displaystyle
\begin{aligned}
  V_{\lambda,\infty}(0) &= \max
    \begin{cases}
      \lambda +\beta (P_{0,1}^p V_{\lambda,\infty}(1)+P_{0,0}^p V_{\lambda,\infty}(0)) & \text{passive}\\
       \beta(P_{0,1}^a V_{\lambda,\infty}(1)+P_{0,0}^a V_{\lambda,\infty}(0)) & \text{active}
    \end{cases}\\
    &= \max
    \begin{cases}
      \lambda  +\beta (P_{0,1}^p (V_{\lambda,\infty}(1)- V_{\lambda,\infty}(0))+ V_{\lambda,\infty}(0))
    %   & \text{passive}
    \\
       \beta(P_{0,1}^a (V_{\lambda,\infty}(1)-V_{\lambda,\infty}(0))+V_{\lambda,\infty}(0))
    %   & \text{active}
    \end{cases}
\end{aligned}
$}
\end{equation}
Similarly for state 1, we have
\begin{equation} \label{eq:mdp2}
\resizebox{\linewidth}{!}{$
    \displaystyle
  V_{\lambda,\infty}(1) = \max
    \begin{cases}
      \lambda +  1 +\beta (P_{1,1}^p (V_{\lambda,\infty}(1)- V_{\lambda,\infty}(0))+ V_{\lambda,\infty}(0)) & \text{passive}\\
       1 + \beta(P_{1,1}^a (V_{\lambda,\infty}(1)-V_{\lambda,\infty}(0))+V_{\lambda,\infty}(0)) & \text{active}
    \end{cases}     
    $}
\end{equation}
Recall that the definition of the Whittle index $W_T(s)$ of state $s$ is the smallest $\lambda$ s.t. it is optimal to make the arm passive in the current state.
Therefore, we have $\lambda = \beta \left((P_{0,1}^a - P_{0,1}^p)(V_{\lambda,\infty}(1)-V_{\lambda,\infty}(0))\right)$ for $s=0$ and, 
$\lambda = \beta \left((P_{1,1}^a - P_{1,1}^p)(V_{\lambda,\infty}(1)-V_{\lambda,\infty}(0))\right)$ for $s=1$. 

As a result, the Whittle index based approach would rank the index value of all $N$ arms and select top $k$ arms at each time step to activate. With fairness constraints, the change to the approach is minimal and intuitive. The optimal policy is still to choose the arms with the top "k" index values until a fairness constraint is violated for an arm. In that time step, we replace the last arm in top-k with the arm for which fairness constraint is violated. We show that this simple change works across the board for infinite and finite horizon, fully and partially observable settings.


\begin{theorem}\label{thm:FC_infinite_fully}
Algorithm 1
% ~\ref{al:alg1} 
is optimal for RMAB with Fairness Constraints governed by parameters $\eta$ and $L$ under certain conditions.
\end{theorem}
\begin{proof_sketch}
This can be viewed as a special case of the partially observable setting. Please refer to the detailed proof for the partially observable case.
\end{proof_sketch}

\begin{figure}
    \centering
    \includegraphics[width=0.98\linewidth]{fig_fully_ob_policy.pdf}
    \caption{The action is $a_t$ at time step $t$. Then we move the action $a^i$ that satisfies the fairness constraint to one slot earlier to replace the $k-th$ action $a^j$, and add the action $a^l$ according to the index value at the end of the time interval.}
    % \label{fig:fully_infinite}
\end{figure}

\subsection{Finite Horizon, Fully Observable} 
Existing literature~\cite{glazebrook2006some, villar2016indexability} on RMAB deals with situations where the time horizon is infinite. In this part, we will demonstrate how the methodology discussed for infinite horizon setting to deal with fairness can also be applied to the finite horizon setting. There are two key challenges:  (i) computing the Whittle index under the finite horizon setting; (ii) to show that Whittle index value reduces as residual life time decreases;

It is costly to compute the index under the finite horizon setting ($O(|S|^kT)$ time and space complexity~\cite{hu2017asymptotically}; instead, we can take advantage of the fact that the index value will converge as $T\rightarrow \infty$ and $\lambda_0=0$. We can use a sigmoid curve to approximate the index value. One common example of a sigmoid function is the logistic function. This form is also used by~\citet{mate2021efficient}. Specifically, we let
\begin{equation}\label{eq:finite_Whittle_fully}
    {\displaystyle  W_T(s)={\frac {A}{1+e^{-k T}}}+C,}
\end{equation}
where ${\displaystyle A}$ and ${\displaystyle \frac{A}{2}+C}$ are the curve's bounds; ${\displaystyle k}$ is the logistic growth rate or steepness of the curve. We have $W_0(s)=0$ and $W_1(s)=\beta(s(P_{1,1}^a-P_{1,1}^p)+(1-s)(P_{0,1}^a-P_{0,1}^p))$, and let $W_{\infty}(s)=TW(s)$, where $TW(s)$ is the Whittle index value for state $s$ under infinite horizon. By solving these three constraints, we can get the three unknown parameters, $$C=-TW(s),A=2TW(s),$$
\begin{equation*}
\resizebox{\linewidth}{!}{$
    k=-\log (\frac{2TW(s)}{\beta(s(P_{1,1}^a-P_{1,1}^p)+(1-s)(P_{0,1}^a-P_{0,1}^p))+TW(s)}-1)
    $}
\end{equation*}

Algorithm 1 shows how to use $W_T(s)$ in considering fairness constraint under the finite horizon setting. As for optimality of Algorithm 1 in case of finite horizon, we have to show that value function and Whittle index decays over time in case of finite horizon, i.e.:
$$V_{\lambda_{T},T}(s)>V_{\lambda_{T-1},T-1}(s), \forall s\in \{0,1\}$$
$$\forall T>1: \lambda_T>\lambda_{T-1}>\lambda_0 = 0$$
This is to make the same argument as in case of infinite horizon, where we show that activating the fairness ensuring arm one step earlier results in lower value. 


We show a lemma first, which can lead to the proof of the Whittle index decay.
\begin{lemma}\label{lem:inc}
Intuitively, we can have $V_{\lambda_{T},T}(s)>V_{\lambda_{T-1},T-1}(s)$, for $\forall s\in \{0,1\}$.
\end{lemma}

% \subsection{Proof of Lemma~\ref{lem:inc}}
\begin{proof}\label{proof:fully_obs2}
For state $s\in \{0,1 \}$, we can always find a policy that ensures $V_{\lambda_{T},T}(s)>V_{\lambda_{T-1},T-1}(s)$. For example, we assume the optimal policy for the state $s$ with the residual time horizon $T-1$ is $\pi$, we can always find a policy $\pi^{\prime}$: keep the same policy for the first $T-1$ time slot for the same state $s$ with the residual time horizon $T$, and then pick the action for the last slot $T$ according to the observed state $s^{\prime}$. Because the reward is either $0$ or $1$, we could have 
\begin{equation}
\resizebox{\linewidth}{!}{$
V_{\lambda_{T},T;\pi^{\prime}}(s) = V_{\lambda_{T-1},T-1;\pi}(s) + \beta^{T} V_{\lambda_{1},1}(s^\prime)>V_{\lambda_{T-1},T-1;\pi}(s).
$}
\end{equation}
\end{proof}
Note this lemma is also suitable for the partially observable case.
Then we present the Theorem and corresponding proof.


\begin{theorem}\label{thm:decay_fully}
For a finite horizon $T$, the Whittle index $\lambda_T$ is the value that satisfies the equation $V_{\lambda_T,T}(s,a=0)= V_{\lambda_T,T}(s,a=1)$ for the observed state $s$. Assuming indexability holds, the Whittle index will decay as the value of horizon $T$ decreases: $\forall T>1: \lambda_T>\lambda_{T-1}>\lambda_0 = 0$.
\end{theorem}


\begin{proof_sketch}
Because the state $s$ is fully observable, We can easily calculate $\lambda_0$ and $\lambda_1$ by solving equation $V_{\lambda,T}^p(s)=V_{\lambda,T}^a(s)$. We then could derive $\lambda_T>\lambda_{T-1}$ by obtaining $\frac{\partial \lambda_T}{\partial T}>0$ for $s\in \{ 0,1\}$. Detailed proof in appendix.
\end{proof_sketch}
\begin{proof}
% See Appendix.
% ~\ref{subsec:app_decay}
Consider the discount reward criterion with discount factor of $\beta\in [0,1]$ (where $\beta =1 $ corresponds to the average criterion). 
\begin{equation}
\resizebox{\linewidth}{!}{$
\begin{aligned}
        &s + \lambda +\beta (s P_{1,1}^p + (1-s)P_{0,1}^p)= s+\beta (s P_{1,1}^a+(1-s)P_{0,1}^a)\\
        \rightarrow &\lambda =\beta(s(P_{1,1}^a-P_{1,1}^p)+(1-s)(P_{0,1}^a-P_{0,1}^p)) 
\end{aligned}
$}
\end{equation}
Because $P_{1,1}^a-P_{1,1}^p>0$ and $P_{0,1}^a-P_{0,1}^p>0$, we have $\lambda>0$. Now we show $\lambda_T>\lambda_{T-1}$. Equivalently, this can be expressed as $\frac{\partial \lambda_T}{\partial T}>0$. Because the state is fully observable, we first get the close form of $\lambda_T$.
\begin{itemize}[leftmargin=*]
    \item{\verb|Case 1:|} The state $s=0$, 
    \begin{equation}
    \resizebox{1\linewidth}{!}{$
    \begin{aligned}
        &0 + \lambda_T +\beta (P_{0,0}^p V_{\lambda_{T-1},T-1}(0)+P_{0,1}^p V_{\lambda_{T-1},T-1}(1))\\
        = &0+\beta (P_{0,0}^a V_{\lambda_{T-1},T-1}(0)+P_{0,1}^a V_{\lambda_{T-1},T-1}(1))\\
        \rightarrow &\lambda_T =\beta(V_{\lambda_{T-1},T-1}(0)(P_{0,0}^a-P_{0,0}^p)+V_{\lambda_{T-1},T-1}(1)(P_{0,1}^a-P_{0,1}^p)).
\end{aligned}
$}
\end{equation}
Intuitively, we can have $V_{\lambda_{T},T}(0)>V_{\lambda_{T-1},T-1}(0)$ (see Lemma~\ref{lem:inc}), and  $V_{\lambda_{T},T}(1)>V_{\lambda_{T-1},T-1}(1)$, we obtain $\frac{\partial V_{\lambda_{T},T}(0)}{\partial T}>0$ and $\frac{\partial V_{\lambda_{T},T}(1)}{\partial T}>0$. Hence, we can get
\begin{equation}
\resizebox{\linewidth}{!}{$
\frac{\partial \lambda_T}{\partial T}
= \beta((P_{0,0}^a-P_{0,0}^p)\frac{\partial V_{\lambda_{T-1},T-1}(0)}{\partial T}+(P_{0,1}^a-P_{0,1}^p)\frac{\partial V_{\lambda_{T-1},T-1}(1)}{\partial T}) >0
$}
\end{equation}
    \item{\verb|Case 2:|} The state $s=1$,
\begin{equation}
\resizebox{\linewidth}{!}{$
    \begin{aligned}
        &1 + \lambda_T +\beta (P_{1,0}^p V_{\lambda_{T-1},T-1}(0)+P_{1,1}^p V_{\lambda_{T-1},T-1}(1))\\
        = &1+\beta (P_{1,0}^a V_{\lambda_{T-1},T-1}(0)+P_{1,1}^a V_{\lambda_{T-1},T-1}(1))\\
        \rightarrow &\lambda_T =\beta(V_{\lambda_{T-1},T-1}(0)(P_{1,0}^a-P_{1,0}^p)+V_{\lambda_{T-1},T-1}(1)(P_{1,1}^a-P_{1,1}^p)).
    \end{aligned}
    $}
\end{equation}
    Similarly, we have
\begin{equation}
\resizebox{\linewidth}{!}{$
\frac{\partial \lambda_T}{\partial T}
= \beta((P_{1,0}^a-P_{1,0}^p)\frac{\partial V_{\lambda_{T-1},T-1}(0)}{\partial T}+(P_{1,1}^a-P_{1,1}^p)\frac{\partial V_{\lambda_{T-1},T-1}(1)}{\partial T}) >0
$}
\end{equation}    
\end{itemize}
Thus $\forall T>1: \lambda_T>\lambda_{T-1}>\lambda_0 = 0$.
\end{proof}


The proof of optimality for Algorithm 1 in case of finite horizon is similar to the infinite horizon setting.  


Most of the time, the states $\{s_1,\dots,s_N\}\in\{ 0,1\}^N$ are not observable until the action is taken. The decision-maker, on the other hand, can infer the current states based on the observation history by keeping track of the belief states $\omega$.




\section{Proofs for the Theorem~\ref{thm:FC_infinite} and Theorem~\ref{thm:FC_finite}}
\label{app:proof}

\subsection{Boundary Lemma}
\begin{lemma}\label{lem:boundary}
For the finite time horizon $T$,  $V_{\lambda_T,T}(\omega_1)-V_{\lambda_T,T}(\omega_2)$ is bounded, where we have 
\begin{equation}
\resizebox{\linewidth}{!}{$
\begin{aligned}
(\omega_1-\omega_2)\sum_{t=0}^{T-1}\beta^t(P_{1,1}^a-P_{0,1}^a)^t
\leq V_{\lambda_T,T}(\omega_1)-V_{\lambda_T,T}(\omega_2) \leq
(\omega_1-\omega_2)\sum_{t=0}^{T-1}\beta^t
% \frac{\omega_1-\omega_2}{1-\beta}
\end{aligned}
$}
\end{equation}
\end{lemma}
\begin{proof}
We prove the lower bound by induction, and the upper bound can be proven similarly. 

When $T=1$, we start from the definition of the value function $V_{\lambda_T,T}(\omega)$ to have
\begin{itemize}[leftmargin=*]
    \item passive actions:
    \begin{equation}
    \resizebox{\linewidth}{!}{$
    \begin{aligned}
        V_{\lambda_1,1}(\omega_1,a=0)-V_{\lambda_1,1}(\omega_2,a=0) &= \lambda_1 +\omega_1 - \lambda_1 -\omega_2 \\&= \omega_1-\omega_2
    \end{aligned}
    $}
    \end{equation}
    \item active actions:
    \begin{equation}
    V_{\lambda_1,1}(\omega_1,a=1)-V_{\lambda_1,1}(\omega_2,a=1) =  \omega_1-\omega_2
    \end{equation}
\end{itemize}
We get $V_{\lambda_1,1}(\omega_1)-V_{\lambda_1,1}(\omega_2) = \omega_1 -\omega_2$. Now we assume
% $V_{\lambda_T,T}(\omega_1,a=0)-V_{\lambda_T,T}(\omega_2,a=0) = \sum_{t=0}^{T-1}\beta^t(\omega_1 -\omega_2)$ and $V_{\lambda_T,T}(\omega_1,a=1)-V_{\lambda_T,T}(\omega_2,a=1) =(\omega_1-\omega_2)\sum_{t=0}^{T-1}\beta^t(P_{1,1}^a-P_{0,1}^a)^t$
$V_{\lambda_T,T}(\omega_1)-V_{\lambda_T,T}(\omega_2) \geq(\omega_1-\omega_2)\sum_{t=0}^{T-1}\beta^t(P_{1,1}^a-P_{0,1}^a)^t$
hold for $\forall T>1$, then for time horizon $T+1$, we have
\begin{itemize}[leftmargin=*]
    \item passive actions:
    \begin{equation}
    \resizebox{\linewidth}{!}{$
    \begin{aligned}
        &V_{\lambda_{T+1},T+1}(\omega_1,a=0)-V_{\lambda_{T+1},T+1}(\omega_2,a=0) \\= 
        &\left(\lambda_1 +\omega_1 + \beta V_{\lambda_{T},T}(\omega_1(1))\right)- \left(\lambda_1 +\omega_2+\beta V_{\lambda_{T},T}(\omega_2(1))\right) \\
        =& \omega_1-\omega_2+\beta \left(V_{\lambda_{T},T}(\omega_1(1))-V_{\lambda_{T},T}(\omega_2(1))\right)\\
        \geq& \omega_1 - \omega_2 +\beta (\omega_1 -\omega_2) \sum_{t=0}^{T-1}\beta^t(P_{1,1}^a-P_{0,1}^a)^t\\
        \geq& \omega_1 - \omega_2 +(\omega_1 -\omega_2)\sum_{t=1}^{T}\beta^t(P_{1,1}^a-P_{0,1}^a)^t \text{ Line $\ast$}\\
        =&(\omega_1 -\omega_2)\sum_{t=0}^{T}\beta^t(P_{1,1}^a-P_{0,1}^a)^t
    \end{aligned}
    $}
    \end{equation}
    Line $\ast$ is because $0\leq P_{1,1}^a-P_{0,1}^a<1$.
    \item active actions:
    \begin{equation}
    \resizebox{\linewidth}{!}{$
    \begin{aligned}
        &V_{\lambda_{T+1},T+1}(\omega_1,a=1)-V_{\lambda_{T+1},T+1}(\omega_2,a=1) \\
        = &\left(  \omega_1 +\beta(\omega_1  V_{\lambda_{T},T}(P_{1,1}^a)+(1-\omega_1)V_{\lambda_{T},T}(P_{0,1}^a))   \right) \\ 
        &- \left(  \omega_2 +\beta(\omega_2 V_{\lambda_{T},T}(P_{1,1}^a)+(1-\omega_2)V_{\lambda_{T},T}(P_{0,1}^a))   \right) \\
        =&\omega_1-\omega_2 + \beta\left( (\omega_1-\omega_2)(V_{\lambda_{T},T}(P_{1,1}^a)-V_{\lambda_{T},T}(P_{0,1}^a))   \right)\\
        =& \omega_1-\omega_2+\beta(V_{\lambda_{T},T}(\omega_1(1))-V_{\lambda_{T},T}(\omega_2(1)))\\
        \geq& \omega_1 - \omega_2 +\beta(\omega_1-\omega_2)\sum_{t=0}^{T-1}\beta^t(P_{1,1}^a-P_{0,1}^a)^t\\
        \geq& \omega_1 - \omega_2 +(\omega_1-\omega_2)\sum_{t=1}^{T}\beta^t(P_{1,1}^a-P_{0,1}^a)^t \text{ Line $\ast$}\\
        =&(\omega_1-\omega_2)\sum_{t=0}^{T}\beta^t(P_{1,1}^a-P_{0,1}^a)^t
    \end{aligned}
    $}
    \end{equation}
\end{itemize}
Line $\ast$ is because $0\leq P_{1,1}^a-P_{0,1}^a<1$. Thus we have $V_{\lambda_T,T}(\omega_1)-V_{\lambda_T,T}(\omega_2)\geq 
(\omega_1-\omega_2)\sum_{t=0}^{T-1}\beta^t(P_{1,1}^a-P_{0,1}^a)^t$. Similarly, we can prove the lower bound.
\end{proof}

\begin{lemma}\label{lem:infinite_boundary}
For the infinite residual time horizon $T\rightarrow \infty$, $V_{\lambda_T,T}(\omega_1)-V_{\lambda_T,T}(\omega_2)$is bounded. Specifically, we have
\begin{equation}
\resizebox{\linewidth}{!}{$
\begin{aligned}
\frac{\omega_1 -\omega_2}{1-\beta(P_{1,1}^a-P_{0,1}^a)}
\leq V_{\lambda_T,T}(\omega_1)-V_{\lambda_T,T}(\omega_2) \leq
\frac{\omega_1-\omega_2}{1-\beta}
\end{aligned}
$}
\end{equation}
\end{lemma}
\begin{proof}
This can be viewed as a special case of the finite residual time horizon setting where $T\rightarrow \infty$.
Thus we can easily derive the lower and upper bound according to the formula for the geometric series:
\begin{equation*}
    \underset{T\rightarrow \infty}{\lim}(\omega_1-\omega_2)\sum_{t=0}^{T-1}\beta^t(P_{1,1}^a-P_{0,1}^a)^t = \frac{\omega_1 -\omega_2}{1-\beta(P_{1,1}^a-P_{0,1}^a)}
\end{equation*}
and
\begin{equation*}
    \underset{T\rightarrow \infty}{\lim}(\omega_1-\omega_2)\sum_{t=0}^{T-1}\beta^t = \frac{\omega_1-\omega_2}{1-\beta}
\end{equation*}
\end{proof}



Consider the single-armed bandit process with subsidy $\lambda$ under the infinite time horizon $T\rightarrow \infty$, we have:
\begin{equation}
\resizebox{\linewidth}{!}{$
    \displaystyle
\begin{aligned}
  V_{\lambda,\infty}(\omega) &= \max
    \begin{cases}
      \lambda+\omega +\beta  V_{\lambda,\infty}(\tau^1(\omega)) & \text{passive}\\
       \omega+\beta\left(\omega V_{\lambda,\infty}(P_{1,1}^a)+(1-\omega) V_{\lambda,\infty}(P_{0,1}^a)\right) & \text{active}
    \end{cases}
\end{aligned}
$}
\end{equation}
and we can get 
\begin{equation}\label{eq:infinite_partial}
\resizebox{\linewidth}{!}{$
    \displaystyle
    \begin{aligned}
    \frac{\partial V_{\lambda,\infty}(\omega)}{\partial \omega} &=
    \begin{cases}
       1 +\beta  \frac{ \partial V_{\lambda,\infty}(\tau^1(\omega))}{\partial \tau^1(\omega)} \frac{\partial \tau^1(\omega)}{\partial \omega} & \text{passive}\\
       1+\beta\left( V_{\lambda,\infty}(P_{1,1}^a)- V_{\lambda,\infty}(P_{0,1}^a)\right) & \text{active}
    \end{cases}
\end{aligned}
$}
\end{equation}

Similarly, for the finite residual time horizon $T$ we have:
\begin{equation}\label{eq:finite_partial}
\resizebox{\linewidth}{!}{$
    \displaystyle
    \begin{aligned}
    \frac{\partial V_{\lambda,T}(\omega)}{\partial \omega} &=
    \begin{cases}
       1 +\beta  \frac{ \partial V_{\lambda,T-1}(\tau^1(\omega))}{\partial \tau^1(\omega)} \frac{\partial \tau^1(\omega)}{\partial \omega} & \text{passive}\\
       1+\beta\left( V_{\lambda,T-1}(P_{1,1}^a)- V_{\lambda,T-1}(P_{0,1}^a)\right) & \text{active}
    \end{cases}
\end{aligned}
$}
\end{equation}
Note that for any belief state $\omega$, $\tau^1(\omega)$ is the $1$-step belief state update of $\omega$ when the passive arm is unobserved for another $1$ consecutive slot. According to the Eq.~\ref{eq:1}, we have $\tau^1(\omega) = \omega P_{1,1}^p + (1-\omega)P_{0,1}^p$, thus 
\begin{equation}\label{eq:partial_omega_1}
    0< \frac{\partial \tau^1(\omega)}{\partial \omega} = (P_{1,1}^p-P_{0,1}^p) <1
\end{equation}



\begin{lemma}\label{lem:partial_value_bound}
For the finite residual time horizon $T$, we have $\frac{\partial V_{\lambda_T,T}(\omega)}{\partial \omega} \geq \min \{1 +\beta(P_{1,1}^p-P_{0,1}^p) \sum_{t=0}^{T-2}[\beta^t(P_{1,1}^a-P_{0,1}^a)^t], 1+\beta (P_{1,1}^a-P_{0,1}^a) \sum_{t=0}^{T-2}[\beta^t(P_{1,1}^a-P_{0,1}^a)^t]   \}$
\end{lemma}
\begin{proof}
According to Eq.~\ref{eq:finite_partial}, for the passive action, we have:
\begin{equation}\label{eq:finite_partial_lower_1}
\resizebox{\linewidth}{!}{$
    \begin{aligned}
            &1 +\beta  \frac{ \partial V_{\lambda,T-1}(\tau^1(\omega))}{\partial \tau^1(\omega)} \frac{\partial \tau^1(\omega)}{\partial \omega}\\
            =&1 +\beta \underset{\delta\rightarrow 0}{\lim}\frac{V_{\lambda,T-1}(\tau^1(\omega)+\delta)-V_{\lambda,T-1}(\tau^1(\omega))}{\delta} (P_{1,1}^p-P_{0,1}^p)\\
    \end{aligned}
$}
\end{equation}
According to Lemma~\ref{lem:boundary}, let $\omega_1 = \tau^1(\omega)+\delta$ and $\omega_2=\tau^1(\omega)$, then we have $V_{\lambda,T-1}(\tau^1(\omega)+\delta)-V_{\lambda,T-1}(\tau^1(\omega))\geq (\omega_1-\omega_2)\sum_{t=0}^{T-1}\beta^t(P_{1,1}^a-P_{0,1}^a)^t = \delta \sum_{t=0}^{T-1}\beta^t(P_{1,1}^a-P_{0,1}^a)^t$. Thus Eq.~\ref{eq:finite_partial_lower_1} becomes:
\begin{equation}\label{eq:finite_partial_lower_2}
% \resizebox{\linewidth}{!}{$
    \begin{aligned}
        &1 +\beta  \frac{ \partial V_{\lambda,T-1}(\tau^1(\omega))}{\partial \tau^1(\omega)} \frac{\partial \tau^1(\omega)}{\partial \omega}\\
        \geq& 1 +\beta(P_{1,1}^p-P_{0,1}^p) \sum_{t=0}^{T-2}[\beta^t(P_{1,1}^a-P_{0,1}^a)^t]\\
    \end{aligned}
% $}
\end{equation}
Similarly, for the active action, according to lower bound in Lemma~\ref{lem:boundary}, we have:
\begin{equation}\label{eq:finite_partial_lower_3}
% \resizebox{\linewidth}{!}{$
    \begin{aligned}
            &1+\beta\left( V_{\lambda,T-1}(P_{1,1}^a)- V_{\lambda,T-1}(P_{0,1}^a)\right)\\
            \geq & 1+\beta (P_{1,1}^a-P_{0,1}^a) \sum_{t=0}^{T-2}[\beta^t(P_{1,1}^a-P_{0,1}^a)^t]\\
    \end{aligned}
% $}
\end{equation}
Therefore, we have $\frac{\partial V_{\lambda_T,T}(\omega)}{\partial \omega} \geq \min \{ (P_{1,1}^p-P_{0,1}^p), (P_{1,1}^a-P_{0,1}^a)\}\cdot \beta \cdot \sum_{t=0}^{T-2}[\beta^t(P_{1,1}^a-P_{0,1}^a)^t]+1$

\end{proof}

\begin{lemma}\label{lem:partial_value_bound_infinite}
For the infinite residual time horizon $T\rightarrow \infty$, we have $\frac{\partial V_{\lambda_T,T}(\omega)}{\partial \omega} \geq \min \{1 +\frac{\beta(P_{1,1}^p-P_{0,1}^p)}{ 1-(\beta(P_{1,1}^a-P_{0,1})}, \frac{1}{1-\beta (P_{1,1}^a-P_{0,1}^a)} \}$
\end{lemma}
\begin{proof}
The proof is similar to the proof for Lemma~\ref{lem:partial_value_bound} of the finite setting. We can get the result with assuming $T\rightarrow \infty$.
\end{proof}


\begin{lemma}\label{lem:partial_value_bound_upper_finite}
For the finite residual time horizon $T$, we have $\frac{\partial V_{\lambda_T,T}(\omega)}{\partial \omega} \leq \min \{1 +(P_{1,1}^p-P_{0,1}^p) \sum_{t=1}^{T-1}\beta^t, 1+ (P_{1,1}^a-P_{0,1}^a) \sum_{t=1}^{T-1}\beta^t   \}$
\end{lemma}
\begin{proof}
The proof is similar to the proof of Lemma~\ref{lem:partial_value_bound}. According to Eq.~\ref{eq:finite_partial}, we have:
\begin{itemize}[leftmargin=*]
    \item passive actions:
    \begin{equation}\label{eq:finite_partial_upper_1}
    \resizebox{\linewidth}{!}{$
    \begin{aligned}
            &1 +\beta  \frac{ \partial V_{\lambda,T-1}(\tau^1(\omega))}{\partial \tau^1(\omega)} \frac{\partial \tau^1(\omega)}{\partial \omega}\\
            =&1 +\beta \underset{\delta\rightarrow 0}{\lim}\frac{V_{\lambda,T-1}(\tau^1(\omega)+\delta)-V_{\lambda,T-1}(\tau^1(\omega))}{\delta} (P_{1,1}^p-P_{0,1}^p)\\
    \end{aligned}
$}
\end{equation}
According to Lemma~\ref{lem:boundary}, let $\omega_1 = \tau^1(\omega)+\delta$ and $\omega_2=\tau^1(\omega)$, then we have $V_{\lambda,T-1}(\tau^1(\omega)+\delta)-V_{\lambda,T-1}(\tau^1(\omega))\leq (\omega_1-\omega_2)\sum_{t=0}^{T-1}\beta^t = \delta \sum_{t=0}^{T-1}\beta^t$. Thus Eq.~\ref{eq:finite_partial_upper_1} becomes:
\begin{equation}\label{eq:finite_partial_upper_2}
% \resizebox{\linewidth}{!}{$
    \begin{aligned}
        &1 +\beta  \frac{ \partial V_{\lambda,T-1}(\tau^1(\omega))}{\partial \tau^1(\omega)} \frac{\partial \tau^1(\omega)}{\partial \omega}\\
        \leq& 1 +\beta(P_{1,1}^p-P_{0,1}^p) \sum_{t=0}^{T-2}[\beta^t]\\
    \end{aligned}
% $}
\end{equation}
    \item active actions, similarly, according to upper bound in Lemma~\ref{lem:boundary}, we have:
\begin{equation}\label{eq:finite_partial_upper_3}
% \resizebox{\linewidth}{!}{$
    \begin{aligned}
            &1+\beta\left( V_{\lambda,T-1}(P_{1,1}^a)- V_{\lambda,T-1}(P_{0,1}^a)\right)\\
            \leq & 1+\beta (P_{1,1}^a-P_{0,1}^a) \sum_{t=0}^{T-2}[\beta^t]\\
    \end{aligned}
% $}
\end{equation}
\end{itemize}
Therefore, we have $\frac{\partial V_{\lambda_T,T}(\omega)}{\partial \omega} \leq \min \{1 +(P_{1,1}^p-P_{0,1}^p) \sum_{t=1}^{T-1}\beta^t, 1+ (P_{1,1}^a-P_{0,1}^a) \sum_{t=1}^{T-1}\beta^t   \}$

\end{proof}

\begin{lemma}\label{lem:partial_value_bound_upper_infinite}
For the infinite residual time horizon $T\rightarrow \infty$, we have $\frac{\partial V_{\lambda_T,T}(\omega)}{\partial \omega} \leq \min \{ 1+\frac{ \beta (P_{1,1}^p-P_{0,1}^p)}{1-\beta}, 1+\frac{ \beta (P_{1,1}^a-P_{0,1}^a)}{1- \beta}   \}$
\end{lemma}
\begin{proof}
The proof is similar to the proof for Lemma~\ref{lem:partial_value_bound_upper_finite} of the finite setting. We can get the result with assuming $T\rightarrow \infty$.
\end{proof}


\subsection{Condition for the optimality of Algorithm 1 under finite/infinite horizon}
We now give the proof for the Theorem~\ref{thm:FC_infinite} and Theorem~\ref{thm:FC_finite}.


According to the Eq.~\ref{eq:1}, we can compute the belief gap between $\omega$ and 1-time step belief update $\tau^1(\omega)$:
\begin{equation}
    \Delta \omega = \tau^{1}(\omega) - \omega = (P_{1,1}^p - P_{0,1}^p-1)\omega +P_{0,1}^p
\end{equation}

Remark that we could get a strict condition that depends only on arm A. This is because change from policy $\pi^\ast$ to $\pi$ will only lead to a decrease in the value function for other arms as the optimal actions determined by the Whittle index algorithm will be influenced, henceforth the value will be decreased. Consider the single-arm A, as we discussed earlier, the belief state update process is either monotonically increasing or monotonically decreasing.
% for the infinite time horizon, we have:

% \begin{equation}
% \resizebox{\linewidth}{!}{$
%     \displaystyle
% \begin{aligned}
%   V_{\lambda,\infty}(\omega) &= \max
%     \begin{cases}
%       \lambda+\omega +\beta  V_{\lambda,\infty}(\tau^1(\omega)) & \text{passive}\\
%       \omega+\beta\left(\omega V_{\lambda,\infty}(P_{1,1}^a)+(1-\omega) V_{\lambda,\infty}(P_{0,1}^a)\right) & \text{active}
%     \end{cases}
% \end{aligned}
% $}
% \end{equation}

% For the finite residual time horizon $T$ we have:
% \begin{equation}\label{eq:finite_update}
% \resizebox{\linewidth}{!}{$
%     \displaystyle
% \begin{aligned}
%   V_{\lambda,T}(\omega) &= \max
%     \begin{cases}
%       \lambda+\omega +\beta  V_{\lambda,T-1}(\tau^1(\omega)) & \text{passive}\\
%       \omega+\beta\left(\omega V_{\lambda,T-1}(P_{1,1}^a)+(1-\omega) V_{\lambda,T-1}(P_{0,1}^a)\right) & \text{active}
%     \end{cases}
% \end{aligned}
% $}
% \end{equation}

\begin{figure}[ht]
    \centering
    \includegraphics[width=0.98\linewidth]{fig_omega2-cropped.pdf}
    \caption{The forward and reverse policy}
    \label{fig:forward_reverse}
\end{figure}
\paragraph{Case 1}: The belief state monotonically increases as the time passed.
Formally, this can be expressed as $\frac{\partial \omega_{t}}{\partial t}>0$, or $\Delta \omega >0$.
We now derive the condition for the optimality of our algorithm for the case 1 under finite time horizon $T$. Consider first (any) period of length $L$, an arm $i$ has not been activated for the past $(L-1)$ time slots. Thus it needs to be pulled at time step $t=L$ according to our algorithm. 
% We only consider the passive set in the Fig~\ref{fig:forward_reverse}. 
Assume that the residual time horizon is $h$ at time step $t=L$, where we have $h+L=T$.
We move the active action at time step $t=L$ to one slot earlier, then at time step $t=L-1$, the residual time horizon is $h+1$, and assume that belief state is $\omega$ at time step $t=L-1$. We here discuss the finite horizon case because the infinite horizon could be viewed as a special case of the finite horizon setting as $T\rightarrow \infty (h\rightarrow \infty)$.

Because the belief state will increase as the time passed, thus we define the value gap $\Delta V_{\lambda,h}(\omega) =  V_{\lambda,h}(\omega,a=1)-V_{\lambda,h}(\omega,a=0)$ will move from left to the right as the residual time horizon decrease. For the single-arm process, if we can show that the gap difference $\Delta V_{\lambda,h}(\omega)$ increases from left to the right (i.e.,$\Delta V_{\lambda,h}(\omega)$ increases as belief state increases), then this implies that moving the active action that ensuring the fairness at time step $t=L$ to one step earlier (i.e., from right to left) will result in a smaller gap $\Delta V_{\lambda,h}(\omega)$. Thus it is optimal to keep the active action at the end of the period to ensure the fairness constraint. This requires that 
\begin{equation}
    \frac{\partial V_{\lambda,h}(\omega,a=0)}{\partial \omega}\leq \frac{\partial V_{\lambda,h}(\omega,a=1)}{\partial \omega}
\end{equation}

According to the expression for $\lambda$, we have
\begin{equation}\label{eq:deviation_finite}
\resizebox{\linewidth}{!}{$
    \displaystyle
    \begin{aligned}
    \frac{\partial V_{\lambda,h}(\omega)}{\partial \omega} &=
    \begin{cases}
       1 +\beta  \frac{ \partial V_{\lambda,h-1}(\tau^1(\omega))}{\partial \tau^1(\omega)} \frac{\partial \tau^1(\omega)}{\partial \omega} & \text{passive}\\
       1+\beta\left( V_{\lambda,h-1}(P_{1,1}^a)- V_{\lambda,h-1}(P_{0,1}^a)\right) & \text{active}
    \end{cases}
\end{aligned}
$}
\end{equation}


As shown in the gray area of the left Fig.~\ref{fig:forward_reverse}, at time step $t=L-1$, we derive the technical condition for the optimality of our algorithm in the gray area under the infinite residual time horizon:

\begin{equation}
\resizebox{1\linewidth}{!}{$
    \begin{aligned}
        &(P_{1,1}^p-P_{0,1}^p)\left(1+\frac{\beta \Delta_3}{1-\beta} \right){\left(1-\beta(P_{1,1}^a-P_{0,1}^a)\right)} \leq (P_{1,1}^a - P_{0,1}^a)\\
        % \rightarrow& \frac{(P_{1,1}^p-P_{0,1}^p)\Delta_1}{1-\beta(P_{1,1}^a - P_{0,1}^a)} \leq \frac{P_{1,1}^a - P_{0,1}^a}{1-\beta} \text{ Line 1}\\
        \rightarrow& (P_{1,1}^p-P_{0,1}^p)\left(1+\frac{\beta \Delta_3}{1-\beta} \right) \leq \frac{P_{1,1}^a - P_{0,1}^a}{1-\beta(P_{1,1}^a-P_{0,1}^a)} \text{ Line 1}\\
        \rightarrow& (P_{1,1}^p-P_{0,1}^p)\left(1+\frac{\beta \Delta_3}{1-\beta} \right) \leq V_{\lambda,h-1}(P_{1,1}^a)- V_{\lambda,h-1}(P_{0,1}^a) \text{ Line 2}\\
        \rightarrow&  (P_{1,1}^p-P_{0,1}^p) \frac{ \partial V_{\lambda,h-1}(\tau^1(\omega))}{\partial \tau^1(\omega)} \leq V_{\lambda,h-1}(P_{1,1}^a)- V_{\lambda,h-1}(P_{0,1}^a) \text{ Line 3}\\      
        \rightarrow&  \frac{\partial \tau^1(\omega)}{\partial \omega} \frac{ \partial V_{\lambda,h-1}(\tau^1(\omega))}{\partial \tau^1(\omega)} \leq \left( V_{\lambda,h-1}(P_{1,1}^a)- V_{\lambda,h-1}(P_{0,1}^a)\right) \text{ Line 4}\\
        \rightarrow&  1 +\beta  \frac{ \partial V_{\lambda,h-1}(\tau^1(\omega))}{\partial \tau^1(\omega)} \frac{\partial \tau^1(\omega)}{\partial \omega}\leq 1+\beta\left( V_{\lambda,h-1}(P_{1,1}^a)- V_{\lambda,h-1}(P_{0,1}^a)\right) \text{ Line 5}\\
        \rightarrow &\frac{\partial V_{\lambda,h}(\omega,a=0)}{\partial \omega}\leq \frac{\partial V_{\lambda,h}(\omega,a=1)}{\partial \omega} \text{ Line 6}
    \end{aligned}
    $}
\end{equation}

Line 1 is obtained via  mathematical transformation. Line 2 is obtained from the lower bound in Lemma~\ref{lem:boundary}. Line 3 is obtained from the Lemma~\ref{lem:partial_value_bound_upper_infinite} when assuming $h\rightarrow \infty$. Line 4 is obtained from Eq.~\ref{eq:partial_omega_1}. Line 5 is obtained from the mathematical transformation. Line 6 is obtained from the Eq.~\ref{eq:deviation_finite}. And $\Delta_3 = \min \{(P_{1,1}^p-P_{0,1}^p),(P_{1,1}^a-P_{0,1}^a)\}$

Similarly, we can derive the technical condition for the finite residual time horizon, which is 

\begin{equation}
\resizebox{1\linewidth}{!}{$
    (P_{1,1}^p-P_{0,1}^p)\left(\Delta_4\beta\sum_{t=0}^{h-2}[\beta^t]+1\right) \leq (P_{1,1}^a - P_{0,1}^a)\sum_{t=0}^{h-2}[\beta^t(P_{1,1}^a-P_{0,1}^a)^t]
    $}
\end{equation}
where $\Delta_4= \min \{ (P_{1,1}^p-P_{0,1}^p), (P_{1,1}^a-P_{0,1}^a)\}$.


\begin{figure}[ht]
    \centering
    \includegraphics[width=0.9\linewidth]{fig_fully_ob_fairness.pdf}
    \caption{Proof of Theorem~\ref{thm:FC_infinite} and Theorem~\ref{thm:FC_finite}}
    \label{fig:fully_ob_fairness}
\end{figure}


\paragraph{Case 2}: The belief state monotonically decreases as the time passed. Formally, this can be expressed as $\frac{\partial \omega_{t}}{\partial t}<0$, or $\Delta \omega <0$.
Similarly, we can derive the condition for the optimality of our algorithm for the case 2 under finite time horizon $T$. 
Because the belief state will decrease as the time passed, thus we define the value gap $\Delta V_{\lambda,h}(\omega) =  V_{\lambda,h}(\omega,a=1)-V_{\lambda,h}(\omega,a=0)$ will move from right to the left as the residual time horizon decrease. For the single-arm process, if we can show that the gap difference $\Delta V_{\lambda,h}(\omega)$ decreases from right to the left (i.e., as belief state decrease, $\Delta V_{\lambda,h}(\omega)$ decreases), then this implies that moving the active action that ensuring the fairness at time step $t=L$ to one step earlier will result in a larger gap $\Delta V_{\lambda,h}(\omega)$. Thus it is optimal to keep the active action at the end of the period, i.e., at time step $t=L$. this requires that 
\begin{equation}
    \frac{\partial V_{\lambda,h}(\omega,a=0)}{\partial \omega}\geq \frac{\partial V_{\lambda,h}(\omega,a=1)}{\partial \omega}
\end{equation}

As shown in the gray area of the left Fig.~\ref{fig:forward_reverse}, at time step $t=L-1$, we derive the technical condition for the optimality of our algorithm in the gray area under the infinite residual time horizon:

\begin{equation}
\resizebox{1\linewidth}{!}{$
    \begin{aligned}
        &(P_{1,1}^p-P_{0,1}^p)(1-\beta)\Delta_1 \geq (P_{1,1}^a - P_{0,1}^a)\left(1-\beta(P_{1,1}^a - P_{0,1}^a)\right)\\
        \rightarrow& \frac{(P_{1,1}^p-P_{0,1}^p)\Delta_1}{1-\beta(P_{1,1}^a - P_{0,1}^a)} \geq \frac{P_{1,1}^a - P_{0,1}^a}{1-\beta} \text{ Line 1}\\
        \rightarrow& (P_{1,1}^p-P_{0,1}^p)\Delta_1\underset{h\rightarrow \infty}{\lim}\sum_{t=0}^{h-2}\beta^t(P_{1,1}^a-P_{0,1}^a)^t \geq \frac{P_{1,1}^a - P_{0,1}^a}{1-\beta} \text{ Line 2}\\
        \rightarrow& (P_{1,1}^p-P_{0,1}^p)\Delta_1\underset{h\rightarrow \infty}{\lim}\sum_{t=0}^{h-2}\beta^t(P_{1,1}^a-P_{0,1}^a)^t  \geq V_{\lambda,h-1}(P_{1,1}^a)- V_{\lambda,h-1}(P_{0,1}^a) \text{ Line 3}\\
        \rightarrow&  (P_{1,1}^p-P_{0,1}^p) \frac{ \partial V_{\lambda,h-1}(\tau^1(\omega))}{\partial \tau^1(\omega)} \geq V_{\lambda,h-1}(P_{1,1}^a)- V_{\lambda,h-1}(P_{0,1}^a) \text{ Line 4}\\      
        \rightarrow&  \frac{\partial \tau^1(\omega)}{\partial \omega} \frac{ \partial V_{\lambda,h-1}(\tau^1(\omega))}{\partial \tau^1(\omega)} \geq \left( V_{\lambda,h-1}(P_{1,1}^a)- V_{\lambda,h-1}(P_{0,1}^a)\right) \text{ Line 5}\\
        \rightarrow&  1 +\beta  \frac{ \partial V_{\lambda,h-1}(\tau^1(\omega))}{\partial \tau^1(\omega)} \frac{\partial \tau^1(\omega)}{\partial \omega}\geq 1+\beta\left( V_{\lambda,h-1}(P_{1,1}^a)- V_{\lambda,h-1}(P_{0,1}^a)\right) \text{ Line 6}\\
        \rightarrow &\frac{\partial V_{\lambda,h}(\omega,a=0)}{\partial \omega}\geq \frac{\partial V_{\lambda,h}(\omega,a=1)}{\partial \omega} \text{ Line 7}
    \end{aligned}
    $}
\end{equation}

Line 1 is obtained via  mathematical transformation. Line 2 is obtained from the formula for the geometric series as $\beta(P_{1,1}^a-P_{0,1}^a)<1$. Line 3 is obtained from the upper bound in Lemma~\ref{lem:boundary}. Line 4 is obtained from the Lemma~\ref{lem:partial_value_bound} when assuming $h\rightarrow \infty$. Line 5 is obtained from Eq.~\ref{eq:partial_omega_1}. Line 6 is obtained from the mathematical transformation. Line 7 is obtained from the Eq.~\ref{eq:deviation_finite}.

Similarly, we can derive the technical condition for the finite residual time horizon, which is 
\begin{equation}
\resizebox{1\linewidth}{!}{$
    (P_{1,1}^p-P_{0,1}^p)\left(\Delta_2\beta\sum_{t=0}^{h-2}[\beta^t(P_{1,1}^a-P_{0,1}^a)^t]+1\right) \geq (P_{1,1}^a - P_{0,1}^a)\sum_{t=0}^{h-2}\beta^t
    $}
\end{equation}
where $\Delta_2= \min \{ (P_{1,1}^p-P_{0,1}^p), (P_{1,1}^a-P_{0,1}^a)\}$.

When the belief state is in the white area of the passive set. Then we need to consider arm A and other arms in the active set. We give detailed discussion in Appendix~\ref{ap_subsec:general_condition}.



\section{Other Proofs}
\subsection{Derivation of Equation}
The belief state can be calculated in closed form with the given transition probabilities. Let $\tau^u_i(\omega_{i,s}(t))=\omega_{i,s}(t+u)$ denote the $u$-step belief state update of $\omega_{i,s}(t)$ when the unobserved arm $i$ is updated for $u$ consecutive slots without being selected. Formally,
\begin{equation}\label{eq:closed}
\resizebox{1\linewidth}{!}{$
\begin{aligned}
        &\omega_{i,s}(u)= \tau^{u-1}(P_{s,1}^{a,i})  \text{ where }\\ &\tau^{u}_i(\omega) =\frac{P_{0,1}^{p,i}-(P_{1,1}^{p,i}-P_{0,1}^{p,i})^u(P_{0,1}^{p,i}-\omega(1+P_{0,1}^{p,i}-P_{1,1}^{p,i}))}{(1+P_{0,1}^{p,i}-P_{1,1}^{p,i})}
\end{aligned}
$}
\end{equation}

This is because $\tau^u(\omega)=\omega \tau^u(1)+(1-\omega) \tau^{u}(0)$, where $\tau^u(1)$ is the $u$-step transition probability from $1$ to $1$ when arm is unobserved, and $\tau^u(0)$ is the $u$-step transition probability from $0$ to $1$ if the transition matrix $P_{s,s^\prime}^{a} = P_{s,s^\prime}^{p} = P_{s,s^\prime}$. From the eigen-decomposition of the transition matrix $\mathcal{P}$, we can have $$\tau^u(1) = \frac{P_{0,1}+(1-P_{1,1})(P_{1,1}-P_{0,1})^u}{1+P_{0,1}-P_{1,1}}$$ 
and 
$$\tau^u(0)=\frac{P_{0,1}(1-(P_{1,1}-P_{0,1})^u)}{1+P_{0,1}-P_{1,1}}$$ 
and solve it to get
\begin{equation*}
\resizebox{1\linewidth}{!}{$
    \tau^u(\omega) = \frac{P_{0,1}-(P_{1,1}-P_{0,1})^u(P_{0,1}-\omega(1+P_{0,1}-P_{1,1}))}{(1+P_{0,1}-P_{1,1})}
    $}
\end{equation*}
However, as the $P_{s,s^\prime}^{a}\neq P_{s,s^\prime}^{p}$, We have $\omega_{s}^i(1)=P_{o_i,1}^{a,i}$, and $\tau^u(\omega)$ as shown in Eq.~\ref{eq:closed}, which leads to $\omega_{s}^i(u)$.


\subsection{Proof of Whittle index decay under partially observable setting (Theorem~\ref{thm:decay})}
\begin{proof}
We can use the induction to prove the index decay. 
Again, $\lambda_0$ satisfies: $V_{\lambda_0,0}(\omega,a=0)=V_{\lambda_0,0}(\omega,a=1)$ for any belief state $\omega\in [0,1 ]$. We can have $\lambda_0+\omega =\omega$, thus $\lambda_0 = 0$. Similarly, $\lambda_1$ can be solved by assuming equation  $V_{\lambda_1,1}(\omega,a=0)=V_{\lambda_1,1}(\omega,a=1)$:
\begin{equation}
\resizebox{\linewidth}{!}{$
\begin{aligned}
        &\omega + \lambda +\beta (\omega P_{1,1}^p + (1-\omega)P_{0,1}^p)= \omega+\beta (\omega P_{1,1}^a+(1-\omega)P_{0,1}^a)\\
        \rightarrow &\lambda =\beta(\omega(P_{1,1}^a-P_{1,1}^p)+(1-\omega)(P_{0,1}^a-P_{0,1}^p)) 
\end{aligned}
$}
\end{equation}
As it is true in the real-world that $P_{s,1}^a>P_{s,1}^p$, thus we have $\lambda_1>0=\lambda_0$. Now we assume the hypothesis that $\lambda_{u}>\lambda_{u-1}$ for $u \in \{1,\dots, T\}$ holds, we must show: $\lambda_{T+1}>\lambda_T$.
There is a similar work done by~\citet{mate2020collapsing}, however, they only show that $\lambda_T>\lambda_1$ for $\forall T > 1$. Their conclusion is built on the fact that for two non-decreasing, linear functions $f_1(\lambda)$ and $f_2(\lambda)$ of $lambda$ and two points $x_1,x_2$. Whenever $f_1(x_1)\leq f_2(x_1)$ and $f_1(x_2)\geq f_2(x_2)$, and if $\frac{\partial f_1}{\partial x}\geq \frac{\partial f_2}{\partial x}$, then $x_1\leq x_2$ is true. 
We here take advantage of this fact and show a different way to demonstrate the index decay in the partially observable setting such that $\lambda_{T+1} > \lambda_{T}$.


We first use $V_{\lambda,T+1}(\omega,a=0)$ to denote function $f_1^p$, and set $f_2^a=V_{\lambda,T+1}(\omega,a=1)$. Then it is obvious that the value of $f_1^p(\omega,\lambda,t)$ will increase as the exogenous reward $\lambda$ increase, whereas the value of $f_2^a(\omega,\lambda,t)$ will increase slower according to the expression. Thus we have the following:
\begin{equation}
\label{eq:decay_1}
   \frac{\partial f_1^p(\omega,\lambda,t)}{\partial \lambda}>\frac{\partial f_2^a(\omega,\lambda,t)}{\partial \lambda} 
\end{equation}
We can prove $\lambda_{T+1} > \lambda_{T}$ through contradiction. We first assume that $\lambda_{T} \geq \lambda_{T-1}$ holds. Then we could have :
\begin{equation}
    f_1^p(\omega,\lambda_{T-1},T-1)=f_2^a(\omega,\lambda_{T-1},T-1);
\end{equation}
and 
\begin{equation}\label{eq:decay_T}
\resizebox{1\linewidth}{!}{$
    f_1^p(\omega,\lambda_{T},T-1)>f_1^p(\omega,\lambda_{T-1},T-1)=f_2^a(\omega,\lambda_{T-1},T-1)>f_2^a(\omega,\lambda_{T},T-1).
    $}
\end{equation}
Similarly, we can also have
\begin{equation}\label{ineq:decay_T}
    f_1^p(\omega,\lambda_{T},T)=f_2^a(\omega,\lambda_{T},T).
\end{equation}. We can get 
\begin{equation}\label{eq:partial_for1}
    \frac{\partial f_1^p(\omega,\lambda_T,t)}{\partial t}<\frac{\partial f_2^a(\omega,\lambda_T,t)}{\partial t}
\end{equation}

However, we assume that $\lambda_{T+1} \leq \lambda_{T}$. 
According to the definition of $\lambda_{T+1}$, we have
\begin{equation}\label{eq:deay_T+1}
    f_1^p(\omega,\lambda_{T+1},T+1)=f_2^a(\omega,\lambda_{T+1},T+1);
\end{equation} 
It is obvious that $\frac{\partial f_1^p(\omega,\lambda,t)}{\partial t}>0$, and $\frac{\partial f_2^a(\omega,\lambda,t)}{\partial t}>0$ (from Lemma~\ref{lem:inc}). According to Eq.~\ref{eq:partial_for1} and Eq.~\ref{ineq:decay_T}, we can have
\begin{equation}\label{ineq:decay_T+1}
    f_1^p(\omega,\lambda_T,T+1)<f_2^a(\omega,\lambda_T,T+1)
\end{equation}
This requires that from $T$ to $T+1$,the Eq.~\ref{eq:partial_for1} also is satisfied. This is also equivalent to show
\begin{equation}\label{eq:cond_decay}
    f_1^p(\omega,\lambda_T,T+1) < f_2^a(\omega,\lambda_T,T)
\end{equation}
Because 
\begin{equation}
\begin{aligned}
    f_1^p(\omega,\lambda_T,T+1) &= \omega + \lambda_T + \beta V_{\lambda_T, T}(\omega_1)\\
    f_2^a(\omega,\lambda_T,T+1) &= \omega + \beta V_{\lambda_T, T}(\omega_2)
\end{aligned}
\end{equation}
where $\omega_1 = P_{1,1}^p \omega + P_{0,1}^p (1-\omega)$ and $\omega_2 = P_{1,1}^a \omega + P_{0,1}^a (1-\omega)$. Because we have $\omega_2>\omega_1$ and $\omega_2>\omega$, thus we can get
\begin{equation}
    V_{\lambda_T, T}(\omega_2) > V_{\lambda_T, T}(\omega_1) \text{ and } V_{\lambda_T, T}(\omega_2)>V_{\lambda_T, T}(\omega)
\end{equation}

According to the definition of $\lambda_T$, we have:
\begin{equation}
    \begin{aligned}
             f_1^p(\omega,\lambda_T,T)&=f_2^a(\omega,\lambda_T,T)\\
             \omega + \lambda_T + \beta V_{\lambda_{T-1}, T-1}(\omega_1) &= \omega + \beta V_{\lambda_{T-1}, T-1}(\omega_2)
    \end{aligned}
\end{equation}
Replace with Eq.~\ref{eq:cond_decay}, we have:
\begin{equation}
\resizebox{\linewidth}{!}{$
\begin{aligned}
    &\beta V_{\lambda_{T-1}, T-1}(\omega_2) -\beta V_{\lambda_{T-1}, T-1}(\omega_1) + \beta V_{\lambda_{T}, T}(\omega_1) < \beta V_{\lambda_{T}, T}(\omega_2) \\
    \rightarrow & V_{\lambda_{T-1}, T-1}(\omega_2) - V_{\lambda_{T}, T}(\omega_2) <  V_{\lambda_{T-1}, T-1}(\omega_1) -  V_{\lambda_{T}, T}(\omega_1)
\end{aligned}
$}
\end{equation}
It is equivalent to show 
\begin{equation}
\begin{aligned}
    &\frac{\partial (V_{\lambda_{T-1}, T-1}(\tau(\omega)) - V_{\lambda_{T}, T}(\tau(\omega)))}{\partial \tau(\omega)}\frac{\partial \tau(\omega)}{\partial \omega} < 0
    % \rightarrow &\frac{\partial (V_{\lambda_{T-1}, T-1}(\omega))}{\partial \omega} < \frac{\partial ( V_{\lambda_{T}, T}(\omega))}{\partial \omega}
\end{aligned}
\end{equation}
This is intuitively correct but we leave this as future work to show whether a condition is required to make it always true, as this is not relevant to the fairness constraint in this paper, we just want to show that it is difficult to derive the whittle index value in the finite horizon case.

As $\lambda_{T+1} \leq \lambda_{T}$, according to  Eq.~\ref{eq:deay_T+1} and Eq.~\ref{eq:decay_1}, we can have
\begin{equation}\label{eq:decay_4}
   f_1^p(\omega,\lambda_{T},T+1)\geq f_2^a(\omega,\lambda_{T},T+1) 
\end{equation}

We have Eq.~\ref{eq:decay_4} and Eq.~\ref{ineq:decay_T+1} conflict with each other. Henceforth $\lambda_{T+1}>\lambda_T$.
% However, as we have shown $\frac{\partial f_1^p(\omega,\lambda,t)}{\partial \lambda}>\frac{\partial f_2^a(\omega,\lambda,t)}{\partial \lambda}$. 
% Those equations (~\ref{eq:decay_T},~\ref{eq:partial_for1} and~\ref{eq:decay_4}) will conflict with each other. Thus we have $\lambda_{T+1}>\lambda_T$.
\end{proof}



\subsection{Proof of Theorem~\ref{thm:q_learning}}\label{ap_subsec:proof_q_learning}
\begin{proof}
We provide our proof which is based on the work by~\citet{biswas2021learn}. Let set $\phi^{\ast}$ to be the set of actions containing the $k$ arms with the highest-ranking values of $Q_i(s,a=1,l)-Q_i(s,a=0,l)$, and any $k$ arms that aren't among the top k are included in the set $\phi^{\prime}$.
% $\phi^{-,\ast}$ is the set that includes any arms that are not among the top $k$ arms.
Let $\phi^{-,\ast}$ and $\phi^{-,\prime}$ denote the set that includes all of the arms except those in set $\phi^{\ast}$ and $\phi^{\prime}$, respectively. We add the subscript $i$ here in order to avoid ambiguity in the Q-values of distinct arms $i$ at a given state. We could have:
\begin{equation}
\begin{aligned}
    \sum_{i^{\ast}\in \phi^{\ast}}\left[  Q_{i^{\ast}}^{\ast}(s_{i^{\ast}},a_{i^{\ast}}=1,l_{i^{\ast}})-Q_{i^{\ast}}^{\ast}(s_{i^{\ast}},a_{i^{\ast}}=0,l_{i^{\ast}}) \right] \geq \\
    \sum_{j\in \phi^{\prime}}\left[  Q_{j}^{\ast}(s_j,a_j=1,l_j)-Q_{j}^{\ast}(s_j,a_j=0,l_j)\right]
\end{aligned}
\end{equation}
\begin{equation}
    \begin{aligned}
    \sum_{i^{\ast}\in \phi^{\ast}} Q_{i^{\ast}}^{\ast}(s_{i^{\ast}},a_{i^{\ast}}=1,l_{i^{\ast}})+\sum_{j\in \phi^{\prime}}Q_{j}^{\ast}(s_j,a_j=0,l_j) \geq \\
    \sum_{j\in \phi^{\prime}}  Q_{j}^{\ast}(s_j,a_j=1,l_j)+\sum_{i^{\ast}\in \phi^{\ast}} Q_{i^{\ast}}^{\ast}(s_{i^{\ast}},a_{i^{\ast}}=0,l_{i^{\ast}}) 
    \end{aligned}
\end{equation}
Adding $\underset{i\notin \phi^{\ast} \& i\notin \phi^{\prime}}{\sum}  Q_{i}^{\ast}(s_i,a_i=0,l_i)$ on both sides,
\begin{equation}\label{eq:equivalence}
    \begin{aligned}
        \sum_{i^{\ast}\in \phi^{\ast}} Q_{i^{\ast}}^{\ast}(s_{i^{\ast}},a_{i^{\ast}}=1,l_{i^{\ast}})+\sum_{j\in \phi^{-,\ast}}Q_{j}^{\ast}(s_j,a_j=0,l_j) \geq \\
        \sum_{i\in \phi^{\prime}}  Q_{i}^{\ast}(s_i,a_i=1,l_i)+\sum_{j\in \phi^{-,\prime}} Q_{j}^{\ast}(s_{j},a_{j}=0,l_{j})
    \end{aligned}
\end{equation}
Thus from Equation~\ref{eq:equivalence}, we can see that taking intervention action in the action set
As can be seen from Equation~\ref{eq:equivalence}, adopting intervention action for the arms in the set $\phi^{\ast}$ would maximizes $\left\{ \sum_{i=1}^N Q_i^\ast(s,a,l)\right\}$.
\end{proof}




\subsection{Proof of Theorem~\ref{thm:Q_2}}
\label{subsec:thm:Q_2}
\begin{proof}
The key to the convergence is contingent on a particular sequence of episodes observed in the real process~\cite{watkins1992q}. The first condition is easy to be satisfied as to the presence of the fairness constraint. It is a reasonable assumption under the $\epsilon$-greedy action selection mechanism, that any state-action pair can be visited an unlimited number of times as $T\rightarrow \infty$. The second condition has been well-studied in~\cite{hirsch1989convergent,watkins1992q,jaakkola1994convergence}, and it guarantees that when the condition is met, the Q-value converges to the optimal $Q^{\ast}(s,a,l)$. As a result, $Q_i(s,a=1,l)-Q_i(s,a=0,l)$ converges to $Q_i^{\ast}(s,a=1,l)-Q_i^{\ast}(s,a=0,l)$. Also, $Q_i^{\ast}(s,a=1,l)-Q_i^{\ast}(s,a=0,l)$ is the calculated Q-Learning based Whittle index, and choosing top-ranked arms based on these values would lead to an optimal solution.

\end{proof}








\section{General Condition}\label{ap_sec:general_condition}
\subsection{A general condition for Theorem~\ref{thm:FC_infinite_fully}}\label{proof:fully_obs}\label{ap_subsec:general_condition_fully}
% \begin{proof}
During a time interval, we consider the two arms A and B as shown in Figure~\ref{fig:fully_ob_fairness}, B is the k-th ranked arm as shown in algorithm 1. We assume that the optimal policy is $\pi^\ast$, which only considers the fairness constraint when it is violated at the end of interval, implying that arm $A$ has not been activated in the past $(L-1)$ slots as shown in left. We move the action $a_L=1$ to one slot earlier, where action $a_L$ is to ensure the fairness constraint. We denote this policy as $\pi$ shown in the right. We first consider the fully observable setting here. 
Assume that the state of arm A in time step $L-1$ is $s_A$, and the state of arm B is $s_B$, and $s_A,s_B\in \{0,1 \}$. We let the corresponding Whittle index for arm A is $\lambda_A$, and $\lambda_B$ for arm B. We can calculate the value function $V_{\lambda_i,\infty}(\omega_i)$ for arm A from the time slot $L-1$ under the policy $\pi^\ast$, we have
\begin{equation}\label{eq:fully_1}
\resizebox{\linewidth}{!}{$
    \begin{aligned}
        V_{\lambda_A,\infty}(s_A)=s_A+\lambda_A+\beta\{P_{1,1}^a s_A+P_{0,1}^p(1-s_A)+
        \beta[(P_{1,1}^a s_A\\
        +P_{0,1}^p(1-s_A))V_{\lambda_A,\infty}(P_{1,1}^a) +(1-P_{1,1}^a s_A-P_{0,1}^p(1-s_A))V_{\lambda_A,\infty}(P_{0,1}^{a})  ]   \}
    \end{aligned}
    $}
\end{equation}
Note that $V_{\lambda,\infty}(\omega)=\omega V_{\lambda,\infty}(1)+(1-\omega)V_{\lambda,\infty}(0)$. Similarly, we can get the value function for the arm A under the policy $\pi$:
\begin{equation}\label{eq:fully_2}
\resizebox{\linewidth}{!}{$
    \begin{aligned}
        V_{\lambda_A,\infty}(s_A)=s_A+\beta(s_A V_{\lambda_A,\infty}(P_{1,1}^a)+(1-s_A)V_{\lambda_A,\infty}(P_{0,1}^a))
    \end{aligned}
    $}
\end{equation}
Similarly, the value function for the arm B under policy $\pi^\ast$:
\begin{equation}\label{eq:fully_3}
\resizebox{1\linewidth}{!}{$
    \begin{aligned}
        V_{\lambda_B,\infty}(s_B)=s_B+\beta(s_B V_{\lambda_B,\infty}(P_{1,1}^a)+(1-s_B)V_{\lambda_B,\infty}(P_{0,1}^p)),
    \end{aligned}
    $}
\end{equation}
and the value function for the arm B under policy $\pi$:
\begin{equation}\label{eq:fully_4}
\resizebox{1\linewidth}{!}{$
    \begin{aligned}
        V_{\lambda_B,\infty}(s_B)=\lambda_B+s_B+\beta (V_{\lambda_B,\infty}( s_B P_{1,1}^p+(1-s_B)P_{0,1}^p)),
    \end{aligned}
    $}
\end{equation}
Let the Eq.~\ref{eq:fully_1} minus Eq.~\ref{eq:fully_2} and Eq.~\ref{eq:fully_3} minus Eq.~\ref{eq:fully_4}, and then sum these two values to let it greater than 0, we then get the general condition for theorem~\ref{thm:FC_infinite_fully}. The explicit calculation for the value function can be found in~\citep{liu2010indexability}.
% \end{proof}


\subsection{General Condition for Theorem~\ref{thm:FC_infinite} and Theorem~\ref{thm:FC_finite}}
\label{ap_subsec:general_condition}
This is similar to the fully observable case~\ref{proof:fully_obs}.
Assume the optimal action set that calculated by Whittle index policy is $A$, and the last several action set that is used to satisfy the fairness constraints is $B$, where we have action $b\in B$ for arm $i$. As shown in Figure~\ref{fig:NIB}.  
\begin{figure}[th]
    \centering
    \includegraphics[width=0.85\linewidth]{fig_belief_Whittle.pdf}
    \caption{Proof of Theorem~\ref{thm:FC_infinite}}
    \label{fig:NIB}
\end{figure}
If we shift the action $b\in B$ at the end of time slot to one slot earlier, and add another action $c$ for arm $j$ into the action set $B$, then according to the belief state update process, the other following action set except for arm $i$ and $j$ will not be influenced. As a result, we only need to consider arm $i$ and arm $j$. If we showed that the modified policy will not have greater value function. again, then we can repeatedly move the action ${b}$ to one slot earlier and show that  the value function will never have a higher value than the original policy. Hence, we can conclude that the modified policy is smaller than the optimal policy. The Whittle index approach is still applicable under the fairness constraint unless the last few slots are used to ensure that the fairness criteria are met. The numerical condition is as follows,

We consider the partially observable setting. Assume that the probability that the arm A is in good state at time slot $L-1$ is $\omega_i$, and arm B is $\omega_j$, and we assume the corresponding Whittle index for arm A is $\lambda_i$, and $\lambda_j$ for arm B. We can calculate the value function $V_{\lambda_i,\infty}(\omega_i)$ from the time slot $L-1$ for the policy $\pi^\ast$, we have
\begin{equation}\label{eq:eq_pi^ast}
\begin{aligned}
      V_{\lambda_i,\infty}(\omega_i)=\lambda_i+\omega_i+\beta(\omega_i P_{1,1}^p+(1-\omega_i)P_{0,1}^p) \\
    +\beta^2\left\{ (A+B)V_{\lambda_i,\infty}(1) + (C+D)V_{\lambda_i,\infty}(0) \right\}      
\end{aligned}
\end{equation}
where $A=P_{1,1}^p(\omega_i P_{,1}^p+(1-\omega_i)P_{0,1}^p)$, $B=P_{0,1}^a(1-\omega_i P_{1,1}^p-(1-\omega_i)P_{0,1}^p)$, $C=(1-P_{1,1}^a)(\omega_i P_{1,1}^p+(1-\omega_i)P_{0,1}^p)$ and $D=(1-P_{0,1}^a)(1-\omega_i P_{1,1}^p-(1-\omega_i)P_{0,1}^p) $.

Similarly, we can get the value function for the arm A under the policy $\pi$:
\begin{equation}\label{eq:eq_pi}
\begin{aligned}
      V_{\lambda_i,\infty}(\omega_i)=\omega_i+\beta(\omega_i P_{1,1}^a+(1-\omega_i)P_{0,1}^a+\lambda_i) \\
    +\beta^2\left\{ (A+B^\prime)V_{\lambda_i,\infty}(1) + (C^\prime+D^\prime)V_{\infty}(\lambda_i,0) \right\}, 
\end{aligned}
\end{equation}
where $B^\prime=B-P_{0,1}^a+P_{0,1}^p$, $C^\prime=C+\omega_i(P_{1,1}^a-P_{1,1}^p)+(1-\omega_i)(P_{0,1}^a-P_{0,1}^p)$ and $D^\prime=D+\omega_i(P_{1,1}^p-P_{1,1}^a)+(1-\omega_i)(P_{0,1}^p-P_{0,1}^a)+(P_{0,1}^a-P_{0,1}^p)$.
Let Eq.~\ref{eq:eq_pi^ast} minus Eq.~\ref{eq:eq_pi}, we can have
\begin{equation}\label{eq:change_A}
\resizebox{\linewidth}{!}{$
\begin{aligned}
            \lambda_i(1-\beta) + \beta(\omega_i(P_{1,1}^p-P_{1,1}^a)+(1-\omega_i)(P_{0,1}^p-P_{0,1}^a))+\\ \beta^2((P_{0,1}^a-P_{0,1}^p)V_{\lambda_i,\infty}(1)+(P_{0,1}^p-P_{0,1}^a)V_{\lambda_i,\infty}(0))
\end{aligned}
$}
\end{equation}
Similarly, for the arm B, we can get the change in the value function from the policy $\pi^\ast$ to $\pi$:
\begin{equation}\label{eq:change_B}
\resizebox{\linewidth}{!}{$
\begin{aligned}
         \lambda_j(\beta-1) - \beta(\omega_j(P_{1,1}^p-P_{1,1}^a)-(1-\omega_j)(P_{0,1}^p-P_{0,1}^a))+\\ \beta^2((P_{0,1}^a-P_{0,1}^p)V_{\lambda_j,\infty}(1)+(P_{0,1}^p-P_{0,1}^a)V_{\lambda_j,\infty}(0))   
\end{aligned}
$}
\end{equation}
We sum Eq.~\ref{eq:change_A} and Eq.~\ref{eq:change_B} and let it greater than $0$, then this is the condition that the optimal policy of theorem~\ref{thm:FC_infinite} for RMAB with the fairness constraint is to select the arm to play when the fairness constraint is violated at the time interval under the partially observable setting. Explicit computation of the value function can be found in~\citep{liu2010indexability}.






\begin{figure*}[ht]
  \centering
  \begin{minipage}[b]{\linewidth}
  \includegraphics[width=0.33\linewidth]{fig_constraint_FC_2.pdf} 
  \includegraphics[width=0.33\linewidth]{fig_constraint_Prior_3.pdf} 
  \includegraphics[width=0.33\linewidth]{fig_constraint_Q.pdf}
  \end{minipage}
  \caption{The average reward of each arm over the time length $T=1000$ with small penalty for the violation of the fairness constraint.}
  \label{fig:sensity}
\end{figure*}

\begin{figure*}[ht]
  \centering
  \begin{minipage}[b]{\linewidth}
  \includegraphics[width=0.33\linewidth]{fig_resource_FC.pdf} 
  \includegraphics[width=0.33\linewidth]{fig_resource_Prior_3.pdf} 
  \includegraphics[width=0.33\linewidth]{fig_resource_Q.pdf}
  \end{minipage}
  \caption{The average reward of each arm over the time length $T=1000$ with small penalty for the violation of the fairness constraint.}
  \label{fig:k_level}
\end{figure*}


\section{Visualization of algorithm}\label{app:vis}
We give a visualization of our proposed Whittle index based approach to solve the fairness constraint in Figure~\ref{fig:Belief_vis}.
\begin{figure}[ht]
    \centering
    \includegraphics[width=0.98\linewidth]{fig_Belief.pdf}
    \caption{Visualization of Whittle index approach with fairness constraints.}
    \label{fig:Belief_vis}
\end{figure}

The belief state MDP works as follows: initially, after an action, the state $s\in\{ 0,1\}$ of the selected arm is observed. Then the belief state then changes to $P_{s,1}^a$ one slot later, which is represented as the blue node at the head of the chain. Subsequent passive actions cause the belief state evolves according to the initial observation in the same chain. Then if the arm is activated again under the proposed algorithm, it will transit to the head of one of the chains with the probability according to its belief state as shown in the black arrow. If the arm's fairness constraint is not met, i.e., it has not been chosen in the last $L-1$ time slots, it will be activated at the time slot $L$, and go to the head of one of the chains (as shown by the red dashed arrow).

% \begin{figure*}[ht]
%   \centering
%   \begin{minipage}[b]{\linewidth}
%   \includegraphics[width=0.33\linewidth]{resource_FC.pdf} 
%   \includegraphics[width=0.33\linewidth]{resource_Prior.pdf} 
%   \includegraphics[width=0.33\linewidth]{resource_Q.pdf}
%   \end{minipage}
%   \caption{The average reward of each arm over the time length $T=1000$.}
%   \label{fig:k_level}
% \end{figure*}
\section{Additional Results}
\label{app:additional}
\subsection{Fairness Constraints Strength}
In this section, we provide the average reward results for different fairness constraints, and see how they influence the overall performance. 
The strength of fairness restrictions is represented by the combination of $L$ and $\eta$.
% For instance, $\eta$ is a parameter for determining lowest bound of an arm should be activated during a time period of length $L$.
For instance, $\eta$ is a parameter to determine the lowest bound of the number of times an arm should be activated in a decision period of length $L$. Smaller $L$, on the other hand, indicates that a strict fairness constraint should be addressed in a shorter time length. For ease of explanation, we fix the value of $\eta=2$, which means that an arm will be activated twice in any given time steps of length $L$. We can change the value of $L$ to measure the fairness constraint level.
We investigate three different categories of fairness constraint strength as follows,
\begin{itemize}
    \item \textbf{Strong level}: The strong fairness constraints impose a strict restrictions on the action. Here we assume that the strong fairness constraints $L$ satisfy $\frac{k\times L}{N}=1.3$, this can translate to at most $30\%$ arms can be engaged twice when before all arms have been pulled previously.
    \item \textbf{Medium level}: We define the medium fairness constraints by solving: $\frac{k\times L}{N}=2$.
    \item \textbf{Low level}: The low strength of fairness constraints can be interpreted as a low fairness restriction on the distribution of the resources, i.e., we have $\frac{k\times L}{N}=3$, which means all arms will receive the health intervention before each arm has been activated three times on average.
\end{itemize}
We provide the average reward results in Figure~\ref{fig:sensity}. Again, the left graph shows the performance of Whittle index approach with fairness constraint when the transition model is known, the middle graph presents the result of the Thompson sampling-based approach for Whittle index calculation, the the right graph shows the result for the Q-Learning based Whittle index approach. Our proposed approach can handle fairness constraints at different strength level without sacrificing significantly on the solution quality. 
% \begin{figure*}[ht]
%   \centering
%   \begin{minipage}[b]{\linewidth}
%   \includegraphics[width=0.33\linewidth]{constraint_FC.pdf} 
%   \includegraphics[width=0.33\linewidth]{constraint_Prior.pdf} 
%   \includegraphics[width=0.33\linewidth]{constraint_Q.pdf}
%   \end{minipage}
%   \caption{The average reward of each arm over the time length $T=1000$.}
%   \label{fig:sensity}
% \end{figure*}




\subsection{Intervention Level}
In this part, we present the performance results for various resource levels where the fairness constraint $L$ is fixed, and we ensure that $k \times L < N$. Here, we let $L=30$, and $N=100$, and we're looking at the performance of the intervention ratio where $\frac{k}{N}=5\%, 10\%, 20\%, 30\%$ respectively. We can see that our proposed approach to solve the fairness constraint can consistently outperform the Random and Myopic baselines regardless of the intervention strength while does not have significant differences when compared to the optimal value without taking fairness constraints into account.

% \begin{figure*}[ht]
%   \centering
%   \begin{minipage}[b]{\linewidth}
%   \includegraphics[width=0.33\linewidth]{resource_FC.pdf} 
%   \includegraphics[width=0.33\linewidth]{resource_Prior.pdf} 
%   \includegraphics[width=0.33\linewidth]{resource_Q.pdf}
%   \end{minipage}
%   \caption{The average reward of each arm over the time length $T=1000$.}
%   \label{fig:k_level}
% \end{figure*}


% \subsection{Hybrid Transition Matrix}
% % In the hibrid setting, we assume that the transition matrix $\mathcal{P}$ will change in during the process. 
% % To investigate the performance of our approaches under the dynamic change of transition probabilities, it suffices to consider a single change in the middle of the time steps.
% We assume that the transition matrix $\mathcal{P}$ will change during the process in the hybrid setting. It is sufficient to evaluate a single change in the midst of the time steps to investigate the performance of our approaches under the dynamic change of transition probabilities.

\bibliography{li_493-supp}

\end{document}
