%\begin{center}
%\centering
%\textbf{\large{Supplementary material for "\texttt{BanditQ:} Fair Multi-Armed Bandits with Guaranteed Rewards per Arm" }}
%\end{center}
\newpage
\onecolumn
\title{ Appendix for \\
\textsc{BanditQ:} Fair Bandits with Guaranteed Rewards}
\maketitle
\appendix
%\vspace{-140pt}
%h
%\vspace{-85pt}
%\section{Appendix} \label{appendix}
\section{On the feasibility assumption}\label{feas-sec}
Throughout the paper, we assume that the target rate vector $\bm{\vec \lambda}$ is feasible. In practice, we can ensure the feasibility by estimating the expected rewards from past data and requiring that condition \eqref{feas-constr} is strictly satisfied with a reasonable margin. To put it quantitatively, let $\hat{\bm{\mu}}$ be the estimated expected reward vector where it is known that $||\hat{\bm{\mu}} -\bm{\mu}||_\infty \leq \epsilon,$ for a small error bound $\epsilon \geq 0$. Then, for the required reward rate vector $\bm{\vec \lambda}$ to be feasible, using the first-order Taylor's series expansion, it is sufficient that: 
\begin{eqnarray} \label{feasibility_test}
	\sum_i \frac{\lambda_i}{\hat{\mu_i}} + \epsilon \sum_i \frac{\lambda_i}{\hat{\mu_i}^2} \leq 1.
\end{eqnarray}
Although the estimated mean rewards can reasonably be used for determining the feasibility of the required reward rates, they cannot possibly be used for the online selection of the arms with no regret, as even a small constant error in the estimated rewards may lead to a linear regret. 
\iffalse
\section{Challenges in adapting the classic bandit policies to the \BQ problem} \label{adapt}
%\edit{todo}
Classic bandit policies, such as Upper-confidence bound (UCB), Explore-then-commit (ETC), and Hedge, attempt to pull the most-rewarding arm $1-o(1)$ fraction of times to achieve a sublinear regret against the best arm in hindsight. Hence, these policies ensure that the fraction of pulls of non-optimal arms tends to zero (otherwise, the policy would incur a linear regret). However, in our problem, even the arm with the smallest mean reward must be pulled a strictly positive fraction of times to satisfy the fairness constraints. Hence, any algorithm that ultimately "commits" to pulling one of the arms (such as ETC, which pulls all sub-optimal arms a positive fraction of times and then commits) would not work in our setting. Secondly, the fairness constraint is a long-term constraint. At any intermediate round, unless the policy keeps track of the cumulative rewards (which is done through the queueing process here), there does not seem to be a straightforward way to ascertain that the policy would eventually satisfy the constraints. Thirdly, Proposition \ref{avg-regret} shows that the \BQ policy achieves a time-average regret of $O(\sqrt{T})$. Hence, even without any fairness constraints, the \BQ policy performs well.
\fi

%\section{Proof of Theorem \ref{q_bd}} \label{q_bd_pf}
%First, we will derive a sublinear bound for the expected queue lengths under the \BQ policy. We will see that the violation and regret bounds follow from this result.
%\paragraph{Bounding the queue lengths:}
%Since the reward components are bounded in $[0,1],$ using the fact that $\sum_i r_i(\tau)(x_i(\tau)-x_i^*) \leq 1, \forall \tau,$ we have that $\textrm{Regret}_t(\bm{x}^*)\geq -t.$ Hence, from Eq.\ \eqref{main-ineq3}, we have for all $t\geq 1:$
%\begin{eqnarray} \label{ineq1}
%	\sum_i\mathbb{E}Q_i^2(t) \leq 2(V+1)t + 4 \sqrt{2\sum_{\tau=1}^t\sum_{i}\mathbb{E}Q_i^2(\tau) } + 4V \sqrt{2Nt}.
%\end{eqnarray}
%Hence, for any round $1\leq \tau \leq t,$ we have that 
%\begin{eqnarray*}
%		\sum_i \mathbb{E}Q_i^2(\tau) \leq 2(V+1)t + 4 \sqrt{2\sum_{\tau=1}^t\sum_{i}\mathbb{E}Q_i^2(\tau) } + 4V \sqrt{2Nt}.
%\end{eqnarray*}
%Summing up the above inequality for all $\tau \in [1,t],$ we have 
%\begin{eqnarray*}
%	R^2(t) \leq 2(V+1)t^2 + 4 \sqrt{2N}V t^{\nicefrac{3}{2}} + 4\sqrt{2} t R(t).
%\end{eqnarray*}
%where we have defined $R(t) \equiv \sqrt{\sum_{\tau=1}^t \sum_{i=1}^N \mathbb{E}Q_i^2(\tau)}.$ Solving the above quadratic inequality in $R(t)$, we conclude that
%\begin{eqnarray} \label{r-bd}
%	 R(t) = O(t)+ O(t\sqrt{V})+O(N^{\nicefrac{1}{4}}\sqrt{V}t^{\nicefrac{3}{4}})= O(t\sqrt{V}).
%\end{eqnarray} 
%Plugging in this bound in \eqref{ineq1}, we have 
%\begin{eqnarray} \label{q-sq-bd}
% \mathbb{E}Q_i^2(t) = O(Vt) + O(t\sqrt{V})+ O(V\sqrt{Nt}) = O(Vt) \stackrel{\textrm{(Jensen's ineq.)}}{\implies}  \mathbb{E}Q_i(t) = O(\sqrt{Vt}).	
%\end{eqnarray}
%\paragraph{Bounding the violation penalty:}
%	Upon expanding \eqref{q-ev}, we obtain the following well-known representation of the Lindley recursion \citep[pp. 92]{asmussen2003applied}: 
%	\begin{eqnarray} \label{q-len-bd}
%		Q_i(t) = \sup_{1\leq \tau \leq t}(0, \lambda_i \tau - \sum_{z=t-\tau+1}^t r_i(z) x_i(z)), ~ \forall i \in \mathcal{P}.
%	\end{eqnarray}
%Combining Eq.\ \eqref{q-len-bd} with the bound \eqref{q-sq-bd}, we can bound the constraint violation penalty as 
%	\begin{eqnarray*}
%		\mathbb{V}(T) \leq \max_{i \in \mathcal{P}} \mathbb{E}Q_i(T) = O(\sqrt{VT}). 
%	\end{eqnarray*} 
%\paragraph{Bounding the regret:}
%	Substituting the bound \eqref{r-bd} into the inequality \eqref{main-ineq3} and using the fact that $Q_i^2(T) \geq 0, \forall i, t,$ we have for any $\bm{x}^* \in \Omega$ 
%	\begin{eqnarray*}
%	2V \textrm{Regret}_T (\bm{x}^*)  
%		\leq O(T) + O(T\sqrt{V})+ O(V\sqrt{NT}), 
%	\end{eqnarray*}
%	\emph{i.e.,}
%	\begin{eqnarray*}
%		\textrm{Regret}_T(\bm{x}^*)  = O(\frac{T}{V})+ O(\frac{T}{\sqrt{V}})+ O(\sqrt{NT})= O(\max(\frac{T}{\sqrt{V}}, \sqrt{NT})).
%	\end{eqnarray*}
%	Since $V=\Theta(\sqrt{T}),$ the above inequality immediately yields
	%\begin{eqnarray*}
%		$\textrm{Regret}_T = O((NT)^{3/4}).$

\section{$O(\sqrt{T})$ Regret of the \BQ Policy with no target rates} \label{bq_no_lambda}
We now consider the classical and special case when there are no specific target rates for any of the arms, \emph{i.e.,} $\lambda_i=0, \forall i.$ Hence, from Eqn.\ \eqref{q-ev}, we have that $Q_i(t)=0, \forall i,t.$ Furthermore, with $\bm{\lambda}=\bm{0},$ the comparator class $\Omega$ coincides with the set of all probability distributions over $N$ arms ($\Delta_N$). We have the following result
\begin{proposition} \label{zero-target-rate}
	With no pre-specified target reward rates, \emph{i.e.,} $\bm{\lambda}=\bm{0},$ the \BQ policy achieves regret bounds of $O(\sqrt{Nt})$ and $\tilde{O}(N\sqrt{t})$ for the full-information and bandit feedback settings, respectively. 
\end{proposition}
Intuitively, the above result can be understood from the fact that, in this case, the surrogate rewards $\bm{r}'(t)$ of the \BQ policy is simply a scaled version of the original rewards $\bm{r}(t).$ See below for a formal proof.

\textbf{Proof:}
\paragraph{Full-information setting:}
From the regret decomposition inequality \eqref{main-ineq3}, we have that 
\begin{eqnarray*}
	2V \textrm{Regret}_t(x^*) \leq 2t + 4V \sqrt{2Nt}.
\end{eqnarray*}
Setting $V=\sqrt{T},$ we have that 
\begin{eqnarray*}
	\textrm{Regret}_t(x^*) \leq \frac{t}{V}+ 2 \sqrt{2Nt} = O(\sqrt{Nt}).
\end{eqnarray*}

\paragraph{Bandit information setting:} The proof is almost identical to the full-information case. Setting $Q_i(t)=0, \forall i,t,$ in  the regret decomposition inequality \eqref{main_bd2}, we have that 
\begin{eqnarray*}
	2V \textrm{Regret}_t(x^*) \leq 2t + \tilde{O}(NV\sqrt{t}+V\sqrt{Nt}). 
\end{eqnarray*}
Setting $V=\sqrt{T},$ the above yields 
\begin{eqnarray*}
	\textrm{Regret}_t(x^*) = \tilde{O}(N\sqrt{t}).
\end{eqnarray*}


\section{Proof of Proposition \ref{rate-prop}} \label{rate-prop-proof}
%\begin{proof}
	
Using Proposition \ref{q_bd}, we have that $\mathbb{E}Q_i(t) \stackrel{\textrm{(Jensen's ineq.)}}{\leq}\sqrt{\mathbb{E}Q_i^2(t)} = O(N^{1/4}T^{3/4}), ~\forall i\in \mathcal{P}, t\in [T].$ Let $\mathcal{I} \subseteq [1,T]$ be any sub-interval of length $l = |\mathcal{I}|$. 	
	Substituting the above bound in Eq.\ \eqref{q-len-bd}, we have for any $i \in \mathcal{P}:$
	\begin{eqnarray*}
	\inf_{\mathcal{I}}\mathbb{E}\big(\nicefrac{1}{|\mathcal{I}|}\sum_{z \in \mathcal{I}} r_i(z) x_i(z)\big) \geq \lambda_i  - O(\frac{N^{1/4}T^{3/4}}{l}),
	\end{eqnarray*}
	which gives a finite-time guarantee for the expected reward accrual rate for each arm in the protected set $\mathcal{P}.$
	Hence, as long as $\nicefrac{T^{3/4}}{l} \to 0,$ we have
	\begin{eqnarray*}
		\liminf_{|\mathcal{I}| \to \infty} |\mathcal{I}|^{-1}\mathbb{E} \big[\sum_{t \in \mathcal{I}} r_i(t)x_i(t)\big] \geq \lambda_i, ~\forall i \in \mathcal{P}.
	\end{eqnarray*} 
	%Finally, using Eq.\ \eqref{q-len-bd} once again, we can bound the violation penalty as 
%	\begin{eqnarray*}
%		\mathbb{V}(T) \leq \max_{i \in \mathcal{P}} \mathbb{E}Q_i(T) = O(N^{1/4}T^{3/4}). 
%	\end{eqnarray*}
%\end{proof}
\iffalse
\subsection{Proof of Theorem \ref{regret_prop}} \label{regret_prop_proof}
%\begin{proof}
	Substituting the queue length bounds from Proposition \eqref{q_bd} into the inequality \eqref{main-ineq3} and using the fact that $Q_i^2(T) \geq 0, \forall i, t,$ we have 
	\begin{eqnarray*}
		%&&2V \textrm{Regret}_T \\
	2V \textrm{Regret}_T   
		\leq 2T + O(\sqrt{N\sqrt{N}\sum_{\tau=1}^tT^{3/2}}) + O(T)= O(N^{3/4}T^{5/4}). 
	\end{eqnarray*}
	Since $V=\Theta(\sqrt{T}),$ the above inequality immediately yields
	%\begin{eqnarray*}
		$\textrm{Regret}_T = O((NT)^{3/4}).$
	%\end{eqnarray*}
%\end{proof}
\fi
%\section{Proof of Proposition \ref{avg-regret}} \label{avg-regret-proof}
%Define $S_t^2 \equiv \sum_i \mathbb{E} Q_i^2(t).$ From Eq.\ \eqref{main_ineq}, we have for all $t \in [T]:$
%\begin{eqnarray*}
%	S_t^2 + 2V \textrm{Regret}_t(\bm{x}^*) \leq 2t + 4 \sqrt{2 \sum_{\tau=1}^t S_\tau^2} + 4V \sqrt{2Nt} \leq 2T+ 4 \sqrt{2 \sum_{\tau=1}^T S_\tau^2}+4V \sqrt{2NT}.
%\end{eqnarray*}
%Summing up the above inequality from $t=1$ to $t=T$ and defining $z_T\equiv \sqrt{\sum_{\tau=1}^T S_\tau^2},$   the above inequality yields
%\begin{eqnarray} \label{avg-ineq2}
% z_T^2 - 4Tz_T + 2V \sum_{t=1}^T\textrm{Regret}_t(\bm{x}^*) \leq 2T^2 + 4V\sqrt{2N}T^{3/2}.
%\end{eqnarray}
%Upon completing the square, we have  $z_T^2 - 4Tz_T = (z_T-2T)^2 -4T^2 \geq -4T^2. $ Hence, from \eqref{avg-ineq2}, we conclude that: 
%\begin{eqnarray*}
%	 \frac{1}{T}\sum_{t=1}^T\textrm{Regret}_t(\bm{x}^*) \leq 3\frac{T}{V} + 2\sqrt{2NT}.
%\end{eqnarray*}
%The result follows upon setting $V=\Theta(\sqrt{T}).$ 














%\section{Proof of Proposition \ref{improved_bd}} \label{improved_bd_proof}
%From Eq.\ \eqref{main-ineq3}, we have for any fixed $t$ and any $1\leq \tau \leq t:$
%\begin{eqnarray} \label{new-eq}
%	\sum_i \mathbb{E}Q_i^2(\tau) \leq 2t + 4 \sqrt{2\sum_{\tau=1}^t\sum_{i} \mathbb{E}Q_i^2(\tau) }~~ \forall t \geq 1, \forall i.
%\end{eqnarray}
%Summing up the above inequalities for $1\leq \tau \leq t$ and defining $z_t^2 \equiv  \sum_{\tau=1}^t \sum_{i} \mathbb{E}Q_i^2(\tau),$ we have 
%\begin{eqnarray*}
%	z_t^2 \leq 2t^2 + 4 \sqrt{2}tz_t.
%\end{eqnarray*}
%Solving the above quadratic inequality, we conclude that 
%\begin{eqnarray*}
%	\sqrt{\sum_{\tau=1}^t \sum_{i} \mathbb{E}Q_i^2(\tau)} = z_t \leq  6t. 
%\end{eqnarray*}
%Substituting the above bound in \eqref{new-eq} and using Jensen's inequality, we conclude that $\mathbb{E}Q_i(t) \leq 6\sqrt{t}, \forall i \in [N].$
\iffalse
Since $\lambda_i \leq 1, \forall i \in \mathcal{P},$ trivially we have that $Q_i(\tau) \leq \tau, \forall \tau,$ \emph{i.e.,} $\mathbb{E}Q_i^2(\tau) \leq \tau^2, \forall i,\tau.$ We now use a repeated refinement technique to go from this trivial bound to the claimed bound.

 As the induction step, assume that $\mathbb{E}Q^2_i(\tau) \leq c\tau^{\delta}, \forall \tau$ for some $\delta \geq 1$ and $c= 64N.$ Then, from Eq.\ \eqref{new-eq}, we have that for any $t \geq 1:$
\begin{eqnarray*}
	\mathbb{E}Q_i^2(t) \leq 2t + 4\sqrt{2Nc \frac{t^{1+\delta}}{1+\delta}} \leq 2t+ 4\sqrt{Nc}t^{\frac{1+\delta}{2}} \leq c t^{\frac{1+\delta}{2}}, ~\forall i, t,
\end{eqnarray*}
where we have used the fact that $\delta \geq 1$ and 
%Using the bound $\sqrt{x+y} \leq \sqrt{x}+ \sqrt{y},$ from the above, we obtain for all $i \in \mathcal{P}, t\geq 1:$
%\begin{eqnarray*}
%	\mathbb{E}Q^2_i(t) \leq \sqrt{2t} + 2\sqrt{c}k^{\nicefrac{1}{4}}t^{\nicefrac{1}{4}+ \delta/2} \leq ct^{\nicefrac{1}{4} +\nicefrac{\delta}{2}},
%\end{eqnarray*}
$c=64N.$
 Hence, the successive upper bounds to the square of the expected queue lengths constitute a sequence of the form $\mathbb{E}Q^2_i(t)\leq c t^{\delta_n}, \forall i,t,$ where 
$\delta_{n+1}= \max(1, \frac{1+\delta_n}{2}),$ with $\delta_1=1.$ It is easy to verify that the above sequence $\{\delta_n\}_{n\geq 1}$ converges to $1.$ Hence, we have that $\mathbb{E}Q_i^2(t) \leq 64Nt.$ Finally, using Jensen's inequality, we conclude that $\mathbb{E}Q_i(t) \leq \sqrt{\mathbb{E}Q_i^2(t)} \leq \sqrt{ct} \leq 8\sqrt{Nt}.$ This completes the proof of the result.
\fi
\iffalse
\subsection{Proof of Theorem \ref{non-neg-thm}} \label{non-neg-thm-pf} 
\paragraph{Queue length bound:}
Setting $\bm{x}^*=\bm{p}^*$ in \eqref{main-ineq3} and using the assumption that $\textrm{Regret}_t(\bm{p}^*) \geq 0, \forall t\geq 1,$ we have 
\begin{eqnarray} \label{str-bd-q}
	 \sum_i \mathbb{E}Q_i^2(t) \leq 2t +  4\sqrt{2\sum_{\tau=1}^t\sum_i \mathbb{E}Q_i^2(\tau) } + 4V \sqrt{2Nt}.
\end{eqnarray}
For any $t \geq 1,$ define $S_t^2 \equiv \sum_i \mathbb{E} Q_i^2(t).$ From the above equation, we have for any $1\leq \tau \leq t$:
\begin{eqnarray*}
	S_\tau^2 \leq 2t + 4 \sqrt{2 \sum_{\tau=1}^t S_\tau^2} + 4V \sqrt{2Nt}.
	\end{eqnarray*}
Summing up the above inequality from $\tau=1$ to $\tau=t$ and defining $z_t\equiv \sqrt{\sum_{\tau=1}^t S_\tau^2},$ we obtain the following quadratic inequality from the above:
\begin{eqnarray} \label{avg-ineq3}
 z_t^2 \leq 2t^2 + 4V\sqrt{2N}t^{3/2} + 4\sqrt{2}tz_t \implies z_t \leq 6t + 3\sqrt{V} N^{\nicefrac{1}{4}}t^{\nicefrac{3}{4}}.
\end{eqnarray}
Plugging in the above bound on $z_t$ into \eqref{str-bd-q}, we obtain
\begin{eqnarray}
	\sum_i \mathbb{E}Q_i^2(t) &=& O(\max(t, \sqrt{V}N^{\nicefrac{1}{4}}t^{\nicefrac{3}{4}}, V \sqrt{Nt}) \nonumber\\
	\stackrel{\textrm{(Jensen's ineq.)}}{\implies} \mathbb{E}Q_i(t) &=& O(\max(\sqrt{t}, N^{\nicefrac{1}{8}}V^{\nicefrac{1}{4}}t^{\nicefrac{3}{8}}, \sqrt{V} (Nt)^{\nicefrac{1}{4}}).\label{q-bd2-str2}
\end{eqnarray}
Finally, substituting \eqref{avg-ineq3} into \eqref{main-ineq3}, using the fact that $\mathbb{E}Q_i^2(t) \geq 0,$ and remembering that queue lengths are independent of the benchmark used in the regret bound, we conclude that for any $x^* \in \Omega(\vec{\lambda}):$
\begin{eqnarray} \label{reg-bd-str1}
	 V \textrm{Regret}_t(\bm{x}^*)= O(\max(t, V\sqrt{Nt}, \sqrt{V}N^{\nicefrac{1}{4}}t^{\nicefrac{3}{4}})) \implies \textrm{Regret}_t(\bm{x}^*)= O(\max(\nicefrac{t}{V}, \sqrt{Nt}, N^{\nicefrac{1}{4}}t^{\nicefrac{3}{4}}/\sqrt{V})). 
\end{eqnarray}
The final result now follows from \eqref{q-bd2-str2} and \eqref{reg-bd-str1} upon setting $V=\sqrt{T}.$
\fi


\section{Proof of Theorem \ref{mon-q-thm}} \label{mon-q-thm-pf}
Let $\bm{x}^*$ be an optimal fixed feasible randomized action. From Eqn.\ \eqref{main-ineq3}, we have that 
\begin{eqnarray*}
	2V \textrm{Regret}_t(\bm{x}^*) \leq  4\sqrt{2\sum_{\tau=1}^t\sum_i \mathbb{E}Q_i^2(\tau)}- \sum_{i} \mathbb{E}Q_i^2(t)+ 2t +4V\sqrt{2Nt}.
\end{eqnarray*}
Define $Q^2(\tau)= \sum_i \mathbb{E}Q_i^2(\tau), \forall \tau \geq 1.$ Using the monotonicity assumption \ref{mon-q}, the above inequality yields
\begin{eqnarray*}
	2V \textrm{Regret}_t(\bm{x}^*) &\leq& \underbrace{4\sqrt{2t}Q(t)- Q^2(t)}_{(A)}+ 2t +4V\sqrt{2Nt} \\
	&\stackrel{(a)}{\leq} & 10t + 4V\sqrt{2Nt}.
\end{eqnarray*}
where in (a), we have upper-bounded the quadratic (A), which is of the form $ax-x^2 $, by $a^2/4 \equiv 8t$. Hence, we have
\begin{eqnarray*}
	\textrm{Regret}_t \leq \frac{5t}{V}+ 2 \sqrt{2Nt}.
\end{eqnarray*}



%Upon completing the square, we conclude that: 
%\begin{eqnarray*}
%	 z_t \leq .
%\end{eqnarray*}
%The result follows upon setting $V=\Theta(\sqrt{T}).$ 
%\subsection{Bound on the diameter of the queueing processes $\{\textbf{Q}(t)\}_{t=1}^T$} 
\section{Proof of Proposition \ref{uniform_bd_lemma}} \label{unif-bd-pf}
%\label{dia-bd}
%For ease of exposition, the proof of the theorem is divided into three interrelated results.
%We begin our analysis by first deriving a sublinear bound for $\mathbb{E}Q_i^2(t).$
%\begin{proposition}
%	Under the action of the \BQ policy with bandit feedback, we have 
%	\[ \mathbb{E}Q_i^2(t)=\tilde{O}(\max(Vt, \sqrt{N}t^{\nicefrac{3}{2}})) , \forall i, t.\]
%\end{proposition}
%\begin{proof}
%	Recall that from Eqn.\ \eqref{reg-bd9} we have: 
%	\begin{eqnarray*}
%		\sum_{i} \mathbb{E}Q_i^2(t) + 2V \textrm{Regret}_t (\bm{x}^*) \leq 2t + \tilde{O}\big(\sqrt{N\sum_{\tau=1}^t\sum_i \mathbb{E}Q_i^2(\tau) }+NV \sqrt{t} + \sqrt{N}t^{3/2}\big).
%	\end{eqnarray*}
%	Using the fact that $r_i(t) \leq 1, \forall i,t,$ we have $\textrm{Regret}_t(x^*) \geq -t.$ Hence, from the above, we obtain
%	\begin{eqnarray}\label{q-bd-bandit}
%		\sum_{i} \mathbb{E}Q_i^2(t) \leq 2(V+1)t+ \tilde{O}\big(\sqrt{N\sum_{\tau=1}^t\sum_i \mathbb{E}Q_i^2(\tau) }+NV \sqrt{t} + \sqrt{N}t^{3/2}\big),
%	\end{eqnarray}
%	which resembles Eqn.\ \eqref{ineq1} in the full-information setting. Defining $R(t) \equiv \sqrt{\sum_{\tau=1}^t \sum_{i=1}^N \mathbb{E}Q_i^2(\tau)},$ and working similarly as in the full-information setting, we have the following quadratic inequality:
%	\begin{eqnarray} \label{R-bd-bandit}
%		R^2(t) \leq 2(V+1)t^2 + \tilde{O}\big(\sqrt{N} tR(t)+NV t^{\nicefrac{3}{2}} + \sqrt{N}t^{\nicefrac{5}{2}}\big) \implies R(t) = \tilde{O}\big(\max(t\sqrt{V}, N^{\nicefrac{1}{4}}t^{\nicefrac{5}{4}})\big).
%	\end{eqnarray}
%	Substituting the above bound in \eqref{q-bd-bandit}, we conclude that for each $i \in [N]:$
%	\begin{eqnarray*}
%		\mathbb{E}Q_i^2(t) = \tilde{O}(\max(Vt, \sqrt{N}t^{\nicefrac{3}{2}})).
%	\end{eqnarray*}
%\end{proof}
%
%The following proposition gives a sublinear bound to the diameter $\mathbb{E}\big[\max_{i, t \in [T]}Q_{i}(t)\big]$ of the queueing processes $\{\textbf{Q}(t)\}_{t=1}^T$. This result will be used to derive a sublinear regret bound for the \BQ policy. 
%
%\begin{proposition}\label{uniform_bd_lemma}
%Under the action of the \BQ policy, for any round $T\geq 1,$ we have the following bound for the expected maximum of the queueing processes
%	\[ \mathbb{E}\big[\max_{i, t \in [T]}Q_{i}(t)\big] = \tilde{O}(\max(\sqrt{VT}, N^{\nicefrac{1}{4}}T^{3/4})).\]
%\end{proposition}
\begin{proof}
Using Eq.\ \eqref{main_ineq} and the fact that $|r_i(t)|\leq 1, \forall i, t,$ we have the following sample-path wise bound on the square of the queue lengths:
	\begin{eqnarray} \label{sample-path}
		\sum_i Q_i^2(t) &\leq& 2(V+1)t + 2 \sum_{\tau=1}^t \sum_{i} Q_i(\tau-1) \big(\lambda_i-r_i(\tau) x_i^* \big)+2\textrm{Regret}^{\Xi}_t \nonumber\\
		&\leq& 2(V+1)t + \tilde{O}(\sqrt{N\sum_{\tau=1}^t\sum_i Q_i^2(\tau) }+NV \sqrt{t} + \sqrt{Nt}\max_{i, \tau \in [1,t]}Q_i(\tau)) + 2\sum_i M_t^i, \label{q-bd-bandit-ref} \\
		&\stackrel{(a)}{\leq}& 2(V+1)t + \tilde{O}(\sqrt{N\sum_{\tau=1}^t\sum_i Q_i^2(\tau) }+NV \sqrt{t} + \sqrt{N}t^{3/2}) + 2\sum_i M_t^i,
	\end{eqnarray}
	where in the above, we have substituted the upper bound to the regret of the surrogate problem from Eq.\ \eqref{bandit_reg-bd} (as in Eqn.\ \eqref{reg-bd9}), used the fact that $Q_(\tau) \leq \tau$, and for each $i \in [N],$ we have defined the stochastic process $\{M_t^i\}_{t \geq 1}$ as follows:
	\begin{eqnarray} \label{mg-def}
		M_t^i = \sum_{\tau=1}^t  Q_i(\tau-1) \big(\lambda_i'-r_i(\tau) x_i^* \big),~ t \geq 1.
	\end{eqnarray}
where $\lambda_i' \stackrel{(\textrm{def.})}{=} x_i^*\mathbb{E}r_i(\tau) = x_i^*\mu_i \geq \lambda_i.$
	%Hence, recalling that $V=\Theta(\sqrt{T}),$ 
	Taking the maximum of both sides with respect to all rounds $t \in [T]$ for some $T \geq 1,$ we have
	\begin{eqnarray*}
		\max_{i, t \in [T]}Q_i^2(t) \leq 2VT+ \tilde{O}(\sqrt{N\sum_{\tau=1}^T\sum_i Q_i^2(\tau) }+NV\sqrt{T}+ \sqrt{N}T^{3/2}) + 2\sum_i \max_{t \in [T]} M_t^i.
	\end{eqnarray*}
	Taking the expectation of both sides of the above inequality, we obtain
	\begin{eqnarray} \label{mg-bound}
		\mathbb{E}\big[\max_{i, t \in [T]}Q_i^2(t)\big] &\leq& 2VT+ \tilde{O}(\sqrt{N\sum_{\tau=1}^T\sum_i \mathbb{E}Q_i^2(\tau) }+ \sqrt{N}T^{3/2}) + 2\sum_i \mathbb{E}\big[\max_{t\in [T]} M_t^i \big] \nonumber \\
		&\stackrel{(a)}{\leq}& 2VT+ \tilde{O}\big(\max(T\sqrt{V}, N^{\nicefrac{1}{4}}T^{\nicefrac{5}{4}})\big)+ \tilde{O}(\sqrt{N}T^{3/2})+ 2\sum_i \mathbb{E}\big[\max_t M_t^i \big] \\
		&=& \tilde{O}(\max(VT, \sqrt{N}T^{3/2})) + 2\sum_i \mathbb{E}\big[\max_t M_t^i \big], 
	\end{eqnarray}
	where in step (a), we have used the bound for $R(T)$ from Eqn.\ \eqref{R-bd-bandit}.
	Next, we claim that each of the processes $\{M_t^i\}_{t\geq 1}$ is a zero-mean martingale process with respect to the natural filtration $\{\mathcal{F}_\tau\}_{\tau \geq 1}$. This follows from the definition \eqref{mg-def} as $Q_i(\tau-1) \in \mathcal{F}_{\tau-1}$ is pre-visible, and the random variable $r_i(\tau)$ is independent of $\mathcal{F}_{\tau -1}$ s.t. $\mathbb{E}(\lambda_i' - r_i(\tau)x_i^*)=0.$  
	%Using classic results 
	Using the $L^2$ maximum inequality for Martingales \citep[Theorem 4.4.4]{durrett2019probability}, \citep[Theorem 3.4]{doob1953stochastic}, \citep{dubins1988sharp}, we have
	%we know that the diameter of a Martingale with a last term is bounded by twice the square root of the variance of the last term. Hence,
	\begin{eqnarray} \label{M-bd1}
		\mathbb{E}[\max_{t \in [T]} M^i_t] \leq 2\sqrt{\mathbb{E}(M^i_T)^2}.
	\end{eqnarray}
	Since $\{M_t\}_{t \geq 1}$ is a zero-mean martingale sequence, using the Pythagorean formula for martingales \cite[Eq. (b), Section 12.1]{williams1991probability} and the fact that $|\lambda_i' - r_i(\tau) x_i^*| \leq 1,$ we have
	\begin{eqnarray} \label{M-bd2}
		\mathbb{E}(M_T^i)^2 &\leq&  \sum_{\tau=1}^T \mathbb{E}Q_i^2(\tau-1) \\
		&\leq& R^2(T), \nonumber
	\end{eqnarray}
	where we have defined $R(T) \equiv \sqrt{\sum_{\tau=1}^T \sum_{i=1}^N \mathbb{E}Q_i^2(\tau)}$. Combining the above with Eq.\ \eqref{mg-bound}, we obtain the desired bound for the diameter of the queueing process:
	\begin{eqnarray*}
		\mathbb{E}\big[\max_{i, t \in [T]}Q_i^2(t)\big] \leq \tilde{O}(\max(VT, \sqrt{N}T^{3/2})) + O(R(T)) = \tilde{O}(\max(VT, \sqrt{N}T^{3/2})),
	\end{eqnarray*}
	where we have again used the bound for $R(T)$ from Eqn.\ \eqref{R-bd-bandit}.
	The result stated in the lemma finally follows from an application of Jensen's inequality.
	\end{proof}
%	Combining the above two results, the following proposition gives the worst-case regret bound for the \BQ policy under the bandit feedback.
%	\begin{proposition} \label{reg-bd-bandit-2}
%		The worst-case regret of the \BQ policy under the  bandit feedback is bounded as
%		\begin{eqnarray*}
%				\textrm{Regret}_T = \tilde{O}(\max(\frac{T\sqrt{N}}{\sqrt{V}}, \frac{N^{\nicefrac{3}{4}}T^{\nicefrac{5}{4}}}{V},N\sqrt{T})).
%		\end{eqnarray*}
%	\end{proposition}
%	\begin{proof}
%		From Eqn.\ \eqref{main_bd2}, we have 
%		\begin{eqnarray} \label{main-bd43}
%			\sum_i \mathbb{E}Q_i^2(T)+2V \textrm{Regret}_T(\bm{x}^*) \leq 2T + \tilde{O}\bigg(\sqrt{N}R(T)+NV \sqrt{T} + V\sqrt{NT}+\sqrt{NT}\mathbb{E}\big[\max_{i,\tau \in [T]}(Q_i(\tau))\big]\bigg),
%		\end{eqnarray}
%		where $R(T) \equiv \sqrt{\sum_{\tau=1}^T \sum_{i=1}^N \mathbb{E}Q_i^2(\tau)}$. Plugging in the upper bound for $R(T)$ from Eqn.\ \eqref{R-bd-bandit} and the diameter of the queueing process from Proposition \ref{uniform_bd_lemma}, we obtain:
%		 \begin{eqnarray*}
%		 	2V \textrm{Regret}_T(\bm{x}^*) = \tilde{O}(\max(T\sqrt{NV}, NV\sqrt{T}, N^{\nicefrac{3}{4}}T^{\nicefrac{5}{4}})).
%		 \end{eqnarray*}
%		 Hence,
%		 \begin{eqnarray*}
%		 	\textrm{Regret}_T(\bm{x}^*) = \tilde{O}(\max(\frac{T\sqrt{N}}{\sqrt{V}}, \frac{N^{\nicefrac{3}{4}}T^{\nicefrac{5}{4}}}{V},N\sqrt{T})).
%		 \end{eqnarray*}
%	\end{proof}
	
\iffalse	
\subsection{Proof of Theorem \ref{regret_prop2}} \label{regret_prop2_proof}
From Eq.\ \eqref{main_bd2}, we have that 
\begin{eqnarray*}
2V \textrm{Regret}_T &\leq& 	2T + \tilde{O}(\sqrt{N\sum_{\tau=1}^T\sum_i \mathbb{E}Q_i^2(\tau) }+NV \sqrt{T} + V\sqrt{NT}+\sqrt{NT}\mathbb{E}\big[\max_{i,t \in [T]}(Q_i(t))\big] \\
&\leq & \tilde{O}(N^{5/4} T^{5/4}+ NV \sqrt{T} + N^{3/4} T^{5/4}). 
\end{eqnarray*}
where we have substituted the bounds from Eq.\ \eqref{q_len_bd4} and Proposition \ref{uniform_bd_lemma}. Finally, dividing both sides by $V = \Theta (\sqrt{T}),$ we have 
\begin{eqnarray*}
	\textrm{Regret}_T = \tilde{O}(N^{5/4} T^{3/4}).
\end{eqnarray*} 
\fi
\iffalse
\subsection{Proof of Proposition \ref{avg-regret2}} \label{avg-regret2-proof}
As in the proof of Proposition \ref{avg-regret}, define $S_t^2 \equiv \sum_i \mathbb{E} Q_i^2(t).$ With $V=\Theta(T),$ we still have $||g_t||_\infty \leq  \max_t (Q(t-1)+V)= O(T).$ Let $c_T$ be a constant s.t. $c_T=O(\log T)$. Then, from the self-bounding inequality \eqref{main_bd2}, we have that 
\begin{eqnarray*}
	S_t^2 + 2V \textrm{Regret}_t(\bm{x}^*) &\leq& 2T + 2c_T (\sqrt{N \sum_{\tau=1}^T S_\tau^2} + NV \sqrt{T} + \sqrt{N}T^{3/2}).	%&\leq& 2T+ 4 \sqrt{2 \sum_{\tau=1}^T S_\tau^2}+4V \sqrt{2NT}.
\end{eqnarray*}
Summing up the above inequality from $t=1$ to $t=T$ and defining $z_T\equiv \sqrt{\sum_{\tau=1}^T S_\tau^2},$   the above inequality yields:
\begin{eqnarray*}
	z_T^2 - 2Tc_T\sqrt{N}z_T + 2V\sum_{t=1}^T\textrm{Regret}_t(\bm{x}^*) \leq 2T^2 + 2c_TT(NV \sqrt{T} + \sqrt{N}T^{3/2}). 
\end{eqnarray*}
Upon completing the square, we have that 
\begin{eqnarray*}
	2\frac{V}{T}\sum_{t=1}^T\textrm{Regret}_t(\bm{x}^*) \leq NTc_T^2 + 2T+ 2c_T(NV \sqrt{T} + \sqrt{N}T^{3/2}).  
\end{eqnarray*}
Hence, the time-averaged regret can be bounded as:
\begin{eqnarray*}
	\frac{1}{T}\sum_{t=1}^T\textrm{Regret}_t(\bm{x}^*) \leq O(Nc_T^2 \frac{T}{V}) + O(\frac{T}{V})+\tilde{O}(N \sqrt{T}+\sqrt{N}\frac{T^{3/2}}{V}).
\end{eqnarray*}
\fi

\section{Proof of Proposition \ref{rate-violation-bandit-no-reward}} \label{rate-violation-bandit-no-reward-proof}
Setting $V=0$ and plugging in the bound from Proposition \ref{uniform_bd_lemma}, we have the following bound from \eqref{main_bd2} for any round $1\leq t \leq T:$
\begin{eqnarray} \label{q-bd-bandit-pf}
	\sum_i \mathbb{E}Q_i^2(t) \leq 2T + \tilde{O}\big(\sqrt{N \sum_{t=1}^T \sum_i \mathbb{E}Q_i^2(t)}) + N^{\nicefrac{3}{4}}T^{\nicefrac{5}{4}}\big).
\end{eqnarray}
Define $z_T^2 \equiv \sum_{i}\sum_{t=1}^T \mathbb{E}Q_i^2(t).$ Summing up the inequalities \eqref{q-bd-bandit-pf} from $t=1$ to $t=T,$ we obtain 
\begin{eqnarray*}
	z_T^2 \leq 2T^2 + \tilde{O}(\sqrt{N}Tz_T + N^{\nicefrac{3}{4}}T^{\nicefrac{9}{4}}) \implies z_T = \tilde{O}(N^{\nicefrac{3}{8}}T^{\nicefrac{9}{8}}).
\end{eqnarray*}
Plugging in the above bound in \eqref{q-bd-bandit-pf}, we conclude that
\[\sum_i \mathbb{E}Q_i^2(T) = \tilde{O}(N^{\nicefrac{3}{4}}T^{\nicefrac{5}{4}}) \stackrel{\textrm{(Jensen's ineq.)}}{\implies} \mathbb{E}Q_i(T) = \tilde{O}(N^{\nicefrac{3}{8}}T^{\nicefrac{5}{8}}),~\forall i \in [N]. \]

\iffalse

\subsection{Proof of Theorem \ref{q-mon-bandit}} \label{q-mon-bandit-pf}

Using Eqn.\ \eqref{q-bd-bandit-ref}, taking maximum of both sides w.r.t. $t\in [T]$ and then taking expectation, we have 
\begin{eqnarray*}
	\mathbb{E}[\max_{i, t \in [T]} Q_i^2(t)] \leq 2(V+1)T + \tilde{O}(\sqrt{N\sum_{\tau=1}^T\sum_i \mathbb{E}Q_i^2(\tau) }+NV \sqrt{T} + \sqrt{NT}\mathbb{E}[\max_{i, t \in [1,T]}Q_i(t)]) + 2\sum_i \mathbb{E}[\max_{t \in [T]} M_t^i]
\end{eqnarray*}
Using Jensen's inequality on the LHS and the inequalities \eqref{M-bd1} and \eqref{M-bd2} to bound the right-most expectation in the above, we have the following quadratic inequality: 
\begin{eqnarray*}
	x^2 \leq \tilde{O}(\sqrt{NT}x+VT +NV\sqrt{T}+ \sqrt{N\sum_{t=1}^T \sum_i  \mathbb{E}Q_i^2(\tau)}),
\end{eqnarray*}
where we have defined $x \equiv \mathbb{E}[\max_{i, t \in [1,T]}Q_i(t)]$ and used Jensen's inequality for the concave square-root function on the right. Solving the above quadratic inequality, we have
\begin{eqnarray*}
	x \leq \tilde{O} (\sqrt{VT}).
\end{eqnarray*}
\edit{This seems to mess up the bound!}

\subsection{Proof of Theorem \ref{mon-bandit}} \label{mon-bandit-pf}
Since the sequence of r.v.s $\{Q_i(t)\}_{t \geq 1}$ are assumed to be stochastically monotone, there exists a probab 
\fi
\section{Pseudocode for the \BQ policy in the bandit feedback setup} \label{BQ_bandit}
As discussed in the main text, the \BQ policy in the Bandit feedback setting uses the scale-free MAB algorithm of \citet{putta2022scale} in conjunction with the surrogate reward function defined in Eq.\ \eqref{reward_def2}. The complete pseudocode of the \BQ policy is given below in Algorithm \ref{fair-MAB-bandit-info}. 
\begin{algorithm}
\caption{\BQ Policy in the Bandit-feedback setting}
\label{fair-MAB-bandit-info}
\begin{algorithmic}[1]
\State \algorithmicrequire{ Target reward rate vector $\bm{\vec{\lambda}}$, $\eta \gets N, \gamma \gets 1/2$, Regularizer $F(q)= \sum_{i=1}^N (f(q(i)-f(1/N)),$ where $f(x)=-\log(x).$} 
\State $\bm{Q} \gets \bm{0}, \bm{p} \gets [1/N, 1/N, \ldots, 1/N], V\gets \sqrt{T}, S\gets 1, \bm{\tilde{R}}\gets 0.$ \algorithmiccomment{\emph{Initialization}}
\ForEach {round $t=1:T$:}
\State $\bm{x} \gets (1-\gamma)\bm{p} + \gamma/N$. \algorithmiccomment{\emph{Updating the sampling distribution}}
\State Sample an arm $I_t \in [N]$ from the distribution $\bm{x}$.  
\State Observe the reward of the selected arm $r_{I_t}(t)$\algorithmiccomment{\emph{Bandit feedback}}
%\ForEach {arms $i \in \mathcal{P}$:}
\State 
%\begin{eqnarray*} 
	$Q_i=\big(Q_i+ \lambda_i - r_i(t)\mathds{1}(I_t=i)\big)^+, ~\forall i\in \mathcal{P}. $\algorithmiccomment{\emph{Updating the queues}}
%\end{eqnarray*}
%\EndForEach
\State $r'_i \gets \big(Q_i + V\big)r_i(t)\mathds{1}(I_t=i), ~\forall i$ \algorithmiccomment{\emph{Computing the surrogate rewards}}
\State $\tilde{r}_i \gets \frac{r'_i}{x_i} \mathds{1}(I_t=i)$ \algorithmiccomment{\emph{Estimating the rewards via the inverse propensity scores (IPS)}}
\State $\bm{\tilde{R}}\gets \bm{\tilde{R}} + \bm{\tilde{r}}$ \algorithmiccomment{\emph{Updating the cumulative estimated surrogate rewards}}
\State $\gamma \gets \min(1/2, \sqrt{N/t}).$ 
%\State $S \gets S + ||\bm{r}'(t)||^2.$ \algorithmiccomment{\emph{Accumulating the norm of past gradients}}
\State $S \gets S+ \eta^{-1}\sup_{q \in \Delta_N}(\langle \tilde{\bm{r}}, \bm{q}-\bm{p} \rangle -\textrm{Breg}_F(\bm{q}||\bm{p}).$  \label{opt1}
\State $\eta \gets N/S$ \algorithmiccomment{\emph{Adaptively choosing the learning rate}}
\State $\bm{p}\gets \arg \min_{\bm{q} \in \Delta_{N}} \big[ F(\bm{q}) - \eta \langle \bm{q}, \bm{\tilde{R}}\rangle \big]$ \algorithmiccomment{ \emph{The \texttt{FTRL} step}} \label{opt2}
	%\State $\bm{x}\gets \Pi_{\Delta_N}\bigg(\bm{x}+ \frac{\bm{r}'(t)}{\sqrt{2S}} \bigg)$ \algorithmiccomment{\emph{Implementing the online gradient ascent step}}
\EndForEach
\end{algorithmic}
\end{algorithm}
In line \ref{opt1} of the pseudocode, $\textrm{Breg}_F(x||y)$ denotes the usual Bregman divergence between the points $x$ and $y$ with respect to the convex function $F(\cdot),$ \emph{i.e.,}
\begin{eqnarray*}
	\textrm{Breg}_F(x||y) = F(x)-F(y)- \langle \nabla F(y), x-y\rangle .
\end{eqnarray*}



%\subsection{Simulation details and Additional Numerical Results}  \label{sim-addl}
\section{Efficient implementation of the optimization module} \label{opt_implementation}

To speed up the simulation, we implemented a custom-made optimizer for the optimization steps \ref{opt1} and \ref{opt2} involved in the \BQ algorithm in the bandit-feedback setting. For this, we directly solved the KKT optimality condition, where we computed the optimal KKT multiplier by using the classic Newton-Raphson root-finding algorithm. This empirically resulted in about \emph{two orders} of magnitude speed-up compared to using standard convex optimization packages such as \texttt{CVX} \citep{grant2011cvx}.  

%\cmt{Report the run-time and other details here}. 

Let $\bm{r} \in \mathbb{R}^N$ be a given $N$-dimensional real vector. After some simple algebraic manipulations, both the optimization problems in steps \ref{opt1} and \ref{opt2} of Algorithm \ref{fair-MAB-bandit-info} %for a generic input parameter vector $\bm{r}$ 
can be expressed in the following form: 

\begin{eqnarray} \label{obj_fun}
\texttt{OPT}(\bm{r}):~	\max \sum_{i=1}^N \log x_i + \langle \bm{r}, \bm{x} \rangle
\end{eqnarray}
Subject to,
\begin{eqnarray} \label{constr}
	\sum_i x_i=1,~ x_i \geq 0, ~\forall i \in [N]. 
\end{eqnarray}

Since the objective function \eqref{obj_fun} is strictly concave, and the constraint \eqref{constr} is linear, using the KKT condition, a probability vector $\bm{x}^*$ is an optimal point for the above problem if and only if there exists a real number $\mu \in \mathbb{R}$ s.t. 
\begin{eqnarray} \label{kkt1}
	\frac{1}{x_i^*} + r_i + \mu =0 \implies x_i^*= - (r_i+\mu)^{-1}, ~\forall i,
\end{eqnarray} 
where $\bm{x}^*$ satisfies the feasibility condition \eqref{constr}. For the non-negativity constraint on $\bm{x}^*$, we must have: 
\begin{eqnarray*}
	r_i+\mu < 0 \implies \mu < -\max_i r_i.
\end{eqnarray*}
Finally, we require that 
\begin{eqnarray*}
	\sum_i x_i^* =1.
\end{eqnarray*}
\emph{i.e.,}
\begin{eqnarray} \label{newton-raphson}
	\sum_i \frac{1}{r_i+\mu}-1=0.
\end{eqnarray}
We now use the Newton-Raphson method for solving \eqref{newton-raphson} starting from $\mu^{(0)}=  -\max_i r_i -1.$  The algorithm is given below:

  \begin{algorithm}
\caption{Custom optimizer for the problem \texttt{OPT} ($\bm{r}$)}
\label{optimizer}
\begin{algorithmic}[1]
\State \algorithmicrequire{ $\bm{r},$  $\texttt{tolerance}\gets 10^{-8}.$ }
\State $\mu\gets  -\max_i r_i -1, \texttt{error}\gets 1.$
\While{$\texttt{error}>\texttt{tolerance}$}
 \begin{eqnarray*}
	\mu \gets \mu + \frac{\sum_i \frac{1}{r_i+\mu}-1}{\sum_i \frac{1}{(r_i+\mu)^2}}.
\end{eqnarray*}
$\texttt{error} \gets |\sum_i \frac{1}{r_i+\mu}-1|.$
\EndWhile
\State $x_i^* \gets -(r_i+\mu)^{-1}, ~\forall i.$
\State Return $\bm{x}^*.$
\end{algorithmic}
\end{algorithm}
% The projection-step on to in the full information setting is implemented using the 

\section{Additional numerical results} \label{addl_sim}
\subsection{Comparison with an oracle policy}\label{addl_sim1}
\begin{figure}[h!]
	\centering
	\includegraphics[scale=0.4]{./Figures/Reward_comparison.pdf}
	\caption{Comparison of reward accrued by the \BQ policy and the Oracle \texttt{LFG} policy ($\eta = 100$)}
	\label{rew-comp}
\end{figure}
In this section, we compare the performance of the \BQ policy with an \emph{Oracle} policy that knows the optimal fraction of pulls of each arm to satisfy the required reward rate constraints. With the given mean reward $\bm{\mu}$ and the required reward rate vector $\bm{\lambda}$, the optimal fraction of pulls can be easily computed to be $f_1=\nicefrac{\lambda_1}{\mu_1}=\nicefrac{1}{2}, f_2=\nicefrac{\lambda_2}{\mu_2}=\nicefrac{1}{3}, f_3=0, f_4=1-(\nicefrac{1}{2}+\nicefrac{1}{3})=\nicefrac{1}{6}, f_5=0.$ In the above computation, we have used the fact that Arm \#4 is the most rewarding arm. We emphasize that the oracle policy should have \emph{exact} knowledge of the mean reward vector $\bm{\mu}$ - a non-zero error in the value of the reward vector either leads to not achieving the target rates or having a linear regret or both.  

Note that the online policy proposed by \citet{patil2021achieving} \emph{cannot} be used with the above profile of fraction of pulls as their policy requires the required fraction of each arm to be at most $\nicefrac{1}{N-1} = \nicefrac{1}{4}.$ Hence, we use the UCB-based policy proposed by \citet{li2019combinatorial} called \emph{Learning with Fairness Guarantee} (\textsc{LFG}) as the benchmark. \textsc{LFG} uses queue variables to balance meeting the target fraction of pulls and achieving the small regret. However, as stated in \citet[Theorem 2]{li2019combinatorial}, the best-known regret bound of the \textsc{LFG} policy increases linearly with time.  


\paragraph{Observation:} From Figure \ref{rew-comp}, we see that the proposed \BQ policy yields strictly better cumulative rewards compared to the oracle \texttt{LFG} policy that knows the optimal fraction of arm pulls to meet the given reward rate constraints. This result can be attributed to the fact that the \BQ policy directly takes into account the reward realizations through the queue evolutions, whereas the Oracle \textsc{LFG} policy works based only on the expected rewards.  

\subsection{Large-scale simulation with $N=1000$ arms} \label{addl_sim2}

\begin{figure*}[t]
  \centering
  \begin{minipage}[b]{0.3\linewidth}
   \centering
    \includegraphics[width=\linewidth]{./Figures/large_scale/Reward_rates_full_info_cropped.pdf}
   \caption{\small{Reward accrual rates in the full-information setting}}
   \label{rew_full}
  \end{minipage}
   \begin{minipage}[b]{0.3\linewidth}
   \centering
    \includegraphics[width=\linewidth]{./Figures/large_scale/Q_lengths_full_info_cropped.pdf}
   \caption{\small{Queue lengths in the full-information setting}}
   \label{q_full}
  \end{minipage}
   \begin{minipage}[b]{0.3\linewidth}
   \centering
    \includegraphics[width=\linewidth]{./Figures/large_scale/Regret_full_info_cropped.pdf}
   \caption{\small{Regret of \BQ in the full-information setting}}
   \label{reg_full}
  \end{minipage}
  \begin{minipage}[b]{0.3\linewidth}
   \centering
    \includegraphics[width=\linewidth]{./Figures/large_scale/Reward_rates_bandit_feedback_cropped.pdf}
   \caption{\small{Reward accrual rates in the bandit feedback}}
   \label{rew_bf}
  \end{minipage}
  %\hfill
  \begin{minipage}[b]{0.3\linewidth}
   \centering
    \includegraphics[width=\linewidth]{./Figures/large_scale/Q_lengths_Bandit_feedback_cropped.pdf}
   \caption{\small{Queue lengths in the bandit feedback setting}}
   \label{q_bf}
  \end{minipage}
 %\hfill
   \begin{minipage}[b]{0.3\linewidth}
   \centering
    \includegraphics[width=\linewidth]{./Figures/large_scale/Regret_bandit_feedback_cropped.pdf}
   \caption{\small{Regret of \BQ in the bandit feedback setting}}
   \label{reg_bf}
  \end{minipage}
\end{figure*}
Figures [8-13] show the performance of the \textsc{BanditQ} policy with $N=1000$ arms in both full and bandit information settings. The mean rewards $\bm{\mu}$ for each arm are sampled uniformly at random from the interval $[0,1].$ As before, we consider two protected arms - arm 1 and arm 2 and set $\lambda_1= \mu_1/2, \lambda_2=\mu_2/3.$ The plots show that even for a large instance, the \BQ policy continues to perform satisfactorily in terms of both regret and achieving the target rates. 


