\section{\BQ policy with bandit feedback} \label{bandit-feedback}
%The additional difficulty in the bandit feedback setting compared to the full-information set-up is that if a policy selects an arm $I_t \in [N],$ on round $t,$ then 
Under bandit feedback, only the reward of the selected arm, \emph{i.e.,} $r_{I_t}(t),$ is revealed to the policy at the end of round $t$. The reader should compare this with the full-information setup where the entire reward vector $\bm{r}(t)$ is revealed irrespective of the action. To deal with the resulting in the \emph{exploration-vs-exploitation} trade-off in the limited information setup, we replace the full-information \textsc{OGA} policy \eqref{oga-update} with an adversarial MAB policy, proposed recently by \citet{putta2022scale}, that enjoys a \emph{scale-free} second-order regret bound similar to Eq.\ \eqref{data-dep-bd}. Their \emph{Follow-the-regularized-leader} (FTRL)-based MAB policy uses the standard inverse propensity score to estimate the reward vectors and employs a log-barrier regularizer in the FTRL algorithm with a carefully chosen learning rate schedule. The arms are finally selected by mixing a uniform exploration component with the distribution obtained from the FTRL algorithm. For completeness, we describe the \BQ policy in the bandit information setting in Appendix \ref{BQ_bandit}. \citet{putta2022scale} showed that their proposed MAB policy works for \emph{any} real loss vector (unlike, \emph{e.g.,} EXP3, which requires non-negative losses) and enjoys the following scale-free adaptive regret bound.  

\begin{theorem}[\cite{putta2022scale}] \label{ref_th2}
MAB Algorithm 1 of \cite{putta2022scale}, when run with the  oblivious linear reward sequence with coefficient vectors $\{\bm{g}_t\}_{t=1}^T,$ enjoys the following scale-free regret bound: 
\begin{eqnarray} \label{bandit_reg-bd}
	\textrm{Regret}_T = \tilde{O}\bigg(\sqrt{N\sum_{t=1}^T||\bm{g}_t||_2^2} + \max_{t \in [T]} ||\bm{g}_t||_\infty\sqrt{NT}\bigg).
\end{eqnarray}
\end{theorem}
It can be seen that the only essential difference between the above expression and that of the \textsc{OGA} regret bound in Eq.\ \eqref{data-dep-bd} is the presence of the additional term $\tilde{O}(\max_{t \in [T]} ||\bm{g}_t||_\infty\sqrt{NT})$ in the former. With a more careful analysis using martingales, our previous arguments go through with minimal changes. We now outline the main differences between the full information and the bandit setup. 

\textbf{Notation:} Let us encode the index of the selected arm $I_t$ on round $t$ by the one-hot encoded vector $\bm{X}(t)=\big(X_1(t), X_2(t), \ldots, X_N(t)\big) \in \{0,1\}^N$, where $X_i(t)= \mathds{1}(I_t=i), \forall i.$ Thus, if $x_i(t)$ denotes the conditional probability that the $i$\textsuperscript{th} arm is pulled, we have $\mathbb{P}(X_i(t)=1|\mathcal{F}_{t-1})= 1-\mathbb{P}(X_i(t)=0|\mathcal{F}_{t-1})=x_i(t)$ and $\mathbb{E}(X_i(t)|\mathcal{F}_{t-1})=x_i(t), \forall i,t.$ 
%Hence the marginal distributions satisfy the relation $\mathbb{E}(X_i(t)|\mathcal{F}_{t-1})=x_i(t), \forall i,t.$

\paragraph{Queueing recursion and the auxiliary MAB problem:} Note that the queueing recursion \eqref{q-ev} for the full-feedback setting does not work in the case of Bandit feedback because the rewards of the unobserved arms are not revealed. However, it is straightforward to modify the recursion \eqref{q-ev} by replacing the sampling probabilities $\bm{x}(t)$ with the corresponding random realizations $\bm{X}(t).$ Hence, in the bandit setting, the queueing evolution for the $i$\textsuperscript{th} arm reads:
\begin{eqnarray} \label{q-ev-2}
	Q_i(t)=\big(Q_i(t-1)+ \lambda_i - r_i(t)X_i(t)\big)^+, ~Q_i(0)=0.
\end{eqnarray}
Eq.\ \eqref{q-ev-2} is well-defined in the bandit feedback setting as $X_i(t)=0$ if $i \neq I_t.$ Hence, the recursion \eqref{q-ev-2} does not depend on the reward of any arm which was not played. Next, analogous to the full-information setting (Eq.\ \eqref{reward_def}), the \textsc{BanditQ} policy defines an instance of an adversarial MAB problem $\Xi^{\textsc{Bandit}}$ where the surrogate reward of the $i$\textsuperscript{th} arm on round $t$ is defined as: 
\begin{eqnarray} \label{reward_def2}
	r'_i(t) \equiv \big(Q_i(t-1) + V\big)r_i(t), ~\forall i \in [N].
\end{eqnarray} 
As before, the surrogate rewards are not bounded \emph{a priori} due to the presence of the queueing variables. 
%Hence, we use a black box MAB policy with a scale-free second-order regret bound proposed in \citet{putta2022scale}, which is an FTRL policy with a logarithmic regularizer with a carefully chosen adaptive learning rate schedule. 
%For completeness, we define the entire policy in Appendix \ref{}.  


%\begin{theorem}[\cite[Algo 1]{putta2022scale}]  
%	d
%\end{theorem}
 
\subsection{Analysis} 
As before, the components of the surrogate reward gradients are given by $\bm{g}_{t,i}= r_i'(t)= \big(Q_i(t-1) + V\big)r_i(t).$
%Set $V=\Theta(\sqrt{T}), \forall t.$ 
%Next, we control the last term in the regret bound \eqref{bandit_reg-bd}. Note that $\max_t ||\bm{g}_t||_\infty = \max_{t=1}^T \max_i (Q_i(t-1)+V) \stackrel{(\textrm{a.s.})}{=} O(T),$ where we have used the fact that $Q_i(t) \stackrel{(\textrm{a.s.})}{\leq } T, \forall t\in [T].$ 
Using the quadratic potential function $\Phi(\cdot)$ defined in Eq.\ \eqref{potential_def} and working identically up to step (c) of Eq.\ \eqref{main-ineq3}, we derive the following self-bounding inequality:
\begin{eqnarray} 
&&\sum_{i} \mathbb{E}Q_i^2(t) + 2V \textrm{Regret}_t (\bm{x}^*) \nonumber \\
	%&=&\sum_{i} \mathbb{E}Q_i^2(t) + 2V \sum_{\tau=1}^t  \mathbb{E}\sum_i r_i(\tau) \big(x_i^*-X_i(\tau)\big)\nonumber \\
	&\leq& 2t + 2 \mathbb{E}\big[ \textrm{Regret}^{\Xi^{\texttt{Bandit}}}_t\big]\nonumber \\
	&\stackrel{(a)}{\leq} & 2t + \tilde{O}\bigg(\sqrt{N\sum_{\tau=1}^t\sum_i \mathbb{E}Q_i^2(\tau) }+NV \sqrt{t} + \nonumber \\
	&&V\sqrt{Nt}+\sqrt{Nt}\mathbb{E}\big[\max_{i,\tau \in [t]}(Q_i(\tau))\big]\bigg)\label{main_bd2}\\
	&\stackrel{(b)}\leq&  2t + \tilde{O}\big(\sqrt{N\sum_{\tau=1}^t\sum_i \mathbb{E}Q_i^2(\tau) }+NV \sqrt{t} + \sqrt{N}t^{3/2}\big),\nonumber \\ \label{reg-bd9}
\end{eqnarray}
where, in step (a), we have used the regret bound from Theorem \ref{ref_th2}, and in step (b), we have used the trivial bound $Q_i(t) \leq t, \forall t \in [T], \forall i$. The following theorem gives the performance of the \BQ policy with bandit feedback.

\begin{theorem}\label{q_bd-bandit}
	%Upon setting $V=\Theta(\sqrt{T}), \forall t\geq 1$, 
	In the bandit feedback setting, the \BQ policy achieves the following regret and target rate violation bounds:
	\begin{eqnarray*}
			\textrm{Regret}_T &=& \tilde{O}(\max(\frac{T\sqrt{N}}{\sqrt{V}}, \frac{N^{\nicefrac{3}{4}}T^{\nicefrac{5}{4}}}{V},N\sqrt{T}))., \\
		\mathbb{V}(T) &=& \tilde{O}(\max(\sqrt{VT}, N^{\nicefrac{1}{4}}T^{\nicefrac{3}{4}})).
	\end{eqnarray*}
	In particular, upon setting $V=\sqrt{T},$ we obtain 
		\begin{eqnarray*}
		\textrm{Regret}_T = O(N^{\nicefrac{3}{4}}T^{\nicefrac{3}{4}}), ~ \mathbb{V}(T) = \tilde{O}(N^{\nicefrac{1}{4}}T^{\nicefrac{3}{4}}).
	\end{eqnarray*}
\end{theorem}
Compared to the full-information setting, the proof in the bandit setting uses a more sophisticated Martingale-based argument to control the maximum of the queueing process for bounding the second term in the regret expression \eqref{bandit_reg-bd}. 
%\begin{proof}
%See Appendix \ref{dia-bd} for the proof. 
%\section{Proof of Theorem \ref{q_bd-bandit}}
%\label{dia-bd}
To simplify the exposition, the proof of Theorem \ref{q_bd-bandit} is broken into three interrelated propositions.
We begin our analysis by first deriving a sublinear bound for $\mathbb{E}Q_i^2(t).$
\begin{proposition} \label{th5-pf1}
	Under the action of the \BQ policy with bandit feedback, we have 
	\[ \mathbb{E}Q_i^2(t)=\tilde{O}(\max(Vt, \sqrt{N}t^{\nicefrac{3}{2}})) , \forall i, t.\]
	Hence, using Jensen's inequality, we have $\mathbb{V}(T) =  \tilde{O}(\max(\sqrt{VT}, N^{\nicefrac{1}{4}}T^{\nicefrac{3}{4}})).$

\end{proposition}
\begin{proof}
	Recall that from Eqn.\ \eqref{reg-bd9} we have: 
	\begin{eqnarray*}
		&&\sum_{i} \mathbb{E}Q_i^2(t) + 2V \textrm{Regret}_t (\bm{x}^*) \leq 2t + \\
		&&\tilde{O}\big(\sqrt{N\sum_{\tau=1}^t\sum_i \mathbb{E}Q_i^2(\tau) }+NV \sqrt{t} + \sqrt{N}t^{3/2}\big).
	\end{eqnarray*}
	Using the fact that $r_i(t) \leq 1, \forall i,t,$ we have $\textrm{Regret}_t(x^*) \geq -t.$ Hence, from the above, we obtain
	\begin{eqnarray}\label{q-bd-bandit}
		&&\sum_{i} \mathbb{E}Q_i^2(t) \leq 2(V+1)t+ \nonumber \\
		&&\tilde{O}\big(\sqrt{N\sum_{\tau=1}^t\sum_i \mathbb{E}Q_i^2(\tau) }+NV \sqrt{t} + \sqrt{N}t^{3/2}\big),
	\end{eqnarray}
	which resembles Eqn.\ \eqref{ineq1} in the full-information setting. Defining $R(t) \equiv \sqrt{\sum_{\tau=1}^t \sum_{i=1}^N \mathbb{E}Q_i^2(\tau)}$ and working similarly as in the full-information setting, we have the following quadratic inequality:
	\begin{eqnarray} \label{R-bd-bandit}
		R^2(t) &\leq& 2(V+1)t^2 + \nonumber\\
		&&\tilde{O}\big(\sqrt{N} tR(t)+NV t^{\nicefrac{3}{2}} + \sqrt{N}t^{\nicefrac{5}{2}}\big) \nonumber\\
		\implies R(t) &=& \tilde{O}\big(\max(t\sqrt{V}, N^{\nicefrac{1}{4}}t^{\nicefrac{5}{4}})\big).
	\end{eqnarray}
	Substituting the above bound in \eqref{q-bd-bandit}, we conclude that for each $i \in [N]:$
	\begin{eqnarray*}
		\mathbb{E}Q_i^2(t) = \tilde{O}(\max(Vt, \sqrt{N}t^{\nicefrac{3}{2}})).
	\end{eqnarray*}
\end{proof}

The next proposition establishes a sublinear bound to the diameter $\mathbb{E}\big[\max_{i, t \in [T]}Q_{i}(t)\big]$, which appears on the RHS of \eqref{main_bd2}.
%of the queueing processes $\{\textbf{Q}(t)\}_{t=1}^T$. This result will be used to derive a sublinear regret bound for the \BQ policy. 

\begin{proposition}\label{uniform_bd_lemma}
Under the action of the \BQ policy, for any round $T\geq 1,$ we have the following bound for the expected maximum of the queueing processes
	\[ \mathbb{E}\big[\max_{i, t \in [T]}Q_{i}(t)\big] = \tilde{O}(\max(\sqrt{VT}, N^{\nicefrac{1}{4}}T^{3/4})).\]
\end{proposition}
The proof of Proposition \ref{uniform_bd_lemma} is technical and is given in Section \ref{unif-bd-pf} in the Appendix.
%\begin{proof}
%Using Eq.\ \eqref{main_ineq} and the fact that $|r_i(t)|\leq 1, \forall i, t,$ we have the following sample-path wise bound on the square of the queue lengths:
%	\begin{eqnarray} \label{sample-path}
%		\sum_i Q_i^2(t) &\leq& 2(V+1)t + 2 \sum_{\tau=1}^t \sum_{i} Q_i(\tau-1) \big(\lambda_i-r_i(\tau) x_i^* \big)+2\textrm{Regret}^{\Xi}_t \nonumber\\
%		&\leq& 2(V+1)t + \tilde{O}(\sqrt{N\sum_{\tau=1}^t\sum_i Q_i^2(\tau) }+NV \sqrt{t} + \sqrt{Nt}\max_{i, \tau \in [1,t]}Q_i(\tau)) + 2\sum_i M_t^i, \label{q-bd-bandit-ref} \\
%		&\stackrel{(a)}{\leq}& 2(V+1)t + \tilde{O}(\sqrt{N\sum_{\tau=1}^t\sum_i Q_i^2(\tau) }+NV \sqrt{t} + \sqrt{N}t^{3/2}) + 2\sum_i M_t^i,
%	\end{eqnarray}
%	where in the above, we have substituted the upper-bound to the regret of the surrogate problem from Eq.\ \eqref{bandit_reg-bd}, (as in Eqn.\ \eqref{reg-bd9}), used the fact that $Q_(\tau) \leq \tau$, and for each $i \in [N],$ we have defined the stochastic process $\{M_t^i\}_{t \geq 1}$ as follows:
%	\begin{eqnarray} \label{mg-def}
%		M_t^i = \sum_{\tau=1}^t  Q_i(\tau-1) \big(\lambda_i'-r_i(\tau) x_i^* \big),~ t \geq 1.
%	\end{eqnarray}
%where $\lambda_i' \stackrel{(\textrm{def.})}{=} x_i^*\mathbb{E}r_i(\tau) = x_i^*\mu_i \geq \lambda_i.$
%	%Hence, recalling that $V=\Theta(\sqrt{T}),$ 
%	Taking the maximum of both sides with respect to all rounds $t \in [T]$ for some $T \geq 1,$ we have
%	\begin{eqnarray*}
%		\max_{i, t \in [T]}Q_i^2(t) \leq 2VT+ \tilde{O}(\sqrt{N\sum_{\tau=1}^T\sum_i Q_i^2(\tau) }+NV\sqrt{T}+ \sqrt{N}T^{3/2}) + 2\sum_i \max_{t \in [T]} M_t^i.
%	\end{eqnarray*}
%	Taking the expectation of both sides of the above inequality, we obtain
%	\begin{eqnarray} \label{mg-bound}
%		\mathbb{E}\big[\max_{i, t \in [T]}Q_i^2(t)\big] &\leq& 2VT+ \tilde{O}(\sqrt{N\sum_{\tau=1}^T\sum_i \mathbb{E}Q_i^2(\tau) }+ \sqrt{N}T^{3/2}) + 2\sum_i \mathbb{E}\big[\max_{t\in [T]} M_t^i \big] \nonumber \\
%		&\stackrel{(a)}{\leq}& 2VT+ \tilde{O}\big(\max(T\sqrt{V}, N^{\nicefrac{1}{4}}T^{\nicefrac{5}{4}})\big)+ \tilde{O}(\sqrt{N}T^{3/2})+ 2\sum_i \mathbb{E}\big[\max_t M_t^i \big] \\
%		&=& \tilde{O}(\max(VT, \sqrt{N}T^{3/2})) + 2\sum_i \mathbb{E}\big[\max_t M_t^i \big], 
%	\end{eqnarray}
%	where in step (a), we have used the bound for $R(T)$ from Eqn.\ \eqref{R-bd-bandit}.
%	Next, we claim that each of the processes $\{M_t^i\}_{t\geq 1}$ is a zero-mean martingale process with respect to the natural filtration $\{\mathcal{F}_\tau\}_{\tau \geq 1}$. This follows from the definition \eqref{mg-def} as $Q_i(\tau-1) \in \mathcal{F}_{\tau-1}$ is pre-visible and the random variable $r_i(\tau)$ is independent of $\mathcal{F}_{\tau -1}$ s.t. $\mathbb{E}(\lambda_i' - r_i(\tau)x_i^*)=0.$  
%	%Using classic results 
%	Using the $L^2$ maximum inequality for Martingales \citep[Theorem 4.4.4]{durrett2019probability}, \citep[Theorem 3.4]{doob1953stochastic}, \citep{dubins1988sharp}, we have
%	%we know that the diameter of a Martingale with a last term is bounded by twice the square root of the variance of the last term. Hence,
%	\begin{eqnarray} \label{M-bd1}
%		\mathbb{E}[\max_{t \in [T]} M^i_t] \leq 2\sqrt{\mathbb{E}(M^i_T)^2}.
%	\end{eqnarray}
%	Since $\{M_t\}_{t \geq 1}$ is a zero-mean martingale sequence, using the Pythagorean formula for martingales \cite[Eq. (b), Section 12.1]{williams1991probability} and the fact that $|\lambda_i' - r_i(\tau) x_i^*| \leq 1,$ we have
%	\begin{eqnarray} \label{M-bd2}
%		\mathbb{E}(M_T^i)^2 &\leq&  \sum_{\tau=1}^T \mathbb{E}Q_i^2(\tau-1) \\
%		&\leq& R^2(T), \nonumber
%	\end{eqnarray}
%	where we have defined $R(T) \equiv \sqrt{\sum_{\tau=1}^T \sum_{i=1}^N \mathbb{E}Q_i^2(\tau)}$. Combining the above with Eq.\ \eqref{mg-bound}, we obtain the desired bound for the diameter of the queueing process:
%	\begin{eqnarray*}
%		\mathbb{E}\big[\max_{i, t \in [T]}Q_i^2(t)\big] \leq \tilde{O}(\max(VT, \sqrt{N}T^{3/2})) + O(R(T)) = \tilde{O}(\max(VT, \sqrt{N}T^{3/2})),
%	\end{eqnarray*}
%	where, we have again used the bound for $R(T)$ from Eqn.\ \eqref{R-bd-bandit}.
%	The result stated in the lemma finally follows from an application of Jensen's inequality.
%	\end{proof}
	Combining the above two results, the following proposition gives the worst-case regret bound for the \BQ policy under the bandit feedback.
	\begin{proposition} \label{reg-bd-bandit-2}
		The worst-case regret of the \BQ policy under the  bandit feedback is bounded as
		\begin{eqnarray*}
				\textrm{Regret}_T = \tilde{O}(\max(\frac{T\sqrt{N}}{\sqrt{V}}, \frac{N^{\nicefrac{3}{4}}T^{\nicefrac{5}{4}}}{V},N\sqrt{T})).
		\end{eqnarray*}
	\end{proposition}
	\begin{proof}
		From Eqn.\ \eqref{main_bd2}, we have 
		\begin{eqnarray} \label{main-bd43}
			\sum_i \mathbb{E}Q_i^2(T)+2V \textrm{Regret}_T(\bm{x}^*) \leq 2T  + \tilde{O}\bigg(\sqrt{N}R(T)+ \nonumber \\ NV \sqrt{T} + V\sqrt{NT}+\sqrt{NT}\mathbb{E}\big[\max_{i,\tau \in [T]}(Q_i(\tau))\big]\bigg),
		\end{eqnarray}
		where $R(T) \equiv \sqrt{\sum_{\tau=1}^T \sum_{i=1}^N \mathbb{E}Q_i^2(\tau)}$. Plugging in the upper bound for $R(T)$ from Eqn.\ \eqref{R-bd-bandit} and the diameter of the queueing process from Proposition \ref{uniform_bd_lemma}, we obtain:
		 \begin{eqnarray*}
		 	2V \textrm{Regret}_T(\bm{x}^*) = \tilde{O}(\max(T\sqrt{NV}, NV\sqrt{T}, N^{\nicefrac{3}{4}}T^{\nicefrac{5}{4}})).
		 \end{eqnarray*}
		 Hence,
		 \begin{eqnarray*}
		 	\textrm{Regret}_T(\bm{x}^*) = \tilde{O}(\max(\frac{T\sqrt{N}}{\sqrt{V}}, \frac{N^{\nicefrac{3}{4}}T^{\nicefrac{5}{4}}}{V},N\sqrt{T})).
		 \end{eqnarray*}
		 
%	\end{proof}
	
\iffalse	
\subsection{Proof of Theorem \ref{regret_prop2}} \label{regret_prop2_proof}
From Eq.\ \eqref{main_bd2}, we have that 
\begin{eqnarray*}
2V \textrm{Regret}_T &\leq& 	2T + \tilde{O}(\sqrt{N\sum_{\tau=1}^T\sum_i \mathbb{E}Q_i^2(\tau) }+NV \sqrt{T} + V\sqrt{NT}+\sqrt{NT}\mathbb{E}\big[\max_{i,t \in [T]}(Q_i(t))\big] \\
&\leq & \tilde{O}(N^{5/4} T^{5/4}+ NV \sqrt{T} + N^{3/4} T^{5/4}). 
\end{eqnarray*}
where we have substituted the bounds from Eq.\ \eqref{q_len_bd4} and Proposition \ref{uniform_bd_lemma}. Finally, dividing both sides by $V = \Theta (\sqrt{T}),$ we have 
\begin{eqnarray*}
	\textrm{Regret}_T = \tilde{O}(N^{5/4} T^{3/4}).
\end{eqnarray*} 
\fi
\iffalse
\subsection{Proof of Proposition \ref{avg-regret2}} \label{avg-regret2-proof}
As in the proof of Proposition \ref{avg-regret}, define $S_t^2 \equiv \sum_i \mathbb{E} Q_i^2(t).$ With $V=\Theta(T),$ we still have $||g_t||_\infty \leq  \max_t (Q(t-1)+V)= O(T).$ Let $c_T$ be a constant s.t. $c_T=O(\log T)$. Then, from the self-bounding inequality \eqref{main_bd2}, we have that 
\begin{eqnarray*}
	S_t^2 + 2V \textrm{Regret}_t(\bm{x}^*) &\leq& 2T + 2c_T (\sqrt{N \sum_{\tau=1}^T S_\tau^2} + NV \sqrt{T} + \sqrt{N}T^{3/2}).	%&\leq& 2T+ 4 \sqrt{2 \sum_{\tau=1}^T S_\tau^2}+4V \sqrt{2NT}.
\end{eqnarray*}
Summing up the above inequality from $t=1$ to $t=T$ and defining $z_T\equiv \sqrt{\sum_{\tau=1}^T S_\tau^2},$   the above inequality yields:
\begin{eqnarray*}
	z_T^2 - 2Tc_T\sqrt{N}z_T + 2V\sum_{t=1}^T\textrm{Regret}_t(\bm{x}^*) \leq 2T^2 + 2c_TT(NV \sqrt{T} + \sqrt{N}T^{3/2}). 
\end{eqnarray*}
Upon completing the square, we have that 
\begin{eqnarray*}
	2\frac{V}{T}\sum_{t=1}^T\textrm{Regret}_t(\bm{x}^*) \leq NTc_T^2 + 2T+ 2c_T(NV \sqrt{T} + \sqrt{N}T^{3/2}).  
\end{eqnarray*}
Hence, the time-averaged regret can be bounded as:
\begin{eqnarray*}
	\frac{1}{T}\sum_{t=1}^T\textrm{Regret}_t(\bm{x}^*) \leq O(Nc_T^2 \frac{T}{V}) + O(\frac{T}{V})+\tilde{O}(N \sqrt{T}+\sqrt{N}\frac{T^{3/2}}{V}).
\end{eqnarray*}
\fi

\end{proof}
Proposition \ref{th5-pf1} and Proposition \ref{reg-bd-bandit-2}, taken together, establish Theorem \ref{q_bd-bandit}. 

Following exactly the same arguments, the result in Proposition \ref{rate-prop} can be shown to hold in the bandit feedback setting as well. Finally, as in the full-information setting, we now discuss the case when one is only interested in satisfying the target rate constraints while disregarding the accrued rewards. The following proposition gives a bound on the cumulative violation in the bandit setting.

\begin{proposition} \label{rate-violation-bandit-no-reward}
	Setting $V=0,$ the cumulative constraint violation under the \BQ policy in the bandit setting can be bounded for any $T \geq 1$ as follows:
	\[ \mathbb{V}(T) \leq \max_i \mathbb{E}Q_i(T) = \tilde{O}(N^{\nicefrac{3}{8}}T^{\nicefrac{5}{8}}) . \]
\end{proposition}
%It can be seen that 
The above bound is slightly worse compared to the $O(\sqrt{T})$ bound in the full-information setting (Proposition \ref{improved_bd}). 
See Section \ref{rate-violation-bandit-no-reward-proof} in the Appendix for the proof of Proposition \ref{rate-violation-bandit-no-reward}. 

\iffalse We now make the following assumption on the base MAB sub-routine to derive a stronger $O(\sqrt{T})$ regret bound.
\begin{assumption} \label{q-mon-bandit2}
	Under the action of the chosen MAB subroutine, each of the queue length processes $\{Q_i(t)\}_{t \geq 1}$ are stochastically non-decreasing in time $t$. 
\end{assumption}
Many closely-related markov chains, \emph{e.g.,} the birth-death chain with the zero initial state are known to satisfy Assumption \ref{q-mon-bandit2} \citep[Proposition 9.2.4]{ross1995stochastic} \citep[Theorem 6.1]{van1980stochastic}, \citep{keilson1977monotone}. 

Under this assumption, we now have the following sharper regret bound.   
 %Interestingly, unlike the full-information setting, Assumption \ref{mon-q} does not seem to yield an improved regret bound in the bandit feedback setting due to the presence of the extra term in the regret expression \eqref{bandit_reg-bd}.
  \begin{theorem} \label{mon-bandit}
  	Under Assumption \ref{q-mon-bandit2}, the regret of the \BQ policy in the bandit feedback setting can be bounded as $\textrm{Regret}_t\leq \frac{5t}{V}+ 2 \sqrt{2Nt}.$
  	Hence, with $V=\sqrt{T},$ we have $\textrm{Regret}_t = O(\sqrt{Nt}).$ 
  	\end{theorem}
  	\fi
\iffalse

Hence, using the fact that  $\sum_i r_i(\tau)(X_i(\tau) -x_i^*) \leq 1, \forall \tau,$ similar to Eq.\ \eqref{ineq1}, we have that for all $t \geq 1:$ 
\begin{eqnarray} \label{q_len_bd5}
	\mathbb{E}Q_i^2(t) &\leq& 2(V+1)t + \tilde{O}(\sqrt{N\sum_{\tau=1}^t\sum_i \mathbb{E}Q_i^2(\tau) }+ NT+ \nonumber \\
	&&\sqrt{N}T^{3/2}) = \tilde{O}(NT^{3/2}),
\end{eqnarray}
where, in the last inequality, we have again used the trivial bound $Q_i(t) \leq T, \forall t \in [T]$ on the RHS.
Substituting the above bound back in the RHS of Eq.\ \eqref{q_len_bd5}, we get an improved bound
\begin{eqnarray}\label{q_len_bd4}
	\mathbb{E}Q_i^2(t) = \tilde{O}(\sqrt{N}T^{3/2}), ~ \forall t \in [T].
\end{eqnarray}
 Eq.\ \eqref{q_len_bd4} yields the following counterpart to Proposition \ref{rate-prop} with an identical proof, which we omit. 
 
\begin{theorem} \label{rate-prop2}
	Setting $V=\Theta(\sqrt{T}),$ for any interval $\mathcal{I} \subseteq [T]$ such that $T^{3/4}=o(|\mathcal{I}|),$ the \textsc{BanditQ} policy in the bandit information setting yields: 
	\begin{eqnarray*}
			&&\liminf_{|\mathcal{I}| \to \infty} |\mathcal{I}|^{-1}\mathbb{E}\sum_{t \in \mathcal{I}} r_i(t)x_i(t) 
	  \geq \lambda_i, ~\forall i \in \mathcal{P},\\
	  &&\textrm{and}~~ \mathbb{V}(T) = O(N^{1/4}T^{3/4}).
	\end{eqnarray*}
\end{theorem}
%Finally, substituting the bound \eqref{q_len_bd4} into the RHS of the self-bounding inequality \ \eqref{main_bd2} immediately yields the following 

Our final result is the following sublinear regret bound for the \BQ policy in the bandit setting.
However, the proof is more technical as we now need to strengthen Eq.\ \eqref{q_len_bd4} to bound the \emph{diameter} of the queueing processes $\{\bm{Q}(t)\}_{t\geq 1},$ \emph{i.e.,} $\mathbb{E}(\max_{i,t} Q_{i}(t)).$ See Proposition \ref{uniform_bd_lemma} in the Appendix for the derivation of this bound using Martingale methods. Using this result, we now establish the following regret bound for the \BQ policy under bandit feedback. See Appendix \ref{regret_prop2_proof} for the proof.
%The proof, again, remains identical to the full-information case and is omitted.
\begin{theorem}\label{regret_prop2}
Upon setting $V=\sqrt{T}, t\in [T],$ the \textsc{BanditQ} policy achieves a regret bound of $\tilde{O}(N^{5/4}T^{3/4})$ in the bandit feedback setting.  
%	\begin{eqnarray*}
		%$\textrm{Regret}_T = O(T^{3/4}).$
%	\end{eqnarray*}
\end{theorem}
\fi
%Our result improves upon the $O(T^{5/6})$ violation penalty established by \citet[Theorem 1]{cai2018online} under independence assumptions.
\iffalse
Finally, we can also derive a stronger regret bound similar to Proposition \ref{avg-regret} when considering average regret. However, in this case, we need a larger value of the parameter $V=\Theta(T).$
\begin{proposition} \label{avg-regret2}
Under the \BQ policy with bandit feedback with $V_t=V=\Theta(T), \forall t,$ we have that 	%\begin{eqnarray*}
		$\frac{1}{T}\sum_{t=1}^T \textrm{Regret}_t(\bm{x}^*) = \tilde{O}(N\sqrt{T}),$ for any $\bm{x}^* \in \Omega.$ 
\end{proposition}
The proof is similar to that of Proposition \ref{avg-regret}. We provide a proof sketch in Appendix \ref{avg-regret2-proof}.
\fi


\begin{figure*}[t]
  \centering
  \begin{minipage}[b]{0.3\linewidth}
   \centering
    \includegraphics[width=\linewidth]{./Figures/Reward_rates_full_info_cropped.pdf}
   \caption{\small{Reward accrual rates in the full-information setting}}
   \label{rew_full}
  \end{minipage}
   \begin{minipage}[b]{0.3\linewidth}
   \centering
    \includegraphics[width=\linewidth]{./Figures/Q_lengths_full_info_cropped.pdf}
   \caption{\small{Queue lengths in the full-information setting}}
   \label{q_full}
  \end{minipage}
   \begin{minipage}[b]{0.3\linewidth}
   \centering
    \includegraphics[width=\linewidth]{./Figures/Regret_full_info_cropped.pdf}
   \caption{\small{Regret of \BQ in the full-information setting}}
   \label{reg_full}
  \end{minipage}
  \begin{minipage}[b]{0.3\linewidth}
   \centering
    \includegraphics[width=\linewidth]{./Figures/Reward_rates_bandit_feedback_cropped.pdf}
   \caption{\small{Reward accrual rates in the bandit feedback}}
   \label{rew_bf}
  \end{minipage}
  %\hfill
  \begin{minipage}[b]{0.3\linewidth}
   \centering
    \includegraphics[width=\linewidth]{./Figures/Q_lengths_Bandit_feedback_cropped.pdf}
   \caption{\small{Queue lengths in the bandit feedback setting}}
   \label{q_bf}
  \end{minipage}
 %\hfill
   \begin{minipage}[b]{0.3\linewidth}
   \centering
    \includegraphics[width=\linewidth]{./Figures/Regret_bandit_feedback_cropped.pdf}
   \caption{\small{Regret of \BQ in the bandit feedback setting}}
   \label{reg_bf}
  \end{minipage}
\end{figure*}



\paragraph{Remarks:} 
\iffalse 1. When all target rates are zero (\emph{i.e.,} $\vec{\bm{\lambda}}=\vec{\bm{0}}$), the fair prediction problem reduces to the classic MAB problem, which is known to have a minimax regret bound of $O(\sqrt{NT})$ \citep{lattimore2020bandit}. Hence, improving the regret bound given by Theorem \ref{q_bd-bandit} might be possible. The main challenge for proving an $O(\sqrt{T})$ regret bound appears to be controlling the term $\mathbb{E}(\max_{i,t}Q_i(t))$ in the regret expression \eqref{main_bd2}.  
\fi
%We leave the question of the tightness of the above regret bound as an interesting open problem. 
  Technically, the scale-free regret bound given in Theorem \ref{ref_th2} was derived for \emph{oblivious} adversaries, which fixes the entire sequence of reward vectors at $t=0$. However, in our case, the surrogate reward vector $\bm{r}'(t)$ in Eqn.\ \eqref{reward_def2} is determined by the past actions of the policy through the variable $\bm{Q}(t)$. To see why we can still use the regret bound \eqref{bandit_reg-bd}, note that the surrogate reward $\bm{r}'(t)$ does not depend on the current action $\bm{X}(t).$ Hence, we can invoke the regret bound for an imaginary adversary that decides the reward vector $\bm{r}'(t)$ at the end of round $t-1$. Since the reward on round $t$ does not affect the previous actions of the policy, the regret bound \eqref{bandit_reg-bd} applies to our problem. 














