\section{Bandit with Knapsack (\texttt{BwK})} \label{BwK}

We now consider the Bandits with Knapsacks (\texttt{BwK}) problem in adversarial environments in the full information set-up \citep{immorlica2022adversarial} in the Budget regime $B_T=\Omega(T)$. In particular, we consider a sequential model, where, starting from a sublinear amount of resource at the beginning, the $i$\textsuperscript{th} resource arrives at the deterministic rate of $0\leq \lambda_i \leq 1$ units per round. Clearly, our sequential model is strictly stronger than the static budget model where the entire budget $B_T$ is available before the game starts.

The action $\bm{x}(t)$ consumes $b_i(t)x_i(t)$ amount of resource $ i\in [N],$ and yields in an overall reward of $\langle \bm{r}(t), \bm{x}(t)\rangle = \sum_i r_i(t) x_i(t).$  Without any loss of generality, we assume that the reward $r_i(t)$ and the consumption $b_i(t)$ for each arm lie in the interval $[0,1], \forall t\geq 1.$   

\subsection{Characterization of the Optimal Offline Benchmark} \label{offline2} 
We compare the regret of the proposed \texttt{BanditQ} policy against reward accrued by any fixed distribution $\bm{x}^* \in \Delta_N$ over the experts while respecting the resource budget constraints. For this, \cite{badanidiyuru2018bandits} considers a linear relaxation where, at each round, the learner receives the same reward vector $\bm{r}(t)= \bar{\bm{r}}$ and the same consumption vector $\bm{b}(t)= \bar{\bm{b}}, \forall t\geq 1,$ where we define 
\begin{eqnarray} \label{benchmark_def}
	\bar{\bm{r}}\equiv\frac{1}{T}\sum_{t=1}^T \bm{r}(t), ~ \bar{\bm{b}} \equiv \frac{1}{T}\sum_{t=1}^T \bm{b}(t).
\end{eqnarray}  
Next, we find the optimal static distribution $\bm{x}^* \in \Delta_N$ for maximizing the cumulative reward as follows: 
\begin{eqnarray*}
	\texttt{OPT}_{\texttt{LP}}= \max_{\bm{x}} \sum_i \bar{r}_i x_i,
\end{eqnarray*}
Subject to
\begin{eqnarray} \label{opt_constr}
\bar{b}_i x_i \leq \lambda_i, ~\forall i \in [N].	
\end{eqnarray}
\citet[Lemma 3.1]{badanidiyuru2018bandits} showed that $\texttt{OPT}_{\textrm{FD}} \leq T\texttt{OPT}_{\textrm{LP}}.$ In this paper, we consider a weaker benchmark where the averaging in Eq.\ \eqref{benchmark_def} is done over all windows of size $w\geq 1,$ \emph{i.e.,}
 \begin{eqnarray} \label{benchmark_def2}
	 \bar{\bm{b}}_{w, \tau} \equiv \frac{1}{w}\sum_{t=\tau}^{\tau+w-1} \bm{b}(t).
\end{eqnarray}  
This leads us to consider the relaxed offline benchmark:
\begin{eqnarray*}
	\texttt{OPT}'_{\texttt{LP}}= \max_{\bm{x}} \sum_i \bar{r}_i x_i
\end{eqnarray*}
Subject to
\begin{eqnarray} \label{opt_constr2}
\bar{b}_{i,w, \tau} x_i \leq \lambda_i, ~\forall i \in [N], \forall \tau.	
\end{eqnarray}
Although the above benchmark could be weaker than $\texttt{OPT}_{\texttt{FD}}$,  on the flip side, the \texttt{BandiQ} policy is oblivious to the window size $w$ as long as it is constant (independent of $T$). 
\subsection{Policy design} \label{policy2}

 To each expert $i \in [N],$ we associate a non-negative state variable $Q_i(t),$ which evolves as follows:
\begin{eqnarray} \label{Q_ev2}
	Q_i(t)= \big(Q(t-1)+b_i(t)x_i(t)-\lambda_i\big)^+, ~Q_i(0)=0.
\end{eqnarray}
The intuition is that, any online policy $\pi^{\texttt{BwK}}=\{\bm{x}(t)\}_{t\geq 1}$, that stabilizes the queues, ensures that the rate of consumption of any resource is roughly bounded by the rate of generation or arrival. 
As before, we define the potential function 
\begin{eqnarray} \label{potential_def2}
	\Phi(\tau) = \sum_i Q_i^2(\tau), ~~\tau \geq 0.
\end{eqnarray}
From Eqn.\ \eqref{Q_ev2}, we have 
\begin{eqnarray*}
	Q_i^2(\tau) &\leq& \big(Q_i(\tau-1)+ b_i(\tau)x_i(\tau)-\lambda_i\big)^2 \\
	&\leq & Q_i^2(\tau-1) + x_i(\tau) + 1 + 2Q_i(\tau-1)\big(b_i(\tau)x_i(\tau)-\lambda_i\big),
\end{eqnarray*}
where, in the above inequality, we have used the fact that $b_i(\tau), x_i(\tau), \lambda_i \in [0,1].$ Summing up the above inequality over all experts $i \in [N],$ we obtain the following upper-bound on the change of potential 
\begin{eqnarray*}
	\Phi(\tau)-\Phi(\tau-1) \leq N+1 + 2\sum_{i \in [N]}Q_i(\tau-1)\big(b_i(\tau)x_i(\tau)-\lambda_i\big). 
\end{eqnarray*}
Fix any feasible fixed $\bm{x}^* \in \Delta_N$ which satisfies the resource conservation constraints $b_i(\tau) x_i^* \leq \lambda_i, \forall i \in [N].$ Let $\{V_\tau\}_{\tau\geq 1}$ be a non-negative non-deceasing sequence of real numbers. Define a surrogate prediction problem $\Xi$ where the reward for the $i$\textsuperscript{th} arm is defined as
\begin{eqnarray*}
	r_i'(\tau)= V_\tau r_i(\tau)- Q_i(\tau-1) b_i(\tau), ~~\forall i, \tau.
\end{eqnarray*}
Note that, unlike the original problem, the rewards in the surrogate problem $\Xi$ need not be bounded. 

\subsection{Analysis and Regret bounds}
Consider the following sequence of inequalities:
\begin{eqnarray} \label{main_eq2}
	&& \Phi(\tau)-\Phi(\tau-1) + 2V_\tau \big[\sum_{i} r_i(\tau)(x_i^* - x_i(\tau))\big] \nonumber \\
	&\leq& N+1 + 2\sum_i Q_i(\tau-1)(b_i(\tau)x_i^*-\lambda_i)+ 2\sum_i r_i'(\tau)(x_i^*-x_i(\tau)) \nonumber \\
	&\stackrel{(a)}{\leq} & N+1 + 2\sum_i r_i'(\tau)\big(x_i^*-x_i(\tau)\big),
\end{eqnarray}
where, in (a), we have used the feasibility of the offline benchmark $\bm{x}^*$ which has the property that $b_i(\tau)x_i^* \leq \lambda_i.$ Summing up \eqref{main_eq2} for $\tau=1$ to $\tau=t$ and recalling the definition of the quadratic potential function from \eqref{potential_def2}, we obtain:
\begin{eqnarray} \label{eq2}
	\sum_{i}Q_i^2(t) + 2\sum_{\tau=1}^tV_\tau\sum_i r_i(\tau)(x_i^*-x_i(\tau)) \leq (N+1)t + 2 \textrm{Regret}^{\Xi}_t.
\end{eqnarray}
Using the Online Gradient Ascent (OGA) policy for the surrogate problem, the regret term can again be bounded as:
\begin{eqnarray*}
	\textrm{Regret}_t^\Xi &\leq& 2 \sqrt{\sum_{\tau=1}^t ||\bm{r'}(\tau)||^2_2} \\
	&=& 2\sqrt{\sum_{\tau=1}^t \sum_i (V_\tau r_i(\tau)- Q_i(\tau-1) b_i(\tau))^2} \\
	&\stackrel{(a)}{\leq} & 2\sqrt{\sum_{\tau=1}^t \sum_i 2\big(V_\tau^2+ Q_i^2(\tau-1)\big) }\\
	&\stackrel{(b)}{\leq}& 2\sqrt{2 \sum_{\tau=1}^t \sum_i Q_i^2(\tau)} + 2\sqrt{2N\sum_{\tau=1}^t V_\tau^2}.
 \end{eqnarray*}
where in (a), we have used the fact that $(x-y)^2 \leq 2(x^2+y^2)$ and in (b), we have used the fact that $\sqrt{x+y} \leq \sqrt{x}+\sqrt{y}.$ Substituting the above regret bound in \eqref{eq2}, we obtain the following Eqn.\ 
\begin{eqnarray}\label{master_ineq}
	\sum_{i}Q_i^2(t) + 2\sum_{\tau=1}^tV_\tau\sum_i r_i(\tau)(x_i^*-x_i(\tau)) \leq (N+1)t +  4 \sqrt{2\sum_{\tau=1}^t\sum_{i} Q_i^2(\tau) } + 4\sqrt{2N\sum_{\tau=1}^t V_\tau^2},
\end{eqnarray}
which is identical to Eqn.\ \eqref{main_ineq} apart from the coefficient of $t$ in the first term of the RHS. Hence, using the same argument as before, we obtain the following results:

\begin{proposition}
	Set $V_t=V=\Theta(\sqrt{T}), 1\leq t \leq T.$ Then the \texttt{BanditQ} policy ensures the following
	\begin{eqnarray*}
		\textrm{Regret}_t = O(T^{3/4}), ~\textrm{and}~ \sum_{z=t-\tau+1}^t b_i(z) x_i(z) \leq \lambda_i \tau  + O(T^{3/4}), \forall 1\leq \tau \leq t \leq T.
	\end{eqnarray*}
\end{proposition}
The above shows that starting with a sublinear $O(T^{3/4})$ amount of budget at the beginning the \texttt{BanditQ} policy satisfies the resource consumption constraint for all time $1\leq t \leq T$ and achieves a sublinear regret of $O(T^{3/4})$ against all feasible offline static predictions. 
















