%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools,amsthm, amssymb, mathrsfs, algpseudocode,algorithm, dsfont, amsfonts,listings,bm,tikz} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newtheorem*{theorem*}{Theorem}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{assumption}{Assumption}
\newtheorem{definition}{Definition}
\newcommand{\expec}{\mathbb{E}}
\newcommand{\prob}{\mathbb{P}}
\newcommand{\indicator}{\mathds{1}}

\newcommand{\Var}{\mathrm{Var}}


\title{Heavy-tailed Linear Bandit with Huber Regression}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
%\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}

\author[1]{Minhyun Kang}
\author[1,2]{Gi-Soo Kim}

% Add affiliations after the authors
\affil[1]{
Artificial Intelligence Graduate School, UNIST,
Ulsan, Republic of Korea
}
\affil[2]{
Department of Industrial Engineering,  UNIST,
Ulsan, Republic of Korea
}



  
 \begin{document}
\maketitle

\begin{abstract}
Linear bandit algorithms have been extensively studied and have shown successful in sequential decision tasks despite their simplicity. Many algorithms however work under the assumption that the reward is the sum of linear function of observed contexts and a sub-Gaussian error. In practical applications, errors can be heavy-tailed, especially in financial data. In such reward environments, algorithms designed for sub-Gaussian error may underexplore, resulting in suboptimal regret. In this paper, we relax the reward assumption and propose a novel linear bandit algorithm which works well under heavy-tailed errors as well. The proposed algorithm utilizes Huber regression. When contexts are stochastic with positive definite covariance matrix and the $(1+\delta)$-th moment of the error is bounded by a constant, we show that the high-probability upper bound of the regret is $O(\sqrt{d}T^{\frac{1}{1+\delta}}(\log dT)^{\frac{\delta}{1+\delta}})$, where $d$ is the dimension of context variables, $T$ is the time horizon, and $\delta\in (0,1]$. This bound improves on the state-of-the-art regret bound of the Median of Means and Truncation algorithm by a factor of $\sqrt{\log T}$ and $\sqrt{d}$ for the case where the time horizon $T$ is unknown. We also remark that when $\delta=1$, the order is the same as the regret bound of linear bandit algorithms designed for sub-Gaussian errors. We support our theoretical findings with synthetic experiments.
\end{abstract}

\section{Introduction}\label{sec:intro}
Bandit algorithms are widely used in sequential decision-making problems such as mobile health \citep{lei2017actor}, clinical trial \citep{villar2015multi} where the goal of the learning agent is to select good actions successively out of many available actions at each time point. Linear bandits make use of contextual information when choosing the actions, or {\it arms}. 
Upon choosing an arm, a random reward is revealed to the agent. The agent then learns the reward model using rewards observed so far under assumption that the expected value of the reward is a linear function of context variables. Using the updated model and various exploration strategies, the agent chooses the next arm.

Most bandit research is studied under the assumption that error distribution is sub-Gaussian.  However, the tails of data distribution might not decay as fast as that of sub-Gaussian in practical applications including financial markets and insurance\citep{rachev2003handbook,stehlik2010favorable,ibragimov2015heavy}. Our work relaxes the sub-Gaussian assumption and proposes a new algorithm that is robust to heavy-tailed errors.
We assume the noise of the reward, $\epsilon$, has finite $(1+\delta)$-th moment, i.e., $\expec[|\epsilon|^{1+\delta}]\leq \nu_{\delta} < \infty$ for some $\delta\in(0,1].$ This assumption is common in the bandit literature which consider heavy-tailed errors  \citep{bubeck2013bandits,medina2016no,shao2018almost,xue2020nearly}. 
We propose to use the Huber estimator \citep{huber1964robust} to estimate the reward model parameters.
The Huber loss works as a square loss when the input variable is small and works as an absolute loss otherwise. It suppresses the loss value when an observation deviated significantly from the mean, so that it does not dominates on other observations. \cite{sun2020adaptive} proposed an adaptive Huber estimator with robustification parameter adapted to dimension of covariates, size of sample and moment bound on the error. The upper bound of $L_2$-norm of the estimation error is tight but is inversely proportional to the minimum eigenvalue of the Gram matrix of the covariates. When the covariates are independently and identically distributed (i.i.d.) and are sampled from a distribution with positive definite covariance matrix, it can be shown that the minimum eigenvalue of their Gram matrix is strictly bigger than a positive constant with high probability for sufficiently large samples. However, data accumulated by bandit agents are not i.i.d. since the arm selection depends on the arms chosen before. Hence, the Gram matrix formed by contexts has no guarantee to have strictly positive minimum eigenvalue, and the estimation error can be higher. In this work, we propose to use the forced-sampling method of  \citet{goldenshluger2013linear} and \citet{bastani2020online} to guarantee the minimum eigenvalue bound in the non-i.i.d. bandit setting.

 In Section 2, we explain our problem settings. Then in Section 3, previous works on the linear bandit with heavy-tailed errors are reviewed. We introduce previous results needed for our theoretical analysis of estimator and the formal definition of the Huber regression in Section 4. In Section 5, a description of the proposed algorithm and its theoretical analysis are presented. Section 6 is devoted to the simulation study of the proposed algorithm compared with three existing algorithms. 



\section{Problem Formulation}
In linear bandits, we assume that the expected value of the reward is linear in the contexts. The bandit agent aims to learn the unknown linear parameter through consecutive arm pulls. We denote $d$-dimensional context vector at time $t$ by $X_t \in \mathbb{R}^d$. In each time step, $X_t$ is sampled from unknown distribution $P_{\mathcal{X}}$ independently. We denote the number of arms as $K$. When the learning agent pulls the $i$-th arm at time $t$, the arm reveals a stochastic reward 
\begin{equation} y_{i,t} = X_t^T\beta_i +\epsilon_{i,t},\label{form1}\end{equation}
where $\beta_i\in \mathbb{R}^d$ is an arm-specific fixed parameter and each $\epsilon_{i,t}$ is a heavy-tailed noise. We denote the index of the arm pulled by the bandit agent at time $t$ as $a(t)$. Hence at each time $t$, the agent observed one reward, $y_t:=y_{a(t),t}$. We also assume $\expec [\epsilon_{i,t}~|~\mathcal{F}_{t-1}]=0$ and $\expec\left[|\epsilon_{i,t}|^{1+\delta}~|~\mathcal{F}_{t-1}\right]\leq \nu_{\delta} <\infty,$ for $\nu_{\delta}>0$ and for some $\delta \in (0,1]$, where $\mathcal{F}_{t} = \sigma(X_1,...,X_{t+1}, a(1), a(2), ..., a(t),y_1,...,y_t)$ denotes the $\sigma$-algebra generated by observations $\{X_1,...,X_{t+1}, a(1), a(2), ..., a(t), y_1,...,y_t\}$. Note that sub-Gaussian errors satisfy this assumption with $\delta=1$.
	The regret at time $t$ is defined by
	\begin{equation*}
 r_t := X_t^T\beta_{a^*(t)} - X_t^T\beta_{a(t)}.
 \end{equation*}
	where 
 $a^*(t) = \arg\max_{j\in[K]} X_t^T\beta_j$ is the optimal arm. 

In related literature on bandits for linear payoffs, the formulation is sometimes presented as follows, where instead of arm-specific parameters and single context, we have a single parameter $\beta\in\mathbb{R}^d$ and arm-specific contexts, $X_{i,t}\in\mathbb{R}^d$ for each arm $i$.
\begin{equation} y_{i,t} = X_{i,t}^T\beta +\epsilon_{i,t}.\label{form2}\end{equation}
However, we note that an algorithm designed for one setting can always be applied to the other setting. For example, when the algorithm designed for (\ref{form1}) needs to be applied on (\ref{form2}), we simply let $X_t=\{X_{1,t}^T ~\cdots ~ X_{K,t}^T\}^T \in\mathbb{R}^{Kd}$ and $\beta_i=\{\mathbf{0}^T ~\cdots~\mathbf{0}^T~\beta^T~\mathbf{0}^T ~\cdots~\mathbf{0}^T\}^T\in\mathbb{R}^{Kd}$, where $\mathbf{0}$ is a $d$-dimensional zero vector and all the elements up to the $d\times (i-1)$-th element and after the $d\times i$-th element are all 0 in $\beta_i$. On the other hand, when we apply an algorithm designed for (\ref{form2}) on setting (\ref{form1}), we simply let $\beta=\{\beta_1^T~\cdots~\beta_K^T\}^T\in\mathbb{R}^{Kd}$ and  $X_{i,t}=\{\mathbf{0}^T ~\cdots~\mathbf{0}^T~X^T~\mathbf{0}^T ~\cdots~\mathbf{0}^T\}^T\in\mathbb{R}^{Kd}$.

\paragraph{Notations} For any vector $v\in\mathbb{R}^d$ and positive semi-definite matrix $A\in\mathbb{R}^{d\times d}$, we let $||v||_{A}:=\sqrt{v^TAv}$. We let $[N]=\{1,2,\cdots,N\}$ for any natural number $N$. We let $\lambda_{\mathrm{min}}(A)$ and $\lambda_{\mathrm{max}}(A)$ be the minimum and maximum eigenvalues of matrix $A$ respectively. For any vector $v \in \mathbb{R}^d$, $v = (v_1,...,v_d)^T$, $L_2$-norm is $||v||_2 = (\sum_{i=1}^d |v_i|^2)^{\frac{1}{2}}$ and max norm is $||v||_{\infty} = \max_{1 \leq i \leq d}|v_i|$. For any matrices $A, B \in \mathbb{R}^{d\times d}$, let $A\succcurlyeq B$ if $B-A$ is positive semi definite. For any set $A$, let $\mathds{1}_A(\cdot)$ denote indicator function of a subset $A$ of some set $W$. For $a,b \in \mathbb{R}$, $a \vee b := \max\{a,b\}$. The set $\mathbb{N}$ denotes the set of natural numbers. When $Z \subset \mathbb{N}$, we denote the Gram matrix as $\hat{\Sigma}(Z) := \frac{1}{|Z|} \sum_{r \in Z} X_rX_r^T.$


\section{Related Work}\label{sec:related work}
\begin{table*}[t] 	
\centering
\caption{Regret order of linear bandit designed for heavy-tailed reward}\label{table:compare}
	\begin{tabular}{ |c|c|c| } 
		\hline
		Algorithm & Regret Order & Fixed Context Space  \\ 
		\hline
		MoM[Medina and Yang, 2016] & $O(\sqrt{d}T^{\frac{1+2\delta}{1+3\delta}}(\log T)^{\frac{3\delta+1}{2(1+\delta)}})$ & Yes\\ 
		\hline
		Truncation[Medina and Yang, 2016] & $O(dT^{\frac{2+\delta}{2(1+\delta)}}\log T)$ & No \\ 
		\hline
		MENU[ Shao et al., 2018] &$O(d^{\frac{3+\delta}{2(1+\delta)}}T^{\frac{1}{1+\delta}}(\log T)^{\frac{3\delta+1}{2(1+\delta)}})$& Yes\\
		\hline
		TOFU[Shao et al., 2018] & $O(dT^{\frac{1}{1+\delta}}(\log T)^{\frac{3\delta+1}{2(1+\delta)}})$ & No\\
		\hline
		SupLinBMM[Xue et al., 2020]  & $O(\sqrt{d}(\log T)^{\frac{3}{2}} T^{\frac{1}{1+\delta}})$ & Yes\\
		\hline
		SupLinBTC[Xue et al., 2020] & $O(\sqrt{d}(\log T)^2T^{\frac{1}{1+\delta}})$ & No\\
		\hline
		Huber Bandit(ours) & $O(\sqrt{d}T^{\frac{1}{1+\delta}}(\log dT)^{\frac{\delta}{1+\delta}})$ & No\\
		\hline
		
		
	\end{tabular}

\end{table*}
When errors are sub-Gaussian and the time horizon is unknown, LinOFUL\citep{abbasi2011improved} is the state-of-the-art algorithm, achieving a tight regret bound of order $\tilde{O}(d\sqrt{T})$ for setting (\ref{form2}), where $\tilde{O}(\cdot)$ ignores logarithmic terms. The strategy of LinOFUL is to construct a tight confidence region of the true parameter $\beta$ at each time step and pull the arm according to the {\it Optimism in the Face of Uncertainty (OFU)} principle. The confidence region at time $t$ is an ellipsoid of the form $C_t=\{\tilde{\beta}: ||\tilde{\beta}-\hat{\beta}_t||_{V_t}\leq \alpha_t\}$, in which the center $\hat{\beta}_t$ is the Ridge estimator of $\beta$ using the contexts and rewards of the chosen arms up to time $t$ as covariate and outcome respectively and $V_t$ is the Gram matrix of the covariates. The Mahalanobis norm $||\beta-\hat{\beta}_t||_{V_t}\approx||\sum_{r=1}^tX_{a(r),r}\epsilon_{a(r),r}||_{V_t^{-1}}$ is a self-normalized martingale, where $\sum_{r=1}^tX_{a(r),r}\epsilon_{a(r),r}$ forms a martingale and $V_t\approx\sum_{r=1}^tX_{a(r),r}X_{a(r),r}^T$ is the normalization term. \cite{abbasi2011improved} proved that when $\epsilon$ is sub-Gaussian, $C_t$ contains the true parameter with high probability for small enough positive value of $\alpha_t$. The main challenge in the proof is that the self-normalized martingale is not a martingale when arms are chosen adaptively. Hence, standard Azuma-Hoeffding inequalities cannot apply. \cite{abbasi2011improved} applied the inequality for self-normalized martingales \citep{de2004self,pena2009self} instead.  

When errors $\epsilon_{i,t}$ are not sub-Gaussian but have only finite $(1+\delta)$-th moments, the self-normalized inequality does not hold anymore. Therefore, we do not have a guarantee that the Ridge estimator of $\beta$ converges to the true value fast enough to allow a tight confidence region and optimal regret. Hence, other estimators than the naive Ridge estimators should be considered. Recently, novel estimators \citep{medina2016no,shao2018almost,xue2020nearly} have been proposed motivated from the Median of Means (MoM) method and truncation method of \cite{bubeck2013bandits} for multi-armed bandits without contexts. All these works follow \cite{abbasi2011improved} in that they first construct a confidence region of $\beta$ and then choose the arm according to the OFU principle. 

\cite{medina2016no} was the first to extend the MoM method to the estimation of the linear regression parameter, $\beta$. They proposed an algorithm which conducts in batches, where during each batch, the agent pulls the same arm with the same context variable repeatedly. At the end of each batch, the algorithm computes the MoM of the rewards which share the same context variable. Then the algorithm updates the Ridge estimator of $\beta$ using the context and MoM of rewards as new covariate-outcome pair. The caveat of the method is that while the errors of the individual rewards are heavy-tailed, the error of the MoM can be shown to be sub-Gaussian with high-probability. Therefore, the self-normalized inequality applies straightforwardly and a tight confidence region can be constructed. The paper derived a high-probability upper bound of the regret of order $O(\sqrt{d}T^{\frac{1+2\delta}{1+3\delta}}(\log T)^{\frac{3\delta+1}{2(1+\delta)}})$. We remark that when $\delta=1$, the order reduces to $\tilde{O}(dT^{\frac{3}{4}})$ which is suboptimal when applied to sub-Gaussian rewards which have finite second moments. The MoM method does not recover the tight $O(\sqrt{T})$ regret bound despite the sub-Gaussianity of MoMs because we need multiple samples, up to $O(T^{\frac{1+\delta}{1+3\delta}})$ samples, to construct a single MoM. 

\cite{shao2018almost} refined the MoM method of \cite{medina2016no} and proposed a novel algorithm called MENU which enjoys a tighter regret upper bound. MENU also executes in batches and requires to pull the same arm with the same context repeatedly in each batch. Instead of computing the MoM of rewards however, MENU updates multiple estimates of $\beta$ where each estimate is updated using only one context-reward pair. Among the different estimates $\hat{\beta}^1, \hat{\beta}^2, \cdots, \hat{\beta}^k$, where $k$ is the size of the batch, MENU then selects the estimate which has the median value of $||\hat{\beta}^j-\beta||_{V_t}$ where $\beta$ is the true parameter. While $\hat{\beta}^1, \hat{\beta}^2, \cdots, \hat{\beta}^k$ have not all a tight estimation error bound, it can be shown that $\hat{\beta}^j$ which has the median Mahalanobis distance with $\beta$ achieves a tight estimation error bound. This refined estimator requires less samples $k$ in each batch to achieve the same estimation error bound as in the MoM method. Consequently, the regret of MENU is   
$O(d^{\frac{3+\delta}{2(1+\delta)}}T^{\frac{1}{1+\delta}}(\log T)^{\frac{3\delta+1}{2(1+\delta)}})$. Now we observe that the bound is optimal with respect to $T$ when $\delta=1$. The MENU algorithm is easy to employ and runs fast. However, the restriction that the same context variable should be observed during the same batch can be restrictive. In practice, context variables are often stochastic. Even for the same arm, the context information may change. In this case, we cannot run MENU.

\cite{medina2016no} proposed an alternative estimator which does not require to pull the same arm with the same context variable in a row. The algorithm computes a Ridge estimator using truncated reward $\hat{y}_t = y_t\mathds{1}_{|y_t|\leq b_t}$, where the value of $b_t$ increases with time $t$. The quantity $\hat{y}_t$ is bounded but biased. Hence, the error $\hat{y}_t-X_{a(t),t}^T\beta$ can be decomposed into a sub-Gaussian variable plus a bias term. The authors carefully choose the value of $b_t$ to trade off the self-normalized bound for the sub-Gaussian variables and the bound on the cumulative bias. Consequently, their algorithm achieves a high probability regret bound of $O(dT^{\frac{2+\delta}{2(1+\delta)}}\log T)$. This bound also does not recover the $O(\sqrt{T})$ bound when $\delta=1$.

\cite{shao2018almost} refined the truncation estimator of \cite{medina2016no}. Instead of truncating the reward $y_t$, they truncated each element of $V_t^{-1/2}X_{a(r),r}y_{r}$ for every $r=1,2,\cdots,t$ by a time increasing threshold $b_t$. Hence, the truncation depends not only on the reward but also on the contexts of the chosen arms so far. Also, at each time $t$, the truncation is re-operated on all observations up to time $t$. This increases the time complexity of the algorithm but makes the estimator more accurate to obtain the tight regret upper bound of $O(dT^{\frac{1}{1+\delta}}(\log T)^{\frac{3\delta+1}{2(1+\delta)}})$ which reproduces the optimal regret bound with respect to $T$ when $\delta =1.$ The analysis however relies on Bernstein's inequalities for martingales, which can be applied to self-normalized martingales under restrictive conditions only. We note that a self-normalized martingale becomes a martingale only when the contexts of chosen arms constitute a fixed design, i.e., when the covariates used in the estimator are fixed prior to observing the outcomes(rewards). In adaptively collected data, we usually do not have a fixed design since the covariate at time $t$ is chosen based on the rewards up to time $t-1$.  

\cite{xue2020nearly} blended the median of means and truncation method with the SupLinUCB algorithm \citep{chu2011contextual} and achieved the regret bounds $O(\sqrt{d}(\log T)^{\frac{3}{2}} T^{\frac{1}{1+\delta}})$ and $O(\sqrt{d}(\log T)^2 T^{\frac{1}{1+\delta}})$, respectively. They refined the estimators of \cite{medina2016no} and \cite{shao2018almost} so that the contexts at the current time point are also considered when taking median of means and truncation. However, the derivation of the confidence interval for $\beta$ based on their estimators is valid under fixed design only. Therefore, the authors adopted the phased structure of SupLinUCB\citep{chu2011contextual} which ensures that the contexts of arms chosen at time points in the same phase constitute a fixed design \citep{auer2002using}. Therefore, the arms that are chosen at time points in the same phase are only correlated with rewards from precedent phases and not correlated with rewards in the same phase. Hence, when an estimator $\hat{\beta}$ based on MoM or truncation is computed from observations in the same phase, the Hoeffding's inequality can be applied to the self-normalized martingales. The bounds of \cite{xue2020nearly} are state-of-the-art, shaving off $\sqrt{d}$ factor from the bounds of \cite{shao2018almost}. However, their algorithm requires to know the time horizon $T$ prior to running the algorithm to determine the optimal number of phases. In this paper, we propose a novel algorithm which does not require the knowledge of $T$.

We present the regret bounds of the aforementioned algorithms in terms of $d$ and $T$ in Table \ref{table:compare}.

In recent works in multi-armed bandits without contexts, there are algorithms which do not require the prior knowledge of $\nu_{\delta}$ \citep{lee2020optimal} and even $\delta$ as well \citep{huang2022adaptive}. Removing these constraints in linear bandit would be promising.




\section{Preliminaries}\label{sec:setting}

 In this paper, we propose to estimate the parameter $\beta_i$ in (\ref{form1}) with Huber regression. Huber loss function \citep{huber1964robust} is defined by
	\begin{equation*}
	    \mathit{l}_{\tau}(x)= \begin{cases}
		x^2/2, \ &\text{ if } |x|\leq \tau,\\
		\tau|x|-\tau^2/2, \ &\text{ if } |x|>\tau, 
	\end{cases}
	\end{equation*}
 for some robustification parameter $\tau>0.$
	When $Z$ is a set of time steps, the Huber estimator of $\beta$ fitted on data observed at time steps in $Z$ is defined as
\begin{equation*}
	\hat{\beta}(Z) = \arg\min_{\tilde{\beta} \in \mathbb{R}^d} \mathcal{L}_{\tau} (\tilde{\beta},Z)
\end{equation*}
where
\begin{equation*}
	\mathcal{L}_{\tau}(\tilde{\beta},Z) = \frac{1}{|Z|}\sum_{r \in Z} \mathit{l}_{\tau} (y_r-X_r^T \tilde{\beta}).
\end{equation*}

The Huber loss works as a square loss when the difference $|y_{r}-X_{r}^T\tilde{\beta}|$ is smaller than $\tau$ and works as an absolute loss otherwise. \cite{sun2020adaptive} analyzed the estimation error bound of Huber estimator in the fixed-design setting. They derived the value of $\tau$ which minimizes the estimation error bound in the following theorem. 

 \begin{theorem}[Theorem1 of \cite{sun2020adaptive}]\label{th:huber}
	For any $\alpha >0, \ \tau_0 \geq \nu_{\delta}, $ the estimator $\hat{\beta}([t])$ with $\tau = \tau_0(t/\alpha)^{1/(1+\delta)}$ satisfies the bound
	\begin{equation*}
	    ||\hat{\beta}([t])-\beta ||_2 \leq 4c_l^{-1}L\tau_0d^{1/2}\left(\frac{\alpha}{t}\right)^{\frac{\delta}{1+\delta}}
	\end{equation*} with probability at least $1-(2d+1)e^{-\alpha}$,
	provided that 
 \begin{align*}
     t &\geq\max\{8M^4c_l^{-1}\alpha,\ 2^{4+\delta}M^2c_l^{-1}\alpha, \ 16\sqrt{2}c_l^{-1}LMd^{1/2}\alpha\}.
 \end{align*}
	Here, $c_l \leq \lambda_{\min}(\hat{\Sigma}([t])),$ $\hat{\Sigma}([t])=\frac{1}{t}\sum_{r\in [t]}X_{r}X_r^T$, $M = \max_{1\leq r \leq t} ||X_r||_2, \ L = \max_{1\leq r \leq t} ||X_r||_{\infty}.$
		\end{theorem}
In Theorem \ref{th:huber}, we observe that the estimation error bound is proportional to the inverse of $c_l,$ the minimum eigenvalue of the Gram matrix $\hat{\Sigma}([t])$. The following theorem shows that when $X_r$'s are sampled independently and identically from a distribution with positive-definite covariance matrix, then $\lambda_{\mathrm{min}}(\hat{\Sigma}([t]))$ is larger than a positive constant with high probability for sufficiently large $t$. 

\begin{theorem}[Theorem 1.1 of \cite{tropp2012user}] \label{th:eigen}
	Consider a finite sequence $\{B_k\}$ of independent, random, self-adjoint matrix with dimension $d$. Assume $B_k \succcurlyeq0$ and $\lambda_{\max}(B_k) \leq M \ a.s.$ Then
	\begin{equation*}
	    \prob(\lambda_{\min}(\sum_k B_k) \leq \alpha \mu_{\min}) \leq d\exp\left(-\frac{(1-\alpha)^2\mu_{\min}}{2M}\right),
	\end{equation*}
	where $\alpha \in [0,1], \ \mu_{\min}:= \lambda_{\min}(\expec[\sum_k B_k]).$\\
	\end{theorem}

In bandits however, the problem is that we do not have i.i.d. data due to adaptivity in the choice of arms. In (\ref{form1}), even if $X_t$'s are sampled i.i.d. from $P_{\mathcal{X}}$, we can only use a subset of $X_t$'s for the estimation of $\beta_i$ for each arm $i$. Due to adaptivity in the choice of arms, the subset is not a random sample from the full set of $X_t$'s. We propose to use the forced-sampling strategy of \cite{goldenshluger2013linear} to tackle this problem. The main idea behind the strategy is to maintain two estimators for each $\beta_i$, one using a small number of observations forcibly sampled in an i.i.d. fashion and another estimator which uses all observations, both forcibly sampled and adaptively sampled. The forced-sample estimator is then used to discard the suboptimal arms and concentrate the arm choices to optimal arms. Consequently, a constant portion of the adaptively sampled data is guaranteed to follow an i.i.d. distribution over a specific region in $\mathcal{X}$. Hereby, the all-sample estimator which uses both forcibly and adaptively sampled data enjoys a tight estimation error bound. 

\cite{goldenshluger2013linear} first proposed the forced sampling strategy for a contextual bandit setting. \cite{bastani2020online} extended this strategy to the multiple-arm setting and used it to prove a tight estimation error bound of the Lasso estimator in bandit settings which also requires a similar minimum eigenvalue bound as the one Huber estimator requires. 
 
Before proceeding, we state few assumptions here.

\begin{assumption}\label{assum:model} At each time $t$, a context variable $X_t\in\mathcal{X}\subset\mathbb{R}^d$ is sampled i.i.d. from $P_{\mathcal{X}}$. When arm $i$ is pulled, the arm returns a stochastic reward $y_t$ as in equation (\ref{form1}). 
\end{assumption}

	\begin{assumption}\label{assum:norm} Without loss of generality, we assume
		\begin{equation*}
		    ||X_t||_2 \leq 1, \ ||\beta_i||_2 \leq 1, \ \forall t \in [T], i \in [K].
		\end{equation*}
	\end{assumption}

	\begin{assumption}\label{assum:armset}[Arm optimality, Assumption 3 of \cite{bastani2020online}]
		The arm set is partitioned into two sets, optimal arms and sub optimal arms. 
  \begin{equation*}
      [K] = K_{opt}\cup K_{sub} \text{ where } K_{opt}\cap K_{sub} = \emptyset.
  \end{equation*}
		$i\in K_{sub}$ satisfies for $h>0,$
  \begin{equation*}
      X^T\beta_i < \max_{j\neq i}X^T\beta_j -h, ~\ \forall X\in \mathcal{X}.
  \end{equation*}
		For $i \in K_{opt}, \ \exists$ non-empty set
  \begin{equation*}
      U_i = \left\{x\in \mathcal{X}| X^T\beta_i > \max_{j\neq i} X^T\beta_j +h\right\}
  \end{equation*}
		such that $\prob_{\mathcal{X}}(X \in U_i) \geq \mathit{p} >0.$ 
	\end{assumption}
	\begin{assumption}\label{assume:eigen}
		For all $i\in K_{opt}$ defined in Assumption \ref{assum:armset}, $\lambda_{\min}\left(\expec [XX^T | X \in U_i]\right) \geq \gamma$, for $\gamma >0,$ where expectation is taken with respect to the distribution $P_{\mathcal{X}}$. 
	\end{assumption}

Assumption \ref{assume:eigen} states that the expected Gram matrix of contexts in $U_i$ is positive definite, for each $i\in K_{opt}.$ Assumptions \ref{assum:armset} and \ref{assume:eigen} also guarantee a positive minimum eigenvalue for $\mathbb{E}[XX^T]$ via the following lemma.
\begin{lemma}\label{lemma3}
Let $U$ be a set with $\prob (X \in U) \geq \mathit{p}$.
If $\lambda_{\min}\left(\expec[XX^T|X \in U]\right) \geq \gamma$ for $\gamma >0,$ then
\begin{equation*}
    \lambda_{\min}\left(\expec[XX^T]\right) \geq \gamma \mathit{p}.
\end{equation*}
\end{lemma}



 
\section{Proposed Estimator and Algorithm}\label{sec:algorithm}

 Let the set $T_i : = \{(2^n -1)Kq+j| n \in \mathbb{N}\cup \{0\}, j \in \{q(i-1)+1, q(i-1)+2,..., qi\}, q\in\mathbb{N}\}$ be the set of predetermined forced sampling time steps for arm $i$ and $T_{i,t}=T_i \cap [t]$ be the set of forced sampling time steps until time $t$. Since $|T_{i,t}|=O(\mathrm{log}T)$, the regret at forced sampling steps is $O(K\mathrm{log}T)$ at maximum. Let the set of time steps where arm $i$ is pulled, either forcedly or adaptively, until time $t$ be $S_{i,t}=\{r|a(r) =i, r\leq t\}$ and we call it all-sample set of the arm $i$. We have $T_{i,t}\subset S_{i,t}.$

\begin{algorithm}
\caption{Huber bandit}\label{alg1}
\begin{algorithmic}[1]
\State \textbf{Input:} $ \ h, \ \nu_{\delta}, \ \alpha$
\State $\hat{\beta}(T_{i,0})=\hat{\beta}(S_{i,0})=0^d$
\For{$t \in[T]$}
\State Observe $X_t \sim \mathcal{P}_\mathcal{X}$
    \If{$t\in T_i$}
    \State $a(t) =i$
\Else
    \State $\mathcal{D}=\{i \in [K]| \max_{j\in[K]}X_t^T\hat{\beta}(T_{j,t-1})-X_t^T\hat{\beta}(T_{i,t-1})\leq \frac{h}{2}\}.$
    \State $a(t) = \arg\max_{i \in \mathcal{D}} X_t^T\hat{\beta}(S_{i,t-1})$
\EndIf
\State Update $S_{a(t),t} = S_{a(t),t-1} \cup \{t\}$
\State Observe reward $y_t = X_t^T\beta_{a(t)}+\epsilon_{a(t),t}$
\If{$t\in T_i$}
\State $\tau(T_{i,t}) =\nu_{\delta} (|T_{i,t}|/\log(t^2(2d+1)/\alpha))^{1/(1+\delta)}$
\State $\hat{\beta}(T_{i,t}) = \arg\min_{\beta \in \mathbb{R}^d} \frac{1}{|T_{i,t}|}\sum_{r \in T_{i,t}} \mathit{l}_{\tau}(y_r-X_r^T\beta)$
\Else
\State $\tau(S_{i,t}) = \nu_{\delta}(|S_{i,t}|/\log(t^2(2d+1)/\alpha))^{1/(1+\delta)}$
\State $\hat{\beta}(S_{i,t}) = \arg\min_{\beta \in \mathbb{R}^d} \frac{1}{|S_{i,t}|}\sum_{r \in S_{i,t}} \mathit{l}_{\tau}(y_r-X_r^T\beta)$
\EndIf
\EndFor
\end{algorithmic}
\end{algorithm}

 The proposed algorithm (Algorithm \ref{alg1}) works as follows. At each time $t \geq 1$, we observe context $X_t$. If $t\in T_{i,t}$, we pull the arm $i$. If $t\notin T_{i,t}$, we choose the arm using a two step procedure. First, we eliminate arms that are supposed to be suboptimal using the forced-sampling estimator $\hat{\beta}(T_{i,t-1})$. Afterward, we choose the arm $i$ which has maximum value of $X_t^T\hat{\beta}(S_{i,t})$ among the arms that survived the first step.
 
 The following theorem shows that the proposed algorithm has regret upper bound of order $O(\sqrt{d}T^{\frac{1}{1+\delta}}(\log dT)^{\frac{\delta}{1+\delta}})$.
 \begin{theorem}\label{th:regret}
Suppose Assumptions \ref{assum:model}-\ref{assume:eigen} hold. When
\begin{align*}
&q \geq 6\left(\frac{32(\tau_0\vee 1)d^{1/2}}{h\gamma \mathit{p}} \right)^{\frac{1+\delta}{\delta}}, \ t \geq \frac{(Kq)^2}{\phi},\\ &C_1 = (1+\delta)\frac{64 \tau_0}{\gamma \mathit{p}}\left(\frac{4}{\mathit{p}}\right)^{\delta/(1+\delta)}
  \end{align*}
 and $\tau_o \geq \nu_{\delta},$
the cumulative regret $R(T)$ is bounded by
\begin{align*}
      R(T) &\leq C_1 T^{\frac{1}{1+\delta}}\left(\log(T^2(2d+1)10K/\phi)\right)^{\delta/(1+\delta)}\sqrt{d}\\
    & +4Kq\log T + \frac{2(Kq)^2}{\phi} 
\end{align*}
with probability at least $1-\phi.$
\end{theorem}
\paragraph{Proof} We first need to adapt Theorem \ref{th:huber}, which is originally proved for independent data, to work for adaptively collected data. In the original version of the theorem, the i.i.d. assumption is exploited in two parts, (i)bounding the norm of the gradient of the loss function which can be written as the sum of nonlinear function of i.i.d. errors and (ii)guaranteeing the minimum eigenvalue of the Gram matrix to be bounded below by a positive constant. For (i), since the errors are no more i.i.d., we derive a novel proof using martingale inequalities in Lemma \ref{allsamplelemma}. As for (ii), we borrow idea of forced sampling strategy used in existing works which we see later. %Then we obtain a tight estimation error bound of the all sample estimator for arms in $K_{opt}$.
    \begin{lemma}[All sample estimator bound]\label{allsamplelemma}
Let $\tau = \tau_0(|S_{i,t}|/\log(t^2(2d+1)/\alpha))^{1/(1+\delta)},$ $\tau_0 \geq \nu_{\delta}.$
If $t \geq(Kq)^2$ and $c_l \leq \lambda_{\min}(\hat{\Sigma}(S_{i,t}))$, we have
 with probability at least $1-\frac{\alpha}{t^2}$, 
\begin{align*}
    &||\hat{\beta}(S_{i,t})-\beta_i||_2 \\
    &\leq \left(\frac{4}{\mathit{p}t}\log(t^2(2d+1)/\alpha)\right)^{\delta/(1+\delta)} 4\tau_0d^{1/2}c_l^{-1}
\end{align*}  

for $\ i \in [K]$.
\end{lemma}
\begin{proof}
    We mainly provide the part we solve differently from the proof of \cite{sun2020adaptive}. 
    If we know the minimum eigenvalue bound of the Gram matrix, we can obtain $L_2$-norm bound
    \begin{equation*}
         ||\hat{\beta}(S_{i,t})-\beta_i||_2 \leq 2|S_{i,t}|^{-1}c_l^{-1}||\sum_{r\in S_{i,t}} \psi_{\tau}(\epsilon_r) X_r||_2
    \end{equation*}
     $\psi_{\tau}(\epsilon_r) := \mathit{l}_{\tau}'(\epsilon_r)$ is the derivative of the Huber loss. The detailed explanation can be found in Section C.3 of \cite{sun2020adaptive}. The $L_2$-norm is bounded by max-norm.
     \begin{align}
       \left\Arrowvert\sum_{r \in S_{i,t}} \psi_{\tau}(\epsilon_r)X_r\right\Arrowvert_2 
        & \leq d^{1/2}\left\lVert\sum_{r \in S_{i,t}} \psi_{\tau}(\epsilon_r)X_r\right\rVert_{\infty}\nonumber\\
        & = d^{1/2}\tau\max_{1\leq j\leq d}\left|\sum_{r \in S_{i,t}}(X_{rj}/\tau)\psi_{\tau}(\epsilon_r)\right|, \label{eq3}
    \end{align}
    where we use $\max_{r \in S_{i,t}} ||X_r||_{\infty} \leq 1$ and $X_{rj}$ is $j$-th element of the context $X_r$.  We observe that in bandit settings, the right-hand side of (\ref{eq3}) is the sum of adapted data. To bound this quantity, we invoke a bound on supermartingales. We can first construct a supermartingale as follows. Let
    $$M_t:=\mathrm{exp}\left(\sum_{r=1}^t\left[(X_{rj}/\tau)\psi_{\tau}(\epsilon_r)-\frac{\nu_{\delta}}{\tau^{1+\delta}}\right]\right).$$
    Then 
    \begin{align*}
    &\mathbb{E}[M_t|\mathcal{F}_{t-1}]\\
    &=M_{t-1}\mathbb{E}\left[\mathrm{exp}\left((X_{tj}/\tau)\psi_{\tau}(\epsilon_t)\right)|\mathcal{F}_{t-1}\right]/\mathrm{exp}\left(\frac{\nu_{
    \delta}}{\tau^{1+\delta}}\right)\\
    &\leq M_{t-1}\expec[1+X_{tj}(\epsilon_t/\tau)+|\epsilon_t/\tau|^{1+\delta}|\mathcal{F}_{t-1}]/\mathrm{exp}\left(\frac{\nu_{
    \delta}}{\tau^{1+\delta}}\right)\\
    &\leq  M_{t-1}\left[1+\frac{\nu_{\delta}}{\tau^{1+\delta}}\right]/\mathrm{exp}\left(\frac{\nu_{
    \delta}}{\tau^{1+\delta}}\right)\\
    &\leq M_{t-1}\mathrm{exp}\left(\frac{\nu_{
    \delta}}{\tau^{1+\delta}}\right)/\mathrm{exp}\left(\frac{\nu_{
    \delta}}{\tau^{1+\delta}}\right)\\
    &=M_{t-1}
    \end{align*}
   shows that $M_t$ is a supermartingale with $M_0:=1$, where the second inequality is derived from 
    \begin{equation*}
   	-\log(1-u+|u|^{1+\delta}) \leq \frac{1}{\tau} \psi_{\tau}(\tau u) \leq \log (1+u+|u|^{1+\delta})
   \end{equation*}
$~\forall u \in \mathbb{R}$ and
     \begin{align*}
        &\exp((X_{rj}/\tau)\psi_{\tau}(\epsilon_r)) \\
        &\leq (1+\epsilon_r/\tau +|\epsilon_r/\tau|^{1+\delta})^{X_{rj}\mathds{1}_{X_{rj}>0}} \\
        &\cdot(1-\epsilon_r/\tau +|\epsilon_r/\tau|^{1+\delta})^{-X_{rj}\mathds{1}_{X_{rj}<0}} \\
        &\leq 1+X_{rj}(\epsilon_r/\tau) +|\epsilon_r/\tau|^{1+\delta}.
    \end{align*}
 
  Iteratively applying the law of total expectation on $M_t$ gives $\mathbb{E}[M_t]\leq 1$ and hence,
    \begin{align*}
        \expec\left[\exp\left(\sum_{r \in S_{i,t}} (X_{rj}/\tau)\psi_{\tau}(\epsilon_r)\right)\right]     
        \leq \exp(|S_{i,t}|\nu_{\delta} \tau^{-1-\delta}).
    \end{align*}
    Markov's inequality gives
    \begin{align*}
        &\prob\left(\sum_{r \in S_{i,t}} (X_{rj}/\tau)\psi_{\tau}(\epsilon_r)>\nu_{\delta}|S_{i,t}|z\right)\\
        &\leq \exp(-\nu_{\delta}|S_{i,t}|z  )\expec\left[\exp\left(\sum_{r \in S_{i,t}} (X_{rj}/\tau)\psi_{\tau}(\epsilon_r)\right)\right]\\
        & \leq \exp(\nu_{\delta}|S_{i,t}|(\tau^{-1-\delta}-z))
    \end{align*}
    for some $z>0.$
    Then when $\tau \geq (2/z)^{1/(1+\delta)}$ and $z=2\nu_{\delta}^{-1}|S_{i,t}|^{-1}\log(t^2(2d+1)/\alpha)$, with probability at least $1-\alpha/t^2$, the same bound as in Theorem \ref{th:huber} can be obtained. 
\end{proof}
We note that in Lemma \ref{allsamplelemma}, one does not need to invoke a bound for self-normalized supermartingales if the minimum eigenvalue of the Gram matrix is lower bounded by a positive constant. However in bandit settings, guaranteeing a positive constant lower bound for the minimum eigenvalue of Gram matrices is challenging. As mentioned earlier, we utilize the forced-sampling strategy to address this challenge. We prove below through a sequence of lemmas that by means of forced-sampling, the Gram matrix $\hat{\Sigma}(S_{i,t})$ of the all-sample estimator of any arm $i\in K_{opt}$ has a minimum eigenvalue greater than a positive constant.  
%For the minimum eigenvalue bound for a adaptively chosen data set $S_{i,t}$, we use the following lemma.
\begin{lemma}[Lemma EC.23 of \cite{bastani2020online}]\label{eigenlemma1}
		Let $A$ be a set of random variables. Consider a subset $A' \subset A$ of i.i.d. random variables. If $\lambda_{\min}(\hat{\Sigma}(A')) \geq \gamma$ for some $\gamma >0,$ then
  \begin{equation*}
      \lambda_{\min}(\hat{\Sigma}(A)) \geq \frac{|A'|}{|A|}\gamma.
  \end{equation*}
	\end{lemma}
Lemma \ref{eigenlemma1} states that we can get the minimum eigenvalue bound of the Gram matrix for a possibly non i.i.d. set $A$, if we know the bound for an i.i.d. subset of the set $A$. We will construct the i.i.d. subset of the set $S_{i,t}$ and show that the size of the subset is proportional to $|S_{i,t}|$ in terms of $t$. 

We first establish the estimation error bound of the forced sampling estimator, which plays a crucial role in constructing a sufficiently big i.i.d. subset of $S_{i,t}$ for every $i\in K_{opt}$. 
\begin{lemma}[Forced-sampling estimator bound]\label{forcedsampling}
 Define an event at time $t$ as
    \begin{equation*}
    A_t : = \left\{||\hat{\beta} (T_{i,t})-\beta_i||_2 \leq \frac{h}{4}, \  \forall i\in [K]\right\}.
    \end{equation*}
For $\alpha \in (0,1)$, we have $\prob\left(A_t\right)\geq 1-2K\alpha/t^2$,  
provided that $t\geq(Kq)^2.$
\end{lemma}
\begin{proof}
The lemma is direct application of Theorem \ref{th:huber}. The proof follows the lines in Proposition 2 of \cite{bastani2020online}. Since the forced sampling set $T_{i,t}$ is set determinisitcally prior to running the algorithm, the samples in $T_{i,t}$ are i.i.d.. Therefore, the minimum eigenvalue bound of the Gram matrix can be derived from Lemma \ref{lemma3} and Theorem \ref{th:eigen}.
\end{proof}

Using Lemma \ref{forcedsampling}, we can prove the following lemma \ref{lemmaec18} which states that under the high-probability event $A_t$, the set $\mathcal{D}$ of arms after discarding the suboptimal arms using $\hat{\beta}(T_{i,t})$ contains arms from $K_{opt}$ only. Therefore under this high-probability event, the algorithm makes action choices using the estimates $\hat{\beta}(S_{i,t})$'s of arms $i\in K_{opt}$ only. Hence, we need to guarantee sufficiently large i.i.d. subset of contexts for arms in $K_{opt}$ only.
    \begin{lemma}[Lemma EC.18 of \cite{bastani2020online}]\label{lemmaec18}
If $A_{t-1}$ holds, then $\mathcal{D}=\{i \in [K]| \max_{j\in[K]}X_t^T\hat{\beta}(T_{j,t-1})-X_t^T\hat{\beta}(T_{i,t-1})\leq \frac{h}{2}\}$ contains the optimal arm $a^*(t)$ and no arms from $K_{sub}$.   
\end{lemma}

Now we derive i.i.d. subset of the set $S_{i,t}$ for any arm $i\in K_{opt}$.
We first define the following subset of $[t]$ which describes the time steps where contexts are sampled from the optimal regions $U_i$'s and where the most recently updated forced sampling estimators have low estimation error bound. 
    \begin{equation*}
        \mathcal{A}_{i,t} := \{r\in[t] | A_{r-1} \text{ holds, } X_r \in U_i \text{ and } r\notin \cup_{j\in[K]} T_{j,t}\}.
    \end{equation*}
    Observe that the random variables $\{x_r|r \in \mathcal{A}_{i,t}\}$ are i.i.d. in $U_i$. This is because the event $\{x_r \in U_i \}$ is independent of the event $A_{r-1}$ while the event $\{r\notin T_{j,t}\}$ is deterministic. Therefore, we can prove $\lambda_{\mathrm{min}}(\hat{\Sigma}(\mathcal{A}_{i,t}))$ is strictly positive with high probability via Assumption \ref{assume:eigen} and Theorem \ref{th:eigen}.


 
Lemma \ref{allsetlemma0} states that $\mathcal{A}_{i,t}$ is a subset of $S_{i,t}$ and Lemma \ref{allsetlemma} states that the size of $\mathcal{A}_{i,t}$ is large enough. Particularly, Lemma \ref{allsetlemma} guarantees that the size of $\mathcal{A}_{i,t}$ is at least proportional to $t$.
    \begin{lemma}[All sample set, Lemma EC.11 of \cite{bastani2020online}]\label{allsetlemma0}
    For $i\in K_{opt}$, if $t\in \mathcal{A}_{i,t}$, then $a(t)=i$.
    \end{lemma}
    \begin{lemma}[All sample set, Lemma EC.14 of \cite{bastani2020online}] \label{allsetlemma}
    If $t \geq (Kq)^2$, for $i\in K_{opt}$,
    \begin{equation*}
        \prob(|\mathcal{A}_{i,t}| \geq t\mathit{p}/4)\geq 1-\frac{\alpha}{t^2}.
    \end{equation*}
\end{lemma}

Substituting $S_{i,t}$ into $A$ and $\mathcal{A}_{i,t}$ into $A'$ in Lemma \ref{eigenlemma1}, we get the minimum eigenvalue bound of $\hat{\Sigma}(S_{i,t})$. Since $|\mathcal{A}_{i,t}|$ is proportional to $t$ and $|S_{i,t}|\leq t,$ the ratio $|A'|/|A|$ is of constant order.
Theorem \ref{th:eigen} states that 
     \begin{equation*}
    	\prob\left(\lambda_{\min}\left(\hat{\Sigma}(\mathcal{A}_{i,t})\right)\leq \frac{\gamma}{2}\right) \leq d\exp\left(-\frac{|\mathcal{A}_{i,t}| \gamma}{8}\right),
    \end{equation*}
    for $\gamma$ in Assumption \ref{assume:eigen}. The size of the set $\mathcal{A}_{i,t}$ is guaranteed by Lemma \ref{allsetlemma}. When $t\geq \frac{d}{\alpha}$ and $q \geq \frac{192}{\gamma \mathit{p}}$, with probability at least $1-\frac{\alpha}{t^2},$
    \begin{align*}
    	|\mathcal{A}_{i,t}| > \frac{t\mathit{p}}{4} \geq \frac{|T_{i,t}|\mathit{p}}{4} \geq \frac{8}{\gamma}(\log(d/\alpha)+\log t^2),
    \end{align*} and hence
    \begin{equation*}
    	\lambda_{\min}\left(\hat{\Sigma}(\mathcal{A}_{i,t})\right) \geq \frac{\gamma}{2}.
    \end{equation*}
    Therefore, together with Lemma \ref{eigenlemma1}, with probability at least $1-\frac{2 \alpha}{t^2},$ 
    \begin{equation}\label{eq:mineigen}
         	\lambda_{\min}\left(\hat{\Sigma}(S_{i,t})\right) >\frac{\gamma|\mathcal{A}_{i,t}|}{2|S_{i,t}|}>\frac{\gamma |\mathcal{A}_{i,t}|}{2t} > \frac{\mathit{p}\gamma}{8}. 
    \end{equation}


Finally, we are ready to prove the Theorem \ref{th:regret}.

\begin{proof}[proof of theorem \ref{th:regret}]
  	We consider two cases; (a) when $t < (Kq)^2/\phi$ or $t \in \cup_{i\in[K]}T_{i,t}$, (b) when $t \geq (Kq)^2/\phi$ and $t \notin \cup_{i\in[K]} T_{i,t}$. For (a), we know that $|\cup_{i\in[K]}T_{i,t}| \leq 2Kq\log T$ by definition of $T_{i,t}$. Hence, the time occupied by the event (a) is at most $2Kq\log T + (Kq)^2/\phi.$ We roughly bound the regret at this time steps using Assumption \ref{assum:norm}. For (b), due to Lemma \ref{forcedsampling}, with probability at least $1-2\alpha K/t^2$, $A_t$ holds. When $A_t$ holds, due to lemma \ref{lemmaec18}, we choose an arm from $K_{opt}$ only. Then we can obtain minimum eigenvalue bound in (\ref{eq:mineigen}) $\forall i \in \mathcal{D}$ with probability at least $1-2\alpha K/t^2$. The regret at time $t$ is 
   \begin{align*}
   	r_t & = X_t^T\beta_{a^*(t)}-X_t^T\beta_{a(t)}\\
   	& = X_t^T\beta_{a^*(t)}-X_t^T\hat{\beta}(S_{a(t),t})+X_t^T\hat{\beta}(S_{a(t),t})-X_t^T\beta_{a(t)}\\
   	& \leq X_t^T\beta_{a^*(t)}-X_t\hat{\beta}(S_{a^*(t),t})+X_t^T\hat{\beta}(S_{a(t),t})-X_t^T\beta_{a(t)}\\
   	& \leq ||X_t||_2||\beta_{a^*(t)}-\hat{\beta}(S_{a^*(t),t})||_2 \\
   	&+||X_t||_2||\beta_{a(t)}-\hat{\beta}(S_{a(t),t})||_2.
   \end{align*} 
   With probability at least $1-\alpha K/t^2$, $\forall i \in \mathcal{D}$, $L_2$-norm bound in Lemma \ref{allsamplelemma} holds. Then with probability at least $1-10\alpha K$, 
\begin{align*}
	&\sum_{t=1}^T r_t \\
 \leq &2\sum_{t=1}^T \left(\frac{4}{\mathit{p}t}\log(t^2(2d+1)/\alpha)\right)^{\delta/(1+\delta)} \frac{32\tau_0d^{1/2}}{\gamma \mathit{p}}  \\
	\leq &\sum_{t=1}^T \left(\frac{4}{\mathit{p}t}\log(T^2(2d+1)/\alpha)\right)^{\delta/(1+\delta)} \frac{64\tau_0d^{1/2}}{\gamma \mathit{p}}\\
	\leq& (T^{\frac{1}{1+\delta}}-1)(1+\delta) \left(\frac{4}{\mathit{p}}\log(T^2(2d+1)/\alpha)\right)^{\delta/(1+\delta)} \cdot \\
 &\frac{64\tau_0d^{1/2}}{\gamma \mathit{p}}.\\
\end{align*}
Then together with (a), let $\phi = 10\alpha K $ gives the desired result.
 \end{proof}
We provide additional Theorem \ref{expectedregret} which shows that if we add additional Assumption \ref{assum:margin} of marginal condition, we can get $\tilde{O}(\log T)$ expected 
regret. 
\begin{assumption}[Assumption 2 of \cite{bastani2020online}]\label{assum:margin}
	$\exists C_0 \in \mathbb{R}^+$ such that $\forall i,j \in [K]$ where $i\neq j$,
 \begin{equation*}
     \prob(0<|X^T(\beta_i -\beta_j)| \leq \kappa ) \leq C_0\kappa ~\ \forall \kappa \in \mathbb{R}^+.
 \end{equation*}
	\end{assumption}
 \begin{theorem}\label{expectedregret}
Suppose Assumptions \ref{assum:model}-\ref{assum:margin} hold. When
\begin{align*}
	&q \geq 6\left(\frac{32(\tau_0\vee 1)d^{1/2}}{h\gamma \mathit{p}} \right)^{\frac{1+\delta}{\delta}}, \ t \geq \frac{(Kq)^2}{\phi},\\
 &C_2 = 2^{14}\frac{\tau_0^2C_0}{\gamma^2p^3}, \ C_3=2^{12+\frac{4\delta}{1+\delta}}\left(\frac{1+\delta}{1-\delta}\right)\frac{\tau_0^2C_0}{\gamma^2p^{\frac{2+4\delta}{1+\delta}}}
\end{align*}
and $\tau_o \geq \nu_{\delta},$
     the expected regret is
         \begin{align*}
         &\sum_{t=1}^T\mathbb{E}[r_t]  \\
         &= \sum_{t}^T\mathbb{E}[X^T\beta_{a^*(t)}-X^T\beta_{a(t)}]\\
         &\leq C_2(\log T)(\log T+1)dK+ (\log T+1)12K(2d+1)
         \end{align*}
        when $\delta =1$ and\\
         \begin{align*}
         &\sum_{t=1}^T\mathbb{E}[r_t] \\
         &\leq C_3 T^{\frac{1-\delta}{1+\delta}}(\log T)^{\frac{2\delta}{1+\delta}}dK
         +(\log T+1)12K(2d+1)
     \end{align*} 
     when $ 0<\delta<1.$
 \end{theorem}
The full proof of Theorem \ref{expectedregret} is deferred to the Supplementary Material.

    
\section{Experiments}

\begin{figure}[!htb]
\centering
\includegraphics[width=0.9\linewidth]{figure/df=2}
\includegraphics[width=0.9\linewidth]{figure/df=1.5}
\includegraphics[width=0.9\linewidth]{figure/df=1.1}
\caption{Cumulative regret of Huber bandit, OLS bandit, TOFU.}\label{fig:regret}
\end{figure}

We compare the Huber bandit algorithm with OLS bandit \citep{bastani2020online}, TOFU \citep{shao2018almost} and SupLinBTC \citep{xue2020nearly}. OLS bandit has the same structure of the forced-sampling as the Huber bandit but the Ordinary Least Squares(OLS) estimator substitutes the Huber estimator. TOFU and SupLinBTC are designed for the single parameter $\beta \in R^d$ and arm-specific contexts, $X_{i,t}\in \mathbb{R}^d, i\in[K]$ setting. To bring the algorithms to our setting, we reshape arm parameters $\beta_i \in \mathbb{R}^d, i\in [K]$ into one parameter $\beta \in R^{dK}$ and context $X_t$ into $X_{i,t}\in \mathbb{R}^{dK}, i\in[K]$ such that $X_{i,t}^T\beta = X_t^T\beta_i$.
We randomly generate context $X_t\in \mathbb{R}^d$ and arm parameters $\beta_i \in \mathbb{R}^d, i \in [K]$ from a uniform distribution in $[-1,1]$ and normalize them to satisfy the assumption \ref{assum:norm}. Due to the random generations of the samples, the assumption \ref{assum:armset} might not be satisfied. Instead, we arbitrarily set $h$ to $0.2$. Nevertheless, the Huber bandit shows a good performance.  The error term $\epsilon_t$ is generated from the student's t-distribution with a degree of freedom(df) $\{1.1,1.5,2\}$ and multiplied by 0.1 to balance with the norm condition of $\beta_i$ and $X_t$. If $df > n$, finite $n$-th moment of the student's t-distribution exists. In the experiment, we assume $1+\delta = df -0.05$ moments exist. We take the context dimension $d=5,$ the number of arms $ K=10,$ the time horizon $T= 1000$. The TOFU algorithm has one hyperparameter {$conf$} which controls the size of the confidence interval of the regression parameter. Our algorithm and OLS bandit both have the hyperparameter $q$ which controls the number of forced sampling steps. We run the algorithms with $conf=0.5,1,1.5,2$ and $q=2,3,4$ and report the results of the values that resulted in the lowest average regret. We run 100 iterations each with a new data set. The results of cumulative regret of Huber, OLS, TOFU and SupLinBTC averaged over 100 iterations are shown in figure \ref{fig:regret}. The shade of the graph shows the standard deviation of 100 iterations. We use $\alpha = 0.01$ for all algorithms. 

\section{Conclusion}
In this paper, we proposed the Huber bandit algorithm robust to heavy-tailed error. The theoretical analysis shows that when contexts are stochastic with positive definite covariance matrix, the algorithm achieves the regret bound of $O(\sqrt{d}T^{\frac{1}{1+\delta}}(\log dT)^{\frac{\delta}{1+\delta}})$ which matches the state-of-the-art regret upper bound for linear bandits with sub-Gaussian errors in terms of the time horizon $T$ when $\delta=1$. The practical performance was proved by comparing it with OLS bandit and two existing bandit algorithms designed for heavy-tailed data.

\section*{Acknowledgements} This work was supported by the Institute of Information \& communications Technology Planning \& evaluation (IITP) grants funded by the Korea government (MSIT) (No. 2020-0-01336, Artificial Intelligence Graduate School Program (UNIST); No. 2022-0-00469, Development of Core Technologies for Task-oriented Reinforcement Learning for Commercialization of Autonomous Drones) and the “Research on multi-armed bandit methodologies for online sequential decision” Project Fund (1.200107.01) of UNIST, South Korea.



\bibliography{reference}
\end{document}
