\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{multirow}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{dsfont}
\usepackage{hyperref}
\usepackage{bbold}
\usepackage{algorithm}
\usepackage{algorithmicx,algpseudocode}
\makeatletter
\def\ALG@special@indent{%
    \ifdim\ALG@thistlm=0pt\relax
        \hskip-\leftmargin
    \else
        \hskip\ALG@thistlm
    \fi
}
\algnewcommand\algorithmicinput{\textbf{Input:}}
\algnewcommand\Input{\item[\algorithmicinput]}
\newcommand{\Stage}[1]{\item[]\noindent\ALG@special@indent \textbf{Initialization:}\ #1}

% Use the following line for the initial blind version submitted for review:
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{url}
\usepackage{pifont}
\usepackage{amsmath}
\usepackage{subfigure}
\usepackage{caption}
\usepackage{amsthm}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
    
\usepackage{mathtools} % amsmath with fixes and additions
\renewcommand{\thempfootnote}{\arabic{mpfootnote}}
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{xcolor,colortbl}
\usepackage{amssymb}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\def\E{{\mathbb E}}
\def\R{{\mathbb R}}
\def\P{{\mathbb P}}
\def\X{{\mathcal X}}
\def\H{{\mathcal H}}
\def\F{{\mathcal F}}
\def\Ti{T_{\text{init}}}
\def\tTheta{\widetilde{\Theta}_\zeta}
\def\txTheta{\widetilde{\Theta}_x}
\def\hTheta{\widehat{\Theta}}
\def\plambda{{\lambda}_{\perp}}
\def\bLambda{{\boldsymbol{\Lambda}}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\nunorm}[1]{\left\lVert#1\right\rVert_{\text{nuc}}}
\newcommand{\opnorm}[1]{\left\lVert#1\right\rVert_{\text{op}}}
\newcommand{\fnorm}[1]{\left\lVert#1\right\rVert_{\text{F}}}
\newcommand{\vecc}[1]{\text{vec}(#1)}
\newcommand{\inp}[2]{\langle #1,#2 \rangle}


\title{Low-rank Matrix Bandits with Heavy-tailed Rewards}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Yue Kang}
\author[2]{Cho-Jui Hsieh}
\author[1]{Thomas C. M. Lee}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    UC Davis\\
    Davis, CA, USA
}
\affil[2]{%
    Google and Department of Computer Science, UCLA\\
    Los Angeles, CA, USA
}
  
\begin{document}
\maketitle

\begin{abstract}
In stochastic low-rank matrix bandit, the expected reward of an arm is equal to the inner product between its feature matrix and some unknown $d_1$ by $d_2$ low-rank parameter matrix $\Theta^*$ with rank $r \ll d_1\wedge d_2$. While all prior studies assume the payoffs are mixed with sub-Gaussian noises, in this work we loosen this strict assumption and consider the new problem of \underline{low}-rank matrix bandit with \underline{h}eavy-\underline{t}ailed \underline{r}ewards (LowHTR), where the rewards only have finite $(1+\delta)$ moment for some $\delta \in (0,1]$. By utilizing the truncation on observed payoffs and the dynamic exploration, we propose a novel algorithm called LOTUS attaining the regret bound of order $\tilde O(d^\frac{3}{2}r^\frac{1}{2}T^\frac{1}{1+\delta}/\tilde{D}_{rr})$\footnote{$\tilde{O}$ ignores polylogarithmic factors. We denote $d \coloneqq d_1 \vee d_2$ and $\tilde{D}_{rr} \coloneqq (D_{rr}-1) \mathbb{1}_{\delta = 1}+1$ where $D_{rr}$ is the $r$-th singular value of $\Theta^*$.} without knowing $T$, which matches the state-of-the-art regret bound under sub-Gaussian noises~\citep{lu2021low,kang2022efficient} with $\delta = 1$. Moreover, we establish a lower bound of the order $\Omega(d^\frac{\delta}{1+\delta} r^\frac{\delta}{1+\delta} T^\frac{1}{1+\delta}) = \Omega(T^\frac{1}{1+\delta})$ for LowHTR, which indicates our LOTUS is nearly optimal in the order of $T$. In addition, we improve LOTUS so that it does not require knowledge of the rank $r$ with $\tilde O(dr^\frac{3}{2}T^\frac{1+\delta}{1+2\delta})$ regret bound, and it is efficient under the high-dimensional scenario. We also conduct simulations to demonstrate the practical superiority of our algorithm.
\end{abstract}

\section{Introduction}\label{sec:intro}
The Multi-armed Bandit (MAB) has proven to be a powerful framework to model various decision-making problems with great applications to medical trials~\citep{villar2015multi}, personalized recommendation~\citep{li2010contextual}, and hyperparameter learning~\citep{ding2022syndicated,kang2023online}, etc. To leverage the side information (contexts) of arms in real-world scenarios, the most important variant of MAB, named stochastic linear bandit (SLB), has been extensively investigated.
%where the expected payoff obeys a linear relationship with the corresponding context vector. 
However, the rise of high-dimensional sparse data in modern applications~\citep{zou2006adaptive,han2022uncertainty} has revealed the inefficiencies of the traditional SLB, particularly in its failure to account for sparsity. To address this limitation, the stochastic high-dimensional bandit with low-dimensional structures has emerged as the pioneering model, such as the LASSO bandit~\citep{bastani2020online} and the low-rank matrix bandit~\citep{jun2019bilinear}. In this work, we investigate the stochastic low-rank matrix bandit, where at each round $t$ the agent first observes the arm set $\X_t \subseteq \R^{d_1 \times d_2}$ composing of context matrices ($\X_t$ can be infinite and changing over time). Then the agent pulls an arm $X_t \in \X_t$ and only obtains its associated noisy reward $y_t = \inp{X_t}{\Theta^*} + \eta_t$ with some inherent low-rank parameter $\Theta^*$ and zero-mean white noise $\eta_t$. This bandit problem is broadly applicable in recommendation systems with pair contexts, like dating service and combined flight-hotel promotion~\citep{kang2022efficient}. 


In all existing literature on low-rank matrix bandit, a default assumption is that the noise $\eta_t$ is sub-Gaussian conditioned on historical observations~\citep{jun2019bilinear}. However, in various real-world scenarios such as financial markets~\citep{bradley2003financial,cont2000herd}, there's a notable trend where extreme noise, a.k.a. heavy-tailed noise, in observations occur more frequently than what would be expected under a sub-Gaussian distribution, in which case previous studies would become futile. These heavy-tailed observations do not exhibit exponential decay and may crucially affect the estimation. To address this challenge, a line of algorithms has been proposed to handle heavy-tailed noise under MAB~\citep{bubeck2013bandits} and SLB~\citep{medina2016no}. However, to the best of our knowledge, effectively managing heavy-tailed noise under the more complex and efficient low-rank matrix bandit framework remains unexplored. In this study, we examine this crucial problem: \underline{low}-rank matrix bandit with \underline{h}eavy-\underline{t}ailed \underline{r}ewards (LowHTR). Specifically, to keep consistent with the heavy-tailed studies under MAB and SLB, we assume that the noise has finite $(1+\delta)$ moment for some $\delta \in (0,1]$. We first propose an efficient algorithm named LOTUS when $T$ is unrevealed to the agent. Then we demonstrate it attains a regret lower bound of LowHTR for the order of $T$ ignoring logarithmic factors. Our LOTUS can be further improved to be agnostic to rank $r$ with slightly worse regret bound. 

%Unlike the conventional sub-Gaussian framework, heavy-tailed rewards do not exhibit exponential decay and would have an infinite variance when $\delta < 1$. This distinction may crucially affect our estimation~\citep{bubeck2013bandits}.


% In this work, we study the problem of LowHTR by proposing an efficient algorithm named LOTUS even when $T$ is unrevealed. We then demonstrate it attains the lower bound of LowHTR in terms of $T$ ignoring logarithmic factors. Our LOTUS can be further improved to be agnostic to the rank $r$ with slightly worse regret bound. Our detailed contributions can be summarized as follows: 
The detailed contributions of our work can be summarized as follows:
(1) inspired by the success of Huber loss~\citep{kang2023heavy,sun2020adaptive} and nuclear norm penalization~\citep{negahban2011estimation}, we first revisit the convex-relaxation-based estimator to approximate the low-rank parameter matrix with heavy-tailed noise. As far as we're aware, our work is the first one to solve the trace regression problem under arbitrary heavy-tailed noise with bounded $(1+\delta)$ moment ($\delta \in (0,1)$), which is highly non-trivial and stands as a noteworthy advancement on its own merits. (2) Equipped with the aforementioned estimator, we develop an algorithm named LOTUS for LowHTR. LOTUS exploits the estimated subspace by proposing a sub-method called LowTO that extends from the TOFU algorithm~\citep{shao2018almost} designed for SLB with heavy-tailed noise. Our LowTO truncates the rewards to mitigate the heavy-tailed effect and penalizes the redundant features within the sparsity structure. When the total horizon $T$ is unrevealed, our algorithm could adaptively switch between exploration and exploitation to achieve the $\tilde O(d^\frac{3}{2}r^\frac{1}{2}T^\frac{1}{1+\delta}/\tilde{D}_{rr})$ regret bound. (3) We further provide a lower bound for LowHTR of order $\Omega( d^\frac{\delta}{1+\delta} r^\frac{\delta}{1+\delta} T^\frac{1}{1+\delta})$, which indicates that our LOTUS is nearly optimal in the scale of $T$. (4) While all existing works on low-rank matrix bandits require a priori knowledge of the rank $r$, we further improve our LOTUS to operate without knowing $r$ even under the more difficult heavy-tailed setting with $\tilde O(dr^\frac{3}{2}T^\frac{1+\delta}{1+2\delta} + d^\frac{3}{2} r^\frac{1}{2}T^\frac{1}{1+\delta})$ regret bound, which is better than the trivial one in high-dimensional case, i.e. when $d \gtrsim T^{\frac{\delta^2}{(1+2\delta)(1+\delta)}}$. Intuitively, it obtains a useful rank $\hat{r}$ by truncating the estimated singular values at each batch. (4) The practical superiority of our LOTUS is then firmly validated in our simulations.


\textbf{Notations:} For any vector $x \in \mathbb{R}^n$, we use $\norm{x}_p$ to denote the $l_p$-norm of the vector $x$ and $\norm{x}_H = \sqrt{x^\top H x}$ to denote its weighted $2$-norm with regard to some positive definite matrix $H \in \mathbb{R}^{n \times n}$. For matrices $X, Y \in \mathbb{R}^{n_1 \times n_2}$, we use $\opnorm{X}$, $\nunorm{X}$ and $\fnorm{X}$ to define the operator norm, nuclear norm and Frobenious norm of the matrix $X$ respectively, and we write $\inp{X}{Y} \coloneqq \textbf{trace}(X^\top Y)$ as their inner product. We also write $f(n) \asymp g(n)$ if $f(n) = O(g(n))$ and $g(n) = O(f(n))$, $f(n) \gtrsim g(n)$ if $g(n) = O(f(n))$, and $f(n) \lesssim g(n)$ if $f(n) = O(g(n))$, and these are the common notations used in the high-dimensional statistics literature~\citep{wainwright2019high}.

\section{Related Work}\label{sec:related}
\paragraph{Bandit under Heavy-tailedness}
Research on bandits with heavy-tailed rewards assumes the noise has finite $(1+\delta)$ moment, $\delta \in (0,1)$, and most existing algorithms follow two key strategies: truncation and median of means. Start with \cite{bubeck2013bandits}, a UCB-based algorithm was proposed for MAB with heavy-tailed rewards, enjoying a logarithmic regret bound. To extend their study to the SLB setting,~\cite{medina2016no} developed two algorithms based on the truncation and median of means ideas, but both methods could only attain the regret bound of order $\tilde O(T^{\frac{3}{4}})$ when $\epsilon =1$, which fails to fulfill our expectations.~\cite{shao2018almost} then refined their results on SLB and introduced two algorithms with improved regret bound. They also constructed a matching lower bound with $T$.~\cite{xue2020nearly} investigated on the finite arm case and provided two SubLinUCB-based~\citep{chu2011contextual} algorithms. Recently, \cite{kang2023heavy} borrowed the ideas from Huber regression and proposed an improved Huber bandit under finite arm sets. However, their work is confined to the low-dimensional bandit without sparsity, and their parameter vectors are presumed to be arm-dependent under the finite arm set. Another contemporary work~\cite{xue2023efficient} developed a nearly optimal algorithm for arbitrary arm sets with reduced computation in practice. Yet, none of these studies tackle the heavy-tailedness under the more challenging contextual high-dimensional bandits problem with sparsity, a useful niche our work aims to fill.

\paragraph{Low-rank Matrix Bandit} There has been a line of literature on stochastic low-rank matrix bandit with sub-Gaussian noise. Initially,~\cite{jun2019bilinear} introduced the bilinear low-rank matrix bandit problem and proposed the two-stage ESTR algorithm with $\tilde O(\sqrt{d^3rT}/D_{rr})$ regret bound. \cite{jang2021improved} then constructed a new algorithm improving the regret bound by $\sqrt{r}$. \cite{lu2021low,kang2022efficient} extended the problem setting to low-rank matrix bandit where feature matrices no longer have to be rank-one. Specifically, \cite{lu2021low} first proposed the LowGLOC with $\tilde O(\sqrt{d^3rT})$ regret bound, but this method is computationally prohibitive and cannot handle the contextual setting. Subsequently, \cite{lu2021low,kang2022efficient} developed 
several more efficient algorithms, achieving regret bound of order $\tilde O(\sqrt{d^3rT}/D_{rr})$. Our work broadens this research scope to encompass arbitrary heavy-tailed noise with bounded $(1+\delta)$ moment ($\delta \in (0,1)$), and our algorithm LOTUS obtains the $\tilde O(d^\frac{3}{2}r^\frac{1}{2}T^\frac{1}{1+\delta}/\tilde{D}_{rr})$ regret bound, which coincides with the aforementioned leading one with $\delta = 1$. Moreover, we showcase that our regret bound is optimal concerning the order of $T$ with a matching lower bound. Another notable limitation in existing algorithms for low-rank matrix bandits is their dependence on the rank $r$, which is impractical. We further improve our LOTUS method to be agnostic to $r$ with a slightly worse regret bound, which represents the first attempt at this real-world issue. \cite{jang2024efficient} recently proposed a new estimator utilizing the geometry of the arm set to conduct estimation.


\paragraph{Matrix Recovery under Heavy-tailedness} All studies on low-rank matrix estimation revolve around two ideas: Convex approaches tend to replace the classic square loss with some more robust ones, like the renowned Huber loss~\citep{huber1965robust,sun2020adaptive}. 
%\cite{elsener2018robust} proposed robust estimators for the noisy matrix completion, albeit with a requirement for symmetric noise. 
\cite{tan2022sparse} considered the sparse multitask regression under heavy-tailed noise, contrasting our focus on the trace regression problem. The two works most closely related to ours are~\cite{fan2021shrinkage,yu2023low}. \cite{fan2021shrinkage} established a two-step method for the robust trace regression, but they assumed the noise possesses finite $2k$ moment for $k>1$ and their approximation error is not even proportional to the noise size. \cite{yu2023low} further employed the Huber loss to develop an enhanced regressor with error aligned with the noise scale as long as the noise has bounded variance. In our work, we further complement their result and utilize the Huber-type estimator robust to noise with only finite $(1+\delta)$ moment for any $\delta \in (0,1]$, and we deduce the error rate of order $\tilde O(({d}/{n})^\frac{\delta}{1+\delta} \E(|\eta_t|^{1+\delta})^\frac{1}{1+\delta})$ scaling with the noise scale decently. On the other hand, nonconvex methods aim to seek local optima of the matrix recovery problem via gradient descent. The notable work~\citep{shen2022computationally} developed a Riemannian sub-gradient method and attained the optimal statistical rate under heavy-tailed noises with bounded $(1+\delta)$ moment, but their work relies on some additional assumptions like the noise is symmetric or zero-median. In summary, our work stands as the first solution to address the trace regression problem under arbitrary heavy-tailed noise with only bounded $(1+\delta)$ moment ($\delta \in (0,1)$), which is significant on its own strengths.

\section{Preliminaries}\label{sec:prelim}
We will present the setting of LowHTR and introduce the common assumptions for theoretical analysis in this section. Denote $T$ as the total horizon, which may be unknown to the agent. At each round $t \in [T]$, the agent is given an arm set $\X_t \subseteq \R^{d_1 \times d_2}$ ($d_1 \asymp d_2$) that can be fixed or varying over time. Then the agent chooses an arm $X_t \in \X_t$ and observes the associated stochastic reward $y_t$ such that, 
\begin{align}
    y_t = \inp{X_t}{\Theta^*} + \eta_t, \label{eq:problem}
\end{align}
where $\Theta^* \in \R^{d_1 \times d_2}$ is an unknown parameter matrix with rank $r \ll d_1 \wedge d_2$ and $\eta_t$ is the heavy-tailed noise. Specifically, we assume $\E(\eta_t|\F_t) = 0$ and $\E(|\eta_t|^{1+\delta}|\F_t) \leq c$ for some $\delta \in (0,1], c > 0$ conditional on the history filtration $\F_t = \{X_t,X_{t-1},\eta_{t-1}, \dots, X_1, \eta_1\}$, which indicates that $\E(y_t|\F_t) = \inp{X_t}{\Theta^*}$. The compact SVD of $\Theta^*$ can be written as $\Theta^* = UDV^\top$ for some $U \in \R^{d_1\times r}$ and $V \in \R^{d_2\times r}$, and we denote $D_{ii}$ as its $i$-th largest singular value. Furthermore, we define $X_{t}^* \coloneqq \arg\max_{X \in \X_t} \inp{X}{\Theta^*}$ as the feature matrix of the optimal arm at round $t$, and the goal is to minimize the cumulative regret in total $T$ rounds formulated as $R_T = \sum_{t=1}^T \inp{X_t^*}{\Theta^*} - \inp{X_t}{\Theta^*}$. 
% \begin{align}
%     R_T = \sum_{t=1}^T \inp{X_t^*}{\Theta^*} - \inp{X_t}{\Theta^*} \label{eq:cumulativeregret}
% \end{align}

Next, we present two mild and regular assumptions.
\begin{assumption}\label{assu:subg}
    We can find a sampling distribution $\mathcal{D}$ over ${\mathcal{X}_t}$ with the covariance matrix $\Sigma$, such that $\mathcal{D}$ is sub-Gaussian with parameter $\sigma^2 \asymp c_l \coloneqq \lambda_{\min}(\Sigma) \asymp 1/(d_1d_2)$.
\end{assumption}

Assumption~\ref{assu:subg} is commonly used in the modern low-rank matrix bandits~\citep{lu2021low,kang2022efficient}, and can be easily satisfied in many cases. For instance, when $\X_t$ is a region in $\R^{d_1 \times d_2}$ (e.g., Euclidean unit ball), we can find such a sampling distribution if the convex hull of this region contains a ball with some constant radius. And when $\X_t$ is a finite set, it suffices if the arms are IID drawn from some sub-Gaussian distribution at each time. Note a random matrix $X \in \R^{d_1\times d_2}$ follows sub-Gaussian distribution with parameter $\sigma^2$ if for any $t \in \R$ s.t.,
$$\P(\inp{A}{X} \geq \sqrt{2} \fnorm{A} t ) \!\leq\! 2 \exp{\left( {-t^2  
   }/{\sigma^2} \right), \, \forall A \!\in\! \R^{d_1\times d_2}}.$$
\begin{assumption}\label{assu:bounded}
    We have $\fnorm{\Theta^*} \leq S$, and for any $t \in [T], X \in \X_t$, it holds that $\fnorm{X} \leq S$.
\end{assumption}
Assumption~\ref{assu:bounded} is very standard in contextual bandit literature. As a consequence, we can deduce that $\E(|y_t|^{1+\delta} | \F_t) \leq 2^\delta S^2 + 2^\delta c \coloneqq b$. Based on the conditions on the sub-Gaussian parameter $\sigma$ in Assumption~\ref{assu:subg}, we can prove that $\fnorm{X}$ is bounded in a constant scale with high probability with its proof in Appendix~\ref{app:bound}. But for simplicity and consistency with previous literature, we still impose this common assumption to bound $\fnorm{X}$ here. Note our work can be naturally extended to the generalized low-rank matrix bandit problem by further assuming the derivative of the inverse link function is bounded in the interval $[-S^2,S^2]$. Such an adaptation would result in the final regret bound being affected only by a constant factor, and we will leave it as our future work.

\section{Methods}\label{sec:method}
In this section, we present our novel \underline{Lo}wTO With Es\underline{t}imated S\underline{u}b\underline{s}paces (LOTUS) algorithm for the LowHTR problem. Our algorithm runs in a batched format adapted from the doubling trick~\citep{besson2018doubling}. And inspired by the success of the two-stage framework in ESTR~\cite{jun2019bilinear}, in each batch our algorithm also first recovers the subspaces spanned by $\Theta^*$, and then invokes a new approach called LowTO that heavily penalizes on columns and rows complementary to our estimated subspaces. Contrasting prior works, our algorithm could dynamically switch between the exploration and exploitation stages so as to be agnostic to the horizon $T$, which is significantly more useful. We further improve LOTUS to operate without knowing the sparsity $r$, which further enhances its practicality.

Initially, we will introduce the nuclear penalized Huber-type low-rank matrix estimator under heavy-tailed noise as follows. Contracting the results in~\citep{yu2023low}, we further prove that the Huber-type estimator is robust to arbitrary heavy-tailed noise with the finite $(1+\delta)$ moment for $\delta \in (0,1)$ on the trace regression problem.

\begin{algorithm*}[t]
\caption{\underline{Lo}wTO With Es\underline{t}imated S\underline{u}b\underline{s}paces (LOTUS)} \label{alg:lotus}
\begin{algorithmic}[1]
\Input Arm set $\mathcal{X}_t$, sampling distribution $\mathcal{D}_t$, $\delta, T_0, \eta, \lambda, 
%\{\lambda_i\}_{i=1}^{+\infty},\{\tau_i\}_{i=1}^{+\infty}, 
\{\lambda_{i,\perp}\}_{i=1}^{+\infty}$. 
\Stage The history buffer index set $\mathcal{H}_1 = \{\}$, the exploration buffer index set $\mathcal{H}_2 = \{\}$.
% \Stage \textbf{1: Subspace Estimation}
\State Pull arm $X_t \in \mathcal{X}_t$ according to $\mathcal{D}_t$ and observe payoff $y_t$. Then add $(X_t, y_t)$ into $\mathcal{H}_1$ and $\mathcal{H}_2$ for $t \leq T_0$.
\For{$i=1,2,\dots$ until the end of iterations}
\State Set the exploration length $T_1 = \min\left\{ \left[\frac{d^{2+4\delta} r^{1+\delta}}{D_{rr}^{2+2\delta}} 2^{i(1+\delta)} \right]^{\frac{1}{1+3\delta}}, 2^i \right\}$. 
\State For iteration $t$ from $|\mathcal{H}_1|+1$ to $|\mathcal{H}_1|+T_1$, pull arm $X_t \in \mathcal{X}_t$ according to $\mathcal{D}_t$ and observe payoff $y_t$. Then add $(X_t, y_t)$ into $\mathcal{H}_1$ and $\mathcal{H}_2$
\State Obtain the estimate $\widehat \Theta$ based on Eqn. \eqref{eqn:hestimator} with $\mathcal{H}_2$, where we set $\tau_i \asymp  \left(|\H_2|/(d+ \ln{(2^{i+1}/\epsilon)})\right)^{\frac{1}{1+\delta}} c^{\frac{1}{1+\delta}}, \lambda_i \asymp \sigma \left((d+ \ln{(2^{i+1}/\epsilon)})/|\H_2|\right)^{\frac{\delta}{1+\delta}} c^{\frac{1}{1+\delta}}$.
\State Calculate the full SVD of $\widehat \Theta = [\widehat U, \widehat U_\perp] \, \widehat D \, [\widehat V, \widehat V_\perp]^\top$ where $\widehat U \in \mathbb{R}^{d_1 \times r}, \widehat V \in \mathbb{R}^{d_2 \times r}$.
\State For $T_2 = 2^i-T_1$ rounds, invoke LowTO with $\delta, [\widehat U, \widehat U_\perp], [\widehat V, \widehat V_\perp], \lambda, \lambda_{i,\perp}, \mathcal{H}_1$ and obtain the updated $\mathcal{H}_1$.
\EndFor
% \Stage \textbf{2: Sparse Generalized Linear Bandits}
% \State Rotate the arm feature set: $\mathcal{X}^\prime \coloneqq [\widehat U, \widehat U_\perp]^\top \mathcal{X} [\widehat V, \widehat V_\perp]$ and the admissible parameter space: $\mathit{\Theta}^{\prime} \coloneqq [\widehat U, \widehat U_\perp]^\top \mathit{\Theta} [\widehat V, \widehat V_\perp]$.
% \State Define the vectorized arm set so that the last $(d_1-r) \cdot (d_2-r)$ components are negligible:
% % \IfThenElse{$\hat \theta_t \in \mathit{\Theta}_0$}{$\btheta_t = \hat \theta_t$}{compute $\btheta_t$ according to \eqref{ttheta2}};
% \begin{align}
%     \mathcal{X}_0 \coloneqq \{\vecc{\mathcal{X}^\prime_{1:r,1:r}},\vecc{\mathcal{X}^\prime_{r+1:d_1,1:r}},  \vecc{\mathcal{X}^\prime_{1:r,r+1:d_2}},\vecc{\mathcal{X}^\prime_{r+1:d_1,r+1:d_2}}\}, \label{rotate_arm}
% \end{align}
% and similarly define the parameter set:
% \begin{align}
%     \mathit{\Theta}_0 \coloneqq \{\vecc{\mathit{\Theta}^\prime_{1:r,1:r}},\vecc{\mathit{\Theta}^\prime_{r+1:d_1,1:r}}, \vecc{\mathit{\Theta}^\prime_{1:r,r+1:d_2}},\vecc{\mathit{\Theta}^\prime_{r+1:d_1,r+1:d_2}}\}. \label{rotate_theta}
% \end{align}
% \State For $T_2 = T -T_1$ rounds, invoke (P)LowGLM-UCB with $\mathcal{X}_0,\mathit{\Theta}_0,k = (d_1+d_2)r-r^2,(\lambda_0,\lambda_\perp)$.
\end{algorithmic}
\end{algorithm*}

\subsection{Low-rank Matrix Estimation}\label{subsec:lowrank}
Suppose we collect $n$ pairs of data $\{(X_i, y_i)\}$ according to some distribution satisfying Assumption~\ref{assu:subg} for $X_i$ and the model of Eqn.~\eqref{eq:problem} for the associated $y_i$ after time $n$. Define the Huber loss~\citep{huber1965robust} $l_\tau(\cdot)$ parameterized by the robustification $\tau > 0$~\citep{sun2020adaptive} as:
$$l_\tau(x) = 
\begin{cases} 
x^2/2 & \text{if } |x| \leq \tau, \\
\tau|x| - \tau^2/2 & \text{if } |x| > \tau.
\end{cases}$$ 
To obtain a low-rank matrix estimate, we use the nuclear norm penalization as a convex surrogate for the rank and implement the following nuclear norm regularized Huber regressor~\citep{yu2023low} to recover the subspaces under heavy-tailedness:
\begin{gather}
    \widehat \Theta = \arg\min_{\Theta \in \R^{d_1 \times d_2}} \hat L_{\tau,[n]}(\Theta) + \lambda \nunorm{\Theta}, \label{eq:estimator} \\ \hat L_{\tau,[n]}(\Theta) = \frac{1}{n}\sum_{i \in [n]} l_\tau \left(y_i - \inp{X_i}{\Theta}\right), \nonumber
\end{gather}
where $\tau$ and $\lambda$ stand for the Huber loss robustification and the nuclear norm penalization parameters, respectively. 


We then establish the following statistical properties of the estimator defined in Eqn. \eqref{eq:estimator}:

\begin{theorem}\label{thm:estimator}
    By extending Assumption~\ref{assu:subg} with any order of $\sigma$ and $c_l$, With probability at least $1-\epsilon$, the low-rank estimator $\widehat \Theta$ in Eqn. \eqref{eq:estimator} with $\tau \asymp \left(n/(d+ \ln{(1/\epsilon)})\right)^{\frac{1}{1+\delta}} c^{\frac{1}{1+\delta}}$ and $\lambda \asymp \sigma \left((d+ \ln{(1/\epsilon)})/n\right)^{\frac{\delta}{1+\delta}} c^{\frac{1}{1+\delta}}$ satisfies
    $$\fnorm{\widehat \Theta - \Theta^*} \leq C_1 \frac{\sigma}{c_l} \left(\frac{d+ \ln{(1/\epsilon)}}{n}\right)^{\frac{\delta}{1+\delta}} c^{\frac{1}{1+\delta}} \sqrt{r}, $$
    for some constant $C_1$ as long as we have $n \gtrsim dr\nu^3, d, \nu^2$, and $(d-\ln{(\epsilon)})\sqrt{r\nu^3}$ with $\nu = \sigma^2/c_l$.
\end{theorem}
The proof of Theorem~\ref{thm:estimator} involves a construction of the restricted strong convexity for the empirical Huber loss function $\hat L_\tau(\cdot)$ and a deduction of an upper bound for $\opnorm{\nabla \hat L_\tau(\Theta^*)}$, and the details are presented in Appendix~\ref{app:estimator}. Note Theorem~\ref{thm:estimator} generally holds without any restriction on the scale of $\sigma$ and $c_l$. Provided the noise has a finite variance, i.e., $\delta = 1$, the deduced $l_2$-error rate aligns with the minimax value~\citep{fan2019generalized} under the standard penalized low-rank estimator with sub-Gaussian noise. Based on our knowledge, this is the first error bound in the trace regression problem under noise with finite $(1+\delta)$ moment ($\delta < 1$) assuming nothing further.

To solve the convex optimization problem in Eqn.~\eqref{eq:estimator}, we adopt the local adaptive majorize-minimization (LAMM) method~\citep{fan2018lamm,sun2020adaptive,yu2023low} that is fast to use and scalable to large datasets. This method constructs an isotropic quadratic function to upper bound the Huber loss and utilizes a majorize-minimization algorithm for finding the optimal solution. One noteworthy advantage of this procedure is that the minimizer often yields a closed-form solution. Due to the space limit, we defer more details and the pseudocode to Appendix~\ref{app:lamm}.


\subsection{LOTUS: The Rank $r$ is Known}\label{subsec:lotus}

We will present our LOTUS algorithm in this subsection. To improve the two-stage framework introduced in~\cite{jun2019bilinear} which requires the knowledge of $T$ and to further yield robust performance against heavy-tailedness, our LOTUS adaptively switches between exploration and exploitation in a batch manner without knowing $T$, and is equipped with a new LowTO algorithm designed for heavy-tailed rewards. The LOTUS algorithm is presented in Algorithm~\ref{alg:lotus}, with three core steps introduced in detail as follows:


\textbf{Adaptive Exploration and Exploitation:} Drawing inspiration from the doubling trick~\citep{besson2018doubling}, after some warm-up iterations of size $T_0$, our LOTUS operates with batches until termination where the batch sizes increase exponentially as $\{2^i\}_{i=1}^{+\infty}$.
We define $\H_1$ and $\H_2$ as the history and exploration buffer index sets, where after time $t$ all the indexes $[t]$ of past observations are included in $\H_1$ while $\H_2$ only contains sample indexes particularly used for subspace estimation of $\Theta^*$. At the $i$-th batch of length $2^i$, we first set $T_1^i = \min\{(d^{2+4\delta} r^{1+\delta} 2^{i+i\delta}/D_{rr}^{2+2\delta})^\frac{1}{1+3\delta}, 2^i \}$ as the exploration length, and we randomly sample $T_1$ arms according to the sampling distribution in Assumption~\ref{assu:subg} and put their indexes into both $\H_1$ and $\H_2$. Subsequently, we obtain an estimate $\widehat \Theta$ based on Eqn.\eqref{eq:estimator} with samples indexed by $\H_2$, and then leverage the recovered subspaces in the remaining $T_2^i = 2^i - T_1^i$ rounds as the exploitation phase, where we invoke a new algorithm named LowTO. The details of this exploitation phase will be elaborated in the following two points. As shown in Algorithm~\ref{alg:lotus} line 8, indexes of observations under LowTO are only added to $\H_1$ but not $\H_2$ and hence will not be used for matrix estimation. Unlike the traditional doubling trick that restarts the algorithm at each batch, our algorithm facilitates interaction across different batches. Specifically, at the $i$-th batch, it utilizes all the samples in $\H_1$ and $\H_2$ accumulated from the previous batches for more informed decision-making. Another point to highlight is that our LOTUS algorithm can also be run in a more randomized manner with the same regret bound: at the $i$-th batch, there is an option to explore with a probability of $T_1^i/2^i$ and to exploit with the remaining probability. We defer its pseudocode to Appendix~\ref{app:alter}. For simplicity, we consider our original approach in this work, which involves an initial exploration phase of deterministic length followed by the use of LowTO.

\textbf{Subspace Transformation:}
At the $i$-th batch, after we randomly sample arms for a carefully designed duration and add their observations into $\H_2$, we first acquire the estimated $\widehat \Theta$ based on the current $\H_2$ as shown in Eqn. \eqref{eqn:hestimator}. With the knowledge of $r$, then we can obtain its corresponding full SVD as $\widehat \Theta = [\widehat U,\widehat U_{\perp}] \widehat D [\widehat V, \widehat V_{\perp}]^\top$ where $\widehat U \in \R^{d_1 \times r}, \widehat U_{\perp} \in \R^{d_1 \times (d_1-r)},\widehat V \in \R^{d_2 \times r}$ and $\widehat V_{\perp} \in \R^{d_2 \times (d_2-r)}$.
\begin{align}
    \widehat \Theta = \arg\min_{\Theta \in \R^{d_1 \times d_2}} \hat{L}_{\tau_i, \H_2}(\Theta) + \lambda_i \norm{\Theta}_{\text{nuc}} \label{eqn:hestimator}
\end{align}
Intuitively, Theorem~\ref{thm:estimator} implies that our estimated column and row subspaces should align with the ground truth $U, V$. Borrowing the ideas from ESTR~\citep{jun2019bilinear}, we aim to transform the original LowHTR into the linear bandit problem under heavy-tailed rewards with some sparsity feature. Specifically, we first orthogonally rotate the actions set $\X_j$ in the exploitation phase as 
\begin{gather}
    \X_j^- = \left\{[\widehat U,\widehat U_{\perp}]^\top X [\widehat V, \widehat V_{\perp}]: \, X \in \X_j\right\}, \label{eq:rotateset} \\
    \Theta^{*,\prime} = [\widehat U,\widehat U_{\perp}]^\top \Theta^* [\widehat V, \widehat V_{\perp}]. \label{eq:rotatetheta}
\end{gather}
Define the total dimension $p \coloneqq d_1d_2$ and the effective dimension $k \coloneqq p - (d_1-r)(d_2-r)$. We perform a tailored vectorization of the arm set $\X_j^-$ as in Algorithm~\ref{alg:lowto} line 4 to obtain a new arm set $\X_t^\prime \subseteq \R^p $, and denote $\theta^*$ to be the corresponding rearranged version of $\vecc{\Theta^{*,\prime}}$ such that $\theta^*_{k+1:p} = \vecc{\Theta^{*,\prime}_{r+1:d_1,r+1:d_2}}$. Then it holds that $\theta^*_{k+1:p}$ is nearly zero based on the results in~\cite{stewart1990matrix} and Theorem~\ref{thm:estimator}. The formal result is shown as follows for the $i$-th batch with probability at least $1-\epsilon$:
\begin{align}
    \norm{\theta^*_{k+1:p}}_2 \lesssim S_\perp  \coloneqq  \frac{r \sigma^2 c^{\frac{2}{1+\delta}}}{c_l^2 D_{rr}^2}\left(\frac{d+ \ln{(1/\epsilon)}}{|\H_2|}\right)^{\frac{2\delta}{1+\delta}} , \label{eq:sparse}
\end{align}
with the parameter setting that
\begin{gather*}
    \tau_i \asymp \left(|\H_2|/(d+ \ln{(1/\epsilon)})\right)^{\frac{1}{1+\delta}} c^{\frac{1}{1+\delta}}, \\
    \lambda_i \asymp \sigma \left((d+ \ln{(1/\epsilon)})/|\H_2|\right)^{\frac{\delta}{1+\delta}} c^{\frac{1}{1+\delta}},
\end{gather*}
Its complete proof is presented in Appendix~\ref{app:sparse}. Consequently, we can simplify the LowHTR problem to an equivalent $p$-dimensional linear bandits under heavy-tailedness with a unique sparse pattern, i.e., the final $(p-k)$ entries of $\theta^*$ are almost zero based on Eqn.~\eqref{eq:sparse}.

Following the recovery of row and column subspaces of $\Theta^*$ and the particular arm set transformation after $T_1^i$ rounds in the $i$-th batch, we will leverage the resulting almost-low-dimensional structure by using the following LowTO algorithm for the rest of the batch's duration.

\begin{algorithm}[t]
\caption{LowTO} \label{alg:lowto}
\begin{algorithmic}[1]
\Input $T,\delta,[\widehat U, \widehat U_\perp], [\widehat V, \widehat V_\perp], \lambda_0, \lambda_{\perp}, \mathcal{H}_1$.
\Stage $M \!= \!\sum_{(x,y) \in \mathcal{H}_1^\prime} x x^\top \!+ \!\Lambda = \sum_{t=1}^{|\mathcal{H}_1^\prime|} x_{s,t} x_{s,t}^{\top}\! +\! \Lambda$, $X^\top \!= \![x_{s,1},\dots,x_{s,|\mathcal{H}_1^\prime|}], [u_1,\dots,u_p]^\top\! = \! M^{-\frac{1}{2}} X^\top$
\begin{align*}
&\text{with }\mathcal{H}_1^\prime  =  \left\{\left(x_{s,t}^\top =[\vecc{\widehat{U}^\top X \widehat{V}}^\top, \vecc{\widehat{U}^\top X \widehat{V}_\perp}^\top,  \right.\right. \\ &\left.\left.\hspace{-0.28 cm} \vecc{\widehat{U}_\perp^\top X \widehat{V}}^\top, \vecc{\widehat{U}_\perp^\top X \widehat{V}_\perp}^\top], y_{s,t} = y\right) \! : \! (X, y) \! \in\! \mathcal{H}_1 \right\}\!. \\
&\hspace{1 cm}\Lambda = \text{diag}([\underbrace{\lambda_0,\dots,\lambda_0}_{
    \textstyle
    \begin{gathered}
       k
    \end{gathered}},\underbrace{\lambda_\perp, \dots, \lambda_\perp}_{
    \textstyle
    \begin{gathered}
       p-k
    \end{gathered}}])
\end{align*}
\For{$t = 1$ {\bfseries to} $T$}
\State Get $\hat{y}_i \!=\! [y_{s,1} \mathds{1}_{u_{i,1}y_{s,1} \leq b_{t-1}}, \dots, y_{t-1} \mathds{1}_{u_{i,|\mathcal H_1|+t-1}y_{t-1}}$ \newline$ 
_{\leq b_{t-1}}]^\top$ for $i \in [p]$, where $\hat y_i \in \R^{|\mathcal H_1|+t-1}$.
\State Calculate $\hat{\theta}_{t-1} = M^{-1/2} [u_1^\top \hat{y}_1, \dots, u_p^\top \hat{y}_p]^\top$.
\State Transform the arm set $\mathcal{X}_t$ as
\begin{align*}
    \mathcal{X}_t^{\prime} \!=\! &\left\{ [\vecc{\widehat{U}^\top X \widehat{V}}^\top, \vecc{\widehat{U}^\top X \widehat{V}_\perp}^\top, \vecc{\widehat{U}_\perp^\top X \widehat{V}}^\top, \right. \\
    &\left.\vecc{\widehat{U}_\perp^\top X \widehat{V}_\perp}^\top]^\top \in \mathbb{R}^{p}: X \in \mathcal{X}_t \right\}.
\end{align*}
\State Pull $x_t = \arg\max_{x \in \mathcal{X}_t^{\prime}} x^\top \hat{\theta}_{t-1} +\beta_{t-1} \norm{x}_{M^{-1}}$ and observe the reward $y_t$. 
\State Restore $x_t$ into its original matrix form $X_t$ and then add $(X_t, y_t)$ into $\mathcal{H}_1$.
\State Update $M = M + x_t x_t^\top, X^\top = [X^\top, x_t]$ and $[u_1,\dots,u_p]^\top = M^{-1/2} X^\top$.
\EndFor
\State \Return The history buffer $\mathcal{H}_1$.
\end{algorithmic}
\end{algorithm}


\textbf{LowTO Algorithm:}
To begin with, we reformulate the resulting $p$-dimensional linear bandit problem under heavy-tailed rewards in the following way: at round $t$, the agent chooses an arm $x_t \in \X_t^\prime$ of dimension $p$ where $\X_t^\prime$ is a rearranged vectorization of $\X_t^-$ as defined in Algorithm~\ref{alg:lowto} line 4, and observes a noisy payoff $y_t = x_t^\top\theta^* + \eta_t$ mixed with some heavy-tailed noise $\eta_t$. 

Our LowTO algorithm is presented in Algorithm~\ref{alg:lowto}. Inspired by LowOFUL in the ESTR method~\citep{jun2019bilinear}, to exploit the additional pattern of $\theta^*$ shown in Eqn.~\eqref{eq:sparse}, we propose the almost-low-dimensional TOFU (LowTO) algorithm that extends the truncation-based TOFU~\citep{shao2018almost}. The original TOFU trims the observed payoffs for each dimension individually and takes the contexts of historical arms into account for the truncation, which could yield a tight regret bound of order $\tilde O(pT^\frac{1}{1+\delta})$. As shown in Algorithm~\ref{alg:lowto} line 2, our LowTO also truncates each entry of $M^{-1/2}x_iy_i$ for $i=1,\dots,t-1$ at time $t$ by some increasing threshold $b_t$, Different from TOFU, when calculating the estimator $\hat \theta$ in Algorithm~\ref{alg:lowto} line 3, we put a weighted regularizer as the diagonal matrix $\Lambda = \text{diag}(\lambda,\dots,\lambda,\lambda_\perp,\dots,\lambda_\perp)$ with $\lambda$ only applied to the first $k$ coordinates. By amplifying $\lambda_\perp$, we ensure greater penalization is applied to the final $p-k$ elements of $\hat \theta$ leading to their diminished values, and this phenomenon is well intended under the almost-low-dimensional structure. Subsequently, we utilize a UCB-based criterion to choose the pulled arm according to Algorithm~\ref{alg:lowto} line 5, where we also decrease the variation of the last $p-k$ elements with $M^{-1}$ to further reduce their impact on the decision-making. It is also noteworthy that we always reuse all the past observations stored in $\H_1$ at each batch when initializing the matrix $M$, which can facilitate a consistent and accurate estimator $\hat \theta$ in the early stage of the exploitation phase. And the randomly drawn samples in $\H_1$ contain more stochasticity and thus are more preferable for the parameter estimation.  

We then state the regret bound of LowTO in Theorem~\ref{thm:lowto}:

\begin{theorem}\label{thm:lowto}
    Suppose the input $\H_1$ is of size $H \lesssim T$ and we run our LowTO algorithm for $T$ rounds. By setting $b_t = (b/\log(2p/\epsilon))^{\frac{1}{1+\delta}} (t+H)^\frac{1-\delta}{2+2\delta}, \beta_t = 4 \sqrt{p} b^\frac{1}{1+\delta} \log(2p/\epsilon)^\frac{\delta}{1+\delta} (t+H)^\frac{1-\delta}{2+2\delta} + \sqrt{\lambda_0} S + \sqrt{\lambda_\perp} S_\perp$ with $\lambda_\perp = S^2T_2/(k \log(1+\frac{S^2 T}{k \lambda_0}))$, with probability at least $1-\epsilon$, the regret of LowTO can be bounded by:
    $$\tilde O\left(\sqrt{kp} \left(T+H\right)^\frac{1}{1+\delta} + \sqrt{kT} + S_\perp T \right),$$
    where $S_\perp$ is the upper bound of $\norm{\theta_{k+1:p}}_2$ as shown in Eqn. \eqref{eq:sparse} depending on $|\H_2|$.
\end{theorem}

In standard linear bandit under heavy-tailed noise case, we can recover the same regret bound of TOFU in the order of $\tilde O(p \cdot T^\frac{1}{1+\delta})$ by setting $S_\perp = S$ and $\lambda_\perp = \lambda$.

\textbf{Overall regret:} Now we are ready to present the overall regret bound for LOTUS in the following Theorem~\ref{thm:lotus}.

\begin{theorem}\label{thm:lotus}
    By using the configuration of LowTO described in Theorem~\ref{thm:lowto} and the parameter values of LOTUS shown in Algorithm~\ref{alg:lotus} for each batch, and set $\epsilon$ as $\epsilon/2^{i+1}$ in $\beta_t$ (formulated in Theorem~\ref{thm:lowto}) for the $i$-th batch. Then with probability at least $1-\epsilon$, it holds that
    $$R(T) \leq \tilde O\left( d^{\frac{2+4\delta}{1+3\delta}} r^{\frac{1+\delta}{1+3\delta}} T^{\frac{1+\delta}{1+3\delta}} / D_{rr}^{\frac{2+2\delta}{1+3\delta}} + d^\frac{3}{2} r^\frac{1}{2} T^\frac{1}{1+\delta} \right),$$
    under the condition that $T_1 \geq 5 d^\frac{1+2\delta}{\delta} r^\frac{1+\delta}{2\delta}/D_{rr}^\frac{1+\delta}{\delta}$. Furthermore, we can simplify the above result as
    $$R(T) \leq \begin{cases}
        \tilde O\!\left(d^\frac{3}{2} r^\frac{1}{2} T^\frac{1}{2} / D_{rr}\right)\!, \delta = 1; \\
        \tilde O\! \left(d^\frac{3}{2} r^\frac{1}{2} T^\frac{1}{1+\delta}\right)\!, \delta <1, T \gtrsim (dr)^\frac{1+\delta}{2\delta}/{D_{rr}^{\frac{2(1+\delta)^2}{\delta(1-\delta)}}}.
    \end{cases}$$
\end{theorem}
Note the regret bound in Theorem~\ref{thm:lotus} improves upon the one attained for a simple linear bandit reduction, which contains the order of $d^2$. When the rewards have bounded variance, i.e., $\delta = 1$, our regret bound matches the modern one for low-rank matrix bandit under sub-Gaussian noise up to logarithmic terms~\citep{lu2021low,kang2022efficient}. 



\subsection{LOTUS: The Rank $r$ is Unknown}\label{subsec:lotusunknown}

While all existing algorithms for low-rank matrix bandits require prior knowledge of the rank $r$, this information is never revealed to agents in real-world applications, and hence misspecification of $r$ will not only undermine the theoretical foundations but also severely compromise the performance of these methods. To solve this crucial challenge, in this section we aim to enhance our LOTUS algorithm to be agnostic to $r$ even under the more complex heavy-tailed scenario. For the Lasso bandit, which is another popular and easier high-dimensional bandit with sparsity, some algorithms~\citep{oh2021sparsity,ariu2022thresholded} free of the sparsity index have been recently introduced. However, when compared with our work, all of them necessitate some additional assumptions on the structure of the underlying parameter as well as the sampling distribution. For example, \cite{oh2021sparsity} further assumes that the active entries of the parameter vector are relatively independent and the skewness of the sampling distribution is bounded. This fact substantiates the huge difficulty of devising an efficient algorithm for LowHRT without additional conditions. Note our work also opens up a potential avenue for exploring low-rank matrix bandits without the need for knowledge about $r$, and we believe that completely addressing this intriguing problem must require more specific assumptions and investigations.

To improve our batched-explore-then-exploit-based LOTUS algorithm, an intuitive idea is to estimate the effective rank of $\widehat \Theta$ right after the matrix recovery in each batch. By trimming the estimated singular values $\{D_{ii}\}_{i=1}^d$ with some craftily designed increasing sequence that is deduced from Theorem~\ref{thm:estimator}, we could obtain a useful rank $\hat r$ with $\hat r \leq r$ and then only focus on the top-$\hat r$ row and column subspaces. We can demonstrate that all the ground truth singular values $\{D_{ii}\}_{i=\hat r+1}^d$ omitted are nearly null and hence negligible. Therefore, by penalizing the subspaces parallel to those omitted directions with a similar idea used in our original LOTUS, we could enjoy the low-rank benefit of LowHTR. Specifically, to modify line 6 and line 7 in Algorithm~\ref{alg:lotus}, we abuse the notation here and denote $\widehat D$ as the singular value matrix of $\widehat \Theta$ that is deduced in line 5. Subsequently, we estimate the useful rank $\hat r$ as
\begin{align*}
\hat r =  \min & \left\{i \in [d\!+\!1]\! :\! \widehat D_{ii} \!\leq\!  C_1 \frac{\sigma \sqrt{i}}{c_l} \left(\frac{d+ \ln{(2^{i+1}/\epsilon)}} {|\H_2|}\right)^{\frac{\delta}{1+\delta}} \right. \\
&\hspace{2.5 cm} \left. \cdot c^{\frac{1}{1+\delta}}\right\}- 1 \wedge 1,
\end{align*}
where $C_1$ is some specific constant in Theorem~\ref{thm:estimator} and $\widehat D_{(d+1)(d+1)}$ is set to be $0$ to avoid the empty set case. Afterward, we rewrite the full SVD of $\widehat \Theta$ as $\widehat \Theta = [\widehat U, \widehat U_\perp] \, \widehat D \, [\widehat V, \widehat V_\perp]^\top$ with $\widehat U \in \mathbb{R}^{d_1 \times \hat r}, \widehat V \in \mathbb{R}^{d_2 \times \hat r}$ for each batch in line 6. In new line 7 of our improved LOTUS, we then input the new $[\widehat U, \widehat U_\perp]$ and $[\widehat V, \widehat V_\perp]$ with the estimated rank $\hat r$ as described above, and the effective dimension $k$ in the following subspace estimation and LowTO implementation will become $k = p - (d_1 - \hat r)(d_2 - \hat r)$. Note $\hat r$ might differ across different batches, but $\hat r \leq r$ consistently holds.

Conclusively, we can obtain the following regret bound of our improved LOTUS algorithm agnostic to $r$:

\begin{theorem}\label{thm:lotus2}
    By using the same setting and conditions of LOTUS as described in Theorem~\ref{thm:lotus} and Algorithm~\ref{alg:lotus} with $T_1 = \min\left\{d \cdot 2^{\frac{i(1+\delta)}{1+2\delta}}, 2^i \right\}$ in line 3 of Algorithm~\ref{alg:lotus}, and utilizing the estimated useful rank $\hat r$ to set the corresponding value of $k$ at each batch, the cumulative regret of our LOTUS agnostic to $r$ can be bounded as
    $$R(T) \leq \tilde O \left( d^\frac{3}{2} r^\frac{1}{2} T^\frac{1}{1+\delta} + d r^\frac{3}{2} T^\frac{1+\delta}{1+2\delta}\right),$$
    with probability at least $1-\epsilon$.
\end{theorem}
% The above regret bound is efficient under the high-dimensional scenario, i.e., $d \gtrsim T^{\frac{\delta^2}{(1+2\delta)(1+\delta)}}$. 
While there exists a disparity between our derived regret bound in cases where $r$ remains undisclosed and the optimal one, as previously discussed in this section, it would prove exceptionally difficult to devise an algorithm for LowHTR that remains agnostic to $r$ while achieving a similar regret bound without more stringent assumptions. Solving this issue would necessitate the formulation of more specific assumptions on the underlying structure of the arm matrices and $\Theta^*$.

Moreover, we will showcase the superior efficiency of our LOTUS algorithm in both scenarios, whether the agent possesses knowledge of $r$ or not, in the following experimental results in Section~\ref{sec:exp}.

\section{Lower bounds}\label{sec:lower}
In this section, we provide a lower bound for the expected cumulative regret in LowHTR particularly regarding the order of $T$. The result is given as follows:
\begin{theorem}\label{thm:lower}
    Under the LowHTR problem with $d,r,T$ and $S=1$ in Assumption~\ref{assu:bounded}, there exists an instance with a fixed $\X_t$ containing $(d-1)r$ arms for which any algorithm must suffer an expected regret of order $\Omega(d^\frac{\delta}{1+\delta} r^\frac{\delta}{1+\delta} T^\frac{1}{1+\delta})$, i.e., $\E(R_T) \gtrsim d^\frac{\delta}{1+\delta} r^\frac{\delta}{1+\delta} T^\frac{1}{1+\delta} \gtrsim T^\frac{1}{1+\delta}$.
\end{theorem}
Theorem~\ref{thm:lower} demonstrates that our LOTUS could attain the lower bound for LowHTR regarding the order of $T$ when $r$ is given. And this lower bound is tight with $r = d$ and finite arm sets since it matches the minimax rate for standard linear bandits under heavy-tailed noise~\citep{xue2020nearly}. Further exploring the regret lower bound for $d$ and $r$ under LowHTR is notably challenging, given the fact that even the simpler low-rank matrix bandits under sub-Gaussian noise this problem is not thoroughly studied~\citep{kang2022efficient}. And the regret lower bound may differ in the order of $d$ when the arm set is infinitely large and arbitrary~\citep{shao2018almost}. We will leave them as future directions.

\section{Experiments}\label{sec:exp}
\begin{figure*}[t]
\begin{minipage}[b]{0.33\linewidth}
    \centering
    \includegraphics[width = 0.99\textwidth]{t.jpg}
    % \vspace{-1mm}
    % \captionof*{figure}{(b)}
\end{minipage}
\begin{minipage}[b]{0.33\linewidth}
    \centering
    \includegraphics[width = 0.99\textwidth]{pareto.jpg}
    % \vspace{-1mm}
    % \captionof*{figure}{(d)}
\end{minipage}
\begin{minipage}[b]{0.33\linewidth}
    \centering
    \includegraphics[width = 0.995\textwidth]{laplace.jpg}
    % \vspace{-1mm}
    % \captionof*{figure}{(f)}
\end{minipage}
\begin{minipage}[b]{0.33\linewidth}
    \centering
    \includegraphics[width = 0.99\textwidth]{t_con.jpg}
    % \vspace{-1mm}
    % \captionof*{figure}{(a)}
\end{minipage}
\begin{minipage}[b]{0.33\linewidth}
    \centering
    \includegraphics[width = 0.99\textwidth]{pareto_con.jpg}
    % \vspace{-1mm}
    % \captionof*{figure}{(c)}
\end{minipage}
\begin{minipage}[b]{0.33\linewidth}
    \centering
    \includegraphics[width = 0.995\textwidth]{laplace_con.jpg}
    % \vspace{-1mm}
    % \captionof*{figure}{(e)}
\end{minipage}
\caption{Plots of cumulative regrets of LowESTR and our proposed LOTUS with fixed or changing contextual arm set under t-distribution, Pareto, and Laplace heavy-tailed noise. We use the LOTUS algorithm agnostic to $r$ in the first three experiments displayed in the first row, and we utilize the value of $r$ in LOTUS in experiments shown in the second row.}
\label{plt:exp}
\end{figure*}

We demonstrate that our proposed LOTUS yields superior performance over the existing LowESTR algorithm~\citep{lu2021low} in the presence of heavy-tailed noise under a suite of simulations. Since our work is the first one to study the LowHTR problem and currently there is no existing method for comparison, we utilize the LowESTR algorithm specifically designed for the sub-Gaussian noise to validate the robustness of our proposed LOTUS. LowESTR also borrows the idea of the two-stage framework from ESTR, and it improves upon ESTR on the computational efficiency of the matrix recovery step. It requires both the knowledge of the horizon $T$ and the rank $r$ as inputs. In the following experiments, we showcase that it becomes vulnerable and achieves suboptimal performance under heavy-tailed noise in practice as expected. The values of hyperparameters in our LOTUS are strictly aligned with their theoretical results deduced in Theorem~\ref{thm:estimator} and Theorem~\ref{thm:lowto}.

We consider two different settings of the parameter matrices $\Theta^*$ with $d_1=d_2=10$ and $r=2$. For the first scenario, we set the parameter matrix as a diagonal matrix $\Theta^* = \text{diag}([7,4,0,\dots,0])$. The arm set is fixed where we draw $500$ random matrices from $\{X \in \R^{10 \times 10}: \fnorm{X} \leq 1 \}$ in the beginning. And we implement the improved LOTUS algorithm introduced in Subsection~\ref{subsec:lotusunknown} that is unaware of the rank $r$ in this scenario. For the second case, we consider a more challenging parameter matrix $\Theta^*$ such that its first row represents a random vector of norm $7$ and its second row is a perpendicular vector of norm $4$ with other entries set to $0$. Contrasting the first scenario, we consider a contextual arm set with $10$ feature matrices drawn from $\{X \in \R^{10 \times 10}: \fnorm{X} \leq 1 \}$ at each round. And we use the original LOTUS algorithm introduced in Subsection~\ref{subsec:lotus} requiring the knowledge of $r=2$. For the heavy-tailed noise $\eta_t$, we consider the following three types of distribution for both scenarios introduced above:

\begin{itemize}
    \item \textbf{Student's t-distribution:} The density function is given as $f(x) \asymp (1+x^2/\nu)^{-\frac{\nu+1}{2}}$ with degree of freedom parameter $\nu > 0$ and $x \in \R$. By setting $\nu = 1.7$, it has infinite variance but finite $1.5$ moment bounded by $6$. The heavy-tail index is equal to $1.60$.\footnote{A greater heavy-tail index~\citep{hoaglin2000understanding} above $1$ indicates stronger fluctuation and heavy-tailedness of the distribution.}
    \item \textbf{Pareto distribution:} The density function is given as $f(x) \asymp \alpha/(x+1)^{\alpha+1}$ for some shape parameter $\alpha > 0$ and $x > 0$. By setting $\alpha = 1.9$, it also has infinite variance but finite $1.5$ moment bounded by $5$. And the heavy-tail index is equal to $2.20$.
    \item \textbf{Laplace distribution:} The density distribution is formulated as $f(x) \asymp \exp(-|x|/b)$ with some scale parameter $b$ for $x \in \R$. By setting $b=1$, the distribution possesses a finite variance bounded by $2$. The heavy-tail index of this distribution is $1.36$.
\end{itemize}

According to Figure~\ref{plt:exp}, we observe that our LOTUS algorithm consistently exhibits superior and more resilient performance across all six scenarios compared to LowESTR. This advantage is particularly evident when dealing with distributions with a higher heavy-tail index, which is aligned with our expectations. On the contrary, LowESTR performs fairly in the presence of Laplace noise with a finite variance but struggles when faced with Pareto noise possessing stronger heavy-tailedness. Furthermore, it is noteworthy that the cumulative regret of the LOTUS algorithm exhibits a batch-wise increase, with a progressively clearer sub-linear pattern emerging in subsequent batches. This fact firmly validates the practical superiority of our LOTUS algorithm under both cases when the rank $r$ is presented or not.

\section{Conclusions}\label{sec:conclu}
In this work, we introduce and examine the new problem of LowHTR, and we propose a robust algorithm named LOTUS that can be agnostic to $T$ and even the rank $r$ with a slightly milder regret bound. We also develop a matching lower bound to demonstrate our LOTUS is nearly optimal in the order of $T$. Meanwhile, we prove that our Huber-type estimator could solve the trace regression problem under arbitrary heavy-tailed noise with finite $(1+\delta)$ moment ($\delta \in (0,1]$) and its Frobenious norm error is of scale $\tilde O((d/n)^\frac{\delta}{1+\delta} \E(|\eta|^{1+\delta})^\frac{1}{1+\delta})$.  The practical superiority of our proposed method is validated under simulations.

\textbf{Limitations:} Although our work represents the first solution to the low-rank matrix bandits without knowing $r$, it leaves a gap with our deduced lower bound. Closing this regret gap seems highly non-trivial without additional assumptions~\citep{oh2021sparsity}, and we will leave it as a future work.




\begin{acknowledgements} 
We appreciate the constructive feedback from the anonymous reviewers and area chair. This work was partially supported by the National Science Foundation under grants CCF-1934568, DMS-1916125, DMS-2113605, DMS-2210388, IIS-2008173 and IIS2048280.
\end{acknowledgements}

% References
\bibliography{kang_26}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newpage

\onecolumn

\title{Appendix for Low-rank Matrix Bandits with Heavy-tailed Rewards\\(Supplementary Material)}
\maketitle



\appendix
\section{Analysis of Theorem~\ref{thm:estimator}}\label{app:estimator}
The analysis of Theorem~\ref{thm:estimator} is inspired by and extended from~\cite{yu2023low}.
\subsection{Preliminaries}\label{app:estimatorlemmas}
\begin{lemma}\label{lem:bernstein}(Bernstein Inequality)
    Let $X$ be a random variable with mean $\mu$ and variance $\sigma^2$. Assume we can find some $b >0$ such that 
    $$\E |X - \mu|^k \leq \frac{1}{2} k! \sigma^2 b^{k-2}, \; k = 3 ,4 ,5 ,\dots$$
    Then it holds that
    $$\P \left( |X - \mu| \geq t \right) \leq 2 \exp \left(- \frac{t^2}{2(\sigma^2 + bt)} \right), \; \forall t > 0.$$
\end{lemma}

\begin{corollary}\label{coro:bernstein}(Adapted from Bernstein Inequality)
    Let $X$ be a random variable with mean $\mu$ and variance $\sigma^2$. Assume we can find some $b >0$ such that 
    $$\E |X - \mu|^k \leq \frac{1}{2} k! \sigma^2 b^{k-2}, \; k = 3 ,4 ,5 ,\dots$$
    Then it holds that
    $$\P \left( X - \mu \geq \sqrt{2t} \sigma + 2bt \right) \leq \exp{(-t)}, \; \forall t > 0.$$
\end{corollary}
\proof 
Based on Lemma~\ref{lem:bernstein}, we have that for any $t>0$
\begin{align*}
\P\left( X - \mu \geq \sqrt{2t} \sigma + 2bt \right) \leq \exp{\left( - \frac{(\sqrt{2t} \sigma + 2bt)^2}{2\sigma^2 + 2b(\sqrt{2t}\sigma + 2bt)} \right)} &\leq 
\exp{\left( - \frac{2\sigma^2 t + 4b^2t^2 + 4\sqrt{2} b \sigma t^\frac{3}{2} }{2\sigma^2 + 4\sigma^2 t + 2\sqrt{2} b \sigma \sqrt{t}} \right)} \\
& \leq \exp(-t).
\end{align*}
\hfill \qedsymbol

\begin{definition}(Local Restricted Strong Convexity) For the empirical loss function $\hat L_\tau(\cdot)$, we can define the event of local restricted strong convexity $\mathcal{E}(s,l,\kappa)$ in terms of the radius parameter $s,l$ and the curvature parameter $\kappa$ as
$$\mathcal{E}(s,l,\kappa) = \left\{ \inf_{\Theta \in \mathcal{M}(\Theta^*,s,l)} \frac{\inp{\nabla \hat L_\tau(\Theta) - \nabla \hat L_\tau(\Theta^*)}{\Theta - \Theta^*}}{\fnorm{\Theta - \Theta^*}^2} \geq \kappa
 \right\},$$
where $\mathcal{M}(\Theta^*,s,l) = \left\{ \Theta \in \R^{d_1 \times d_2}: \fnorm{\Theta - \Theta^*} \leq s, \nunorm{\Theta - \Theta^*} \leq l \fnorm{\Theta - \Theta^*}  \right\}$.
    
\end{definition}

We assume $d_1 \geq d_2$ without loss of generality, and denote $\widehat \Delta \coloneqq \widehat \Theta - \Theta^*$ in the following argument. To start with, we will show that our target $\fnorm{\widehat \Delta}$ can be bounded conditioned on the event $\mathcal E(s,l,\kappa)$ and $\lambda \geq 2 \opnorm{\nabla \hat L_\tau(\Theta^*)}$.
\begin{theorem}\label{thm:thm1}
    Conditioned on the event $\lambda \geq 2 \opnorm{\nabla \hat L_\tau(\Theta^*)}$ and the event $\mathcal E(s,l,\kappa)$ with $s \geq 9 \sqrt{r} \frac{\lambda}{\kappa}$ and $l \geq 4 \sqrt{2r}$, then we can deduce that
    $$\fnorm{\widehat \Delta} = \fnorm{\widehat \Theta - \Theta^*} \leq 9 \sqrt{r} \cdot \frac{\lambda}{\kappa}.$$
\end{theorem}
\proof We will prove Theorem~\ref{thm:thm1} by contradiction. Assume we have that $\lambda \geq 2 \opnorm{\nabla \hat L_\tau(\Theta^*)}$ and $\mathcal E(s,l,\kappa)$ holds with $s \geq 9 \sqrt{r} \frac{\lambda}{\kappa}$ and $l \geq 4 \sqrt{2r}$, and we assume $\fnorm{\widehat \Delta} > 9\sqrt{r} \cdot \frac{\lambda}{\kappa}$ holds. Define $\widetilde \Theta_x = \Theta^* + x (\hTheta - \Theta^*)$ as a function of $x \in [0,1]$, then there exists some $\zeta \in (0,1)$ such that $\tTheta = \Theta^* + \zeta (\hTheta - \Theta^*)$ satisfying $\fnorm{\tTheta - \Theta^*} = 9\sqrt{r} \cdot \frac{\lambda}{\kappa}$ since $\fnorm{\widetilde \Theta_x - \Theta^*}$ is a continuous function in terms of $x \in [0,1]$. Furthermore, we define $Q(x) = \hat L_\tau(\txTheta) - \hat L_\tau(\Theta^*) - \inp{\nabla \hat L_\tau(\Theta^*)}{\txTheta - \Theta^*}$. Note $x \in [0,1] \rightarrow Q(x)$ can be easily shown as a convex function: first, we observe that $\txTheta$ is a linear function of $x$, and the Huber loss function defined in Section~\ref{subsec:lowrank} is convex~\citep{huber1965robust}, which implies that $\hat L_\tau(\txTheta)$ is convex. On the other hand, the inner product $\inp{\nabla \hat L_\tau(\Theta^*)}{\txTheta - \Theta^*}$ is bi-linear and hence naturally convex as well. Therefore, we know that $Q^\prime(x) = \inp{\nabla \hat L_\tau(\txTheta)-\nabla \hat L_\tau(\Theta^*)}{\hTheta - \Theta^*}$ is monotonically increasing. And it holds that 
\begin{align}
    \zeta Q^\prime(\zeta) \leq \zeta Q^\prime(1) \; \; \Longrightarrow \; \; \inp{\nabla \hat L_\tau(\tTheta)-\nabla \hat L_\tau(\Theta^*)}{\tTheta - \Theta^*} \leq \zeta \inp{\nabla \hat L_\tau(\hTheta)-\nabla \hat L_\tau(\Theta^*)}{\hTheta - \Theta^*}\label{eq:thm1:1}
\end{align}
To bound the right-hand side of Eqn.~\eqref{eq:thm1:1}, since $\hTheta$ is the solution to the convex optimization problem in Eqn.~\eqref{eq:estimator}, then we have the sub-gradient condition as:
$$\inp{\nabla \hat L_\tau (\hTheta) + \lambda \widehat Z}{\widehat \Theta - \Theta^*} \leq 0, \; \; \text{where } \widehat Z \in \partial \nunorm{\hTheta}.$$ Due to the definition of the sub-gradient, it holds that $\nunorm{\Theta^*} \geq \|{\hTheta}\|_{\text{nuc}} + \inp{\widehat Z}{\Theta^* - \hTheta}$. By assuming $\lambda \geq 2 \opnorm{\nabla \hat L_\tau(\Theta^*)}$, we can have that
\begin{align}
    \inp{\nabla \hat L_\tau(\hTheta) - \nabla \hat L_\tau(\Theta^*)}{\hTheta - \Theta^*} &\leq \inp{\lambda \widehat Z}{\Theta^* - \hTheta} + \inp{\nabla \hat L_\tau(\Theta^*)}{\Theta^* - \hTheta} \nonumber \\
    &\leq \lambda\left(\nunorm{\Theta^*} - \|{\hTheta}\|_{\text{nuc}}\right) + \frac{\lambda}{2} \nunorm{\Theta^* - \hTheta} \leq \frac{3 \lambda}{2} \nunorm{\widehat \Delta} \nonumber
\end{align}
To bound $\nunorm{\widehat \Delta}$, we utilize the regular procedure~\citep{negahban2011estimation,yu2023low}. We restate the notation and define the reduced SVD of $\Theta^*$ as $\Theta^* = U \Sigma V^\top$ with $U \in \R^{d_1 \times r}$ and $V \in \R^{d_2 \times r}$. Then we denote two sets as:
\begin{gather*}
    \mathbb M = \left\{ \Theta \in \R^{d_1 \times d_2}: \, \text{row}(\Theta) \subseteq \text{col}(V), \text{col}(\Theta) \subseteq \text{col}(U)  \right\}, \\
    \bar{\mathbb M}^\perp = \left\{ \Theta \in \R^{d_1 \times d_2}: \, \text{row}(\Theta) \subseteq \text{col}(V)^\perp, \text{col}(\Theta) \subseteq \text{col}(U)^\perp  \right\},
\end{gather*}
and hence $\mathbb M \subseteq \bar{\mathbb M}$. Next we will show that $\|{\widehat \Delta_{\bar{\mathbb M}^\perp}}\|_{\text{nuc}} \leq 3\|{\widehat \Delta_{\bar{\mathbb M}}}\|_{\text{nuc}}$ in the following part. First, since $\widehat \Theta$ is the solution to the problem defined in Eqn.\eqref{eq:estimator}, we have that 
$$\hat L_\tau(\widehat \Theta) + \lambda \nunorm{\widehat \Theta} \leq \hat L_\tau(\Theta^*) + \lambda \nunorm{\Theta^*} \; \; \Longleftrightarrow \; \; \hat L_\tau(\widehat \Theta) -  \hat L_\tau(\Theta^*) \leq \lambda \left( \nunorm{\Theta^*} - \nunorm{\widehat \Theta} \right).$$
For the left-hand side, it holds that
\begin{align}
    \hat L_\tau(\widehat \Theta) -  \hat L_\tau(\Theta^*) &\geq \inp{\nabla \hat L_\tau(\Theta^*)}{\widehat \Theta - \Theta^*} \geq - \opnorm{\nabla \hat L_\tau(\Theta^*)} \nunorm{\widehat \Delta} \nonumber \\
    &\geq - \opnorm{\nabla \hat L_\tau(\Theta^*)} \left(\nunorm{\widehat \Delta_{\mathbb M}} + \nunorm{\widehat \Delta_{\bar{\mathbb M}^\perp}} \right) \geq - \frac{\lambda}{2} \left(\nunorm{\widehat \Delta_{\bar{\mathbb M}}} + \nunorm{\widehat \Delta_{\bar{\mathbb M}^\perp}} \right). \label{eq:thm1:2}
\end{align}
And for the right-hand side, we have that
$$\nunorm{\widehat \Theta} = \nunorm{\Theta^* + \widehat \Delta} = \nunorm{\Theta^*_{\mathbb M} +\widehat \Delta_{\bar{\mathbb M}} + \widehat \Delta_{\bar{\mathbb M}^\perp}} \geq \nunorm{\Theta^*_{\mathbb M}} +\nunorm{\widehat \Delta_{\bar{\mathbb M}^\perp}}  -\nunorm{\widehat \Delta_{\bar{\mathbb M}}},$$
and hence we have that 
\begin{align}
    \nunorm{\Theta^*} - \nunorm{\widehat \Theta} = \nunorm{\Theta^*_{\mathbb M}} - \nunorm{\widehat \Theta}  \leq \nunorm{\widehat \Delta_{\bar{\mathbb M}}} - \nunorm{\widehat \Delta_{\bar{\mathbb M}^\perp}}. \label{eq:thm1:3}
\end{align}
Combining the results from Eqn.~\eqref{eq:thm1:2} and Eqn.~\eqref{eq:thm1:3}, we can deduce that $\|{\widehat \Delta_{\bar{\mathbb M}^\perp}}\|_{\text{nuc}} \leq 3\|{\widehat \Delta_{\bar{\mathbb M}}}\|_{\text{nuc}}$. Next, since we have that rank$(\widehat \Delta_{\bar{\mathbb M}}) \leq 2r$, then based on Cauchy-Schwarz inequality it holds that
$$\nunorm{\widehat \Delta} \leq \nunorm{\widehat \Delta_{{\mathbb M}}} + \nunorm{\widehat \Delta_{\bar{\mathbb M}_\perp}} \leq \nunorm{\widehat \Delta_{\bar{\mathbb M}}} + \nunorm{\widehat \Delta_{\bar{\mathbb M}_\perp}} \leq 4 \nunorm{\widehat \Delta_{\bar{\mathbb M}}} \leq 4 \sqrt{2r} \fnorm{\widehat \Delta_{\bar{\mathbb M}}} \leq 4 \sqrt{2r} \fnorm{\widehat \Delta}.$$
Therefore, we can show that $\nunorm{\tTheta - \Theta^*} \leq 4 \sqrt{2r} \fnorm{\tTheta - \Theta^*}$. And remember that we assume $\fnorm{\tTheta - \Theta^*} = 9\sqrt{r} \cdot \frac{\lambda}{\kappa}$. These facts indicate that $\tTheta \in \mathcal{M}(\Theta^*,s,l)$ with $s \geq 9 \sqrt{r} \frac{\lambda}{\kappa}$ and $l \geq 4 \sqrt{2r}$. Therefore, based on the local restricted strong convexity, we have
$$\kappa \zeta \fnorm{\widehat \Delta} \fnorm{\tTheta - \Theta^*} = \kappa \fnorm{\tTheta - \Theta^*}^2 \leq \inp{\nabla \hat L_\tau(\tTheta) - \nabla \hat L_\tau(\Theta^*)}{\tTheta-\Theta^*}.$$
For the left-hand side, it holds that
$$\kappa \zeta \fnorm{\widehat \Delta} \fnorm{\tTheta - \Theta^*} = \kappa \zeta \fnorm{\widehat \Delta} 9 \sqrt{r} \frac{\lambda}{\kappa} =\zeta \lambda \fnorm{\widehat \Delta} 9 \sqrt{r} ,$$
and for the right-handed side, based on Eqn.~\eqref{eq:thm1:1} we have that
\begin{align*}
\inp{\nabla \hat L_\tau(\tTheta) - \nabla \hat L_\tau(\Theta^*)}{\tTheta-\Theta^*} &\leq \zeta\inp{\nabla \hat L_\tau(\hTheta) - \nabla \hat L_\tau(\Theta^*)}{\hTheta-\Theta^*} \\
& \leq \eta \frac{3\lambda}{2} \nunorm{\widehat \Delta}  \leq \zeta 6\sqrt{2}  \lambda \sqrt{r} \fnorm{\widehat \Delta}
\end{align*}
Consequently, we have $9 \leq 6\sqrt{2}$ that contradicts the fact, which means that 
$$\fnorm{\widehat \Delta} \leq 9\sqrt{r} \cdot \frac{\lambda}{\kappa}.$$
\hfill \qedsymbol

Next, we will show the event $\mathcal E(s,l,\kappa)$ and the event $\lambda \geq 2 \opnorm{\nabla \hat L_\tau(\Theta^*)}$ hold with high probability individually. Specifically, we will first give an upper bound of $\opnorm{\nabla \hat L_\tau(\Theta^*)}$ in Theorem~\ref{thm:lambda} and then present the event $\mathcal E(s,l,\kappa)$ holds with high probability in Theorem~\ref{thm:e}.
\begin{theorem}\label{thm:lambda}
    By taking $\tau = (\frac{n}{5d-\ln(\epsilon)})^\frac{1}{1+\delta}c^\frac{1}{1+\delta}$, then with probability at least $1-\epsilon$, it holds that
    $$\opnorm{\nabla \hat L_\tau(\Theta^*)} \leq (10+11\sqrt{2}) \sigma \left(\frac{n}{5d-\ln(\epsilon)}\right)^\frac{\delta}{1+\delta} c^\frac{1}{1+\delta}.$$
\end{theorem}
\proof
Define the zero-mean random matrix $\Gamma = \nabla \hat L_\tau(\Theta^*) - \E \nabla \hat L_\tau(\Theta^*)$, then we have that
$$\opnorm{\nabla \hat L_\tau(\Theta^*)} = \opnorm{\nabla \hat L_\tau(\Theta^*) - \E \nabla \hat L_\tau(\Theta^*) + \E \nabla \hat L_\tau(\Theta^*)} \leq \opnorm{\Gamma} + \opnorm{\E \nabla \hat L_\tau(\Theta^*)}.$$
Therefore, we could control these two terms separately. Denote $S^{d-1} = \{u \in \R^d: \|u\|_2 = 1\}$. For the second term, we have that 
$$\nabla \hat L_\tau (\Theta^*) = -\frac{1}{n} \sum_{i=1}^n l^\prime_\tau(y_i - \inp{X_i}{\Theta^*}) X_i = - \frac{1}{n} \sum_{i=1}^n l^\prime_\tau(\eta_i) X_i.$$
Therefore, we can deduce that
\begin{align*}
    \opnorm{\E \nabla \hat L_\tau(\Theta^*)} &= \sup_{u \in S^{d_1-1}, v \in S^{d_2-1}} \frac{1}{n} \sum_{i=1}^n \E \left( l^\prime_\tau (\eta_i) u^\top X_i v \right) \\
    &= \sup_{u \in S^{d_1-1}, v \in S^{d_2-1}} \frac{1}{n} \sum_{i=1}^n \E \left(\E \left( l^\prime_\tau (\eta_i) u^\top X_i v | \mathcal{F}_i \right)  \right) \\
    & = \sup_{u \in S^{d_1-1}, v \in S^{d_2-1}} \frac{1}{n} \sum_{i=1}^n \E \left(u^\top X_i v \cdot \E \left( l^\prime_\tau (\eta_i) | \mathcal{F}_i \right) \right)
\end{align*}
By the expression of $l_\tau^\prime(\cdot)$, we can deduce that 
$$| \E \left( l^\prime_\tau (\eta_i) | \mathcal{F}_i \right) | = | \E \left( l^\prime_\tau (\eta_i) - \eta_i | \mathcal{F}_i \right) | \leq  
\E \left( \frac{|\eta_i|^{1+\delta}}{\tau^\delta} {\Big |} \mathcal{F}_i \right) \leq \frac{c}{\tau^\delta}$$
And since $u^\top X_i v$ is sub-Gaussian with the parameter $\sigma^2$, we have $\E(|u^\top X_i v|) \leq \sqrt{2\sigma^2}$. Conclusively, it holds that
\begin{align}
    \opnorm{\E \nabla \hat L_\tau(\Theta^*)} \leq \frac{\sqrt{2}}{\tau^\delta}c \cdot \sigma. \label{eq:thm2:imp1}
\end{align}
To bound the operator norm of $\Gamma$, we use the regular covering technique: Let $\mathcal N^d_{\frac{1}{4}}$ be the $1/4$ covering of $S^{d-1}$, then we claim that
\begin{align}
    \opnorm{\Gamma} \leq  \frac{5}{2} \max_{u \in \mathcal N^{d_1}_{\frac{1}{4}}, v \in \mathcal N^{d_2}_{\frac{1}{4}}} u^\top \Gamma v. \label{eq:thm2:1}
\end{align}
To prove this result, for any $u \in S^{d_1-1}, v \in S^{d_2-1}$, we denote $S(u) \in \R^{d_1}$ $(S(v) \in \R^{d_2})$ as the nearest neighbor of $u$ $(v)$ in $N^{d_1}_{\frac{1}{4}}$ ($\mathcal N^{d_2}_{\frac{1}{4}}$) such that $\|u-S(u)\|_2, \|v-S(v)\|_2 \leq \frac{1}{4}$. We take $u,v$ such that $u^\top \Gamma v = \opnorm{\Gamma}$. Therefore, it holds that
\begin{align*}
    \opnorm{\Gamma} = u^\top \Gamma v &= S(u)^\top \Gamma S(v) + (u-S(u))^\top \Gamma v + u^\top \Gamma (v - S(v)) + (u-S(u))^\top \Gamma (v - S(v)) \\
    &\leq \max_{u \in \mathcal N^{d_1}_{\frac{1}{4}}, v \in \mathcal N^{d_2}_{\frac{1}{4}}} u^\top \Gamma v + \frac{1}{4} \opnorm{\Gamma} + \frac{1}{4} \opnorm{\Gamma} + \frac{1}{16} \opnorm{\Gamma} \leq \max_{u \in \mathcal N^{d_1}_{\frac{1}{4}}, v \in \mathcal N^{d_2}_{\frac{1}{4}}} u^\top \Gamma v + \frac{3}{5} \opnorm{\Gamma},
\end{align*}
which leads to Eqn.~\eqref{eq:thm2:1}. And then it holds that
$$\opnorm{\Gamma} \leq \frac{5}{2} \max_{u \in \mathcal N^{d_1}_{\frac{1}{4}}, v \in \mathcal N^{d_2}_{\frac{1}{4}}} \frac{1}{n} \sum_{i=1}^n \left[\E\left(l^\prime_\tau(\eta_i) u^\top X_i v \right) -  l^\prime_\tau(\eta_i) u^\top X_i v \right].$$
To bound the right-hand side term, we aim to use a union bound of probability with Corollary~\ref{coro:bernstein}. Since $u^\top X_i v$ is sub-Gaussian with parameter $\sigma$ for arbitrary $u \in \mathcal N^{d_1}_{\frac{1}{4}}, v \in \mathcal N^{d_2}_{\frac{1}{4}}$, then we have that for $k=2,3,\dots$
$$\E|u^\top X_i v|^k = \int_{0}^{\infty} \P\left(|u^\top X_i v|^k > t \right)dt \leq 2 \int_{0}^{\infty} \exp{\left(-\frac{t^2}{2k\sigma^2}\right)} dt \leq \frac{1}{2} \cdot k! \cdot (\sqrt{2}\sigma)^k.$$
The above results along  with the fact that $|l^\prime_\tau(\cdot)| \leq \tau$ can lead to the following inequality for $k=2,3,\dots$:
$$\E\Big|\sum_{i=1}^n l^\prime_\tau(\eta_i) u^\top X_i v \Big|^k \leq n\cdot \tau^{k-1-\delta} \E\left(|l^\prime_\tau(\eta_i)|^{1+\delta}  |u^\top X_i v|^k\right) \leq \frac{1}{2} \cdot k! \cdot \left(\sqrt{2}\sigma \tau\right)^{k-2}\cdot(2n\sigma^2\tau^{1-\delta}c).$$
Based on Corollary~\ref{coro:bernstein}, it holds that
$$\P\left(u^\top \Gamma v \geq 4 \sqrt{x\sigma^2\tau^{1-\delta} c} \frac{1}{\sqrt{n}} + 4\sqrt{2} \sigma \tau \frac{x}{n} \right) \leq e^{-x}.$$
By taking the union bound on all $u \in \mathcal N^{d_1}_{\frac{1}{4}}, v \in \mathcal N^{d_2}_{\frac{1}{4}}$ and using the fact that $9^{d_1+d_2} \leq e^{5d}$, it holds that
\begin{align}
    \P\left(\opnorm{\Gamma} \geq  \frac{5}{2} \max_{u \in \mathcal N^{d_1}_{\frac{1}{4}}, v \in \mathcal N^{d_2}_{\frac{1}{4}}} u^\top \Gamma v   \geq 10\sigma \sqrt{c} \sqrt{\frac{5d-\ln{(\epsilon)}}{n}} \tau^{\frac{1-\delta}{2}} + 10 \sqrt{2} \sigma \tau \frac{5d-\ln(\epsilon)}{n} \right) \leq \epsilon. \label{eq:thm2:imp2}
\end{align}
Combining the results in Eqn.~\eqref{eq:thm2:imp1} and Eqn.~\eqref{eq:thm2:imp2}, we have that 
$$    \P\left( \opnorm{\nabla \hat L_\tau(\Theta^*)}  \geq 10\sigma \sqrt{c} \sqrt{\frac{5d-\ln{(\epsilon)}}{n}} \tau^{\frac{1-\delta}{2}} + 10 \sqrt{2} \sigma \tau \cdot \frac{5d-\ln(\epsilon)}{n} + \frac{\sqrt{2}}{\tau^\delta}c \cdot \sigma \right) \leq \epsilon.$$
By taking $\tau = \left(\frac{n}{5d-\ln(\epsilon)}\right)^\frac{1}{1+\delta} \cdot c^\frac{1}{1+\delta}$, we have that
$$\P \left( \opnorm{\nabla \hat L_\tau(\Theta^*)} \leq (10+11\sqrt{2}) c^\frac{1}{1+\delta} \cdot \left( \frac{5d-\ln(\epsilon)}{n} \right)^\frac{\delta}{1+\delta} \right) \geq 1 - \epsilon.$$
\hfill \qedsymbol
\begin{theorem}\label{thm:e}
    For any $s,l > 0$, if we take $\tau$ and $n$ such that
    \begin{gather*}
        \tau \geq \max\left\{32\sigma^2 s \sqrt{\frac{1}{c_l}}, \left(\frac{64\sigma^2c}{c_l} \right)^\frac{1}{1+\delta} \right\} \\
        n \geq \max \left\{ 8 \ln{(9)} (d_1 + d_2) ,\left(225 \sigma \sqrt{\ln(9)(d_1+d_2)} \frac{\tau l}{s c_l} \right)^2, \left(\frac{48 \sigma^2}{c_l}\sqrt{-2\ln(\epsilon)} \right)^2, -\frac{\tau^2}{c_l s^2} \ln(\epsilon) \right\}.
    \end{gather*}
    Then with probability at least $1-\epsilon$, the local restricted strong convexity $\mathcal{E}(s,l,\kappa)$ holds with $\kappa = \frac{c_l}{4}$.
\end{theorem}
\proof Given the values of $s,l > 0$, for the sake of simplicity we denote the event $\Phi$ as $\Phi = \mathcal{M}(\Theta^*,s,l) = \left\{ \Theta \in \R^{d_1 \times d_2}: \fnorm{\Theta - \Theta^*} \leq s, \nunorm{\Theta - \Theta^*} \leq l \fnorm{\Theta - \Theta^*}  \right\}$. Since the Huber loss is convex and differentiable, we have
\begin{align*}
    D(\Theta) &\coloneqq \inp{\nabla \hat L_\tau(\Theta) - \nabla \hat L_\tau(\Theta^*)}{\Theta - \Theta^*} \\
    & = \frac{1}{n} \sum_{i=1}^n \left(l^\prime_\tau(y_i - \inp{X_i}{\Theta^*}) - l^\prime_\tau(y_i - \inp{X_i}{\Theta})\right) \cdot \inp{X_i}{\Theta - \Theta^*} \\
    &\geq \frac{1}{n} \sum_{i=1}^n  \left(l^\prime_\tau(y_i - \inp{X_i}{\Theta^*}) - l^\prime_\tau(y_i - \inp{X_i}{\Theta})\right) \cdot \inp{X_i}{\Theta - \Theta^*} \cdot \mathbb{1}_{\Xi_i(\Theta)},
\end{align*}
where the last inequality holds since Huber loss is convex, and $\Xi_i(\Theta)$ is defined as
$$\Xi_i(\Theta)  = \left\{ | \eta_i| \leq \frac{\tau}{2} \right\} \cap \left\{ |\inp{X_i}{\Theta - \Theta^*}| \leq \frac{\tau}{2s} \fnorm{\Theta - \Theta^*} \right\}.$$
Note whenever $\Theta \in \Phi$ and $\Xi_i(\Theta)$ hold we have that
$$|y_i - \inp{X_i}{\Theta}| \leq |y_i - \inp{X_i}{\Theta^*}| + \frac{\tau}{2s} \cdot \fnorm{\Theta - \Theta^*} \leq \tau.$$
Since we have $l_\tau^{\prime\prime}(u) = 1$ with $|u| \leq \tau$, it holds that
$$D(\Theta) \geq \frac{1}{n} \sum_{i=1}^n \inp{X_i}{\Theta - \Theta^*}^2 \cdot \mathbb{1}_{\Xi_i(\Theta)}.$$
Furthermore, we define the function $\phi_R(x)$ with some $R>0$ as
$$\phi_R(x) = \begin{cases}
    x^2, &\text{ if } |x| \leq \frac{R}{2}; \\
    (x-R)^2, &\text{ if } \frac{R}{2} <x \leq R; \\
    (x+R)^2, &\text{ if } -R \leq x < -\frac{R}{2}; \\
    0, &\text{ otherwise}.
\end{cases}$$
And we know $\phi_r(\cdot)$ is $R$-Lipschitz continuous with the properties that
$$\phi_{\alpha R}(\alpha x) = \alpha^2 \phi_R(x) \; \forall \alpha > 0,  \; \, \text{and } x^2\cdot \mathbb{1}_{|x|\leq R/2} \leq \phi_R(x) \leq x^2\cdot \mathbb{1}_{|x|\leq R}.$$
Then we can deduce that
\begin{align*}
    \frac{D(\Theta)}{\fnorm{\Theta - \Theta^*}^2} &\geq \frac{1}{n} \sum_{i=1}^n \left( \frac{\inp{X_i}{\Theta - \Theta^*}}{\fnorm{\Theta - \Theta^*}} \right)^2 \cdot \mathbb{1}_{\Xi_i(\Theta)} \geq \frac{1}{n} \sum_{i=1}^n \phi_{\frac{\tau}{2s}} \left(\frac{\inp{X_i}{\Theta - \Theta^*}}{\fnorm{\Theta - \Theta^*}}\right) \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}} \\
    & \coloneqq \frac{1}{n} \sum_{i=1}^n \beta_{\tau, s}(X_i, \Theta, \eta_i) \\
    & = \frac{1}{n} \sum_{i=1}^n \E \left(\beta_{\tau, s}(X_i, \Theta, \eta_i)\right) + \frac{1}{n} \sum_{i=1}^n \beta_{\tau, s}(X_i, \Theta, \eta_i) - \frac{1}{n} \sum_{i=1}^n \E \left(\beta_{\tau, s}(X_i, \Theta, \eta_i)\right) \\
    & \geq \frac{1}{n} \sum_{i=1}^n \E \left(\beta_{\tau, s}(X_i, \Theta, \eta_i)\right) - \sup_{\Theta \in \Phi} \left| \frac{1}{n} \sum_{i=1}^n \beta_{\tau, s}(X_i, \Theta, \eta_i) - \frac{1}{n} \sum_{i=1}^n \E \left(\beta_{\tau, s}(X_i, \Theta, \eta_i)\right) \right| \\
    & \coloneqq A_1 - A_2.
\end{align*}
For simplicity we write $\Delta = \Theta - \Theta^*$ as a function of $\Theta$. To lower bound the first term $A_1$, we have that for any $i \in [n]$,
\begin{align*}
    \E \left(\beta_{\tau, s}(X_i, \Theta, \eta_i)\right) &\geq \E \left[ \left( \frac{\inp{X_i}{\Delta}}{\fnorm{\Delta}}\right)^2 \cdot \mathbb{1}_{\{|\inp{X_i}{\Delta}| \leq \frac{\tau}{4s} \fnorm{\Delta}\}} \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}} \right] \\
    &\geq \E\left( \frac{\inp{X_i}{\Delta}}{\fnorm{\Delta}}\right)^2  - \E \left[ \left( \frac{\inp{X_i}{\Delta}}{\fnorm{\Delta}}\right)^2 \cdot \mathbb{1}_{\{|\inp{X_i}{\Delta}| > \frac{\tau}{4s} \fnorm{\Delta}\}} \right]  - \E \left[ \left( \frac{\inp{X_i}{\Delta}}{\fnorm{\Delta}}\right)^2 \cdot \mathbb{1}_{\{|\eta_i| > \frac{\tau}{2}\}} \right]  \\
    & \coloneqq A_{11} - A_{12} - A_{13}.
\end{align*}
Based on Assumption~\ref{assu:subg}, we have $A_{11} \geq c_l$. Furthermore, it holds that
\begin{gather*}
    A_{12} \leq \sqrt{\E\left( \frac{\inp{X_i}{\Delta}}{\fnorm{\Delta}}\right)^4} \cdot \sqrt{\E\left( \frac{\inp{X_i}{\Delta}}{\fnorm{\Delta}}\right)^4 \Big{/} \left(\frac{\tau}{4s}\right)^4} \leq 256\sigma^4 \cdot \frac{s^2}{\tau^2} \\
    A_{13} \leq \left(\frac{2}{\tau}\right)^{1+\delta} \E\left( \frac{\inp{X_i}{\Delta}}{\fnorm{\Delta}}\right)^2 \cdot \E|\eta_i|^{1+\delta} \leq \frac{16}{\tau^{1+\delta}} \sigma^2 \cdot c.
\end{gather*}
By choosing that $\tau \geq \max\left\{32\sigma^2 s \sqrt{\frac{1}{c_l}}, \left(\frac{64\sigma^2c}{c_l} \right)^\frac{1}{1+\delta} \right\}$, it holds that $A_{12} \leq \frac{c_l}{4}$ and $A_{13} \leq \frac{c_l}{4}$, which indicates that
\begin{align}
    \E \left(\beta_{\tau, s}(X_i, \Theta, \eta_i)\right) \geq \frac{c_l}{2}, \; \, \forall i \in [n], \nonumber
\end{align}
which implies that 
\begin{align}
    A_1 \geq \frac{c_l}{2} \label{eq:thm3:imp1}
\end{align}
Afterward, we'd like to upper-bound the term $A_{12}$. Since we have that $\forall i \in [n]$
$$0 \leq \beta_{\tau, s}(X_i, \Theta, \eta_i) \leq \frac{\tau^2}{16s^2}, \, \; \; \E\left(\beta_{\tau, s}(X_i, \Theta, \eta_i)\right)^2 \leq \E\left( \frac{\inp{X_i}{\Delta}}{\fnorm{\Delta}}\right)^4 \leq 16\sigma^4.$$
Then based on the Bousquet's inequality~\citep{bousquet2002concentration}, with probability at least $1-\epsilon$ it holds that 
\begin{align*}
    A_2 &\leq \E A_2 + \sqrt{\E A_2} \cdot \frac{\tau}{2s} \sqrt{\frac{-\ln(\epsilon)}{n}} + 4 \sigma^2  \sqrt{\frac{-2\ln(\epsilon)}{n}} + \frac{\tau^2}{16s^2} \frac{-\ln(\epsilon)}{3n} \\
    &\leq 2 \E A_2 + 4 \sigma^2 \sqrt{\frac{-2\ln(\epsilon)}{n}} + + \frac{\tau^2}{16s^2} \frac{-4\ln(\epsilon)}{3n}.
\end{align*}
To bound the first term $\E A_2$, we use the regular Rademacher symmetrization argument by defining a series of iid Rademacher random variables $\{e_i\}$ with $\tilde X_i, \tilde \eta_i$ that are iid with $X_i ,\eta_i$:
\begin{align*}
    \E A_2 &=  \E \left[\sup_{\Theta \in \Phi} \left| \frac{1}{n} \sum_{i=1}^n \beta_{\tau, s}(X_i, \Theta, \eta_i) - \frac{1}{n} \sum_{i=1}^n \E \left(\beta_{\tau, s}(X_i, \Theta, \eta_i)\right) \right| \right] \\
    &\leq \E \left[\sup_{\Theta \in \Phi} \left| \left( \frac{1}{n} \sum_{i=1}^n \beta_{\tau, s}(X_i, \Theta, \eta_i) - \frac{1}{n} \sum_{i=1}^n \E \left(\beta_{\tau, s}(\tilde X_i, \Theta, \tilde \eta_i)\right) \right) e_i \right| \right] \\
    &\leq 2 \E \left[\sup_{\Theta \in \Phi}  \frac{1}{n} \sum_{i=1}^n \beta_{\tau, s}(X_i, \Theta, \eta_i) e_i  \right].
\end{align*}
Denote the event $c(l) \coloneqq \left\{ \Theta \in \R^{d_1 \times d_2}: \nunorm{\Theta - \Theta^*} \leq l \fnorm{\Theta - \Theta^*}  \right\}$. Recall that we define as:
$$\beta_{\tau, s}(X_i, \Theta, \eta_i) =  \phi_{\frac{\tau}{2s}} \left(\frac{\inp{X_i}{\Theta - \Theta^*}}{\fnorm{\Theta - \Theta^*}}\right) \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}} = \phi_{\frac{\tau}{2s}} \left(\frac{\inp{X_i}{\Theta - \Theta^*}}{\fnorm{\Theta - \Theta^*}} \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}} \right).$$
Define $c(t) = \frac{2s}{\tau} \phi_{\frac{\tau}{2s}}(t)$ and it is easy to show that $c(\cdot)$ is a $1$-Lipschitz function. By using the Talagrand's concentration inequality~\citep{wainwright2019high}, it holds that
\begin{align*}
    \E A_2 &\leq \frac{\tau}{s} \cdot \E\left[ \sup_{\Theta \in c(l)} \frac{1}{n}\sum_{i=1}^n e_i \cdot \frac{2s}{\tau} \cdot \phi_{\frac{\tau}{2s}} \left(\frac{\inp{X_i}{\Theta - \Theta^*}}{\fnorm{\Theta - \Theta^*}} \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}} \right) \right] \\
    & \frac{\tau}{s} \cdot \E\left[ \sup_{\Theta \in c(l)} \frac{1}{n}\sum_{i=1}^n e_i \cdot \frac{2s}{\tau} \cdot \frac{\inp{X_i}{\Theta - \Theta^*}}{\fnorm{\Theta - \Theta^*}} \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}} \right] \\
    &\leq \frac{\tau}{s} \cdot \E\left[ \sup_{\Theta \in c(l)} \frac{1}{n} \opnorm{\sum_{i=1}^n e_i X_i\cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}}} \cdot \nunorm{\frac{\Theta - \Theta^*}{\fnorm{\Theta - \Theta^*}}} \right] \\
    &\leq \frac{\tau l}{sn} \cdot \E \opnorm{\sum_{i=1}^n e_i X_i\cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}}}.
\end{align*}
By using the same technique in the proof of Theorem~\ref{thm:lambda}, we can bound the operator norm by using the covering argument. Denote $\mathcal N^d_{\frac{1}{4}}$ be the $1/4$ covering of $S^{d-1}$, then it holds that
$$\E \opnorm{\sum_{i=1}^n e_i X_i\cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}}} \leq \frac{5}{2} \cdot \E\left[ \max_{u \in \mathcal N^{d_1}_{\frac{1}{4}}, v \in \mathcal N^{d_2}_{\frac{1}{4}}} \sum_{i=1}^n e_i u^\top X_i v \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}} \right].$$
Note for any pair of  $u \in \mathcal N^{d_1}_{\frac{1}{4}}, v \in \mathcal N^{d_2}_{\frac{1}{4}}$, we have that 
\begin{gather*}
    \E\left(\sum_{i=1}^n e_i u^\top X_i v \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}} \right)  = 0 \\
    \E\left(\sum_{i=1}^n |e_i|^k |u^\top X_i v|^k \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}} \right) \leq \E |u^\top X_i v|^k \leq \frac{1}{2} \cdot k! \cdot (\sqrt{2}\sigma)^{k-2} \cdot 2\sigma^2, \; \; k=2,3,\dots.
\end{gather*}
We can write the moment generating function $M(\lambda)$ of the random variable $\sum_{i=1}^n e_i \cdot u^\top X_i v \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}}$ as:
\begin{align*}
    M(\lambda) &= \E\left[ \exp \left( \lambda  \sum_{i=1}^n e_i \cdot u^\top X_i v \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}}\right) \right] = \prod_{i=1}^n \E\left[ \exp \left( \lambda e_i \cdot u^\top X_i v \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}}\right) \right] \\
    &\leq  \prod_{i=1}^n \left[ 1+ \frac{\lambda^2 \cdot 2\sigma^2}{2} + \frac{\lambda^2 \cdot 2\sigma^2}{2} \left( \sum_{k=3}^\infty (|\lambda| \sqrt{2} \sigma)^{k-2} \right) \right] \\
    &= \prod_{i=1}^n \left[ 1+ \frac{2\lambda^2 \sigma^2}{2} \cdot \frac{1}{1-\sqrt{2} \sigma |\lambda|} \right] \\
    &\leq \exp{\left(n \lambda^2 \sigma^2 \frac{1}{1- \sqrt{2}\sigma |\lambda|}\right)}, \; \; \; \; \; \; |\lambda| \leq \frac{1}{\sqrt{2}\sigma}.
\end{align*}
Therefore, it holds that for any $s_0 > 0$
\begin{align*}
    \E\left[ \max_{u \in \mathcal N^{d_1}_{\frac{1}{4}}, v \in \mathcal N^{d_2}_{\frac{1}{4}}} \sum_{i=1}^n e_i u^\top X_i v \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}} \right] &= \frac{1}{s_0} \E \left[\ln\left( \exp\left(s_0 \cdot \sum_{i=1}^n e_i u^\top X_i v \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}} \right) \right) \right] \\
    &\leq \frac{1}{s_0} \ln\left( \E \left[ \max_{u \in \mathcal N^{d_1}_{\frac{1}{4}}, v \in \mathcal N^{d_2}_{\frac{1}{4}}}  \exp \left(s_0 \cdot \sum_{i=1}^n e_i u^\top X_i v \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}}  \right)\right] \right) \\
    &\leq \frac{1}{s_0} \ln{\left(9^{d_1+d_2} \E \left[ \exp \left(s_0 \cdot \sum_{i=1}^n e_i u^\top X_i v \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}}  \right)\right]\right)} \\
    & = \frac{(d_1+d_2)\ln(9) + n s_0^2 \sigma^2 \cdot \frac{1}{1-\sqrt{2}\sigma |s_0|} }{s_0}, \; \; \; \; \forall |s_0| \leq \frac{1}{\sqrt{2} \sigma}.
\end{align*}
By taking $s_0 = \frac{\sqrt{(d_1+d_2)\ln(9)}}{\sigma \cdot \sqrt{n}}$, and conditioned on $n \geq 8 \ln(9) (d_1+d_2)$, we have that
$$\E\left[ \max_{u \in \mathcal N^{d_1}_{\frac{1}{4}}, v \in \mathcal N^{d_2}_{\frac{1}{4}}} \sum_{i=1}^n e_i u^\top X_i v \cdot \mathbb{1}_{\{|\eta_i| \leq \frac{\tau}{2}\}} \right] \leq 3 \sqrt{\ln(9)} \cdot \sqrt{n(d_1+d_2)} \cdot \sigma.$$
And this fact implies that 
$$\E A_2 \leq \frac{15 \tau \sigma l}{2 s} \sqrt{\ln(9)} \sqrt{\frac{d_1+d_2}{n}}.$$
Conclusively, with probability at least $1-\epsilon$ we have that 
\begin{align*}
    A_2 \leq \frac{15 \tau \sigma l}{s} \sqrt{\ln(9)} \sqrt{\frac{d_1+d_2}{n}} + 4 \sigma^2 \sqrt{\frac{-2\ln(\epsilon)}{n}} + + \frac{\tau^2}{16s^2} \frac{-4\ln(\epsilon)}{3n}.
\end{align*}
Therefore, by ensuring that 
$$n \geq \max \left\{ 8 \ln{(9)} (d_1 + d_2) ,\left(225 \sigma \sqrt{\ln(9)(d_1+d_2)} \frac{\tau l}{s c_l} \right)^2, \left(\frac{48 \sigma^2}{c_l}\sqrt{-2\ln(\epsilon)} \right)^2, -\frac{\tau^2}{c_l s^2} \ln(\epsilon) \right\},$$
we have 
\begin{align}
    \P\left( A_2 \leq \frac{c_l}{4} \right) \geq 1 - \epsilon.  \label{eq:thm3:imp2}
\end{align}
Given the results shown in Eqn.~\eqref{eq:thm3:imp1} and Eqn.~\eqref{eq:thm3:imp2}, we have that with probability at least $1-\epsilon$, it holds that 
$$\frac{\inp{\nabla \hat L_\tau(\Theta) - \nabla \hat L_\tau(\Theta^*)}{\Theta - \Theta^*}}{\fnorm{\Theta - \Theta^*}^2} \geq \frac{c_l}{4}, \; \; \; \; \forall \Theta \in \Phi.$$
\hfill \qedsymbol
\subsection{Proof of Theorem~\ref{thm:estimator}}\label{app:estimatorpf}
Theorem~\ref{thm:estimator} can be naturally proved based on the above Theorem~\ref{thm:thm1}, Theorem~\ref{thm:lambda} and Theorem~\ref{thm:e}. Here we assume $c_l$ and $\sigma$ are in constant scale in general, and for the LowHTR problem with $\sigma^2 \asymp c_l \asymp \frac{1}{d_1d_2}$, our proof can be slightly modified as we discuss later.

By taking $\lambda \asymp \sigma \left(\frac{d-\ln(\epsilon)}{n} \right)^\frac{\delta}{1+\delta} c^\frac{1}{1+\delta}$, and $\tau \asymp \left(\frac{n}{d-\ln{(\epsilon)}} \right)^\frac{1}{1+\delta} c^\frac{1}{1+\delta}$, we can guarantee that $\lambda \geq 2 \opnorm{\nabla \hat L_\tau(\Theta^*)}$ with probability at least $1-\epsilon$ from Theorem~\ref{thm:lambda}. By choosing $l \asymp 4\sqrt{2r}$ and $s = \frac{\tau}{32 \sigma^2} \sqrt{c_l}$, then the conditions in Theorem~\ref{thm:thm1} can be satisfied as long as $n \gtrsim (d-\ln{(\epsilon)})\sqrt{r\nu^3}$ where we denote $\nu = \frac{\sigma^2}{c_l}$. Furthermore, under the above setting, we know the local restricted strong convexity $\mathcal{E}(s,l,c_l/4)$ holds with probability at least $1-\epsilon$ as long as the conditions in Theorem~\ref{thm:e} hold. By reviewing the conditions of Theorem~\ref{thm:e}, we know it suffices to have $n \gtrsim d, \nu^2, dr \nu^3$. Therefore, with probability at least $1-2\epsilon$, the final error bound in Theorem~\ref{thm:thm1} indicates that
$$\fnorm{\widehat \Theta - \Theta^*} \lesssim \frac{\sigma}{c_l} \left(\frac{d+ \ln{(1/\epsilon)}}{n}\right)^{\frac{\delta}{1+\delta}} c^{\frac{1}{1+\delta}} \sqrt{r}.$$
\hfill \qedsymbol
\section{Proof of Theorem~\ref{thm:lowto}}\label{app:lowto}
We now prove the regret bound given in Theorem~\ref{thm:lowto}:
We have $\|\theta^*\| \leq S$ based on Section~\ref{sec:prelim} and $\|\theta_{k+1:p}^*\| \leq S_\perp$ for some small $S_\perp$. In the beginning, we have the transformed buffer set $\mathcal H_1^\prime$ of size $H \coloneqq |\mathcal H_1^\prime|$, and we write the pair information $(X,y)$ in $\H_1^\prime$ as $\{(x_{s,1},y_{s,1}),\dots,(x_{s,H},y_{s,H})\}$. And we denote $(x_{e,t},y_{e,t})$ as the pair of pulled arm and corresponding stochastic payoff at round $t$. To abuse the notation, at round $t+1$ we denote $\{(x_i,y_i)\}_{i=1}^{t+H}$ as the pairs of observations in the initial buffer set and obtained by the end of round $t$ in order.

At the beginning of the round $t+1$, the current $M \coloneqq M_t$ can be written as $M_t = \sum_{i=1}^H x_{s,i}x_{s-i}^\top \sum_{j=1}^{t} x_{e,j} x_{e,j}^\top + \Lambda$, where $\Lambda$ is a positive diagonal matrix with $\lambda$ occupying the first $k$ diagonal entries and $\lambda_\perp$ the next $p-k$ entries. According to Algorithm~\ref{alg:lowto}, we denote $X_t \in \R^{(t+H)\times p}$ where each row of $X_t$ is the feature vector of the pulled arm (in the history buffer set or not). Assume $t+H>p$, we denote its full SVD as $X_t = U_x \Sigma_x V_x^\top$ with $U_x \in \R^{(t+H)\times p}$ and $V_x \in \R^{p \times p}$. We also write $M_t = V_x(\Sigma^2 + \Lambda) V_x^\top \in \R^{p\times p}$. And we further denote
$$\begin{pmatrix}
u_1^\top \\
u_2^\top \\
\vdots \\
u_p^\top 
\end{pmatrix} = M_t^{-\frac{1}{2}} X_t^\top = V_x (\Sigma_x^2 + \Lambda)^{-\frac{1}{2}} \cdot \Sigma_x U_x^\top  \preceq V_x U_x^\top = \begin{pmatrix}
V_{x,11} &\cdots& V_{x,1p} \\
\vdots& \ddots& \vdots \\
V_{x,p1} &\cdots& V_{x,pp} \\ 
\end{pmatrix} \cdot 
\begin{pmatrix}
U_{x,1}^\top \\
\vdots \\
U_{x,p}^\top \\ 
\end{pmatrix} \in \R^{p \times (t+H)}.
$$
We first show that for all $i \in [p]$,
\begin{gather*}
    \|u_i\|_2 \leq \|\sum_{j=1}^p V_{ij} U_j\|_2 = \sqrt{\sum_{j=1}^p V_{ij}^2 \|U_j\|_2^2} = 1 \\
    \|u_i\|_{1+\delta} \leq (t+H)^{\frac{1}{1+\delta} - \frac{1}{2}} \cdot  \|u_i\|_{2} \leq (t+H)^\frac{1-\delta}{2(1+\delta)},
\end{gather*}
where the last inequality is deduced from the Cauchy-Schwarz inequality. With the formulation of $\hat\theta_t$ in Algorithm~\ref{alg:lowto} line 3, we have that
\begin{align*}
    \|\hat \theta_t - \theta^*\|_{M_t} &= \left\|M_t^{-\frac{1}{2}} \begin{pmatrix}
        u_1^\top \hat y_1 \\
        \vdots\\
        u_p^\top \hat y_p
    \end{pmatrix} - M_t^{-1} X_t^\top X_t \theta^*  - M_t^{-1} \Lambda \theta^*\right\|_{M_t} \\
    &\leq \left\|M_t^{-\frac{1}{2}} \begin{pmatrix}
        u_1^\top \hat y_1 \\
        \vdots\\
        u_p^\top \hat y_p
    \end{pmatrix} - M_t^{-\frac{1}{2}} \begin{pmatrix}
        u_1^\top  \\
        \vdots\\
        u_p^\top
    \end{pmatrix}  X_t \theta^* \right\|_{M_t} + \left\| \Lambda \theta^* \right\|_{M_t^{-1}} \\
    &\leq \norm{\begin{pmatrix}
        u_1^\top(\hat y_1 - X_t \theta^*)  \\
        \vdots\\
        u_p^\top (\hat y_p - X_t \theta^*)
    \end{pmatrix}}_2 + \norm{\theta^*}_\Lambda \\
    &\leq \sqrt{\sum_{i=1}^p \left(u_i^\top(\hat y_i - X_t \theta^*) \right)^2} + \sqrt{\lambda_0}S + \sqrt{\lambda_\perp} S_\perp.
\end{align*}
To present a bound on the first term, we divide it into two separate parts.
\begin{align*}
    u_i^\top (\hat y_i - X_t \theta^*) & = \sum_{j=1}^{t+H} u_{i,j}(\hat y_{i,j} - \E(y_j | \F_{j-1})) \\
    &= \sum_{j=1}^{t+H} u_{i,j}\left[(\hat y_{i,j} - \E(\hat y_{i,j}| \F_{j-1})) - \E(y_j \mathbb{1}_{\{|u_{i,j} y_j| > b_t\}} | \F_{j-1})\right] \\
    &\leq \left|\sum_{j=1}^{t+H} u_{i,j} (\hat y_{i,j} - \E(\hat y_{i,j}| \F_{j-1})) \right| + \left|\sum_{j=1}^{t+H} u_{i,j} \E(y_j \mathbb{1}_{\{|u_{i,j} y_j| > b_t\}} | \F_{j-1}) \right| \coloneqq A_1 + A_2
\end{align*}
For the first term $A_1$, based on Bernstein' inequality for martingales~\citep{seldin2012pac}, for any $i \in [p]$ it holds that with probability at least $1-\frac{\epsilon}{p}$:
\begin{align*}
    A_1 &\leq 2b_t\ln\left(\frac{2p}{\epsilon}\right) + \left| \frac{1}{2b_t} \sum_{j=1}^{t+H} \E \left[ u_{i,j}^2 \left(\hat y_{i,j} - \E(\hat y_{i,j} | \F_{j-1}) \right)^2 | \F_{j-1}\right] \right|  \\
    &\leq 2b_t\ln\left(\frac{2p}{\epsilon}\right) + \frac{b_t}{2} \left|  \sum_{j=1}^{t+H} \E \left[ \left( \frac{ u_{i,j}\left(\hat y_{i,j} - \E(\hat y_{i,j} | \F_{j-1})\right)}{b_t} \right)^2 | \F_{j-1}\right] \right| \coloneqq 2b_t\ln\left(\frac{2p}{\epsilon}\right) + \frac{b_t}{2} \left|  \sum_{j=1}^{t+H} \E \left[ T | \F_{j-1}\right] \right|.
\end{align*}
Since we know that $|T| \leq 1$ and hence $\E(T^2) \leq \E(|T|^{1+\delta})$, and we can then deduce that
\begin{align*}
    A_1 \leq 2b_t\ln\left(\frac{2p}{\epsilon}\right) + \frac{b_t}{2} \cdot \frac{\sum_{j=1}^{t+H} |u_{i,j}|^{1+\delta} \cdot b}{b_t^{1+\delta}} \leq 2b_t\ln\left(\frac{2p}{\epsilon}\right) + \frac{b}{2b_t^\delta} (t+H)^{\frac{1-\delta}{2}}.
\end{align*}
Therefore, we know that with probability at least $1-\epsilon$ the following result holds for all $i \in [p]$ simultaneously:
$$\left|\sum_{j=1}^{t+H} u_{i,j} (\hat y_{i,j} - \E(\hat y_{i,j}| \F_{j-1})) \right| \leq 2b_t\ln\left(\frac{2p}{\epsilon}\right) + \frac{b}{2b_t^\delta} (t+H)^{\frac{1-\delta}{2}}.$$
For the term $A_2$, with the help of Holder's inequality, we have for all $i \in [p]$:
\begin{align*}
    A_2 &\leq \sum_{j=1}^{t+H} \E \left( |u_{i,j}y_j|^{1+\delta} \right)^\frac{1}{1+\delta} \cdot \E\left( \mathbb{1}_{|u_{i,j}y_j| > b_t} \right)^\frac{\delta}{1+\delta} \\
    &\leq \sum_{j=1}^{t+H} |u_{i,j}| \cdot b^\frac{1}{1+\delta} \cdot \P\left(|u_{i,j}y_j| > b_t \right)^\frac{\delta}{1+\delta} \\
    &\leq \sum_{j=1}^{t+H} |u_{i,j}| \cdot b^\frac{1}{1+\delta} \cdot \left(\frac{|u_{i,j}|^{1+\delta} b}{b_t^{1+\delta}} \right) \leq \frac{b}{b_t^\delta} \cdot (t+H)^\frac{1-\delta}{2}.
\end{align*}
Therefore, by taking
$$b_t = \left(\frac{b}{\ln\left(\frac{2p}{\epsilon}\right)} \right)^\frac{1}{1+\delta} \cdot (t+H)^\frac{1-\delta}{2+2\delta},$$ 
we can deduce that with probability at least $1-\epsilon$ the following result holds for all $i \in [p]$ simultaneously:
$$ u_i^\top (\hat y_i - X_t \theta^*) \leq 4b^\frac{1}{1+\delta} \left(\ln\left(\frac{2p}{\delta}\right)\right)^\frac{\delta}{1+\delta}\cdot (t+H)^\frac{1-\delta}{2+2\delta}.$$
Therefore, with probability at least $1-\epsilon$ it holds that
$$ \|\hat \theta_t - \theta^*\|_{M_t} \leq 2 \sqrt{p}\cdot b^\frac{1}{1+\delta} \left(\ln\left(\frac{2p}{\delta}\right)\right)^\frac{\delta}{1+\delta}\cdot (t+H)^\frac{1-\delta}{2+2\delta} \coloneqq \beta_t(\epsilon).$$
Denote the optimal arm at time $t+1$ as $x_{e,t+1}^*$. Therefore, the instance regret at time $t+1$ can be bounded by
\begin{align*}
    {x_{e,t+1}^*}^\top\theta^* - &x_{e,t+1}^\top\theta^* = {x_{e,t+1}^*}^\top\theta^* - {x_{e,t+1}^*}^\top \hat\theta_t + {x_{e,t+1}^*}^\top \hat\theta_t - x_{e,t+1}^\top \hat\theta_t + x_{e,t+1}^\top \hat\theta_t -  x_{e,t+1}^\top\theta^* \\
    & \leq \beta_t(\epsilon) \|x_{e,t+1}^*\|_{M_t^{-1}} + x_{e,t+1}^\top \hat\theta_t + \beta_t(\epsilon) \|x_{e,t+1}\|_{M_t^{-1}} -  x_{e,t+1}^\top \hat\theta_t - \|x_{e,t+1}^*\|_{M_t^{-1}} + \beta_t(\epsilon) \|x_{e,t+1}\|_{M_t^{-1}} \\
    &\leq \min\{S^2, 2 \beta_t(\epsilon)\|x_{e,t+1}\|_{M_t^{-1}} \}.
\end{align*}
Therefore, with probability at least $1-\epsilon$, it holds that
\begin{align*}
    \sum_{t=1}^T r_t &= \sum_{t=1}^T  \min\{S^2, 2 \beta_t\left(\frac{\epsilon}{T}\right)\|x_{e,t+1}\|_{M_t^{-1}} \} \\
    &\leq 2 \beta_T\left(\frac{\epsilon}{T}\right) \sum_{t=1}^T \min\{\frac{S^2}{\beta_T\left(\frac{\epsilon}{T}\right)}, \|x_{e,t+1}\|_{M_t^{-1}} \} \leq 2 \beta_T\left(\frac{\epsilon}{T}\right) \cdot \sqrt{T} \cdot \sqrt{\sum_{t=1}^T \min \{\|x_{e,t+1}\|_{M_t^{-1}}^2,1\}}
\end{align*}
We denote $\tilde M_{T+1} = \sum_{t=1}^T x_{e,t}x_{e,t}^\top + \Lambda$, and by Lemma 9 of~\cite{dani2008stochastic}, it holds that
\begin{align*}
    \sqrt{\sum_{t=1}^T \min \{\|x_{e,t+1}\|_{M_t^{-1}}^2,1\}} &\leq 2 \ln\left( \frac{\text{det}(\tilde M_{T+1})}{\text{det}(\Lambda)} \right) \leq 2k\cdot \ln\left(1+ \frac{S^2}{k \lambda_0} T \right) + 2(p-k) \ln\left(1+ \frac{S^2}{(p-k)\lambda_\perp}T \right) \\
    &\leq  2k\cdot \ln\left(1+ \frac{S^2}{k \lambda_0} T \right) + \frac{2S^2}{\lambda_\perp} T \leq 4k\cdot \ln\left(1+ \frac{S^2}{k \lambda_0} T \right),
\end{align*}
by taking that 
$\lambda_\perp = \frac{S^2T}{k\ln\left(1+ \frac{S^2}{k \lambda_0} T \right)}$.
Therefore, with probability at least $1-\epsilon$, it holds that
\begin{align*}
    R(T) &\leq 2 \sqrt{T} \cdot \sqrt{4k\cdot \ln\left(1+ \frac{S^2}{k \lambda_0} T \right)} \cdot \left[2 \sqrt{p}\cdot b^\frac{1}{1+\delta} \left(\ln\left(\frac{2p}{\delta}\right)\right)^\frac{\delta}{1+\delta}\cdot (T+H)^\frac{1-\delta}{2+2\delta} + \sqrt{\lambda_0} S + \sqrt{\lambda_\perp} S_\perp \right] \\
    & = \widetilde O\left( \sqrt{kp} \cdot T^\frac{1}{1+\delta} + \sqrt{kT} + S_\perp T \right).
\end{align*}
\hfill \qedsymbol
\section{Proof of Eqn.~(\ref{eq:sparse})}\label{app:sparse}
Our argument is adapted from the proof of Theorem 3 in \cite{jun2019bilinear}, and we will still present details here for completeness of our work. Furthermore, the proof of Theorem~\ref{thm:lotus2} in our work still relies on the same Lemma.

\begin{lemma}\textup{(Wedin's $\sin \Theta$ Theorem)}\label{lem:eqn6} Let the SVDs of matrices $A$ and $\tilde A$ be defined as follows:
\begin{gather*}
    \begin{pmatrix}
        U_1 & U_2 & U_3
    \end{pmatrix}^\top A \begin{pmatrix}
        V_1 & V_2 
    \end{pmatrix} = \begin{pmatrix}
        \Sigma_1 & 0 \\
        0 &\Sigma_2 \\
        0 & 0 
    \end{pmatrix}, \\
    \begin{pmatrix}
      \tilde  U_1 &\tilde U_2 &\tilde U_3
    \end{pmatrix}^\top \tilde A \begin{pmatrix}
        \tilde V_1 & \tilde V_2
    \end{pmatrix} = \begin{pmatrix}
        \tilde \Sigma_1 & 0  \\
        0 &\tilde \Sigma_2 \\
        0 & 0 
    \end{pmatrix}.
\end{gather*}
    Let $R = A \tilde V_1 - \tilde U_1 \tilde \Sigma_1$ and $S = A^\top \tilde U_1 - \tilde V_1 \tilde \Sigma_1$, and define $U_{1\perp} = [U_2 \; U_3]$ and $V_{1\perp} = [V_2 \; V_3]$. Then suppose there is a number $q > 0$ such that
    $$\min_{i,j} |\sigma_i(\tilde \Sigma_1) - \sigma_j(\Sigma_2)| \geq q, \; \; \; \min_{i} \sigma_i(\tilde \Sigma_1) \geq q,$$
    Then it holds that
    $$\sqrt{\fnorm{U_{1\perp}^\top \tilde U_1}^2 + \fnorm{V_{1\perp}^\top \tilde V_1}^2 } \leq \frac{\sqrt{\fnorm{R}^2 + \fnorm{S}^2}}{q}.$$
\end{lemma}
Based on Lemma~\ref{lem:eqn6}, we define $A = \widehat \Theta, U_1 = \widehat U, \Sigma_1 = \widehat D, V_1 = \widehat V, \tilde A = \Theta^*, \tilde U_1 = U, \tilde \Sigma_1 = D, \tilde V_1 = V, q = D_{rr}$. Therefore, according to Lemma~\ref{lem:eqn6}, we have that $R = (\widehat \Theta - \Theta^*) \widehat V$ and $S= -(\widehat \Theta - \Theta^*)^\top U$, and then it holds that
$$\sqrt{2 \fnorm{\widehat U_\perp^\top U} \fnorm{\widehat V_\perp^\top V}} \leq \sqrt{\fnorm{\widehat U_\perp^\top U}^2 + \fnorm{\widehat V_\perp^\top V}^2} \leq \frac{\sqrt{\fnorm{R}^2 + \fnorm{S}^2}}{D_{rr}} \leq \frac{\sqrt{2}\cdot\fnorm{\widehat \Theta - \Theta^*}}{D_{rr}}.$$
And then by using the bound on $\fnorm{\widehat \Theta - \Theta^*}$ we can deduce that 
$$\|\theta^*_{k+1:p}\|_2 = \fnorm{\widehat U_\perp^\top U D V^\top \widehat V_\perp} \leq \fnorm{\widehat U_\perp^\top U} \fnorm{\widehat V_\perp^\top V} \cdot \opnorm{D} \lesssim \frac{r \sigma^2 c^{\frac{2}{1+\delta}}}{c_l^2 D_{rr}^2}\left(\frac{d+ \ln{(1/\epsilon)}}{|\H_2|}\right)^{\frac{2\delta}{1+\delta}}.$$
\hfill \qedsymbol
\section{Proof of Theorem~\ref{thm:lotus}}\label{app:lotus}
We now prove Theorem~\ref{thm:lotus} in this section. We first bring up the result shown in Eqn.~\eqref{eqn:hestimator} again: under Assumption~\ref{assu:subg}, if we estimate $\Theta^*$ based on the exploration set $\H_2$ of size $H$, then our estimator $\widehat \Theta$ satisfies the following property:
$$\|\theta^*_{k+1:p}\|_2  \lesssim \frac{r d^2 c^{\frac{2}{1+\delta}}}{ D_{rr}^2}\left(\frac{d+ \ln{(1/\epsilon)}}{H}\right)^{\frac{2\delta}{1+\delta}},$$
under $\sigma^2 \asymp c_l \asymp 1/(d_1d_2)$ with probability at least $1-\epsilon$. Our Algorithm~\ref{alg:lotus} first randomly samples arms for the first $T_1$ rounds, and then for the rest of the time horizon it utilizes a doubling-trick-based idea. Based on line $3$ of Algorithm~\ref{alg:lotus}, when we have that
$$\left[\frac{d^{2+4\delta} r^{1+\delta}}{D_{rr}^{2+2\delta}} 2^{i(1+\delta)} \right]^{\frac{1}{1+3\delta}} \geq 2^i \; \Longrightarrow \; i \leq \left\lfloor \log_2\left(\frac{d^\frac{1+2\delta}{\delta} r^\frac{1+\delta}{2\delta}}{D_{rr}^\frac{1+\delta}{\delta}}\right) \right\rfloor \coloneqq L,$$
then in the first $L$ batches, we will run out of time to do random exploration. Since we have that
$$\frac{2d^\frac{1+2\delta}{\delta} r^\frac{1+\delta}{2\delta}}{D_{rr}^\frac{1+\delta}{\delta}} \geq \sum_{j=1}^L 2^j = 2^{L+1} - 2 \geq \frac{d^\frac{1+2\delta}{\delta} r^\frac{1+\delta}{2\delta}}{D_{rr}^\frac{1+\delta}{\delta}}-2,$$
we know before the batch $L+1$, we already repeat random sampling for $\Ti$ rounds, with
$$T_1 + \frac{d^\frac{1+2\delta}{\delta} r^\frac{1+\delta}{2\delta}}{D_{rr}^\frac{1+\delta}{\delta}}-2 \leq \Ti \leq T_1 + \frac{2d^\frac{1+2\delta}{\delta} r^\frac{1+\delta}{2\delta}}{D_{rr}^\frac{1+\delta}{\delta}}.$$
For the sake of simplicity in our proof, we assume that our algorithm terminates exactly at the end of some batch, i.e. the $M$-th batch. And otherwise, our proof will be the same by using the index of the last batch. In other words, it holds that
$$\sum_{i=L+1}^M 2^i + \Ti = T \; \Longleftrightarrow \; 2^{M+1} = T + 2^{L+1} - \Ti.$$
Therefore, if we set $\epsilon$ as $\epsilon/2^{i+1}$ in both $\beta_t$ of Algorithm~\ref{alg:lowto} and $\lambda,\tau$ in the matrix estimation for the $i$-th batch, then based on Theorem~\ref{thm:lowto}, with probability at least $1-\epsilon$ it holds that
\begin{align*}
    R(T) &= \widetilde O \left( \Ti + \sum_{i=L+1}^M \left[ C \left(2^\frac{1+\delta}{1+3\delta} \right)^i + \sqrt{d^3r} \left(2^\frac{1}{1+\delta} \right)^i + \sqrt{dr2^i} + 2^i \cdot \frac{d^\frac{2+4\delta}{1+\delta}r}{D_{rr}^2} \cdot \left(\frac{1}{\Ti + \sum_{j=L+1}^i C \left(2^\frac{1+\delta}{1+3\delta} \right)^j } \right)^\frac{2\delta}{1+\delta} \right] \right) \\
    &= \widetilde O \left(A_1 + \sum_{i=L+1}^M [A_{i,2}+A_{i,3}+A_{i,4}+A_{i,5}] \right), 
\end{align*}
with $C = \left(\frac{d^{2+4\delta} r^{1+\delta}}{D_{rr}^{2+2\delta}}\right)^\frac{1}{1+3\delta}.$ For $A_1$, it naturally holds that
$A_1 \lesssim \Ti$.
For $A_{i,2}$, we have that 
\begin{align*}
    \sum_{i=L+1}^M A_{i,2} \lesssim C\cdot \frac{1}{2^\frac{1+\delta}{1+3\delta} - 1} \cdot T^\frac{1+\delta}{1+3\delta}.
\end{align*}
For $A_{i,3}$, we have that
\begin{align*}
    \sum_{i=L+1}^M A_{i,3} \lesssim \sqrt{d^3r} \frac{1}{2^\frac{1}{1+\delta}-1} \cdot (T - \Ti)^\frac{1}{1+\delta} \lesssim \sqrt{d^3r}\cdot T^\frac{1}{1+\delta}.
\end{align*}
For $A_{i,3}$, it holds that
\begin{align*}
    \sum_{i=L+1}^M A_{i,4} \lesssim \sqrt{dr} \sqrt{2^i} \lesssim \sqrt{dr} \cdot \frac{1}{\sqrt{2}-1} \cdot (T -\Ti)^\frac{1}{2} \lesssim \sqrt{drT}.
\end{align*}
And finally for $A_{i,5}$ we can show that
\begin{align*}
    \sum_{i=L+1}^M A_{i,5} &= \sum_{i=L+1}^M 2^i \cdot \frac{d^\frac{2+4\delta}{1+\delta}r}{D_{rr}^2} \cdot \left(\frac{1}{\Ti + \sum_{j=L+1}^i C \left(2^\frac{1+\delta}{1+3\delta} \right)^j } \right)^\frac{2\delta}{1+\delta} \\
    &\lesssim  \sum_{i=L+1}^M 2 \cdot C \cdot \left( \frac{\left(2^\frac{1+\delta}{2\delta} \right)^i}{\frac{T_1-2}{C} + \frac{d^\frac{1+2\delta}{\delta} r^\frac{1+\delta}{2\delta}}{D_{rr}^\frac{1+\delta}{\delta}C}+ \sum_{j = L+1}^i \left( 2^\frac{1+\delta}{1+3\delta}\right)^j} \right)^\frac{2\delta}{1+\delta} \\
    &\lesssim 2 \cdot C \cdot \sum_{L+1}^M \left[ \frac{\left(2^\frac{1+\delta}{1+3\delta}-1\right)^\frac{2\delta}{1+\delta}}{2^\frac{(1+\delta)(2\delta)}{(1+3\delta)(1+\delta)}} \cdot 2^{\left(\frac{1+\delta}{1+3\delta}\right)i} \right] \lesssim C \cdot T^\frac{1+\delta}{1+3\delta},
\end{align*}
given that
\begin{align*}
    T_1 \geq 2 - \frac{d^\frac{1+2\delta}{\delta} r^\frac{1+\delta}{2\delta}}{D_{rr}^\frac{1+\delta}{\delta}} + \left( \frac{d^\frac{1+2\delta}{\delta} r^\frac{1+\delta}{2\delta}}{D_{rr}^\frac{1+\delta}{\delta}}\right)^\frac{1+\delta}{1+3\delta} \cdot \frac{1}{2^\frac{1+\delta}{1+3\delta}-1} \cdot C \geq 2 + \left(\frac{2}{\sqrt{2}-1}\right) \cdot \frac{d^\frac{1+2\delta}{\delta} r^\frac{1+\delta}{2\delta}}{D_{rr}^\frac{1+\delta}{\delta}}.
\end{align*}
Therefore, with the above condition on $T_1$ satisfied, the following result holds with probability at least $1-\epsilon$
\begin{align*}
    R(T) = \widetilde O \left( \frac{d^\frac{2+4\delta}{1+3\delta} r^\frac{1+\delta}{1+3\delta}}{D_{rr}^\frac{2+2\delta}{1+3\delta}} \cdot T^\frac{1+\delta}{1+3\delta} + d^\frac{3}{2} r^\frac{1}{2} T^\frac{1}{1+\delta} \right).
\end{align*}
\hfill \qedsymbol
\section{Proof of Theorem~\ref{thm:lotus2}}\label{app:lotus2}

The proof of Theorem~\ref{thm:lotus2} is adapted from that of Theorem~\ref{thm:lotus} presented in the above Appendix~\ref{app:lotus}. According to~\cite{li1998relative}, it holds that
$$|\sigma_i(\widehat \Theta) - \sigma_i(\Theta^*)| \leq \fnorm{\widehat \Theta - \Theta^*}, \; \; \forall i \in [d].$$
Denote $H$ as the size of the exploration buffer set $\H_2$ at the end of the exploration phase for the $i-$th batch, then according to Theorem~\ref{thm:estimator} we know that
\begin{align}
    \fnorm{\widehat \Theta - \Theta^*} \leq C_1 \frac{\sigma \sqrt{r}}{c_l} \left(\frac{d+ \ln{(2^{i+1}/\epsilon)}} {H}\right)^{\frac{\delta}{1+\delta}}  \cdot c^{\frac{1}{1+\delta}} \coloneqq E, \; \; C_1 > 0, \label{eq:effrank}
\end{align}
with probability at least $1-\epsilon/2^{i+1}$. We define the useful rank $\hat r$ as:
\begin{align*}
\hat r =  \min & \left\{i \in [d+1] : \hat D_{ii} \leq  C_1 \frac{\sigma \sqrt{i}}{c_l} \left(\frac{d+ \ln{(2^{i+1}/\epsilon)}} {H}\right)^{\frac{\delta}{1+\delta}} \cdot c^{\frac{1}{1+\delta}} \coloneqq R(i)\right\} - 1 \wedge 1,
\end{align*}
We will first show that $\hat D_{(r+1)(r+1)} \leq R(r+1)$ and hence $\hat r \leq r$ holds if we have Eqn.~\eqref{eq:effrank}. This is because that $\hat D_{(r+1)(r+1)} \leq E = R(r) < R(r+1)$. Furthermore, we will illustrate that all the subspaces we remove based on our estimated $\hat r$ are sufficiently minimal. Specifically, we know that 
$$D_{(\hat r+1)(\hat r+1)} \leq \widehat D_{(\hat r+1)(\hat r+1)} + |\widehat D_{(\hat r+1)(\hat r+1)} - D_{(\hat r+1)(\hat r+1)}| \leq R(\hat r+1) + E \leq 2 R(r+1).$$
To abuse the notation, we rewrite the SVD of $\widehat \Theta$ and $\Theta^*$ as
\begin{gather*}
    \widehat \Theta = \begin{pmatrix}
        \widehat U & \widehat U_r & \widehat U_\perp
    \end{pmatrix} \cdot \begin{pmatrix}
        \widehat D_{\hat r} & 0 & 0 \\
        0 & \widehat D_{r - \hat r} & 0 \\
        0 & 0 & \widehat D_{0}
    \end{pmatrix} \cdot \begin{pmatrix}
        \widehat V^\top \\
        \widehat V_r^\top \\
        \widehat V_\perp^\top
    \end{pmatrix} \\
    \Theta^* = \begin{pmatrix}
        \tilde U & \tilde U_r & \tilde U_\perp
    \end{pmatrix} \cdot \begin{pmatrix}
        \tilde D_{\hat r} & 0 & 0 \\
        0 & \tilde D_{r - \hat r} & 0 \\
        0 & 0 & 0
    \end{pmatrix} \cdot \begin{pmatrix}
        \tilde V^\top \\
        \tilde V_r^\top \\
        \tilde V_\perp^\top
    \end{pmatrix}.
\end{gather*}
And by making sure that $H$ is sufficiently large such that $R(r+1) \leq D_{rr}/2$, we have that 
$$\min |\sigma_i(D_{\hat r}) - \sigma_j(D_{r - \hat r})| \geq \frac{D_{rr}}{2}, \; \; \min \sigma_i(D_{\hat r}) \geq D_{rr}.$$
In Lemma~\ref{lem:eqn6}, with $A = \widehat \Theta, U_1 = \widehat U, U_{1\perp} = [\widehat U_r, \widehat U_\perp], \Sigma_1 = \widehat D, V_1 = \widehat V,V_{1\perp} = [\widehat V_r, \widehat V_\perp], \tilde A = \Theta^*, \tilde U_1 = \tilde U, \tilde \Sigma_1 = D, \tilde V_1 = \tilde V, q = D_{rr}/2$, we can show that 
$$\fnorm{\widehat U_{1\perp}^\top \tilde U}\fnorm{\widehat V_{1\perp}^\top \tilde V}  \leq \frac{4\fnorm{\widehat \Theta - \Theta^*}^2}{D_{rr}^2}.$$
After we do the same transformation in Algorithm~\ref{alg:lowto}, we know the effective dimension (denoted by $\hat k$) satisfies that $\hat k = d_1d_2 - (d_1 -\hat r)(d_2 - \hat r) \leq d_1d_2 - (d_1 -r)(d_2 - r) = k$. And it holds that
\begin{align*}
    \|\theta^*_{\hat k+1 :p}\|_2 &= \fnorm{
    U_{1\perp}^\top \begin{pmatrix}
        \tilde U & \tilde U_r
    \end{pmatrix} \cdot \begin{pmatrix}
        D_{\hat r} & 0 \\
        0 & D_{r-\hat r}
    \end{pmatrix} \cdot  \begin{pmatrix}
       \tilde V^\top \\
        \tilde V_r^\top
    \end{pmatrix} V_{1\perp}}  \\
    &=\fnorm{U_{1\perp}^\top \tilde U  D_{\hat r}  \tilde V^\top V_{1\perp} + U_{1\perp}^\top \tilde U_r  D_{r-\hat r}  \tilde V_r^\top V_{1\perp}} \\
    &\leq \fnorm{U_{1\perp}^\top \tilde U} \fnorm{\tilde V^\top V_{1\perp}} \cdot \opnorm{D_{\hat r}} + \fnorm{U_{1\perp}^\top \tilde U_r} \fnorm{\tilde V_r^\top V_{1\perp}} \cdot \opnorm{D_{r-\hat r}} \\
    &\leq \opnorm{\Theta^*}  \cdot \frac{4\fnorm{\widehat \Theta - \Theta^*}^2}{D_{rr}^2} + \sqrt{r-\hat r}^2 \cdot 2R(r+1) \\
    &\widetilde O \left( \frac{rd^2}{D_{rr}^2} \left(\frac{d}{H} \right)^\frac{2\delta}{1+\delta} + r^\frac{3}{2} d \left(\frac{d}{H} \right)^\frac{\delta}{1+\delta}\right) \asymp \widetilde O \left( r^\frac{3}{2} d \left(\frac{d}{H} \right)^\frac{\delta}{1+\delta}\right).
\end{align*}
Note the second term will be dominant for large $H$, s.t. $H \geq \frac{d^\frac{1+2\delta}{\delta}}{r^\frac{1+\delta}{2\delta} D_{rr}^\frac{2+2\delta}{\delta}}$.

By using $T_1 = \min\left\{d \cdot 2^{\frac{i(1+\delta)}{1+2\delta}}, 2^i \right\}$ at each batch in line 3 of Algorithm~\ref{alg:lotus}, we can identically prove Theorem~\ref{thm:lotus2} with the same procedure as the proof of Theorem~\ref{thm:lotus}. And the only slight difference lies in the control of the term $A_{i,5}$. Therefore, we will omit the redundant details here. 

\hfill \qedsymbol
\section{Proof of Theorem~\ref{thm:lower}}\label{app:lower}

In this section, we will present a regret lower bound for the LowHTR. Our proof relies on the following Lemma for the MAB with heavy-tailed rewards:
\begin{lemma}\textup{\citep{xue2020nearly}}\label{lem:xue}
    For any multi-armed bandit algorithm $\mathcal B$ with $T \geq K \geq 4$ where $K$ is the number of arms, an arm $a^* \in \{1,\dots,K\}$ is chosen uniformly at random, this arm pays $1/\gamma$ with probability $p(a^*) = 2 \gamma^{1+\delta}$ and the rest pays $1/\gamma$ with probability $\gamma^{1+\delta}$ ($2\gamma^{1+\delta} < 1$). If we set $\gamma = (K/(T+2K))^\frac{1}{1+\delta}$, and denote $r_{t,a}$ as the observed reward of arm $a$ at round $t$ under algorithm $\mathcal B$, we have
    $$\E\left[\sum_{t=1}^T r_{t,a^*} - \sum_{t=1}^T r_{t,a_t} \right] \geq \frac{1}{8} T^\frac{1}{1+\delta} K^\frac{\delta}{1+\delta}.$$
\end{lemma}
Therefore, we can naturally consider the LowHTR problem with a finite and fixed arm set of size $K$. For simplicity, we set $d_1=d_2=d$ and set $K = (d-1)r \geq 4$. To adapt the results from Lemma~\ref{lem:xue}, we make the reward function of an arm $X_{t,a} \in \R^{d^2}$ as
$$r_{t,a} = \begin{cases}
    \frac{1}{\gamma}, \; \;  &\text{ with probability } \gamma\cdot \inp{X_{t,a}}{\Theta^*} \\
    0, \; \;   &\text{ with probability } 1 - \gamma\cdot\inp{X_{t,a}}{\Theta^*} 
\end{cases},$$
and then we only need to make $\inp{X_{t,a^*}}{\Theta^*} = 2\gamma^\delta$ and $\inp{X_{t,a}}{\Theta^*} = \gamma^\delta$ for any other arm $a$ where $a^*$ is uniformly chosen from $[K]$.

The contextual matrices are designed in the following way. For the first column, the first $r$ entries are set to be $\left[\sqrt{\frac{1}{r(r+1)}}, \sqrt{\frac{2}{r(r+1)}},\dots, \sqrt{\frac{r}{r(r+1)}}\right]$. And for the rest $(d-1)r$ entries in the first $r$ rows, we flatten them and set the $i$-th entry as $\frac{1}{\sqrt{2}}$ for the $i$-th arm matrix. All the other elements in the last $(d-k)$ rows are set to null for all arm matrices. We can easily check that the Frobenious norm of all arm matrices are bounded by $1$.

Next, we consider the parameter matrix $\Theta^*$ of rank $r$. For the first column, the first $r$ entries are set to be $\left[\sqrt{\frac{4}{r(r+1)}}\gamma^\delta, \sqrt{\frac{8}{r(r+1)}}\gamma^\delta,\dots, \sqrt{\frac{4r}{r(r+1)}} \gamma^\delta \right]$. And similarly for the rest $(d-1)r$ entries in the first $r$ rows, we flatten them and uniformly choose an index from $[(d-1)r]$, then the corresponding entry is $\sqrt{2}\gamma^\delta$ and all the rest elements in $\Theta^*$ are 0. The norm of $\Theta^*$ can also be bounded with large $T$. By using the feature matrices and the parameter matrix described above, we can recover the scenario in Lemma~\ref{lem:xue}, and thus we have that
$$\E R(T) \geq \frac{1}{8} T^\frac{1}{1+\delta} (d-1)^\frac{\delta}{1+\delta} r^\frac{\delta}{1+\delta} \asymp T^\frac{1}{1+\delta} d^\frac{\delta}{1+\delta} r^\frac{\delta}{1+\delta} \gtrsim T^\frac{1}{1+\delta}.$$
\hfill \qedsymbol

\section{Remarks of Assumption~\ref{assu:bounded}}\label{app:bound}
We will show that when a series of iid random matrices ${X_i}_{i=1}^m$ follows a sub-Gaussian distribution with parameter $\sigma \asymp \frac{1}{\sqrt{d_1d_2}}$, then the scale of $\max_{i \in [m]} \fnorm{X_i}$ can be bounded by some constant up to some very small logarithmic terms. The results can be directly deduced from the following Lemma:

\begin{lemma}\label{lem:assump}
    If iid random matrices ${X_i}_{i=1}^m \in \R^{d_1 \times d_2}$ follows a sub-Gaussian distribution with parameter $\sigma$, then with probability at least $1-\delta$ it holds that:
    $$\fnorm{X_i} \leq 4\sigma \sqrt{d_1d_2} + 2\sqrt{2} \sigma \sqrt{\ln{(\frac{m}{\delta})}}, \; \; \forall i \in [m]. $$
\end{lemma}

\proof Denote $\mathcal N_{\frac{1}{2}}$ as the $\frac{1}{2}$-covering of the matrix space $\{X : \,  \fnorm{X} \leq 1 \}$, then it holds that $|\mathcal N_{\frac{1}{2}}| \leq (1+1/0.5)^{d_1d_2} = 5^{d_1d_2}$. And 
for $\fnorm{V} \leq 1$ we define $S(V)$ as the closest point in $\mathcal N_{\frac{1}{2}}$ such that $\fnorm{V - S(V)} \leq \frac{1}{2}$. Next, we can have that
\begin{align*}
    \fnorm{X_i} = \max_{\fnorm{V} = 1} \inp{V}{X_i} &= \max_{\fnorm{V} = 1} \inp{V-S(V)+S(V)}{X_i} \leq \max_{Z \in \mathcal N_{\frac{1}{2}}} \inp{Z}{X_i} + \max_{\fnorm{W} = \frac{1}{2}} \inp{W}{X_i} \\
    & \leq \max_{Z \in \mathcal N_{\frac{1}{2}}} \inp{Z}{X_i} + \frac{1}{2}\max_{\fnorm{W} = 1} \inp{W}{X_i},
\end{align*}
which indicates that $\fnorm{X_i} \leq 2 \max_{Z \in \mathcal N_{\frac{1}{2}}} \inp{Z}{X_i}$. Therefore, it holds that for any $t > 0$
$$\P \left( \fnorm{X_i}  \geq t \right) \leq \P \left( \max_{Z \in \mathcal N_{\frac{1}{2}}} \inp{Z}{X_i} \geq \frac{1}{2} \right) \leq |\mathcal N_{\frac{1}{2}}| \cdot \exp{\left(-\frac{t^2}{8 \sigma^2} \right)} \leq 5^{d_1d_2} \cdot \exp{\left(-\frac{t^2}{8 \sigma^2} \right)}.$$
This fact indicates that
$$\P \left( \fnorm{X_i} \geq 2\sqrt{2} \sigma \sqrt{\ln{\left(\frac{1}{\delta}\right)}} + 4 \sigma \sqrt{d_1d_2} \right) \leq \delta.$$
Therefore, we have that 
\begin{align*}
    \P \left(\max_{i \in [m]} \fnorm{X_i} < 2\sqrt{2} \sigma \sqrt{\ln{\left(\frac{1}{\alpha}\right)}} + 4 \sigma \sqrt{d_1d_2} \right) \geq (1-\alpha)^m = 1 - \delta, \; \; \text{where } \alpha = 1 -(1-\delta)^\frac{1}{m}.
\end{align*}
For any $m > 1$ and $x \in [0,1]$, based on the taylor series of the function $f(x) = (1-x)^\frac{1}{m} = 1 - \frac{x}{m} - O(x^2)$, it holds that $1 - \frac{x}{m} > (1-x)^\frac{1}{m}$. And this fact leads to the final result:
\begin{align*}
    \P \left(\max_{i \in [T]} \fnorm{X_i} < 2\sqrt{2} \sigma \sqrt{\ln{\left(\frac{T}{\delta}\right)}} + 4 \sigma \sqrt{d_1d_2} \right) > 1 - \delta,
\end{align*}
which indicates that $\max_{i \in [T]} \fnorm{X_i}$ can be uniformly bounded by a constant scale up to some minimal error.
\hfill \qedsymbol

In our case with $\sigma \asymp \frac{1}{\sqrt{d_1d_2}}$, with probability at least $1-\delta$ it holds that
$$\max_{i \in [m]} \fnorm{X_i} \lesssim \frac{2\sqrt{2}}{\sqrt{d_1d_2}} \sqrt{\ln{\left(\frac{m}{\delta}\right)}} + 4.$$

\section{Alternative Version of LOTUS}\label{app:alter}
\begin{algorithm*}[t]
\caption{Randomized LOTUS} \label{alg:lotus2}
\begin{algorithmic}[1]
\Input Arm set $\mathcal{X}_t$, sampling distribution $\mathcal{D}_t$, $\delta, T_0, \eta, \lambda, 
%\{\lambda_i\}_{i=1}^{+\infty},\{\tau_i\}_{i=1}^{+\infty}, 
\{\lambda_{i,\perp}\}_{i=1}^{+\infty}$. 
\Stage The history buffer index set $\mathcal{H}_1 = \{\}$, the exploration buffer index set $\mathcal{H}_2 = \{\}$.
% \Stage \textbf{1: Subspace Estimation}
\State Pull arm $X_t \in \mathcal{X}_t$ according to $\mathcal{D}_t$ and observe payoff $y_t$. Then add $(X_t, y_t)$ into $\mathcal{H}_1$ and $\mathcal{H}_2$ for $t \leq T_0$.
\For{$i=1,2,\dots$ until the end of iterations}
\State Set the expected exploration length $T_1 = \min\left\{ \left[\frac{d^{2+4\delta} r^{1+\delta}}{D_{rr}^{2+2\delta}} 2^{i(1+\delta)} \right]^{\frac{1}{1+3\delta}}, 2^i \right\}$. 
\For{$t = |\mathcal{H}_1|+1 + |\mathcal{H}_1|+2^i$}
\If{Randomly sample from Bernoulli($T_1/2^i$) and get $1$}
\State Pull arm $X_t \in \mathcal{X}_t$ according to $\mathcal{D}_t$ and observe payoff $y_t$. Then add $(X_t, y_t)$ into $\mathcal{H}_1$ and $\mathcal{H}_2$
\Else
\State Obtain the estimate $\widehat \Theta$ based on Eqn. \eqref{eqn:hestimator} with $\mathcal{H}_2$, where we set $\tau_i \asymp  \left(|\H_2|/(d+ \ln{(2^{i+1}/\epsilon)})\right)^{\frac{1}{1+\delta}} c^{\frac{1}{1+\delta}}, \lambda_i \asymp \sigma \left((d+ \ln{(2^{i+1}/\epsilon)})/|\H_2|\right)^{\frac{\delta}{1+\delta}} c^{\frac{1}{1+\delta}}$.
\State Calculate the full SVD of $\widehat \Theta = [\widehat U, \widehat U_\perp] \, \widehat D \, [\widehat V, \widehat V_\perp]^\top$ where $\widehat U \in \mathbb{R}^{d_1 \times r}, \widehat V \in \mathbb{R}^{d_2 \times r}$.
\State For the next round, invoke LowTO with $\delta, [\widehat U, \widehat U_\perp], [\widehat V, \widehat V_\perp], \lambda, \lambda_{i,\perp}, \mathcal{H}_1$ and obtain the updated $\mathcal{H}_1$.
\EndIf
\EndFor
\EndFor
\end{algorithmic}
\end{algorithm*}


As we mention in Subsection~\ref{subsec:lotus}, we also have an alternative version of our LOTUS algorithm in a more randomized manner. Specifically, at each batch, our original version illustrated in Algorithm~\ref{alg:lotus} uses the static explore-then-exploit framework, where it first randomly samples some arms from the distribution $\mathcal D_t$ in Assumption~\ref{assu:subg} and then exploits the recovered low-rank subspaces with our LowTO method. However, we can mix these two exploration and exploitation steps in each batch. Specifically, we can explore by the sampling distribution $D_t$ with the probability of $T_1^i/2^i$ at each time $t$, otherwise we will conduct the subspace transformation and LowTO algorithm based on the current $\H_t$. The full pseudocode is presented in Algorithm~\ref{alg:lotus2}. We can expect the same order of regret as in Theorem~\ref{thm:lotus} based on the fact that if we do a series of iid Bernoulli trials with probability $p$ for $n$ times, then with a high probability the sum of success will be close to $np$ for large $n$ up to some logarithmic terms.


\section{Details of the LAMM Algorithm}\label{app:lamm}

\begin{algorithm*}[t]
\caption{LAMM Algorithm for the Solution to Eqn.\eqref{eq:estimator}} \label{alg:lamm}
\begin{algorithmic}[1]
\Input Initial $\widehat \Theta_0$, stopping threshold $\epsilon$, $\alpha_0,\psi,\lambda$.
\For{$i=1,2,\dots$ until $\fnorm{\widehat \Theta_i - \widehat \Theta_{i-1}} \leq \epsilon$}
\State{Initialize $\widehat \Theta_i = \widehat \Theta_{i-1}, \alpha_i = \max(\alpha_0, \alpha_{i-1}/\psi)$ and $s_i = 0$.}
\While{$F(\widehat \Theta_{i}; \widehat \Theta_{i-1}, \alpha_i) < \hat L_\tau(\widehat \Theta_{i})$ \textbf{or} $s_i = 0$}
\State{ $\widehat \Theta_i = S(\widehat \Theta_{i-1} - \alpha_i^{-1} \nabla \hat L_\tau(\widehat \Theta_{i-1}), \alpha_i^{-1}  \lambda )$.}
\State{$s_i = s_i + 1, \alpha_i = \psi \cdot \alpha_i$.}
\EndWhile
\EndFor
\end{algorithmic}
\end{algorithm*}

We implement the LAMM algorithm that was first proposed in~\citep{fan2018lamm} and recently extended to the matrix estimation setting~\cite{yu2023low} for the Huber-type estimator formulated in Eqn.~\eqref{eq:estimator}. Here we use the unified framework proposed in~\citep{yu2023low}, and for the sake of completeness we will still present its details as follows:

LAMM is presented in Algorithm~\ref{alg:lamm}. The LAMM method is a very efficient and scalable algorithm under high-dimensional datasets, and its first crux is establishing an isotropic quadratic function that locally upper bounds the objective function $\hat L_\tau(\Theta)$ at each iteration until convergence. Based on the second-order Taylor expansion, given the previous estimate $\widehat \Theta_{t-1}$ at iteration $t-1$, we can define the quadratic function at iteration $t$ as:
$$F(\Theta; \widehat \Theta_{t-1}, \alpha_k) = \hat L_\tau(\widehat \Theta_{t-1})  + \inp{\nabla \hat L_\tau(\widehat \Theta_{t-1})}{\Theta - \widehat \Theta_{t-1}} + \frac{\alpha_t}{2} \fnorm{\Theta - \widehat \Theta_{t-1}}^2,$$
with some quadratic parameter $\alpha_t > 0$. This parameter needs to be sufficiently large as we illustrated above such that $\hat L_\tau(\widehat \Theta_t) \leq F(\widehat \Theta_t; \widehat \Theta_{t-1}, \alpha_t)$ holds where
$$\widehat \Theta_t = \arg \min_{\Theta \in R^{d_1 \times d_2}} F(\Theta; \widehat \Theta_{t-1}, \alpha_t) + \lambda \nunorm{\Theta}.$$
We will use an iterative increment approach on $\alpha_t$ with some multiplier $\psi > 1$ to guarantee the quadratic function $F$ majorizes the objective function $\hat L$ at each descent. This fact ensures the descent of the objective function at each iteration with a closed-formed solution. Specifically, to minimize the penalized isotropic quadratic function, we can deduce the solution in the following ways: for $k >0$, define the soft-thresholding operator on a diagonal matrix $\Sigma = \text{diag}(\{\sigma_i\})$ as $S(\Sigma, k) = \text{diag}(\{\max(\sigma_i-k,0)\})$. For any general matrix $\Theta$ with its SVD decomposition as $\Theta = U \Sigma V^\top$, we write $S(\Theta, k) = U S(\Sigma, k) V^\top$. Then the solution of $\widehat \Theta_t$ can be represented as:

$$\widehat \Theta_t = S(\widehat \Theta_{t-1} - \alpha_t^{-1} \nabla \hat L_\tau(\widehat \Theta_{t-1}), \alpha_t^{-1}  \lambda ).$$




\end{document}
