\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

\usepackage{algorithm}% http://ctan.org/pkg/algorithms
\usepackage{algpseudocode}% http://ctan.org/pkg/algorithmicx

\usepackage{xr-hyper}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{tang_327-supp}


%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2022} with \usepackage[nohyperref]{icml2022} above.
\usepackage{hyperref}

\usepackage{times}
\usepackage{epsfig}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{bbm,nicefrac}
%\usepackage{algorithm,algorithmic}
%\usepackage[noend]{algpseudocode}
\usepackage{multicol, multirow}
%\usepackage{booktabs}
\usepackage{mathtools}
\usepackage{enumitem, xcolor}


% Attempt to make hyperref and algorithmic work together better:
%\newcommand{\theHalgorithm}{\arabic{algorithm}}

% If accepted, instead use the following line for the camera-ready submission:
% \usepackage[accepted]{icml2022}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{High-Probability Bounds for Robust Stochastic Frank-Wolfe Algorithm}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author{\href{mailto:<tyitang@ucdavis.edu>?Subject=Your UAI 2022 paper}{Tongyi Tang}{}}
\author{Krishnakumar Balasubramanian}
\author{Thomas C. M. Lee}
% Add affiliations after the authors
\affil{%
    Department of Statistics\\
    University of California\\
    Davis, CA, USA.
}

  
\begin{document}
\maketitle

\begin{abstract}
We develop and analyze robust Stochastic Frank-Wolfe type algorithms for projection-free stochastic convex optimization problems with heavy-tailed stochastic gradients. Existing works on the oracle complexity of such algorithms require a uniformly bounded variance assumption, and hold only in expectation. We develop tight high-probability bounds for robust versions of Stochastic Frank-Wolfe type algorithm under heavy-tailed assumptions, including infinite variance, on the stochastic gradient. Our methodological construction of the robust Stochastic Frank-Wolfe type algorithms leverage techniques from the robust statistic literature. Our theoretical analysis highlights the need to utilize robust versions of Stochastic Frank-Wolfe type algorithm for dealing with heavy-tailed data arising in practice.
\end{abstract}

\section{Introduction}
In this paper, we consider constrained stochastic optimization problem of the form
\begin{align}\label{eq:main_prob}
\underset{x \in \mathcal{X}}{\arg\min}~~ \{ f(x)  = \mathbb{E}_\xi[F(x, \xi)]= \int F(x, \xi) \, d\mathbb{P}(\xi) \},
\end{align}
where $f$ is a smooth convex function, $\mathcal{X}$ is a closed convex subset of $\mathbb{R}^d$, $\xi$ is a random vector defined according to a distribution $\mathbb{P}$ on the domain of $\xi$. We denote by $x^*$, the minimizer of $f(x)$ in~\eqref{eq:main_prob}. Problems of the form in~\eqref{eq:main_prob} arise frequently in modern machine learning, including matrix completion~\citep{davenport2016overview, freund2017extended}, structured linear inverse problems~\citep{chandrasekaran2012convex, tewari2011greedy}, multi-task learning~\citep{sener2018multi, zhao2020efficient} and particle filtering~\citep{lacoste2015sequential}. Stochastic Frank-Wolfe (SFW) and Stochastic Conditional Gradient Sliding (SCGS) algorithms are widely used iterative first-order algorithm for solving~\eqref{eq:main_prob}. Each iteration of SFW/SCGS involves performing linear minimization over the constraint set $\mathcal{X}$ based on stochastic first-order (or gradient) information, $\nabla F(\cdot,\xi) \in \mathbb{R}^d$. Depending on the geometry of the constraint set, SFW/SCGS is widely used in practice due to its projection-free nature~\citep{besanccon2021frankwolfe}. Despite its wide-spread usage, our understanding of the oracle complexity of SFW/SCGS algorithm is  limited. 

As each iteration of the SFW/SCGS algorithm requires access to a Stochastic First-order Oracle (SFO) and a Linear Minimization Oracle (LMO), the oracle complexity is measured by the number of calls to SFO and LMO to obtain an $\epsilon$-optimal solution -- that is, a point $\bar{x}$ such that $f(\bar{x}) - f(x_*) \leq \epsilon$. A majority of the existing results on the oracle complexity of SFW/SCGS algorithm are available only in expectation, i.e., on the metric $\mathbb{E}_\xi[f(\bar{x}) - f(x_*)] \leq \epsilon$. Furthermore, to obtain such oracle complexity results in expectation, it is assumed that stochastic gradient used has uniformly bounded variance (i.e., $\mathbb{E}_\xi [ \| \nabla F(x,\xi) - \nabla f(x)\|^2] \leq \sigma^2$), where $\|\cdot\|$ denotes the Euclidean norm on $\mathbb{R}^d$. This characterization of the oracle complexity provides very little understanding regarding the behaviour of the SFW/SCGS algorithms. In particular, the effect of the properties of the distribution $\mathbb{P}$, on the heavy-tail nature of the stochastic gradient, and consequently on the oracle complexity is \emph{not characterized} by the oracle complexity results in expectation. Furthermore, the oracle complexity of SFW/SCGS algorithm or its robust variants under infinite variance assumption is not known. 

To provide a concrete motivating example, consider the problem of sparse linear regression: Given $(a,y)\in \mathbb{R}^d \times \mathbb{R}$, consider the linear regression model $y= \langle a, \bar{x} \rangle+ \epsilon$, where for two vectors $c, d \in \mathbb{R}^d$, $\langle c, d \rangle$ represents the Euclidean inner product. Here, we let $\epsilon \sim N(0,1)$ and the true parameter $\bar{x}$ is assumed to be $s$-sparse (i.e., it has only $s$ non-zero coordinates out of the $d$ coordinates). The $L_1$-constrained least-squares estimator is then given by 
\begin{align}\label{eq:sparselinear}
\underset{x \in \mathcal{X}_1(s)}{\arg\min}~~\mathbb{E}[(y-\langle a, x\rangle)^2],
\end{align}
where $\mathcal{X}_1(s)\coloneqq\{x \in \mathbb{R}^d: \sum_{j=1}^d|x_j| \leq s\}$ is the $L_1$ ball of radius $s$.  This problem fits in the setup of~\eqref{eq:main_prob} with $\xi \coloneqq (a,y)$ and $F(x,\xi)\coloneqq (y-a^\top x)^2$. Hence the stochastic gradient is given by $$G(x,\xi) \coloneqq \nabla F(x,\xi) = 2( aa^\top x - y a) \in \mathbb{R}^d.$$ Note that as the iterates of SFW/SCGS algorithms (see, for example, Algorithm~\ref{alg_CGDSC} for the description of SCGS) are in the set $\mathcal{X}_1(s)$, we have $\| x\|$ is to be always bounded for all $x$ along the trajectory of the algorithm. Hence, the $(1+\alpha)$-th moment of the stochastic gradient, i.e., $\mathbb{E}[\|G(x, \xi)\|^{(1+\alpha)}]$, is controlled by the order of  $\mathbb{E}[\|a\|^{2(1+\alpha)}]$. When the covariate $a$ is a zero-mean multivariate $t$-distribution with degrees of freedom in the interval $[2,4)$, or is a zero-mean multivariate Pareto distribution with parameter in the interval $[2,4)$, the stochastic gradients have infinite variance. Hence, the existing oracle complexity results for SFW/SCGS are not applicable. 

Focusing on the SCGS algorithm~\citep{lan2016conditional}, in this work we develop and analyze robust versions of it, and establish high-probability SFO and LMO complexity bounds. First, under a sub-Gaussian tail assumption on the stochastic gradient, we establish high-probability bounds for the standard SCGS algorithm. However, when the stochastic gradient is heavy-tailed or has infinite variance, the standard SCGS algorithm is sub-optimal. This is due to the fact that each iteration of SCGS algorithm requires a sample average of mini-batch stochastic gradients. When the stochastic gradients are heavy-tailed, it is well-known from classical robust statistics literature that sample averages are poor estimates of the true expectation. Hence, to deal with the case when the stochastic gradient is heavy-tailed or has infinite variance, we construct robust versions of mini-batch stochastic gradient estimates which are subsequently used in the SCGS algorithm. Our motivations for developing robust SCGS algorithms are based on the fact that data arising from various real-world problems (for example, finance, networks, linguistics) are modeled efficiently using heavy-tailed distribution~\citep{resnick2007heavy, fan2014challenges, taleb2020statistical, roughgarden2020beyond}. We establish high-probability bounds on the oracle complexity results on robust SCGS algorithm by developing tight concentration inequalities for heavy-tailed martingales, which might be of independent interest. The established high-probability bounds on the SFO and LMO complexity of SCGS, provide a fine-grained characterization of the effect of the moments/tails of the distribution $\mathbb{P}$ on the performance of SCGS algorithms in terms of both the level of solution accuracy and the confidence specified by the practitioner.    %
\vspace{-0.061in}
\subsection{Related Works}\vspace{-0.051in}
\textbf{Robust statistics:} Robust statistics is a classical topic with too large a literature to summarize completely. We refer the reader to~\cite{huber2004robust} for an overview. We emphasize that a majority of the robust estimators developed in the statistics literature are invariably computationally intractable. The revival of robust statistics in modern mathematical statistics and learning theory communities arguably started with the work of~\cite{catoni2012challenging}. Since then, there has been intense work on robust mean and covariance estimation~\citep{minsker2015geometric, cardot2017online, minsker2018sub, lugosi2019sub, lugosi2019mean, hopkins2020mean}, and robust empirical risk minimization~\citep{hsu2016loss, diakonikolas2019robust, geoffrey2020robust, lecue2020robust}. However, such results are mainly statistical in nature and are not directly applicable for stochastic optimization with heavy-tailed gradients. In this work, we leverage the classical trick of \emph{trimmed estimator} from the robust statistics literature, and the recently proposed optimal mean-estimation technique by~\cite{ cherapanamjeri2020optimal} in the context of projection-free stochastic optimization. 

\textbf{Robust stochastic optimization:} Early works on robust stochastic optimization include~\cite{krasulina1969stochastic, martin1975robust, price1979robust, chen1989robustness, nemirovskij1983problem}. Limit theorems for random iterated maps and time-series with heavy-tails were proved in~\cite{mirek2011heavy, buraczewski2012asymptotics, mikosch2016large}. Such results make restrictive assumptions that are not satisfied by stochastic optimization algorithms. In modern stochastic optimization, several works consider the effect of heavy-tails on the performance of the algorithm~\citep{oliveira2017sample, nazin2019algorithms, wang2021convergence, davis2021low, bartl2021monte, anantharam2012stochastic, holland2021robustness}. There has also been overwhelming theoretical and empirical evidence in the modern machine learning literature that shows the noise in stochastic gradient algorithm could easily turn out to have heavy-tails due to the composite or product nature of the random vectors/matrices in the stochastic gradient~\citep{hodgkinson2020multiplicative,panigrahi2019non, simsekli2019tail, simsekli2020fractional, gurbuzbalaban2021fractional, camuto2021asymmetric, scaman2020robustness, simsekli2020hausdorff}. Furthermore, recently robust mean estimation based and trimming based SGD algorithms were analyzed in~\cite{prasad2020robust} and~\cite{bubeck2013bandits, NEURIPS2020_abd1c782, zhang2020adaptive, mai2021stability, NEURIPS2020_b282d173} respectively. However the above works do not deal with projection-free robust stochastic optimization, which is the main focus of our work.  

\textbf{Frank-Wolfe Algorithms:} The Frank-Wolfe method first proposed by~\cite{frank1956algorithm, levitin1966constrained}, has had a renewed interest in the past decade. We refer the reader to~\cite{jaggi2013revisiting,harchaoui2015conditional, lacoste2015global, beck2017linearly, garber2018fast}, for a partial list of recent works in the deterministic setting.  Considering the stochastic convex setup,~\cite{hazan2012projection, hazan2016variance} provided expected oracle complexity results for SFW algorithm. The complexities were further improved by a sliding procedure in~\cite{lan2016conditional}, based on a modified Frank-Wolfe method by Nesterov's acceleration. It is common in SFW/SCGS analysis to require an increasing batch-size in each step, to obtain oracle complexity results. Recently,~\cite{mokhtari2018stochastic, hassani2019stochastic, zhang2019one} addressed this issue of increasing batch size. But these works require restrictive assumptions, and have sub-optimal LMO complexity. In section~\ref{sec:experiments} we empirically compare against the 1-sample SFW method from~\cite{mokhtari2018stochastic} in the heavy-tailed setting. We also highlight that several works~\citep{reddi2016stochastic, yurtsever2019conditional, hazan2016variance} considered variance reduced versions of SFW and provided expected oracle complexities. In comparison to the above works, we focus on high-probability bounds on the oracle complexity in both the light-tailed and heavy-tailed setting (including infinite variance). 
\vspace{-0.1in}
\subsection{Our Contributions}\label{sec:contributinos} \vspace{-0.05in}

\begin{table*}[ht]
%\vspace{0.2in}
%\hskip-0.15cm
    \centering
    \resizebox{\textwidth}{!}{
\begin{tabular}{| c|c| c | c | c | c|}
\hline
\textbf{Mean-Estimator} & \textbf{Tails} & \textbf{Theorem}  & \textbf{SFO}  & \textbf{LMO}   \\
 \hline
 \hline
Average & Asmp.~\ref{assumption1}   & Thm.~\ref{theorem_light_tail} & ${\cal O}\left(\left(\tfrac{\log(1/\delta)}{\epsilon}\right)^2\right)$ &  $ {\cal O}\left(\left(\tfrac{\log(1/\delta)}{\epsilon}\right)\right)$\\
\hline
Clipped Grad. &   Asmp.~\ref{assumption2}&  Thm.~\ref{theorem_gc} (a)   &  $\mathcal{O}\left(\Big(\frac{(\log(1/\delta))^{\frac{\alpha}{1+\alpha}}}{\epsilon}\Big)^{\frac{3\alpha+2}{2\alpha}}\right)$ &  $ \mathcal{O}\left(\frac{(\log(1/\delta))^{\frac{\alpha}{1+\alpha}}}{\epsilon}\right)$  \\
\hline
Clipped Grad. & Asmp.~\ref{assumption3} & Thm.~\ref{theorem_gc} (b)    &  $\mathcal{O}\left(\Big(\frac{\sqrt{d}(\log(1/\delta))^{\frac{\beta}{1+\beta}}}{\epsilon}\Big)^{\frac{3\beta+2}{2\beta}}\right)
$   &  $ \mathcal{O}\left(\frac{\sqrt{d}(\log(1/\delta))^{\frac{\beta}{1+\beta}}}{\epsilon}\right)$    \\
\hline
CTBJ~\citep{cherapanamjeri2020optimal}  & Asmp.~\ref{assumption3}&  Thm.~\ref{theorem_optimal_mean}    &  $\mathcal{O}\left(\Big(\frac{(\log(1/\delta))^{\frac{\beta}{1+\beta}}}{\epsilon}\Big)^{\frac{3\beta+2}{2\beta}}\right)  
$    & $ \mathcal{O}\left(\frac{(\log(1/\delta))^{\frac{\beta}{1+\beta}}}{\epsilon}\right)$   \\
\hline
BC Clipped Grad. & Asmp.~\ref{assumption2}&  Thm.~\ref{theorem3} (a)   &  $\mathcal{O}\left(\Big(\frac{C(d, \alpha, \delta)}{\epsilon}\Big)^{\frac{5\alpha+3}{4\alpha}}\right)
$   &  $\mathcal{O}\left(\frac{C(d, \alpha, \delta)}{\epsilon}\right)$    \\
\hline
BC Clipped Grad.& Asmp.~\ref{assumption3}&  Thm.~\ref{theorem3} (b)   & $\mathcal{O}\left(\Big(\frac{C(d, \beta, \delta)}{\epsilon}\Big)^{\frac{5\beta+3}{4\beta}}\right)
$   &  $\mathcal{O}\left(\frac{C(d, \beta, \delta)}{\epsilon}\right)$    \\
\hline
% CGD & Deterministic& Convex &  & \\
 %\hline
% CGD & Deterministic& Non-convex &  & \\
%\hline
\end{tabular}
}
\caption{A summary of the obtained high-probability bounds. All results corresponds to the notion of $(\epsilon,\delta)$-optimal solution introduced in Definition~\ref{def:optsol}. The parameters $C(d, \alpha, \delta)$ and $C(d, \beta, \delta)$ are defined in Theorem~\ref{theorem3}. BC stands for Bias-Corrected. The last two rows also require the symmetric condition described in Assumption~\ref{assumption5}.}
\label{tab:summary}
\end{table*}


We now provide a list of theoretical contributions to the literature on oracle complexity of SFW algorithm. To do so, we first introduce the notion of optimality that we consider for solving~\eqref{eq:main_prob}.
\begin{definition}[$(\epsilon,\delta)$-optimal solution]\label{def:optsol}
We call $\bar{x}$ an $(\epsilon,\delta)$-optimal solution of the stochastic optimization problem~\eqref{eq:main_prob}, if we have %\begin{align}\label{eq:edoptimalsol}
$\mathbb{P} \left( f(\bar{x}) - f(x_*) \leq \epsilon(\delta) \right) \geq 1-\delta$. Here, the function $\epsilon (\delta)$ stands for the target accuracy and the parameter $\delta$ stands for the required level of confidence.
%\end{align}
\end{definition}
In contrast to the oracle complexity results obtained in the literature which are only in expectation, the above definition takes into account an user-specified level of confidence $\delta$, with which we could obtain an $\epsilon$-optimal solution of the stochastic optimization problem~\eqref{eq:main_prob}. This helps obtain a fine-grained characterization of the moments/tails of $\mathbb{P}$ on the oracle complexity of SFW algorithms. We next list the specific notion of light and heavy-tailed assumption that we make on the stochastic first-order oracle. 

%For this section, we show the high probability bound for the stochastic Frank-Wolfe algorithm from~\citep{balasubramanian2021zeroth} with $G = \nabla F(w_k, \xi_k)$ and different mean estimators. 

\begin{assumption}[Sub-Gaussian]\label{assumption1}
For any $x \in \mathbb{R}^d$, the SFO outputs an estimator $G(x,\xi)$ of $\nabla f(x)$ such that $\mathbb{E}[G(x,\xi)] = \nabla f(x)$, and $\mathbb{E}[\exp\{\|G(x,\xi) - \nabla f(x)\|^2/\sigma^2\}] \leq \exp\{1\}$, for some $\sigma^2>0$.  
\end{assumption} 

\begin{remark}
The sub-Gaussian tail assumption is satisfied by light-tailed distributions arising in practice, and hence has been used widely in statistics and stochastic optimization~\citep{vershynin2018high, wainwright2019high}. It should be note Assumption~\ref{assumption1} implies by Jensen's inequality that $\mathbb{E}[\|G(x,\xi) - \nabla f(x)\|^2] \leq \sigma^2$, which is used to obtain oracle complexity results for SFW in expectation. However, the converse is not true. 
\end{remark}

%\noindent $Assumption$ $2.1$. 
\begin{assumption}[Weak-exponential]\label{assumption2}
For any $x \in \mathbb{R}^d$, the first-order oracle outputs an estimator $G(x,\xi)$ of $\nabla f(x)$ such that $\mathbb{E}[G(x,\xi)] = \nabla f(x)$, and $\mathbb{E}[\|G(x,\xi)\|^{1+\alpha}] \leq \sigma^{1+\alpha}$ where $\alpha\in(0, 1]$.
\end{assumption}

%\noindent $Assumption$ $2.2$. 
\begin{assumption}[Weak-moment] \label{assumption3}
For any $x \in \mathbb{R}^d$, the first-order oracle outputs an estimator $G(x,\xi)$ of $\nabla f(x)$ such that $\mathbb{E}[G(x,\xi)] = \nabla f(x)$, and for all $\|v\| = 1$, we have $\mathbb{E}[|\langle G(x,\xi) - \nabla f(x), v\rangle|^{1+\beta}] \leq 1
$ for some $\beta \in (0,1]$
\end{assumption}

%\kb{Tongyi: Either it should be $\alpha$ and some other constant (say $\beta$) in the two assumptions above or $\alpha_1$ and $\alpha_2$. Could you please fix it in all places carefully ?}\tty{Done}

\begin{remark}
Firstly, we point out that both Assumptions~\ref{assumption2} and~\ref{assumption3} allow for the stochastic gradient to have infinite variance (when $\alpha,\beta <1$). Note that in Assumption~\ref{assumption3} the vector $v$ is normalized to 1, so $\beta$ does not appear in the RHS. Without loss of generality, we set constant on right hand side to 1 as any finite positive constant anyway can will absorbed in the $\mathcal{O}$ notation. Furthermore, even when $\alpha,\beta=1$, Assumptions~\ref{assumption2} and~\ref{assumption3} allow for tails heavier than the sub-Gaussian tail in Assumption~\ref{assumption1}. For canonical problems like structured linear or multi-class logistic regression, this assumption allows for the covariate to be heavy-tailed including Pareto or $t$-distribution, which do not satisfy Assumption~\ref{assumption1}. Furthermore, Assumption~\ref{assumption2} is stronger than Assumption~\ref{assumption3}. Indeed if  $\zeta \in \mathbb{R}^d$ is a random vector that satisfies Assumption~\ref{assumption3}, then we have $\mathbb{E}[\|\zeta\|^{1+\beta}]\leq (\pi/2)~d^{\frac{1+\beta}{2}}.$ %\kb{clarify later that $\sigma$ is typically an absolute constant but in this remark we kind of assume it is dimension dependent.}
\end{remark}

With the above preliminaries, we now present our \textbf{main contributions} in this work. \vspace{-0.1in}
\begin{itemize}[noitemsep, leftmargin=0.1in]
\item We first characterize the SFO and LMO complexity, in high-probability, when the standard mini-batch average of the stochastic gradients is used, in Theorem~\ref{theorem_light_tail} under the condition that the distribution of the stochastic gradients follows the sub-Gaussian tail Assumption~\ref{assumption1}.
\item We next establish the SFO and LMO complexity, in high-probability, when using the clipped gradient estimators, under the heavy-tailed Assumptions~\ref{assumption2} and~\ref{assumption3} in Theorem~\ref{theorem_gc} (a) and (b) respectively.
\item Next, in Theorem~\ref{theorem_optimal_mean}, we show that the high-probability oracle complexity results under  Assumption~\ref{assumption3} could be further improved by using a gradient estimator based on a recently proposed optimal mean-estimator procedure in~\citep{cherapanamjeri2020optimal}, at the cost of increased per-iteration complexity. 
\item Finally, we propose a computationally efficient biased-corrected clipped gradient procedure. We show in Theorem~\ref{theorem3} that this approach obtains improved high-probability oracle complexities (over Theorem~\ref{theorem_gc}) results under an additional symmetry condition (see Assumption~\ref{assumption5}) on the distribution of the stochastic gradients, for certain regimes of $\epsilon$ and $d$. 
\end{itemize}
A summary of our oracle complexity results is provided in Table~\ref{tab:summary} and visual illustrations are provided in Section~\ref{sec:plotsfortheorem}. To our knowledge, our work provides the \emph{first comprehensive high-probability oracle complexity  results} for SCGS algorithm with light and heavy-tailed stochastic gradients (including infinite variance). We discuss the consequences of our theoretical results for practice by reporting simulations for heavy-tailed sparse linear regression (Section~\ref{sec:experiments}) and multi-class logistic regression (Appendix--Section~\ref{sec:multiclass}). 

\section{Robust Stochastic Frank-Wolfe Algorithms}
Recall that our task is to solve constrained stochastic convex optimization problems of the form in~\eqref{eq:main_prob} using projection-free Frank-Wolfe type algorithm, which involves two main steps: the gradient evaluation step and the linear optimization step. The gradient evaluation step is typically based on averaging a mini-batch of gradients~\citep{hazan2016variance, lan2016conditional, balasubramanian2021zeroth}. It is well-known from classical robust statistics that the sample average is not an accurate estimate of true expectation in the heavy-tailed setting. Hence, a natural strategy is to replace the sample average with appropriate robust versions of mean estimators, and incorporate such estimators in Frank-Wolfe type algorithms. Specifically, we consider the version of stochastic conditional gradient sliding algorithm in~\cite{lan2016conditional}, also analyzed in~\cite{balasubramanian2021zeroth} for the case of biased gradients. We first state a subroutine in Algorithm~\ref{alg_CGDsubroutine} that we subsequently use. 


\begin{algorithm} [h]
	\caption{Inexact Conditional Gradient (ICG) method}
	\label{alg_CGDsubroutine}
	\begin{algorithmic}
    \State Input: $(x, g, \gamma, \mu)$.
    \State Set $\bar{y}_0 =x$, $t=1$, and $\kappa=0$..
    \While{$\kappa =0$}
    \State
$    y_t = \underset{u \in \mathcal{X}}{\arg\min} \left\{  h_{\gamma}(u) :=\langle g + \gamma(\bar{y}_{t-1}- x), u - \bar{y}_{t-1} \rangle \right \}$
    \State \textbf{If} $h_{\gamma}(y_t) \geq -\mu$, set $\kappa=1$. \textbf{Else} $\bar{y}_t  = \tfrac{t-1}{t+1} \bar{y}_{t-1} + \frac{2}{t+1} y_t $ and $t=t+1$.
    \EndWhile
    \State Output $\bar{y}_{t-1}$.
    \end{algorithmic}
\end{algorithm}

Note that Algorithm~\ref{alg_CGDsubroutine} is indeed the SFW algorithm for inexactly solving the following quadratic program
\begin{align} \label{qd_subproblem}
P_{\mathcal{X}}(x,g,\gamma)= \underset{u \in \mathcal{X}}{\arg\min} \left\{ \langle g,u \rangle + \frac{\gamma}{2} \| u -x\|^2 \right\},
\end{align}
which is the standard subproblem of stochastic first-order methods applied to a minimization problem when $g$ is an unbiased stochastic gradient of the objective function at $x$. We now present Algorithm~\ref{alg_CGDSC} which applies the Frank-Wolfe method to inexactly solve subproblems of the stochastic accelerated gradient method. It is well known that the above approach can significantly reduce the total number of calls to the stochastic oracle~\citep{lan2016conditional, balasubramanian2021zeroth}.

\begin{algorithm} [t!]
	\caption{Robust Stochastic Accelerated Gradient Method with Inexact Updates }
	\label{alg_CGDSC}
	\begin{algorithmic}

\State \textbf{Input:} $z_0 = x_0 \in \mathcal{X}$, positive integer sequence $m_k$, and sequences $\alpha_k$, $\gamma_k$, $\mu_k$ and iteration limit $N\geq 1$.

\For{$k = 1, \ldots, N$}% } $k = 1, \cdots, N$ \textbf{do}
\State 1. Set $
w_k = (1-\alpha_k) z_{k-1}+\alpha_k x_{k-1}$
\State 2. Call the stochastic oracle $m_k$ times, and compute (robust) stochastic gradients $\bar{G}_k$ as given by~\eqref{eq:sampleavggrad},~\eqref{eq:clippedgrad},~\eqref{eq:optimalgrad} or~\eqref{eq:bcclippedgrad}.
\State 3. Set $x_k = ICG(x_{k-1}, \bar{G}_k, \gamma_k, \mu_k)$, where $ICG(\cdot)$ is the output of Algorithm~\ref{alg_CGDsubroutine} with input $(x_{k-1}, \bar{G}_k, \gamma_k, \mu_k)$.
\State 4. Set $z_k  = (1- \alpha_k) z_{k-1} + \alpha_k x_k$.
\EndFor
\State \textbf{Output:} $z_N$
\end{algorithmic}
\end{algorithm}


\subsection{Robust Mean Estimators}
Note that step 2 in Algorithm~\ref{alg_CGDSC} requires the gradient estimator denoted by $\bar{G}_k$. We now elaborate the robust stochastic gradient estimators that we propose to use in this work. Recall that standard analysis of SCGS algorithm takes the sample average of the mini-batch of  \texttt{\textrm{i.i.d.}} stochastic gradient (obtained by querying the SFO) in each iteration. In this case, the gradient estimator is given by 
\begin{align}\label{eq:sampleavggrad}
\bar{G}_k \coloneqq \frac{1}{m_k} \sum_{j=1}^{m_k}  G(w_k, \xi_{k,j}).
\end{align}
As we will see from our analysis, the above gradient estimator is not robust to heavy-tails, i.e., when the vectors $G(w_k, \xi_{k,j})$ are heavy-tailed. Our first robust stochastic gradient estimator is based on the idea of trimmed or clipped estimators, generalized to the multivariate setting~\citep{tukey1963less, bickel1965some, huber2004robust, stigler1973asymptotic}. More recently such ideas have been used in the context of bandit optimization~\citep{bubeck2013bandits} and stochastic gradient descent algorithm~\citep{NEURIPS2020_abd1c782, NEURIPS2020_b282d173, zhang2020adaptive, mai2021stability}. Formally, in our setting, given  \texttt{\textrm{i.i.d.}} stochastic gradients $\{G(w_k, \xi_{k,j}) \}_{j=1}^{m_k}$, the \emph{clipped gradient} estimator is defined, for some $\delta\in (0,1)$, 
\begin{align}\label{eq:clippedgrad}
&~ \bar{G}_k \coloneqq \frac{1}{m_k}\sum_{j=1}^{m_k} \left[ G(w_k, \xi_{k,j})~ \mathbbm{1}\left\{ A_j \right\}\right] \\
&~ \text{with}\quad A_j \coloneqq \|G(w_k, \xi_{k,j})\| \leq \left(\frac{j \sigma^{1+\alpha} }{\log(1/\delta)}\right)^{\tfrac{1}{1+\alpha}}. \nonumber
\end{align}

%\begin{align}\label{eq:clippedgrad}
%&~ \bar{G}_k \coloneqq \nonumber\\
%&~ \frac{1}{m_k}\sum_{j=1}^{m_k} \left[ G(w_k, \xi_{k,j})~ %\mathbbm{1}\left\{\|G(w_k, \xi_{k,j})\| \leq \left(\frac{j \sigma^{1+\alpha} }{\log(1/\delta)}\right)^{\tfrac{1}{1+\alpha}}\right\}\right].
%\end{align}


Here, $\mathbbm{1}\{A\}$, for the event $A$ is defined as taking value 1 when the event A is true and taking value $0$ otherwise. While the above estimator handles robust stochastic gradients (including ones with potential infinite variance condition in Assumption~\ref{assumption2}), it turns out that the oracle complexities under the even weaker condition in Assumption~\ref{assumption3} with the above clipped gradient estimator is sub-optimal. To improve the oracle complexity under Assumption~\ref{assumption3}, we leverage the recent optimal robust mean estimation procedure proposed in~\cite{cherapanamjeri2020optimal}, which we call as the CTBJ procedure. Given  \texttt{\textrm{i.i.d.}} stochastic gradients $\{G(w_k, \xi_{k,j}) \}_{j=1}^{m_k}$, we use CTBJ estimator is given by (see Algorithm~\ref{alg:mean_estimation})
\begin{align}\label{eq:optimalgrad}
 \bar{G}_k \coloneqq \textsc{optimalmeanest}\left(\{G(w_k, \xi_{k,j}) \}_{j=1}^{m_k}\right).
\end{align}
Roughly, the idea of~\cite{cherapanamjeri2020optimal} is to use filtering to remove outliers and then compute the median by gradient descent procedures. A full description of the procedure is provided in Appendix--Section~\ref{alg:optimal_mean} for the sake of completeness. We also emphasize that while the estimator for the light-tailed case in~\eqref{eq:sampleavggrad} is unbiased, the robust gradient estimators in~\eqref{eq:clippedgrad} and~\eqref{eq:optimalgrad} are biased. This is another challenge that we handle in our analysis. Finally, we also remark that in Section~\ref{sec:biascorrected}, we introduce a bias-corrected  clipped gradient procedure which achieves improved oracle complexities under an additional symmetry assumption on the distribution on stochastic gradients.



\iffalse
\textbf{Gradient Clipping} Let $\delta\in (0,1)$, consider the truncated empirical mean $\bar{G}$ defined as 
\begin{align}\label{eq:clippedgrad}
\bar{G} = \frac{1}{m}\sum_{t=1}^m G_t\mathbbm{1}\left\{\|G_t\| \leq \left(\frac{t \sigma^{1+\alpha} }{\log(1/\delta)}\right)^{\frac{1}{1+\alpha}}\right\} 
\end{align}
\fi



\section{High-Probability Bounds}\label{sec:hpbounds}
We now provide our main results on high-probability bounds on the oracle  complexity of Algorithm~\ref{alg_CGDSC}. To do so, we also make the following standard smoothness assumption about the stochastic gradient, which is common in the literature of smooth convex optimization~\citep{nesterov2018lectures, lan2016conditional, balasubramanian2021zeroth}.
%\noindent $Assumption$ $3$. 

\begin{assumption}\label{assumption4}
The objective function $f$ has Lipschitz continuous gradient with constant $L>0$, i.e., $\|\nabla f(y) - \nabla f(x)\|\leq L\|y-x\|$ for all $x,y \in \mathbb{R}^d$. %and $\|G(y, \xi)-G(x, \xi)\| \leq L\|y-x\|$ almost surely, for any $x,y \in \mathbb{R}^d$. %\kb{ second condition really needed?}
\end{assumption}
We first state a preliminary result about the iterates of Algorithm~\ref{alg_CGDSC}, under Assumption~\ref{assumption4}. 
\begin{lemma}\label{templemma}
%\kb{Tongyi: can you polish this lemma in our notations ? You can use this lemma and merge it with your analysis for the high-probability bound (i.e., subsequent theorems)}\tty{Done}
Let $\{z_k\}_{k\geq 1}$ be generated by Algorithm \ref{alg_CGDSC} and the function $f$ be convex. Let $\bar{\Delta}_k \coloneqq \bar{G}_k - \nabla f(w_k)$, $\hat\Gamma_k \coloneqq \prod_{i=2}^k (1-\alpha_i)$, $\hat\Gamma_1\coloneqq1$ and $D_0 = \| x_0 - x_* \|^2$. Then under Assumption \ref{assumption4}, we have 
\begin{align}\label{eq:mainlemmaeq}
\frac{f(z_N)-f(x_*)}{\hat\Gamma_N} &~\leq \frac{\gamma_1}{2}\|x_0-x_*\|^2 
+ \sum_{i=1}^N\frac{\alpha_k\mu_k}{\hat\Gamma_k} \nonumber \\ 
&~ \hspace*{-1cm} + \sum_{i=1}^N  \frac{\alpha_k}{\hat\Gamma_k} \langle \bar{\Delta}_k, x_*-x_{k-1}\rangle + \sum_{k=1}^N\frac{\|\bar{\Delta}_k\|^2}{2L\hat\Gamma_k}.
\end{align}
%\begin{align}\label{eq:mainlemmaeq}
%\frac{f(z_N)-f(x_*)}{\hat\Gamma_N} &~\leq \frac{\gamma_1}{2}\|x_0-x_*\|^2 \nonumber
%\\ &~ + \sum_{i=1}^N\frac{\alpha_k\mu_k}{\hat\Gamma_k} + \sum_{i=1}^N  %\frac{\alpha_k}{\hat\Gamma_k} \langle \bar{\Delta}_k, x_*-x_{k-1}\rangle %\nonumber \\
%&~ + \sum_{k=1}^N\frac{\|\bar{\Delta}_k\|^2}{2L\hat\Gamma_k}.
%\end{align}
For our subsequent analysis, we set 
\begin{align}\label{eq:para}
    \alpha_k = \frac{2}{k+1}, \quad \gamma_k = \frac{4L}{k}, \quad\text{and}\quad \mu_k = \frac{LD_0}{kN}.
\end{align}
%\begin{eqnarray*}
%f(z_k) \leq (1-\alpha_k)f(z_{k-1}) + \alpha_k f(x_*) &+& \alpha_k[\mu_k + \frac{2L\alpha_k-\gamma_k}{2}\|x_k-x_{k-1}\|^2 + \langle %\bar{\Delta}_k, x_*-x_{k-1}\rangle] \\
%&+& \frac{\alpha_k\gamma_k}{2}[\|x_{k-1}-x_*\|^2-\|x_k-x_*\|^2] + \frac{\|\bar{\Delta}_k\|^2}{2L}
%\end{eqnarray*}
\end{lemma}
%The above lemma essentially appears as an intermediate result in the analysis of zeroth-order SFW in~\citep{balasubramanian2021zeroth}. We provide a proof in Appendix~\ref{sec:hpbounds} for the sake of completeness. 

The proof of Lemma~\ref{templemma} is provided in Appendix--Section~\ref{sec:proofs} and is an intermediate result in the proof of Theorem 3.1 in \cite{balasubramanian2021zeroth} with minor change. Our high-probability bounds are now based on developing concentration inequalities for the various gradient estimators $\bar{G}_k$ and bounding the terms appearing in right hand side of~\eqref{eq:mainlemmaeq} in  high-probability. To do so, we prove novel user-friendly concentration inequalities for (scalar-valued) martingales with heavy-tails that are discussed in detail in Section~\ref{sec:martingalesection}. It is worth mentioning that~\cite{lesigne2001large} and~\cite{fan2017deviation} also consider tail bounds for heavy-tailed martingales. However, they only provide deviation inequalities and their assumptions do no cover the regimes of heavy-tails that we are interested in.




\subsection{Oracle Complexity with Sample Average Estimator}
We now provide oracle complexity results for Algorithm~\ref{alg_CGDSC} with the sample average gradient estimator in~\eqref{eq:sampleavggrad}, that hold in high-probability. 
\begin{theorem}\label{theorem_light_tail}
Let Algorithm \ref{alg_CGDSC} be run with $\bar{G}_k$ as in~\eqref{eq:sampleavggrad}, and with parameters $\alpha_k, \gamma_k$ and $\mu_k$ as in \eqref{eq:para}. If the stochastic gradients $G(x,\xi)$ satisfy Assumption \ref{assumption1} and \ref{assumption4}, and $m_k = \mathcal{O}(N^3)$, then 
\begin{align*}
\mathbb{P}\left(f(z_N) - f(x_*) \leq \frac{D_0\sigma^2 \log ({1}/{\delta})}{N(N+1)}\right) \geq 1-\delta,
\end{align*}
and the SFO and LMO complexity are respectively bounded by
\begin{align*}
\mathcal{O}\left(\Big(\frac{\log(1/\delta)}{\epsilon}\Big)^2\right) \quad\text{and}\quad  \mathcal{O}\left(\frac{\log(1/\delta)}{\epsilon}\right).
\end{align*}
\end{theorem}
\begin{remark}
Note that~\citep{lan2016conditional} shows that the SFO and LMO oracle complexity for Algorithm~\ref{alg_CGDSC} with the sample average gradient estimator are of order $\mathcal{O}(1/\epsilon^2)$ and $\mathcal{O}(1/\epsilon)$ in expectation. Our results in Theorem~\ref{theorem_light_tail} generalize this to the high-probability setting quantifying the effect of the allowed  confidence level $\delta$ precisely. 
\end{remark}

\subsection{Oracle Complexity with Clipped Gradient Estimator}
We first provide results on the bias, tail and moment bounds on the clipped gradient estimator $\bar{G}_k$ defined in~\eqref{eq:clippedgrad}. Then, we provide oracle complexity results for Algorithm~\ref{alg_CGDSC} with the clipped gradient estimator, that hold with high-probability. 
\begin{lemma}\label{lemma1}
Let $\delta\in (0,1)$ and $C$ be a positive universal constant. Let $\bar{G}_k$ be as defined in~\eqref{eq:clippedgrad}. 
\begin{enumerate}[noitemsep,leftmargin=0.21in] 
\item [(a)] If the stochastic gradients $G(x,\xi)$ satisfy Assumption~\ref{assumption2}, then we have  
\begin{align*}
\left\|\mathbb{E}[\bar{G}_k] - \nabla f(w_k)\right\| \leq \sigma\left(\frac{\log(1/\delta)}{m_k}\right)^{\tfrac{\alpha}{1+\alpha}} \\
~~~\text{and}~~~\mathbb{P}\left(\left\|\bar{\Delta}_k\right\| \geq 4\sigma\left(\frac{\log(1/\delta)}{m_k}\right)^{\tfrac{\alpha}{1+\alpha}}\right) \leq \delta.
\end{align*}
Consequently, we also have the following moment bound $\mathbb{E}\left[\exp\left\{\left\|\frac{\bar{\Delta}_k}{\sigma}\right\|^\frac{1+\alpha}{\alpha}m_k\right\}\right] \leq C.$
\item [(b)] If the stochastic gradients $G(x,\xi)$ satisfy Assumption~\ref{assumption3}, then we have
\begin{align*}
\left\|\mathbb{E}[\bar{G}_k] - \nabla f(w_k)\right\| \leq \sqrt{d}\left(\frac{\log(1/\delta)}{m_k}\right)^{\tfrac{\beta}{1+\beta}} \\ ~~~\text{and}~~~\mathbb{P}\left(\|\bar{\Delta}_k\| \geq 4\sqrt{d}\left(\frac{\log(1/\delta)}{m_k}\right)^{\tfrac{\beta}{1+\beta}}\right) \leq \delta.
\end{align*}
Consequently, we also have the following moment bound $\mathbb{E}\left[\exp\left\{\left\|\frac{\bar{\Delta}_k}{\sqrt{d}}\right\|^\frac{1 + \beta}{\beta}m_k\right\}\right] \leq C.$
\end{enumerate}
\end{lemma}

%The proof of Lemma~\ref{lemma1} part b follows immediately by Lemma \ref{lemma1} (with  $d=1$) and union bound.

%\begin{remark}
%Using the proof similar as $1\rightarrow 2$ in Lemma~\ref{app_lemma1}, we immediately obtain the following moment bound $\mathbb{E}\left[\exp\left\{\left\|\frac{\bar{G}_k - \nabla f(w_k)}{\sigma}\right\|^\frac{1+\alpha}{\alpha}m_k\right\}\right] \leq C.$
%\begin{align*}
%%\end{align*}
%\end{remark}

\begin{theorem}\label{theorem_gc}
Let Algorithm~\ref{alg_CGDSC} be run with $\bar{G}_k$ as defined in \eqref{eq:clippedgrad}, and with parameters $\alpha_k, \gamma_k$ and $\mu_k$ as defined in \eqref{eq:para}. 
\begin{enumerate}[noitemsep,leftmargin=0.21in] 
\item [(a)]  
%
% a(\delta, \omega) defined by Thomas
%
Define $a(\delta,\omega)=(\log (1/\omega))^{\tfrac{\omega}{1+\omega}}$ and hence $a(\delta,\alpha)=(\log (1/\delta))^{\tfrac{\alpha}{1+\alpha}}$.
If the stochastic gradients $G(x,\xi)$ satisfy Assumptions~\ref{assumption2} and \ref{assumption4} and $m_k = \mathcal{O}(N^{\frac{2(\alpha+1)}{\alpha}})$, then  
%\begin{align*}
%\mathbb{P}\left(f(z_N) - f(x_*) \leq \frac{D_0\sigma\max\left\{(\log %(1/\delta))^{\tfrac{\alpha}{1+\alpha}}, \frac{\sigma}{N}(\log %(1/\delta))^{\tfrac{2\alpha}{1+\alpha}}\right\}}{N(N+1)}\right) \geq %1-\delta,
%\end{align*}
%{\scriptsize
%\begin{align*}
%\mathbb{P}\left(f(z_N) - f(x_*) \leq \frac{D_0\sigma\max\left\{a(\delta,\alpha),\frac{\sigma}{N}a(\delta,\alpha)^2\right\}} {N(N+1)}\right) \geq 1-\delta,
%\end{align*}
%}
{\small
\begin{align*}
& \mathbb{P}\left(f(z_N) - f(x_*) \leq 
\frac{D_0\sigma\max\left\{a(\delta,\alpha),\frac{\sigma}{N}a(\delta,\alpha)^2\right\}} {N(N+1)}\right) \\
& \geq 1-\delta,
\end{align*}
}
and the SFO and LMO complexity are respectively bounded by
\begin{align*}
\mathcal{O}\left(\left(\frac{
a(\delta,\alpha)
}
{\epsilon}\right)^{\tfrac{3\alpha+2}{2\alpha}}\right) \quad\text{and}\quad \mathcal{O}\left(\frac{
a(\delta,\alpha)
}
{\epsilon}\right).
%\mathcal{O}\left(\left(\frac{(\log({1}/{\delta}))^{\tfrac{\alpha}{1+\alpha}}}{\epsilon}\right)^{\tfrac{3\alpha+2}{2\alpha}}\right) \quad\text{and}\quad \mathcal{O}\left(\frac{(\log({1}/{\delta}))^{\tfrac{\alpha}{1+\alpha}}}{\epsilon}\right).
\end{align*}
\item [(b)] 
If the stochastic gradients $G(x,\xi)$ satisfy Assumptions \ref{assumption3} and \ref{assumption4} and $m_k =\mathcal{O}( N^{\frac{2(\beta+1)}{\beta}})$, then
{\small
\begin{align*}
& \mathbb{P}\left(f(z_N) - f(x_*) \leq \frac{D_0\sqrt{d}\max\left\{
a(\delta, \beta),
\frac{\sqrt{d}}{N}
a(\delta, \beta)^2
\right\}}{N(N+1)}\right) \\
& \geq 1-\delta,
\end{align*}
}
%{\scriptsize
%$$
%\mathbb{P}\left(f(z_N) - f(x_*) \leq \frac{D_0\sqrt{d}\max\left\{a(\delta, \beta), \frac{\sqrt{d}}{N} a(\delta, \beta)^2 \right\}}{N(N+1)}\right) \geq 1-\delta,
%$$
%}
%$$
%\mathbb{P}\left(f(z_N) - f(x_*) \leq \frac{D_0\sqrt{d}\max\left\{(\log ({1}/{\delta}))^{\tfrac{\beta}{1+\beta}}, \frac{\sqrt{d}}{N}(\log ({1}/{\delta}))^{\tfrac{2\beta}{1+\beta}}\right\}}{N(N+1)}\right) \geq 1-\delta,
%$$
and the SFO and LMO complexity are, respectively, bounded by
$$
\mathcal{O}\left(\left(\frac{\sqrt{d}
a(\delta, \beta)
%(\log({1}/{\delta}))^{\tfrac{\beta}{1+\beta}}
}
{\epsilon}\right)^{\tfrac{3\beta+2}{2\beta}}\right) \quad\text{and}\quad  \mathcal{O}\left(\frac{\sqrt{d}
a(\delta, \beta)
%(\log({1}/{\delta}))^{\tfrac{\beta}{1+\beta}}
}{\epsilon}\right),
$$
where $a(\delta,\beta)=(\log (1/\delta))^{\tfrac{\beta}{1+\beta}}$.

%$$
%\mathcal{O}\left(\left(\frac{\sqrt{d}(\log({1}/{\delta}))^{\tfrac{\beta}{1+\beta}}}{\epsilon}\right)^{\tfrac{3\beta+2}{2\beta}}\right), \quad\text{and}\quad  \mathcal{O}\left(\frac{\sqrt{d}(\log({1}/{\delta}))^{\tfrac{\beta}{1+\beta}}}{\epsilon}\right).
%$$
\end{enumerate}
\end{theorem}

\begin{remark}
First note that the oracle complexities under the weaker condition in Assumption~\ref{assumption3} has an additional dimension factor $\sqrt{d}$. Hence, for a fixed value of $\delta$, for $\alpha=1$ and $\beta=1$ (i.e., finite variance case), we have the SFO complexity to be of order $\mathcal{O}(\epsilon^{-5/2})$ and $\mathcal{O}(d^{5/4}\epsilon^{-5/2})$ respectively. Furthermore, note that under our assumptions, only $(1+\alpha)$ or $(1+\beta)$ moment exists for the stochastic gradients. Consequently, as $\alpha \to 0$ or $\beta \to 0$, for a fixed value of $0 < \epsilon < 1$ and $\delta$, the SFO complexity tends to infinity, highlighting the difficulty of the problem.
\end{remark}


\subsection{Oracle Complexity with CTBJ-based Gradient Estimator}

In this section, we will use the mean estimator procedure proposed recently in~\citep{cherapanamjeri2020optimal}, and show that the dimension factor $\sqrt{d}$ appearing in the SFO complexity in part (b) of Theorem~\ref{theorem_gc} could be removed, even under the weaker condition in Assumption~\ref{assumption3}. 
\begin{theorem}\label{theorem_optimal_mean}
Let Algorithm \ref{alg_CGDSC}  be run with $\bar{G}_k$ as defined in~\eqref{eq:optimalgrad}, and with parameters $\alpha_k$, $\gamma_k$ and $\mu_k$ as defined in~\eqref{eq:para}. If the stochastic gradients $G(x,\xi)$ satisfy Assumptions \ref{assumption3} and \ref{assumption4}, and $m_k = \mathcal{O}(N^{\frac{2(\beta+1)}{\beta}})$, then with a target confidence $\delta > 2^{-\frac{m_k}{16000}}$ and $d\lesssim \log(1/\delta)$, we have
{\small
\begin{align*}
\mathbb{P}\left(f(z_N) - f(x_*) \leq \frac{D_0\max\left\{
a(\delta, \beta),
\frac{1}{N}
a(\delta, \beta)^2
\right\}}{N(N+1)}\right) \geq 1-\delta,
\end{align*}
}
%\begin{align*}
%\mathbb{P}\left(f(z_N) - f(x_*) \leq \frac{D_0\max\left\{(\log ({1}/{\delta}))^{\tfrac{\beta}{1+\beta}}, \frac{1}{N}(\log ({1}/{\delta}))^{\tfrac{2\beta}{1+\beta}}\right\}}{N(N+1)}\right) \geq 1-\delta,
%\end{align*}
and the SFO and LMO complexity are respectively bounded by
\begin{align*}
\mathcal{O}\left(\left(\frac{
a(\delta,\beta)
}
{\epsilon}\right)^{\tfrac{3\beta+2}{2\beta}}\right) \quad \text{and} \quad  \mathcal{O}\left(\frac{
a(\delta,\beta)
}{\epsilon}\right).
\end{align*}
%\begin{align*}
%\mathcal{O}\left(\left(\frac{(\log({1}/{\delta}))^{\tfrac{\beta}{1+\beta}}}{\epsilon}\right)^{\tfrac{3\beta+2}{2\beta}}\right), \quad \text{and} \quad  \mathcal{O}\left(\frac{(\log({1}/{\delta}))^{\tfrac{\beta}{1+\beta}}}{\epsilon}\right).
%\end{align*}
\end{theorem}

\begin{remark}
The proof of the above theorem is based on a concentration result for the gradient estimator~\eqref{eq:optimalgrad}, established in~\citep{cherapanamjeri2020optimal}. In comparison to part (b) of Theorem~\ref{theorem_gc}, the SFO complexity in Theorem~\ref{theorem_optimal_mean} under Assumption~\ref{assumption3} does not have the additional dimensional factor $\sqrt{d}$, thereby demonstrating the benefits of using the mean-estimation procedure proposed in~\citep{cherapanamjeri2020optimal}. However, this improvement does not come for free, as the per-iteration complexity of using the robust mean-estimator~\eqref{eq:optimalgrad} is significantly higher than that of the clipped gradient based robust mean-estimator in~\eqref{eq:clippedgrad}, although it has a polynomial dependency on the problem parameters. See Section~\ref{alg:optimal_mean} for details regarding the per-iteration computational complexity of~\eqref{eq:optimalgrad}. 
\end{remark}

\subsection{Improving the Oracle Complexity}\vspace{-0.1in}
\label{sec:biascorrected}
%In addition, the CTBJ based robust gradient estimators have increased per-iteration complexity. 
The above oracle complexity results based on clipped gradient based and CTBJ based robust gradient estimators have the following drawbacks. The $\epsilon$-dependency of the SFO complexity in part (a) of Theorem~\ref{theorem_gc} under Assumption~\ref{assumption2} or Theorem~\ref{theorem_optimal_mean} under Assumption~\ref{assumption3} is $\mathcal{O}(\epsilon^{-2.5})$ when $\alpha=1$ and $\beta=1$. In this section, we propose a bias-corrected clipped gradient based robust gradient estimation procedure with which SFO complexity of Algorithm~\ref{alg_CGDSC} could be improved to $\mathcal{O}(d^2 \epsilon^{-2})$ under an additional symmetry assumption on the distribution of the stochastic gradient. Hence, when high-accuracy solutions in low-dimensional settings are required, the bias-corrected clipped gradient based robust SFW algorithm could be preferred. We now introduce the symmetry assumption and the clipped gradient procedure. 

\begin{assumption}\label{assumption5}
Let the distribution of $G(x,\xi)$ be continuous and symmetric about $\mathbb{E}[G(x,\xi)] = \nabla f(x)$, for all $x \in \mathbb{R}^d$. Also, let the probability density function be a  decreasing function with respect to $\|G(x,\xi)-\nabla f(x)\|$, for all $x \in \mathbb{R}^d$.
\end{assumption}

\begin{proposition}\label{proposition1}
For iteration $k$ the \texttt{\textrm{i.i.d.}} stochastic gradients $G(w_k, \xi_{k,j})$ are assumed to satisfy Assumptions~\ref{assumption3} and \ref{assumption5}. Let $\delta\in (0,1)$. Consider the initial estimate defined as 
\begin{align}\label{init_mean}
& \widehat{G}_k \coloneqq \underset{{G(w_k,\xi_{k,j}): j\geq\frac{m_k}{2}}}{\arg\min}~~\min\Bigg\{r\geq 0: \nonumber \\
& \sum_{\ell=\frac{m_k}{2}}^{m_k}\mathbbm{1}\left\{\|G(w_k, \xi_{k,\ell})-G(w_k, \xi_{k,j})\|\leq r\right\}\geq 0.3 m_k\Bigg\},
\end{align}
%\begin{equation}\label{init_mean}
%\widehat{G}_k \coloneqq \underset{{G(w_k,\xi_{k,j}): j\geq\frac{m_k}{2}}}{\arg\min}~~\min\left\{r\geq 0:\sum_{\ell=\frac{m_k}{2}}^{m_k}\mathbbm{1}\left\{\|G(w_k, \xi_{k,\ell})-G(w_k, \xi_{k,j})\|\leq r\right\}\geq 0.3 m_k\right\},
%\end{equation}
and consider the bias-corrected clipped gradient estimator $\bar{G}_k$ defined as 
\begin{align}\label{eq:bcclippedgrad}
& \bar{G}_k \coloneqq \frac{2}{m_k}\sum_{t=1}^{m_k/2} \min\left\{\frac{\bigg[\Big(\frac{ t}{\log(1/\delta)}\Big)^{\tfrac{1}{1+\beta}} + 24\bigg]\sqrt{d}}{\|G(w_k, \xi_{k,t})-\widehat{G}_k \|}, 1 \right\} \times \nonumber \\
& \quad \quad \quad \left[G(w_k, \xi_{k,t}) - \widehat{G}_k)\right] + \widehat{G}_k.
\end{align}
%\begin{align}\label{eq:bcclippedgrad}
%\bar{G}_k \coloneqq \frac{2}{m_k}\sum_{t=1}^{m_k/2} \min\left\{\frac{\bigg[\Big(\frac{t}{\log(1/\delta)}\Big)^{\tfrac{1}{1+\beta}} + 24\bigg]\sqrt{d}}{\|G(w_k, %\xi_{k,t})-\widehat{G}_k \|}, 1 \right\}  \left[G(w_k, \xi_{k,t}) - %\widehat{G}_k)\right] + \widehat{G}_k.
%\end{align}
%\begin{align}\label{eq:bcclippedgrad}
%\bar{G}_k \coloneqq \frac{2}{m_k}\sum_{t=1}^{m_k/2} G(w_k, \xi_{k,t})\mathbbm{1}\left\{\|G(w_k, \xi_{k,t})-\widehat{G}_k \| \leq \bigg[\Big(\frac{ t}{\log(1/\delta)}\Big)^{\tfrac{1}{1+\beta}} + 24\bigg]\sqrt{d}\right\} 
%\end{align}
Then, as long as $m_k\geq 2\max\{50, 24^{1+\beta}\}\log(1/\delta)$, by recalling that  $\bar{\Delta}_k = \bar{G}_k - \nabla f(w_k)$, we have
\begin{align*}
\mathbb{E}[\bar{G}_k] & = \nabla f(w_k) %\\
\quad \text{and} \quad \\
\mathbb{P} \Big\{ \| \bar{\Delta}_k \| & \leq 8\pi\sqrt{d}\left(\frac{\log(1/\delta)}{m_k}\right)^{\tfrac{\beta}{1+\beta}} \Big\} \geq 1-\delta.
\end{align*}
\end{proposition}
\begin{remark}
The initial estimate defined in~\eqref{init_mean} is the same as that in the CTBJ estimator in~\eqref{eq:optimalgrad}. We show that this initial step, along with the clipped gradient procedure for a specific choice of clipping parameter (as defined in~\eqref{eq:bcclippedgrad}) helps obtain an unbiased gradient estimator which is sufficiently concentrated.
\end{remark}
We now leverage the result in Proposition~\ref{proposition1} and show that one could obtain improved SFO complexity for certain ranges of $d$ and $\epsilon$ when running Algorithm~\ref{alg_CGDSC} with the robust gradient estimator~\eqref{eq:bcclippedgrad}. 

\begin{figure*}[h]
\includegraphics[scale=0.26]{updatedexperiments/sfw1/plot_abs_value_pareto_dim_100_iter_20000_L_10.png}
\includegraphics[scale=0.26]{updatedexperiments/sfw1/plot_abs_value_stu_t_dim_100_iter_20001_L_10.png}
\includegraphics[scale=0.26]{updatedexperiments/sfw1/plot_abs_value_pareto_dim_500_iter_100000_L_20.png}
\includegraphics[scale=0.26]{updatedexperiments/sfw1/plot_abs_value_stu_t_dim_500_iter_100000_L_20.png}\\
\includegraphics[scale=0.26]{updatedexperiments/sfw1/hist_abs_value_pareto_dim_100_iter_20000_25_one_L_10.png}
\includegraphics[scale=0.26]{updatedexperiments/sfw1/hist_abs_value_stu_t_dim_100_iter_20001_50_one_L_10.png}
\includegraphics[scale=0.26]{updatedexperiments/sfw1/hist_abs_value_pareto_dim_500_iter_100000_100_one_L_20.png}
\includegraphics[scale=0.26]{updatedexperiments/sfw1/hist_abs_value_stu_t_dim_500_iter_100000_100_one_L_20.png}
\caption{The two left and two right columns correspond to Pareto, Student-$t$ distributions with $d=100$ and $d=500$ respectively. \textbf{Top row:} Mean (solid lines) and Median (dotted lines) over 100 trails of iterations versus $f(z_N) - f(x_*)$ for $N=100$. \textbf{Bottom row:} Histogram of $f(z_N) - f(x_*)$ for $N = 100$. Numbers in the legend correspond to \emph{heavy-tail index}/\emph{standard deviation}. \vspace{-0.1in} } %It took 1 hour on a 2011 MacBook Pro to produce the above plots.} 
\label{fig:linearregexp}
\end{figure*}

%\noindent $Theorem$ $3$. 
\begin{theorem}\label{theorem3}
Let Algorithm~\ref{alg_CGDSC} be run with $\bar{G}_k$ as defined in~\eqref{eq:bcclippedgrad}, and with parameters $\alpha_k$, $\gamma_k$ and $\mu_k$ as defined in~\eqref{eq:para}.
\begin{enumerate}[noitemsep,leftmargin=0.21in] 
\item [(a)]  If the stochastic gradients $G(x,\xi)$ satisfy Assumptions \ref{assumption2}, \ref{assumption4} and \ref{assumption5}, and $m_k = \mathcal{O}(N^{\frac{3(\alpha+1)}{2\alpha}})$, we have for $(\log ({1}/{\delta}))^{\frac{1}{1+\alpha}}\geq [\Gamma(\frac{\alpha}{1+\alpha})\frac{1+\alpha}{\alpha}]^{\frac{1}{1-\alpha}}$,
\begin{align}\label{eq:conc}
\mathbb{P}\left(f(z_N) - f(x_*) \geq \frac{C(d, \alpha, \delta)D_0\sigma}{N(N+1)}\right) \leq \delta,
\end{align}
where
\begin{align*}
C(d, \alpha, \delta) = \max\left\{\sqrt{d}
a(\delta,\alpha)^{\tfrac{1}{\alpha}},
a(\delta,\alpha)^2
\right\},
\end{align*}
%\begin{align*}
%C(d, \alpha, \delta) = \max\left\{\sqrt{d}\Big(\log %\big({1}/{\delta}\big)\Big)^{\tfrac{1}{1+\alpha}}, d\Big(\log %\big({1}/{\delta}\big)\Big)^{\tfrac{2\alpha}{1+\alpha}}\right\},
%\end{align*}
and the SFO and LMO complexity are respectively bounded by
\begin{align*}
\mathcal{O}\left(\Big(\frac{C(d, \alpha, \delta)}{\epsilon}\Big)^{\frac{5\alpha+3}{4\alpha}}\right) \quad \text{and} \quad \mathcal{O}\left(\frac{C(d, \alpha, \delta)}{\epsilon}\right).
\end{align*}
\item [(b)] If the stochastic gradients $G(x,\xi)$ satisfy Assumptions \ref{assumption2},\ref{assumption4} and \ref{assumption5}, and $m_k = \mathcal{O}(N^{\frac{3(\beta+1)}{2\beta}})$, the same conclusion in~\eqref{eq:conc} holds with $\alpha$ replaced by $\beta$, with $C(d, \beta, \delta)\coloneqq C(d, \alpha=\beta, \delta)$, for $(\log ({1}/{\delta}))^{\frac{1}{1+\beta}}\geq [\Gamma(\frac{\beta}{1+\beta})\frac{1+\beta}{\beta}]^{\frac{1}{1-\beta}}$.

\iffalse
\item  If $G$ satisfies Assumption \ref{assumption3} and set $m_k = N^{\frac{3(\beta+1)}{2\beta}}$, we have 
$$
\mathbb{P}\left(f(z_N) - f(x_*) \geq \frac{C(d, \beta, \delta)D_0}{N(N+1)}\right) \leq \delta
$$
where
$$
C(d, \beta, \delta) = \max\left\{\sqrt{d}\Big(\log \big(1/\delta\big)\Big)^{\frac{1}{1+\beta}}, \min\bigg\{d\Big(\log \big(1/\delta\big)\Big)^\frac{2\beta}{1+\beta}, d^\frac{2\beta}{1+\beta}\Big(\log\big(\frac{d}{\delta}\big)\Big)^\frac{2}{1+\beta}\bigg\}\right\}
$$
when $(\log (1/\delta))^{\frac{1}{1+\beta}}\geq [\Gamma(\frac{\beta}{1+\beta})\frac{1+\beta}{\beta}]^{\frac{1}{1-\beta}}$. The SFO and LMO complexity are, respectively, bounded by
$$
\mathcal{O}\left(\Big(\frac{C(d, \beta, \delta)}{\epsilon}\Big)^{\frac{5\beta+3}{4\beta}}\right), \ \ \ \  \mathcal{O}\left(\frac{C(d, \beta, \delta)}{\epsilon}\right)
$$
\fi
\end{enumerate}
\end{theorem}
\begin{remark}
Note that when $\alpha=1$, for any fixed value of $\delta$, $C(d,\alpha, \delta) $ is $\mathcal{O}(d)$. Hence, the SFO complexity is of order $\mathcal{O}(d^2\epsilon^{-2})$. Hence, when $d^2 < o(\epsilon^{-0.5})$ the SFO complexity of part (a) of Theorem~\ref{theorem3} is better than part (a) of Theorem~\ref{theorem_gc}. A similar improvement holds for part (b). In Figure~\ref{fig:comparison} (Appendix--Section~\ref{sec:plotsfortheorem}), we compare Theorem~\ref{theorem_gc}, and Theorem~\ref{theorem3} visually. For comparing part (a) of the respective theorems, we set $\alpha=1$, $\epsilon =10^{-10}$ and $\delta=0.05$ vary $d$ from $200$ to $1000$ in steps of two hundred. For comparing part (b) of the respective theorems, we set $\alpha=1$, $\epsilon =10^{-6}$ and $\delta=0.05$ vary $d$ from $2000$ to $10000$ in steps of two thousand. %A visual illustration is provided in Section~\ref{sec:plotsfortheorem}, Figure~\ref{fig:comparison}.
\end{remark}

\vspace{-0.1in}




\section{Consequences for Practice}\label{sec:experiments}\vspace{-0.1in}

We now demonstrate the consequences of our theoretical results in practice. Among the robust gradient estimators in~\eqref{eq:clippedgrad},~\eqref{eq:optimalgrad} and~\eqref{eq:bcclippedgrad}, the \emph{most practical one} (i.e., least per-iteration complexity) is the clipped gradient estimator in~\eqref{eq:clippedgrad}. Hence, we compare Algorithm~\ref{alg_CGDSC} with the mini-batch average gradient in~\eqref{eq:sampleavggrad} and the clipped gradient estimator in~~\eqref{eq:clippedgrad} via experiments on sparse linear regression and multi-class logistic regression (presented in Appendix--Section~\ref{sec:multiclass} due to space limitations) with heavy-tailed covariates. 

\textbf{Sparse Linear Regression:} We now provide simulation results for the regression problem described in \eqref{eq:sparselinear}. For our experiments, we consider the data vector $a \in\mathbb{R}^d$ to be a Pareto distribution with the exponent being 2.2 (which is asymptotically a $t$-distribution with degrees of freedom 2.2). We ran Algorithm~\ref{alg_CGDSC} with parameters as defined in~\eqref{eq:para} for 100 trails. Here, $L$ could be calculated analytically for our problem. For the choice of batch size, while Theorem~\ref{theorem_gc} suggests $m_k = \mathcal{O}(N^{\frac{2(\alpha+1)}{\alpha}})$, we found that in our experiments setting $m_k=500$ works well already. In Figure~\ref{fig:linearregexp}, we report the performance of Algorithm~\ref{alg_CGDSC} with the clipped gradient estimator~\eqref{eq:clippedgrad} and mini-batch average estimator~\eqref{eq:sampleavggrad}. We also compare against the 1-sample SFW method from~\cite{mokhtari2018stochastic}. From the top row, we see that the clipped gradient estimator has faster convergence, i.e., it achieves higher accuracy with lesser iterations compared to the standard mini-batch averaging or the 1-sample SFW method. Furthermore, from the histogram in the bottom row, we see that the distribution of the last iterate has more fluctuations for the mini-batch average estimator and the 1-sample SFW method, compared to the clipped gradient estimator. We quantify this statement by reporting the \emph{standard deviation} and also the \emph{heavy-tailed index}, a widely used metric to quantify fluctuations~\citep{hoaglin2000understanding}, which is defined as
\begin{align*}
\tau(F) = \frac{F^{-1}(0.95)-F^{-1}(0.5)}{F^{-1}(0.75)-F^{-1}(0.5)}\bigg/ \frac{\Phi^{-1}(0.95)-\Phi^{-1}(0.5)}{\Phi^{-1}(0.75)-\Phi^{-1}(0.5)},
\end{align*}
where $\Phi$ is the distribution of a standard normal and $F$ is the empirical CDF obtained from the histogram. The results in Figure~\ref{fig:linearregexp} \textbf{confirm our theoretical results and highlight the benefits of using robust versions of SFW algorithms for dealing with heavy-tailed data arising practice}.

%Given $(a,y)\in \mathbb{R}^d \times \mathbb{R}$, consider the linear regression model $y= a^\top \bar{x} + \epsilon$. Here, we let $\epsilon \sim N(0,1)$ and the true parameter $\bar{x}$ is assumed to be $s$-sparse (i.e., it has only $s$ non-zero coordinates out of the $d$ coordinates). The $L_1$-constrained least-squares estimator is then given by $\arg\min_{x \in \mathcal{X}_1(s)} \mathbb{E}[(y-a^\top x)^2]$, where $\mathcal{X}_1(s)\coloneqq\{x \in \mathbb{R}^d: \sum_{j=1}^d|x_j| \leq s\}$ is the $L_1$ ball of radius $s$.  This problem fits in the setup of~\eqref{eq:main_prob} with $\xi \coloneqq (a,y)$ and $F(x,\xi)\coloneqq (y-a^\top x)^2$. Hence the stochastic gradient $G(x,\xi) = \nabla F(x,\xi) = 2( aa^\top x - y a) \in \mathbb{R}^d$. Note that as the iterates of Algorithm~\ref{alg_CGDSC} is in the set $\mathcal{X}_1(s)$, we have $\| x\|$ is to be always bounded for all $x$ along the trajectory of Algorithm~\ref{alg_CGDSC}. Hence, the $(1+\alpha)$-th moment of the stochastic gradient, i.e., $\mathbb{E}[\|G(x, \xi)\|^{(1+\alpha)}]$, is controlled by the order of  $\mathbb{E}[\|a\|^{2(1+\alpha)}]$. When the covariate $a$ is a zero-mean multivariate $t$-distribution with degrees of freedom in the interval $[2,4)$, or is a zero-mean multivariate Pareto distribution with parameter in the interval $[2,4)$, the stochastic gradients have infinite variance but finite $(1+\alpha)$-th moment. In other words, Assumption~\ref{assumption2} is satisfied, while Assumption~\ref{assumption1} is not. 





 %Developing oracle complexity results for robust SFW method under nonconvexity, and developing more practical versions of robust SFW algorithms are left as future work.

%\textbf{Multiclass logistic regression.} 
\textbf{Summary and Outlook:} %\vspace{-0.1in}
We proposed and analyzed robust versions of stochastic Frank-Wolfe type algorithms and established high-probability oracle complexity results. Our theoretical results are supported by numerical experiments on the problem of sparse linear regression and multi-class logistic regression with heavy-tailed data. Developing oracle complexity results for robust projection-free algorithms under non-convexity, and developing more practical versions of robust projection-free algorithms are interesting problems that we plan to examine as future work.
 
\bibliography{tang_327}
%\bibliographystyle{icml2022}

%\section{Additional Notation}\label{sec:addnotation}
%We define below some standard notations that we use in main paper, and in the appendix:
%\begin{enumerate}[noitemsep,leftmargin=0.1in]
%\item \textbf{Indicator function}: $\mathbbm{1}\{A\}$, for some event $A$ is defined as taking value 1 when the event A is true and taking value $0$ otherwise.
%\item \textbf{Big-$\mathcal{O}$ notation}: $T(n)$ is $\mathcal{O}(P(n))$ if and only if for some constants $c$ and $n_0$, $T(n) \leq c~ P(n)$, for all $n\geq n_0$.  \item \textbf{$\Gamma(x)$ function}: 
%\end{enumerate}


% In the unusual situation where you want a paper to appear in the
% references without citing it in the main text, use \nocite
% \nocite{langley00}




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\iffalse

\newpage
\appendix
\onecolumn

\section{Simulation Results for Multiclass Logistic Regression Experiments}\label{sec:multiclass}

In this section, we consider the problem of multi-class logistic regression. Given $(a,y)\in \mathbb{R}^{d} \times \mathbb{R}$, consider the multi-class logistic model $$\mathbb{P}(y|a)= \exp(a^\top \bar{x}_{y})/\sum\exp(a^\top \bar{x}_l).$$ Here, the true parameter $\bar{X} = \{\bar{x}_1, \cdots, \bar{x}_L\}^\top \in \mathbb{R}^{L\times d}$ is assumed to be  with bounded trace norm. The trace norm constrained estimator is then given by $$\arg\min_{x \in \mathcal{X}_{TR}(\tau)} \mathbb{E}[\log\big(\sum_l \exp(a^\top x_{l} - a^\top x_{y})\big)],$$ 
where $\mathcal{X}_{TR}(\tau)\coloneqq\{X \in \mathbb{R}^{L\times d}: \sum_{j=1}^d \sigma_j(X) \leq \tau\}$ is the $\|.\|_{tr}$ ball of radius $\tau$.  This problem fits in the setup of~\eqref{eq:main_prob} with $\xi \coloneqq (a,y)$ and $F(X,\xi)\coloneqq \log\big(\sum_l \exp(a^\top x_{l} - a^\top x_{y})\big)$. Hence the stochastic gradient $G(X,\xi) = \nabla F(X,\xi) = \{\nabla_1 F(X, \xi), \cdots, \nabla_L F(X, \xi)\} \in \mathbb{R}^{L\times d}$ where $$\nabla_l F(X, \xi) = [\exp(a^\top \bar{x}_{y})/\sum\exp(a^\top \bar{x}_l)]\mathbbm{1}\{y\neq l\}a.$$ Note that as the iterates of Algorithm~\ref{alg_CGDSC} is in the set $\mathcal{X}_{tr}(\tau)$, we have $\| x\|$ is to be always bounded for all $x$ along the trajectory of Algorithm~\ref{alg_CGDSC}. Hence, the $(1+\alpha)$-th moment of the stochastic gradient, i.e., $\mathbb{E}[\|G(x, \xi)\|^{(1+\alpha)}]$, is controlled by the order of  $\mathbb{E}[\|a\|^{(1+\alpha)}]$. When the covariate $a$ is a zero-mean multivariate $t$-distribution with degrees of freedom in the interval $[1,2)$, or is a zero-mean multivariate Pareto distribution with parameter in the interval $[1,2)$, the stochastic gradients have infinite variance but finite $(1+\alpha)$-th moment. In other words, Assumption~\ref{assumption2} is satisfied, while Assumption~\ref{assumption1} is not. 




For our experiments, we select the degrees of freedom of $t$-distribution and the parameter of Pareto distribution to be 1.1. We ran Algorithm~\ref{alg_CGDSC} with parameters as defined in~\eqref{eq:para} for 100 trails. We report the results in Figure~\ref{fig:multiregexp}. We report the performance of Algorithm~\ref{alg_CGDSC} with the clipped gradient estimator~\eqref{eq:clippedgrad} and mini-batch average estimator~\eqref{eq:sampleavggrad}.  In our experiments with multi-class logistic regression, we observe a similar performance as in the linear regression setting -- clipped gradient method performed the best. 



\begin{figure*}[ht]
\includegraphics[scale=0.26]{updatedexperiments/fig/multi_plot_pareto_dim_20_iter_100_2_three_L_2.png}
\includegraphics[scale=0.26]{updatedexperiments/fig/multi_plot_stu_t_dim_20_iter_100_2_three_L_1.png}
\includegraphics[scale=0.26]{updatedexperiments/fig/multi_plot_pareto_dim_100_iter_100_1_three_L_5.png}
\includegraphics[scale=0.26]{updatedexperiments/fig/multi_plot_stu_t_dim_100_iter_100_6_three_L_10.png}\\
\includegraphics[scale=0.26]{updatedexperiments/fig/multi_hist_pareto_dim_20_iter_100_2_three_L_2.png}
\includegraphics[scale=0.26]{updatedexperiments/fig/multi_hist_stu_t_dim_20_iter_100_2_three_L_1.png}
\includegraphics[scale=0.26]{updatedexperiments/fig/multi_hist_pareto_dim_100_iter_100_1_three_L_5.png}
\includegraphics[scale=0.26]{updatedexperiments/fig/multi_hist_stu_t_dim_100_iter_100_6_three_L_10.png}\\
\caption{The two left and two right columns corresponds to Pareto, Student-$t$ distributions with $d=20/L=10$ and $d=100/L=20$ respectively. \textbf{Top row:} Mean (solid lines) over 100 trails of iterations versus $f(z_N) - f(x_*)$ for $N=100$. \textbf{Bottom row:} Histogram of $f(z_N) - f(x_*)$ for $N = 100$. Numbers in the legend correspond to \emph{heavy-tail index}/\emph{standard deviation}. } %It took 1 hour on a 2011 MacBook Pro to produce the above plots.} 
\label{fig:multiregexp}
\end{figure*}


\section{Plots Illustrating Main Theoretical Results}\label{sec:plotsfortheorem}
In Figure~\ref{fig:theorem32}, we provide an illustration of the SFO complexity from Theorem~\ref{theorem_gc}. We set $\delta=0.05$. We split the scale of $\epsilon$ and $\alpha$ from $(0.2,0.5)$ and $(0.4,0.9)$ for better visualization with the large difference in the scale of the vertical (SFO) axis. We set $d=100$ for part (b).
\begin{figure}[h!]
\centering
\includegraphics[scale=0.4]{thm32a1.png}
\includegraphics[scale=0.4]{thm32a2.png} \\
\includegraphics[scale=0.4]{thm32b1.png}
\includegraphics[scale=0.4]{thm32b2.png}
\caption{Visualization of the SFO complexity from Theorem~\ref{theorem_gc} part (a) \textbf{(top row)} and part (b) \textbf{(bottom row)}. Note that for smaller values of $\epsilon$ (i.e, higher accuracy) and lower values of $\alpha$ (i.e., when only a smaller order moment exists for the stochastic gradients) the SFO complexity increases rapidly. Note the effect of dimension is pronounced in the bottom row corresponding to part (b). \vspace{-0.1in}}
\label{fig:theorem32}
\end{figure}

\begin{figure*}[t]
\centering
\includegraphics[scale=0.5]{comparison1.png}
\includegraphics[scale=0.5]{comparison2.png}
%\includegraphics[scale=0.38]{thm32a2.png} 
\caption{Comparing  the SFO complexity in Theorem~\ref{theorem_gc} and Theorem~\ref{theorem3}. Part (a) on the left and part (b) on the right. Note that in both cases, there is a certain threshold in dimension below which the SFO complexity of Theorem~\ref{theorem3} is better than that of Theorem~\ref{theorem_gc}.}. 
\label{fig:comparison}
\end{figure*}
\section{Concentration Inequality for Martingales with Heavy-Tails}\label{sec:martingalesection}
 We first start with two assumptions that turn out to be equivalent.
 
\begin{assumption}\label{app_assumption1} The random variable $X \in \mathbb{R}$ satisfies 
 $$\mathbb{P}(|X|\geq t) \leq 2\exp\left(-\frac{C_1}{\vartheta^2}~t^{\tfrac{1+\alpha}{\alpha}}\right),$$
  for some $\vartheta^2,C_1 >0$  with $\alpha \in (0, 1]$, for all $t \geq 0$.
\end{assumption}

\begin{assumption}\label{app_assumption2} The random variable $X \in \mathbb{R}$ satisfies 
$$
\mathbb{E}\left[\exp\left(C_2|X|^{\tfrac{1+\alpha}{\alpha}}\frac{1}{\vartheta^2}\right)\right]\leq 2,
$$  
for some $\vartheta^2,C_2 >0$  with $\alpha \in (0, 1]$.\end{assumption}

\begin{lemma}\label{app_lemma1}
Assumptions \ref{app_assumption1} and \ref{app_assumption2} are equivalent.
\end{lemma}
\begin{proof}
Suppose $X$ satisfies Assumption \ref{app_assumption1} and assume $C_2<C_1$, we have
\begin{eqnarray*}
\mathbb{E}\left[\exp\left(C_2|X|^{\tfrac{1+\alpha}{\alpha}}\frac{1}{\vartheta^2}\right)\right] &\leq& 1 + C_2\int_0^\infty
\frac{1+\alpha}{\alpha}t^{\tfrac{1}{\alpha}}\frac{1}{\vartheta^2}\exp\left(C_2t^{\tfrac{1+\alpha}{\alpha}}/\vartheta^2\right)\mathbb{P}(|X|>t)dt\\
&\leq& 1 + 2C_2\int_0^\infty \frac{1+\alpha}{\alpha}t^{\tfrac{1}{\alpha}}\frac{1}{\vartheta^2}\exp\left(-(C_1-C_2)t^{\tfrac{1+\alpha}{\alpha}}/\vartheta^2\right)dt\\
&=& 1 + 2\frac{C_2}{C_1-C_2}.
\end{eqnarray*}
Then, by taking $C_2\leq C_1/3$, we obtain $\mathbb{E}[\exp(C_2|X|^{\frac{1+\alpha}{\alpha}}\frac{1}{\vartheta^2})] \leq 2$. This completes one direction of the equivalence. 

Now, suppose $X$ satisfies Assumption \ref{app_assumption2} and assume $C_2 = 1$, then
\begin{eqnarray*}
\mathbb{P} (|X|\geq t) = \mathbb{P}\left(\exp\left(|X|^{\tfrac{1+\alpha}{\alpha}}/\vartheta^2\right) \geq \exp\left(t^{\frac{1+\alpha}{\alpha}}/\vartheta^2 \right) \right)\leq 2\exp\left(-t^{\frac{1+\alpha}{\alpha}}/\vartheta^2\right).
\end{eqnarray*}
This proves the other direction of the equivalence, thereby completing the proof. 
\end{proof}
\begin{lemma}\label{app_lemma2} Let $\Gamma(x)$ denote the gamma function which is defined via a convergent improper integral:~$\Gamma(x) \coloneqq  \int_0^\infty t^{x-1}e^{-t} dt$. For a random variable that satisfies Assumption~\ref{app_assumption1}, the following properties hold: 
\begin{enumerate}[leftmargin=0.2in]
    \item [(a)] For some positive constant $C_3$, the moments satisfy 
    \begin{align*}
    \mathbb{E} |X|^k \leq (2\vartheta^2)^{\tfrac{\alpha k}{\alpha+1}} \frac{\alpha k}{\alpha+1} \Gamma\left(\frac{\alpha k}{\alpha+1}\right),\quad\text{and}\quad (\mathbb{E} |X|^k)^{{1}/{k}} \leq C_3(\vartheta^{2k})^{\tfrac{\alpha}{1+\alpha}}, \text{for}~k\geq 1.
    \end{align*}
    %\item $\mathbb{E}(\exp(t^\frac{\alpha+1}{\alpha}X^2))\leq \exp(C_4^2 t)$ for all $t$ such that $|t|\leq \frac{1}{C_4}$
    \item[(b)] For some positive constant $C_4$, when $\alpha+1/\alpha\in\mathbb{N}$, we have $$\mathbb{E}[\exp(t X)]\leq \left(1 + C_4(t^\frac{1+\alpha}{\alpha}\vartheta^2)^\frac{\alpha}{1+\alpha}\right)\exp\left(t^{\tfrac{1+\alpha}{\alpha}}\vartheta^2\right).$$ 
    Furthermore, if $\mathbb{E}[X] = 0$, we have
    $$\mathbb{E}[\exp(t X)]\leq \left(1 + C_4(t^{\tfrac{1+\alpha}{\alpha}}\vartheta^2)^{\tfrac{2\alpha}{1+\alpha}}\right)\exp\left(t^{\tfrac{1+\alpha}{\alpha}}\vartheta^2\right).$$
\end{enumerate}
\end{lemma}

\begin{proof}
For part (a), without loss of generality assume that $C_1 = 1$. Then, we have
\begin{eqnarray*}
\mathbb{E}[|X|^k] &=& \int_0^\infty \mathbb{P}(|X|^k \geq t)dt \\
&=& \int_0^\infty \mathbb{P}(|X| \geq t^{\tfrac{1}{k}})dt \\
&\leq& 2\int_0^\infty \exp \left(-t^{\tfrac{1+\alpha}{\alpha k}}/\vartheta^2\right)dt \\
&=& (2\vartheta^2)^{\tfrac{\alpha k}{\alpha+1}}\frac{\alpha k}{\alpha+1}\int_0^\infty e^{-u}u^{\tfrac{\alpha k}{\alpha +1}-1} du \\
&=& (2\vartheta^2)^{\tfrac{\alpha k}{\alpha+1}} \frac{\alpha k}{\alpha+1} \Gamma\left(\frac{\alpha k}{\alpha+1}\right).
\end{eqnarray*}

Then, by the elementary facts that 
\begin{align*}
\Gamma\big(\frac{\alpha k}{\alpha+1}\big) \leq (\frac{\alpha k}{\alpha+1})^\frac{\alpha k}{\alpha+1}~\quad\text{and}\quad k^{1/k} \leq e^{1/e}, 
\end{align*}
we have that for any $k\geq 2$,
$$
(\mathbb{E}[|X|^k])^{1/k} \leq (\vartheta^2)^{\tfrac{\alpha}{\alpha+1}} \left(\frac{\alpha k}{\alpha+1}\right)^\frac{\alpha}{\alpha +1} e^{1/e} \leq C(\vartheta^2k)^\frac{\alpha}{\alpha+1},
$$
which completes the proof of part (a). We now show that part(a) implies part (b). To do so, first note that 
\begin{eqnarray*}
\mathbb{E}[\exp(t X)] &\leq& 1 + \sum_{k=1}^\infty \frac{t^k\mathbb{E}[|X|^k]}{k!}\\
&\leq& 1 + \sum_{k=1}^\infty \frac{t^k(\vartheta^2)^\frac{\alpha k}{\alpha+1}\frac{\alpha k}{\alpha+1}\Gamma(\frac{\alpha k}{\alpha+1})}{k!}\\
&\leq& 1 + \sum_{k=1}^\infty \frac{(t^\frac{\alpha+1}{\alpha}\vartheta^2)^\frac{\alpha k}{\alpha+1} k \Gamma(\frac{\alpha k}{\alpha+1})}{k!}\\
&=& 1 + \sum_{k=1}^{\frac{\alpha + 1}{\alpha} - 1} \frac{(t^\frac{\alpha+1}{\alpha}\vartheta^2)^\frac{\alpha k}{\alpha+1} k \Gamma(\frac{\alpha k}{\alpha+1})}{k!} + \sum_{j=0}^{\frac{\alpha + 1}{\alpha} - 1}\sum_{k=1}^\infty \frac{(t^\frac{\alpha+1}{\alpha}\vartheta^2)^{k + \frac{\alpha j}{1+\alpha}}(\frac{(\alpha + 1) k}{\alpha} + j)\Gamma(k + \frac{\alpha j}{1+\alpha})}{(\frac{(\alpha + 1) k}{\alpha} + j)!}\\
&\leq& 1 + \Gamma\big(\frac{\alpha}{1+\alpha}\big)\sum_{k=1}^{\frac{\alpha + 1}{\alpha} - 1} (t^\frac{\alpha+1}{\alpha}\vartheta^2)^{\frac{\alpha k}{1+\alpha}} + \sum_{j=0}^{\frac{\alpha + 1}{\alpha} - 1} (t^\frac{\alpha+1}{\alpha}\vartheta^2)^{\frac{\alpha j}{1+\alpha}}\sum_{k=1}^\infty \frac{(t^\frac{\alpha+1}{\alpha}\vartheta^2)^{k}k!}{(\frac{(\alpha + 1) k}{\alpha})!}\\
&\leq& 1 + \Gamma\big(\frac{\alpha}{1+\alpha}\big)\frac{(t^\frac{1+\alpha}{\alpha}\vartheta^2)^\frac{\alpha}{1+\alpha}(1 - t^\frac{1+\alpha}{\alpha}\vartheta^2)}{1 - (t^\frac{1+\alpha}{\alpha}\vartheta^2)^\frac{\alpha}{1+\alpha}}   + \frac{\alpha}{1+\alpha}\sum_{j=0}^{\frac{\alpha + 1}{\alpha} - 1} (t^\frac{\alpha+1}{\alpha}\vartheta^2)^{\frac{\alpha j}{1+\alpha}}\sum_{k=1}^\infty \frac{(t^\frac{\alpha+1}{\alpha}\vartheta^2)^{k}}{k!}\\
&\leq& 1 + \Gamma\big(\frac{\alpha}{1+\alpha}\big)\frac{1+\alpha}{\alpha}(t^\frac{1+\alpha}{\alpha}\vartheta^2)^\frac{\alpha}{1+\alpha} + (\exp(t^\frac{\alpha + 1}{\alpha}\vartheta^2) - 1) \\
&\leq& (1 + C_4(t^\frac{1+\alpha}{\alpha}\vartheta^2)^\frac{\alpha}{1+\alpha})\exp(t^\frac{\alpha + 1}{\alpha}\vartheta^2),
\end{eqnarray*}
where $C_4 = \Gamma\left(\frac{\alpha}{1+\alpha}\right)\frac{1+\alpha}{\alpha}$, thereby completing the proof. The second claim in part (b) follows immediately. 
\end{proof}
We now state our concentration inequality for heavy-tailed martingales. 
\begin{proposition}\label{app_proposition1}
Suppose a sequence of random variables $\{X_k\}_{k=1}^\infty$ satisfies, for $\alpha \in (0,1]$,
$$
\mathbb{E}\left[\exp(t X_k)|X_1, \dots, X_{k-1}\right]\leq \left(1 + C\left(t^{\tfrac{1+\alpha}{\alpha}}\vartheta_{k-1}^2\right)^{\tfrac{\alpha}{1+\alpha}}\right)\exp \left(t^{\tfrac{\alpha + 1}{\alpha}}\vartheta_{k-1}^2\right).
$$
If we assume that $\vartheta^2_i \leq n^{-\tfrac{\alpha+1}{\alpha}}$ for all $i$, then, we have
$$
\mathbb{P}\left(\sum_{k=1}^n X_k \geq \lambda \right) \leq \exp\left(-\tfrac{1}{\alpha + 1}\left(\tfrac{\alpha}{\alpha + 1}\right)^\alpha(\lambda-C)^{1+\alpha}n\right).
$$
If we further have $\mathbb{E}[X_k|X_1,\dots,X_{k-1}] = 0$, and $\vartheta_i^2 \leq n^{-\frac{\alpha+1}{2\alpha}}$ for all $i$, then
\begin{align*}
\mathbb{P}\left(\sum_{k=1}^n X_k \geq \lambda \right) \leq \exp(-C_\alpha\lambda^{1+\alpha})\quad\text{when}\quad\lambda \geq \left[\Gamma\left(\frac{\alpha}{1+\alpha}\right)\frac{1+\alpha}{\alpha}\right]^{\tfrac{1}{1-\alpha}}.
\end{align*}
where $C_\alpha$ is as defined in~\eqref{eq:calpha}.
\end{proposition} 

\begin{proof}
First note that we have the following expression for the moment generating function for the sum:
\begin{eqnarray*}
&&\mathbb{E}_{X_1,\dots, X_n}\left[\exp \left(t \sum_{k=1}^n X_k \right)\right]\\
 &=& \mathbb{E}_{X_1, \dots, X_{n-1}} \left[\mathbb{E}_{X_n}\Big[\exp\left(t \sum_{k=1}^n X_k\right)\Bigg|X_1, \dots, X_{n-1}\Big]\right] \\
&=&  \mathbb{E}_{X_1, \dots, X_{n-1}} \left[\exp\left(t \sum_{k=1}^{n-1} X_k\right)\mathbb{E}_{X_{n}}\left[\exp(t X_n)|X_1, \dots, X_{n-1}\right]\right] \\
&\leq&  \left(1 + C\left(t^{\tfrac{1+\alpha}{\alpha}}\vartheta_n^2\right)^{\tfrac{\alpha}{1+\alpha}}\right)\exp\left(t^\frac{1+\alpha}{\alpha}/\vartheta_n^2 \right)\mathbb{E}_{X_1, \dots, X_{n-1}} \left[\exp\left(t \sum_{k=1}^{n-1} X_k\right)\right].
\end{eqnarray*}
By repeatedly performing the above calculation for the term on the right hand side of the last inequality, we obtain 
\begin{align*}
\mathbb{E}_{X_1,\dots, X_n}\left[\exp \left(t \sum_{k=1}^n X_k \right)\right]  \leq \exp\left(t^\frac{1+\alpha}{\alpha}\sum_{k=1}^n\vartheta_n^2\right)\prod_{k=1}^n (1 + C(t^\frac{1+\alpha}{\alpha}\vartheta_k^2)^\frac{\alpha}{1+\alpha}). 
\end{align*}
Hence, by Markov's inequality and by our assumption that $\vartheta^2_i \leq n^{-\frac{\alpha+1}{\alpha}}$ for all $i$, we obtain
\begin{eqnarray*}
\mathbb{P}\left(\sum_{k=1}^n X_k \geq  \lambda\right) &=& \mathbb{P}\left(\exp\left(t \sum_{k=1}^n X_k\right) \geq \exp(\lambda t)\right) \\
&\leq& \exp\left(t^\frac{1+\alpha}{\alpha}\sum_{k=1}^n\vartheta_n^2 - \lambda t\right)\prod_{k=1}^n \left(1 + Ct\vartheta_k^{\tfrac{2\alpha}{1+\alpha}}\right)  \\
&\leq& \exp\left(t^{\tfrac{1+\alpha}{\alpha}}n^{-\frac{1}{\alpha}} - \lambda t + Ct\right) \\
&\leq& \exp\left(-\tfrac{1}{\alpha + 1}\big(\tfrac{\alpha}{\alpha + 1}\big)^\alpha(\lambda-C)^{1+\alpha}n\right) \\
\end{eqnarray*}
where in the last step we set $t = \left(\frac{\alpha}{\alpha+1}(\lambda-C)n^\frac{1}{\alpha}\right)^\alpha$. This proves the first claim. Now, when $\mathbb{E}[X_k|X_1,\ldots, X_{k-1}] = 0$ and $\vartheta^2_i\leq n^{-\frac{\alpha+1}{2\alpha}}$, we have
\begin{eqnarray*}
\mathbb{P}\left(\sum_{k=1}^n X_k \geq  \lambda\right) &=& \mathbb{P}\left(\exp\big(t \sum_{k=1}^n X_k\big) \geq \exp(\lambda t )\right) \\
&\leq& \exp\Big(t^\frac{1+\alpha}{\alpha}\sum_{k=1}^n\vartheta_n^2 - \lambda t\Big)\prod_{k=1}^n \left(1 + Ct^2\vartheta_k^{\tfrac{4\alpha}{1+\alpha}}\right)  \\
&\leq& \exp\left(t^{\tfrac{1+\alpha}{\alpha}} - \lambda t + Ct^2\right)  \\
&\leq& \exp(-C_\alpha\lambda^{1+\alpha}) \\
\end{eqnarray*}
where in the penultimate step, we set $t = \left(\frac{\alpha\lambda}{\alpha+1}\right)^\alpha$, and $C_\alpha$ is defined in \eqref{eq:calpha}. Clearly, the last inequality holds when $\lambda > \left[ \Gamma\left(\frac{\alpha}{1+\alpha}\right)\frac{1+\alpha}{\alpha}\right]^{\frac{1}{1-\alpha}}$. %\kb{Not clear to me why this condition on $\lambda$ matches the condition stated in the proposition statement. Can you add an explanation ?}\tty{In the proposition, $C$ is replaced with $C_4$ from Lemma \ref{app_lemma2}, as defined in line 672. As the condition of this proposition can be derived from random variables satisfying assumption \ref{app_assumption1} (which is how we used this proposition), I specified the constant clearly in the statement. Do you think we should just leave it as $C$ or replace it with $C_4 = \Gamma\left(\frac{\alpha}{1+\alpha}\right)\frac{1+\alpha}{\alpha}$ in this proposition statement?}
\end{proof}


\section{Proofs for Section~\ref{sec:hpbounds}}\label{sec:proofs}


\begin{proof}[\textbf{Proof of Lemma~\ref{templemma}}]
First, note that by Assumption \ref{assumption4}, we have
\begin{align}
f(z_k) &\leq f(w_k)  + \langle \nabla f(w_k), z_k-w_k\rangle + \frac{L}{2} \|z_k - w_k \|^2 \nonumber \\
&\le (1-\alpha_k) f(z_{k-1})  + \alpha_k \left[f(w_k)+\langle \nabla f(w_k), x_k-w_k\rangle \right]+ \frac{L \alpha_k^2}{2} \|x_k - x_{k-1} \|^2, \label{eq:cgdeq1}
\end{align}
where the second inequality follows from the convexity of $f$, and the definition of the sequence $w_k$ and $z_k$ from Algorithm~\ref{alg_CGDSC}. Also note that by definition of the sequence $x_k$ from Algorithm~\ref{alg_CGDSC} (based on Algorithm~\ref{alg_CGDsubroutine}), we have
\begin{align}\label{eq:cgdeq2}
- \mu_k \leq \langle \bar{G}_k  + \gamma_k(x_k - x_{k-1}), u - x_k\rangle \quad \forall u\in \mathcal{X}.
\end{align}
Letting $u=x_*$ in the above inequality and multiplying it by $\alpha_k$, summing it up with \eqref{eq:cgdeq1}, and denoting $\bar \Delta_{k} = \bar{G}_k - \nabla f(w_k)$, we obtain
\[
f(z_k) \le (1-\alpha_k) f(z_{k-1})  + \alpha_k f(x_*)+ \alpha_k \left[\mu_k+ \langle \bar \Delta_k+\gamma_k(x_k-x_{k-1}), x_*-x_k \rangle\right]+ \frac{L \alpha_k^2}{2} \|x_k - x_{k-1} \|^2,
\]
which together with the facts that
\begin{align*}
&\| x_{k-1} - x_* \|^2 = \| x_k - x_{k-1}\|^2 + \| x_k - x_*\|_2^2 + 2 \langle x_{k-1} - x_{k}, x_k - x_*\rangle, \\
&\alpha_k \langle \bar \Delta_k, x_*-x_k \rangle  \le \alpha_k \langle \bar \Delta_k, x_*-x_{k-1} \rangle + \frac{\|\bar \Delta_k\|^2}{2L}+\frac{L \alpha_k^2}{2}\|x_k - x_{k-1}\|^2,
\end{align*}
imply
\begin{align*}
f(z_k) \le (1-\alpha_k) f(z_{k-1})  + \alpha_k f(x_*) &+ \alpha_k \left[\mu_k+\frac{2L\alpha_k-\gamma_k}{2}\|x_k - x_{k-1} \|^2+\langle \bar \Delta_k, x_*-x_{k-1} \rangle\right] \nonumber \\
&+  \frac{\alpha_k\gamma_k}{2}\left[\| x_{k-1} - x_* \|^2 - \| x_k - x_* \|^2\right]+\frac{\|\bar \Delta_k\|^2}{2L}.
\end{align*}
Recalling the definition of $\hat\Gamma_k$ and $\hat\Gamma_1$,  
%$$
%\hat\Gamma_k = \prod _{i=2}^k(1-\alpha_i), \ \ \Gamma_1 =1
%$$
subtracting $f(x_*)$ from both sides, dividing by $\hat\Gamma_k$,  summing them up, we obtain
\begin{equation}\label{alg_ineq}
\frac{f(z_N)-f(x_*)}{\hat\Gamma_N} \leq \frac{\gamma_1}{2}\|x_0-x_*\|^2 + \sum_{i=1}^N\frac{\alpha_k\mu_k}{\hat\Gamma_k} + \sum_{i=1}^N  \frac{\alpha_k}{\hat\Gamma_k} \langle \bar{\Delta}_k, x_*-x_{k-1}\rangle + \sum_{k=1}^N\frac{\|\bar{\Delta}_k\|^2}{2L\hat\Gamma_k}, 
\end{equation}
which completes the proof.
\end{proof}

\begin{proof}[\textbf{Proof of Theorem~\ref{theorem_light_tail}}]
By Lemma~\ref{templemma}, we have~\eqref{alg_ineq} where we recall that $\bar{\Delta}_k = \bar{G}_k - \nabla f(w_k)$ with $\bar{G}_k$ as defined in~\eqref{eq:sampleavggrad}. Now, note that the first two terms on the right hand side of~\eqref{alg_ineq}  are bounded by the constant $3LD_0$.

Hence, we proceed to getting a handle on the third and fourth terms in the right hand side of~\eqref{alg_ineq} with high probability. Considering the fourth term, note that according to Assumption \ref{assumption1}, we have
\begin{align}\label{eq:momentbound0}
\mathbb{E}\left[\exp\Big\{\big\|\frac{\bar{\Delta}_k}{\sigma}\big\|^2m_k\Big\}\bigg|\mathcal{F}_{k-1}\right] \leq \exp\{1\},
\end{align}
where $\mathcal{F}_{k-1} = \sigma(\xi_1, \cdots, \xi_{k-1})$ is the $\sigma$-algebra generated by the random sequence $\xi_1, \cdots, \xi_{k-1}$. Now, by defining
$$
\pi_k \coloneqq \frac{1}{\hat{\Gamma}_km_k},\quad\text{and}\quad \theta_k \coloneqq \frac{\pi_k}{\sum_k \pi_k},
$$
we obtain the inequality corresponding to the fourth term on the right hand side of~\eqref{alg_ineq}:
$$
\exp\left\{\sum_{k=1}^N \frac{\theta_k \|\bar{\Delta}_k\|^2m_k}{\sigma^2}\right\}\leq \sum_{k=1}^N  \theta_k \exp\left\{ \frac{\|\bar{\Delta}_k\|^2m_k}{\sigma^2}\right\}.
$$
Taking expectations on both sides, and using~\eqref{eq:momentbound0} we then obtain 
\begin{align*}
\mathbb{E}\left[\exp\left\{  \frac{\sum_{k=1}^N \frac{1}{\hat{\Gamma}_km_k}\|\bar{\Delta}_k\|^2m_k}{\left(\sigma^2\sum_{k=1}^N \frac{1}{\hat\Gamma_km_k}\right)}\right\}\right] &\leq \sum_{k=1}^N \theta_k \mathbb{E}\left[\exp\left\{\frac{\|\bar{\Delta}_k\|^2m_k}{\sigma^2}\right\}\right]\\
&=  \sum_{k=1}^N \theta_k \mathbb{E}\left[\mathbb{E}\Big[\exp\left\{\frac{\|\bar{\Delta}_k\|^2m_k}{\sigma^2}\right\}\Big|\mathcal{F}_{k-1}\Big]\right] \\
&\leq \exp\{1\}.
\end{align*}
It then follows by Markov's inequality that for all $\lambda \geq 0$, we have
$$
 \mathbb{P}\left(\sum_{k=1}^N \frac{\|\bar{\Delta}_k\|^2}{\hat\Gamma_k} \geq \lambda \Big(\sigma^2\sum_{k=1}^N \frac{1}{\hat\Gamma_km_k}\Big)  \right) \leq \exp\{-\lambda\}.
$$
Note that, by our choice of $\alpha_k$ and $m_k$, we have $\frac{1}{\hat\Gamma_km_k} \leq \frac{D_0}{N}$, where $D_0 = \|x_0 - x^*\|^2$. Substituting this fact in the above bound, we hence obtain for all $\lambda \geq 0$,
\begin{equation}\label{norm_ineq}
\mathbb{P}\left(\sum_{k=1}^N \frac{\|\bar{\Delta}_k\|^2}{\hat\Gamma_k} \geq \lambda \sigma^2 D_0 \right) \leq \exp\{-\lambda\}.
\end{equation}
This completes the high-probability bound for the fourth term on the right hand side of~\eqref{alg_ineq}. In order to bound the third term on the right hand side of~\eqref{alg_ineq}, we first let $$\zeta_k = \frac{\alpha_k}{\hat\Gamma_k} \langle \bar{\Delta}_k, x_*-x_{k-1}\rangle.$$ 
Then Assumption \ref{assumption1} implies that,
\begin{align*}
\mathbb{E}\left[\exp\left\{\frac{\zeta_k^2m_k}{[\alpha_k\hat{\Gamma}_k^{-1}D_0\sigma]^2}\right\}\bigg|\mathcal{F}_{k-1}\right] &\leq \mathbb{E}\left[\exp \left\{\frac{m_k(\|\bar{\Delta}\|\|x_{k-1}-x^*\|)^2}{[\sigma D_0]^2}\right\}\bigg|\mathcal{F}_{k-1}\right] \\& \leq \exp\{1\}.
\end{align*}
As $\mathbb{E} [\langle \bar{\Delta}_k, x_*-x_{k-1}\rangle|\mathcal{F}_{k-1}] = 0$ it follows that $\{\zeta_k\}_{k\geq 1}$ is a martingale difference sequence. Then by exponential concentration inequalities for sums of martingale difference sequence (specifically by \cite[Lemma 2]{lan2012validation}), we have for all $\lambda \geq 0$ 
$$
 \mathbb{P}\left(\sum_{k=1}^N \zeta_k \geq \lambda \sigma D_0\Big[\sum_{k=1}^N (\hat\Gamma_k^{-1}m_k^{-\frac{1}{2}}\alpha_k)^2\Big]^{\frac{1}{2}}\right)\leq \exp\{-\lambda^2/3\}.
$$
We remark that while more refined exponential inequalities exist in the literature (for example,~\cite{fan2015exponential}) in our above calculation, it suffices to use the version from~\cite{lan2012validation}. Now, note that, by our choice of $\alpha_k$ and $m_k$ we have $\hat\Gamma_k^{-1}m_k^{-\frac{1}{2}}\alpha_k \leq  (N/D_0)^{-1/2}$. Substituting this fact in the above inequality, we obtain
\begin{equation}\label{martingale_ineq}
 \mathbb{P}\left(\sum_{k=1}^N \zeta_k \geq \lambda \sigma D_0 \right)\leq \exp\{-\lambda^2/3\}.
\end{equation}
Combine \eqref{alg_ineq}, \eqref{norm_ineq} and \eqref{martingale_ineq}, we get the high probability bound stated in Theorem~\ref{theorem_light_tail}. 

For the total number of iterations in Algorithm \ref{alg_CGDsubroutine}, from the classical analysis of the CG method, one can show that the FW-gap ($-h_\gamma$) of problem \eqref{qd_subproblem} is bounded by $LD_\mathcal{X}^2/T$ (where $L$ is the Lipschitz constant and $\max_{x, y \in\mathcal{X}}\|y-x\| \leq D_\mathcal{X}$) if the CG method runs for $T$ iterations; see, for example~\cite{balasubramanian2021zeroth}. Since the gradient of the objective function in \eqref{qd_subproblem} is Lipschitz continuous with constant $\gamma$, we have
$$
-h_{\gamma_k}(\bar{y}_{T_k}) \leq \frac{\gamma_k D_\mathcal{X}^2}{T},
$$
which together with the choice of $\mu_k$ and $\gamma_k$ in \eqref{eq:para}, imply that at iteration $k$ of Algorithm \ref{alg_CGDSC}, we need to run Algorithm \ref{alg_CGDsubroutine} for at most $T_k = 4D_\mathcal{X}^2N/D_0$ iterations. Therefore, the total number of iterations of Algorithm \ref{alg_CGDsubroutine} to find an $\epsilon$-stationary point of problem \eqref{eq:main_prob} is bounded by $\sum_{k=1}^N T_k \leq 48LD_\mathcal{X}^2/\epsilon$. Hence, we obtain the oracle complexity stated in Theorem~\ref{theorem_light_tail}. 
\end{proof}



\begin{proof}[\textbf{Proof of Lemma~\ref{lemma1}}]
We first prove part (a). For $\bar{G}_k$ as defined in~\eqref{eq:clippedgrad}, we let 
$$ 
G_t \coloneqq G(w_k, \xi_{k, t})\quad\text{and}\quad B_t = \left(\frac{ \sigma^{1+\alpha} t}{\log(1/\delta)}\right)^{\tfrac{1}{1+\alpha}}.
$$ 
Now, by Assumption \ref{assumption2}, we obtain
\begin{align}\label{eq:expbound}
\left\|\mathbb{E}[\bar{G}_k - \nabla f(w_k)]\right\| =&~ \frac{1}{m_k}\left\|\sum_{t=1}^{m_k} \left( \mathbb{E}[G_t\mathbbm{1}\{\|G_t\|\leq B_t\}] - \nabla f(w_k)\right)\right\| \nonumber\\
\leq&~ \frac{1}{m_k}\sum_{t=1}^{m_k} \mathbb{E}[\|G_t\|\mathbbm{1}\{\|G_t\|\geq B_t\}] \nonumber\\
\leq&~\frac{1}{m_k}\sum_{t=1}^{m_k}\frac{\sigma^{1+\alpha}}{B_t^{\alpha}}.
\end{align}

Now, note that we have
\begin{align*}
\left\|\bar{\Delta}_k\right\| 
\leq&~\frac{1}{m_k}\left\|\sum_{t=1}^{m_k} (\nabla f(w_k) - \mathbb{E}[G_t\mathbbm{1}\{\|G_t\|\leq B_t\}])\right\| \\ &~~~ + \frac{1}{m_k}\left\|\sum_{t=1}^{m_k} (\mathbb{E}[G_t\mathbbm{1}\{\|G_t\|\leq B_t\}] - G_t\mathbbm{1}\{\|G_t\|\leq B_t\})\right\| \\
=&~\|\mathbb{E}[\bar{G}_k - \nabla f(w_k)]\| + \frac{1}{m_k}\left\|\sum_{t=1}^{m_k} \mathbb{E}[G_t\mathbbm{1}\{\|G_t\|\leq B_t\}] - G_t\mathbbm{1}\{\|G_t\|\leq B_t\}\right\|.
%\leq&~\frac{1}{m_k}\sum_{t=1}^{m_k}\frac{\sigma^{1+\alpha}}{B_t^{\alpha}} + \sqrt{\frac{2B_{m_k}^{1-\alpha}\sigma^{1+\alpha}\log(1/\delta)}{m_k}} + \frac{B_{m_k}\log(1/\delta)}{3m_k}
\end{align*}

Furthermore, we also have that $$\mathbb{E}(\|G_t\|^2\mathbbm{1}\left\{\|G_t\|\leq B_t\right\}) \leq \sigma^{1+\alpha}B^{1-\alpha}.$$ 
Hence, by~\eqref{eq:expbound} and by vector-valued Bernstein's inequality for bounded independent random vectors (see, for example~\cite[Corollary 4.1]{minsker2017bernstein}), we have with probability at least $1-\delta$,
\begin{align*}
\left\|\bar{\Delta}_k\right\| 
\leq&~\frac{1}{m_k}\sum_{t=1}^{m_k}\frac{\sigma^{1+\alpha}}{B_t^{\alpha}} + \sqrt{\frac{2B_{m_k}^{1-\alpha}\sigma^{1+\alpha}\log(1/\delta)}{m_k}} + \frac{B_{m_k}\log(1/\delta)}{3m_k}.
\end{align*}
Plugging in the expression for $B_t$ concludes the proof. 

The proof of part (b) follows verbatim the proof of part (a) and by noting the fact that $G_t/\sqrt{d}$ satisfies Assumption \ref{assumption2}.
\end{proof}


\begin{proof}[\textbf{Proof of Theorem~\ref{theorem_gc}}]
We first prove part (a). Note that by Lemma~\ref{templemma}, we can obtain the inequality \eqref{alg_ineq}. As before, we note that the first two terms on the right hand side of~\eqref{alg_ineq}  are bounded by the constant $3LD_0$. Hence, we proceed to bound the last two terms on the right hand side of~\eqref{alg_ineq} with a high probability bound. 

For the last term, according to Lemma \ref{lemma1}, we have
\begin{align}\label{eq:momentbound2}
\mathbb{E}\left[\exp\bigg\{\Big\|\frac{\bar{\Delta}_k}{\sigma}\Big\|^{\tfrac{1+\alpha}{\alpha}}m_k\bigg\}\bigg|\mathcal{F}_{k-1}\right] \leq C.
\end{align}
where $\mathcal{F}_{k-1} = \sigma(\xi_1, \cdots, \xi_{k-1})$. Now, by defining 
\begin{align*}
\pi_k \coloneqq \frac{1}{\hat{\Gamma}_km_k^\frac{2\alpha}{\alpha+1}}, \quad\text{and}\quad \theta_k \coloneqq \frac{\pi_k}{\sum_k \pi_k},
\end{align*}
we obtain the following inequality: 
\begin{align*}
\exp\left\{\bigg(\sum_{k=1}^N \theta_k \Big\|\frac{\bar{\Delta}_k}{\sigma}\Big\|^2m_k^{\tfrac{2\alpha}{\alpha+1}}\bigg)^{\tfrac{\alpha+1}{2\alpha}}\right\} \leq \exp\left\{\sum_{k=1}^N \theta_k \Big\|\frac{\bar{\Delta}_k}{\sigma}\Big\|^{\tfrac{1+\alpha}{\alpha}}m_k\right\}\leq \sum_{k=1}^N \theta_k \exp\left\{\Big\|\frac{\bar{\Delta}_k}{\sigma}\Big\|^{\tfrac{1+\alpha}{\alpha}}m_k\right\}.
\end{align*}
By taking expectation on both sides and using~\eqref{eq:momentbound2} we obtain 
\begin{align*}
&~~\mathbb{E}\left[\exp\left\{\left(\frac{\sum_{k=1}^N {\hat{\Gamma}^{-1}_k m_k^{-\tfrac{2\alpha}{\alpha+1}}}\Big\|\frac{\bar{\Delta}_k}{\sigma}\Big\|^2 m_k^{{-\tfrac{2\alpha}{\alpha+1}}}}{\left(\sum_{k=1}^N {\hat\Gamma^{-1}_k m_k^{-\tfrac{2\alpha}{\alpha+1}}}\right)}\right)^{\tfrac{1+\alpha}{2\alpha}}\right\}\right] \\
\leq &~~\sum_{k=1}^N \theta_k \mathbb{E}\left[\exp\Big\{\Big\|\frac{\bar{\Delta}_k}{\sigma}\Big\|^{\tfrac{1+\alpha}{\alpha}}m_k\Big\}\right]\\
  =&~~\sum_{k=1}^N \theta_k \mathbb{E}\left[\mathbb{E}\bigg[\exp\Big\{\Big\|\frac{\bar{\Delta}_k}{\sigma}\Big\|^{\tfrac{1+\alpha}{\alpha}}m_k\Big\}\bigg|\mathcal{F}_{k-1}\bigg]\right]\leq C.
\end{align*}
Hence, by Markov's inequality we have for all $\lambda \geq 0$ that 
\begin{align*}
\mathbb{P}\left\{\sum_{k=1}^N \frac{\big\|\bar{\Delta}_k\big\|^2}{\hat\Gamma_k} \geq \lambda \left(\sum_{k=1}^N ~\frac{\sigma^2}{\hat\Gamma_km_k^{\tfrac{2\alpha}{\alpha+1}}}\right)  \right\} \leq C \exp\{-\lambda^{\tfrac{1+\alpha}{2\alpha}}\}.
\end{align*}
If we set $m_k = N^{\tfrac{2\alpha+2}{\alpha}}$, then we obtain
$$
{\hat\Gamma^{-1}_k m_k^{-\tfrac{2\alpha}{\alpha+1}}} \leq \frac{D_0}{N}.
$$
Hence, we have for all $\lambda \geq 0$ that
\begin{align*}
\mathbb{P}\left(\sum_{k=1}^N \frac{\|\bar{\Delta}_k\|^2}{\hat\Gamma_k} \geq \lambda \sigma^2 D_0 \right) \leq \exp\{-\lambda^{\tfrac{1+\alpha}{2\alpha}}\}
\end{align*}
which equivalently leads to
\begin{align}\label{eq:temp44}
\mathbb{P}\left(\sum_{k=1}^N \frac{\|\bar{\Delta}_k\|^2}{\hat\Gamma_k} \geq \sigma^2D_0\log \left(\frac{1}{\delta}\right)^{\tfrac{2\alpha}{1+\alpha}}\right) \leq \delta.
\end{align}
corresponding to the fourth term on the right hand side of~\eqref{alg_ineq}.

For the third term, again by setting $m_k = N^{\tfrac{2\alpha+2}{\alpha}}$ and applying Lemma \ref{lemma1}, we have with probability $1-\delta$
\begin{align}\label{eq:temp43}
\sum_{i=1}^N \frac{\alpha_k}{\hat\Gamma_k} \langle \bar{\Delta}_k, x_*-x_{k-1}\rangle\leq  D_0\sigma\sum_{i=1}^N \frac{\alpha_k}{\hat\Gamma_k} \left(\frac{\log(1/\delta)}{m'_k}\right)^{\tfrac{\alpha}{\alpha+1}} \leq  \sigma D_0 \log \left(\frac{1}{\delta}\right)^{\tfrac{\alpha}{1+\alpha}}
\end{align}
Now, the claim in Theorem~\ref{theorem_gc} follows by~\eqref{alg_ineq},~\eqref{eq:temp44} and~\eqref{eq:temp43}. The oracle complexity results then follows by our choice of $m_k$ and the argument similar to that used in the proof of Theorem~\ref{theorem_light_tail}.
%Observing that $m'_k \geq m_k$, we will take batch size $m'_k$ as the third term is dominant in terms of sample complexity.

Furthermore, the proof of part (b) follows verbatim the proof of part (a) as $G/\sqrt{d}$ satisfies Assumption~\ref{assumption2}.
\end{proof}

\begin{proof}[\textbf{Proof of Theorem~\ref{theorem_optimal_mean}}]
We first require a concentration result from~\cite{cherapanamjeri2020optimal} for ~\eqref{eq:optimalgrad}, which we restate below in our notation.

\begin{lemma}\label{lemma3}
Suppose $G$ satisfies Assumption~\ref{assumption3} and $d \lesssim \log(1/\delta)$, then the estimator $\bar{G}_k$ in~\eqref{eq:optimalgrad}, with $\bar{\Delta}_k \coloneqq \bar{G}_k - \nabla f(w_k)$ satisfies 
$$
\mathbb{P}\left(\| \bar{\Delta}_k \| \gtrsim \left(\frac{d}{n}\right)^{\tfrac{\beta}{1+\beta}} + \left(\frac{\log(1/\delta)}{m_k}\right)^{\tfrac{\beta}{1+\beta}}\right) \leq \delta, \quad\text{and}\quad
\mathbb{E}\left[\exp\left\{\|\bar{\Delta}_k\|^{\tfrac{1+\beta}{\beta}}m_k\right\}\right] \leq C.
$$
\end{lemma}

Note that by Lemma~\ref{lemma3}, we also have 
\begin{align}\label{eq:temp222}
\mathbb{P}\left(\| \bar{\Delta}_k\| \geq \lambda\right) \leq C\exp \left(-\lambda^{\tfrac{1+\beta}{\beta}}m_k\right).
\end{align}
With~\eqref{eq:temp222} in hand, the proof of Theorem~\ref{theorem_optimal_mean} follows verbatim the proof of Theorem \ref{theorem_gc}.
\end{proof}

\begin{proof}[\textbf{Proof of Proposition \ref{proposition1}}]

Before proving Proposition \ref{proposition1}, we introduce an intermediate result regarding the initial estimator in~\eqref{init_mean}, which is essentially \cite[ Lemma B.1]{cherapanamjeri2020optimal}, restated in our notation.

\begin{lemma}\label{lemma4}
For a given $k$, let $G(w_k, \xi_{k,j})$, for $t=1, \ldots, m_k$ be $i.i.d.$ random vectors satisfying Assumption \ref{assumption3} for some $\beta\in (0, 1]$. Then the estimator $\widehat{G}_k$ as defined by \eqref{init_mean}, with probability at least $1-e^{-{\tfrac{m_k}{50}}}$.
 satisfies
$$
\| \widehat{G}_k - \nabla f(w_k)\| \leq 24\sqrt{d}.
$$
\end{lemma}
Now we are ready to prove Proposition \ref{proposition1}. First, we recall the definition of $\bar{G}_k$ from~\eqref{eq:bcclippedgrad}:
\begin{align*}
\bar{G}_k \coloneqq \frac{2}{m_k}\sum_{t=1}^{m_k/2} \min\left\{\frac{\bigg[\Big(\frac{ t}{\log(1/\delta)}\Big)^{\tfrac{1}{1+\beta}} + 24\bigg]\sqrt{d}}{\|G(w_k, \xi_{k,t})-\widehat{G}_k \|}, 1 \right\} \left[G(w_k, \xi_{k,t}) - \widehat{G}_k)\right] + \widehat{G}_k.
\end{align*}
Now, under Assumption~\ref{assumption5}, it is straightforward to see that $\widehat{G}_k$ is an unbiased estimator of $\nabla f(w_k)$ and the distribution of $\widehat{G}_k$ is symmetric about $\nabla f(w_k)$. 

Now, we proceed to first prove that $\bar{G}_k$ is unbiased, i.e., $\mathbb{E}[\bar{G}_k] = \nabla f(w_k)$ by showing
\begin{align}\label{eq:unbiased}
\mathbb{E}\left[\min\left\{\frac{B}{\|G(w_k, \xi) - \widehat{G}_k\|}, 1\right\}[G(w_k, \xi) - \widehat{G}_k ] + \widehat{G}_k\right] = \nabla f(w_k)
\end{align}
for any $B\geq 0$ when the distribution of $G(w_k, \xi)$ is symmetric about $\nabla f(w_k)$. Note that without loss of generality, one can assume that $\nabla f(w_k) = 0$. As, if this is not true, we can define $U(w_k, \xi) = G(w_k, \xi) - \nabla f(w_k)$, for which we have $\mathbb{E}[U(w_k, \xi)] = 0$, and $\widehat{U}_k = \widehat{G}_k - \nabla f(w_k)$, which leads to
$$
\mathbb{E}\left[\min\left\{\frac{B}{\|G(w_k, \xi) - \widehat{G}_k\|}, 1\right\}[G(w_k, \xi) - \widehat{G}_k ] +\widehat{U}_k\right] = 0
$$
as we have $G(w_k, \xi) - \widehat{G}_k = U(w_k, \xi) - \widehat{U}_k$, which would prove~\eqref{eq:unbiased} for the general case.



Denoting the distribution of $G = G(w_k, \xi)$ by $g(x)$, we immediately have that $g(x) = g(-x)$ and $-\widehat{G}_k\overset{d}{\sim} \widehat{G}_k$ (i.e., $-\widehat{G}_k $ has the same distribution as $\widehat{G}_k$). Hence, we have for any $B\geq 0$, 
\begin{align*}
&~\mathbb{E}\left[G(w_k, \xi)\mathbbm{1}\{\|G(w_k, \xi) - \widehat{G}_k\|\leq B\}\right] \\
=&~ \mathbb{E}_{\hat{G}_k}\left[\int_{\|x-\hat{G}_k\|\leq B} xg(x) dx   \right]        \\
=& ~\mathbb{E}_{\widehat{G}_k}\left[\int_{\|x\|\leq B} (x+\widehat{G}_k)g(x+\widehat{G}_k) dx \right]  \\
=&~\mathbb{E}_{\widehat{G}_k}\left[\int_{\|x\|\leq B} (\widehat{G}_k-x)g(\widehat{G}_k-x) dx \right]    \\
=&~\mathbb{E}_{\widehat{G}_k}\left[\int_{\|x\|\leq B} (\widehat{G}_k-x)g(x-\widehat{G}_k) dx \right]    \\
=&~\mathbb{E}_{\widehat{G}_k}\left[\int_{\|x\|\leq B} (-\widehat{G}_k-x)g(x+\widehat{G}_k) dx \right] .
\end{align*}

Comparing the equation in the second and last line, we can immediately obtain that 
\begin{align}\label{temp09}
\mathbb{E}[G(w_k, \xi)\mathbbm{1}\{\|G(w_k, \xi) - \widehat{G}_k\|\leq B\}] = 0.
\end{align}
Similarly, we can show that 
$$
\mathbb{E}\left[\mathbbm{1}\left\{\|G(w_k, \xi) - \widehat{G}_k\|\geq B\right\}\left[\frac{B}{\|G(w_k, \xi) - \widehat{G}_k\|}(G(w_k, \xi) - \widehat{G}_k) + \widehat{G}_k\right]\right] = 0,
$$ 
which together with~\eqref{temp09} proves~\eqref{eq:unbiased}.



Next, we will prove the concentration of $\bar{G}_k$. As $G(w_k, \xi)$ satisfies Assumption \ref{assumption3}, we have $\mathbb{E}[\|G(w_k, \xi)\|^{1+\beta}]\leq \frac{\pi}{2}d^{\frac{1+\beta}{2}}$. Let 
$$B_t = \left[\left(\frac{t}{\log(1/\delta)}\right)^{\tfrac{1}{1+\beta}}+24\right]\sqrt{d}, \quad\quad G_t = G(w_k, \xi_{k, t}),$$ 
and 
$$
\widetilde{G}_t = \widetilde{G}(w_k, \xi_{k,t}) = \min\left\{\frac{B_t}{\|G(w_k, \xi_{k,t}) - \widehat{G}_k\|}, 1\right\}[G(w_k, \xi_{k,t}) - \widehat{G}_k ] + \widehat{G}_k.
$$

As $\widetilde{G}_t $, for $t=1, \ldots, {m_k}/2$ depends on $\widehat{G}_k$, they are only independent conditioned on $\widehat{G}_k$. Hence, conditioned on  $\widehat{G}_k$, we have the following: 


\begin{align*}
\left\|\mathbb{E}[\bar{G}_k - \nabla f(w_k)]\right\| =&~ \frac{2}{m_k}\left\|\sum_{t=1}^{m_k/2} \left( \widetilde{G}_t - \nabla f(w_k)\right)\right\| \\
\leq&~ \frac{2}{m_k}\sum_{t=1}^{m_k/2} \mathbb{E}[(\|G_t\| + \|\widehat{G}_k\|)\mathbbm{1}\{\|G_t\|\geq B_t - \|\widehat{G}_k\|\}] \\
\leq&~\frac{2}{m_k}\sum_{t=2}^{m_k/2}\frac{(\frac{\pi}{2}+24)d^\frac{1+\beta}{2}}{(B_t-\widehat{G}_k)^{\alpha}}.
\end{align*}

Now, again conditioned on  $\widehat{G}_k$, note that we obtain
\begin{align*}
\left\|\bar{\Delta}_k\right\| 
\leq&~\left\|\mathbb{E}[\bar{G}_k - \nabla f(w_k)]\right\|+ \frac{2}{m_k}\Big\|\sum_{t=1}^{m_k} (\mathbb{E}[\Tilde{G}(w_k, \xi_{k,t})] - \Tilde{G}(w_k, \xi_{k,t}))\Big\| \\
\leq&~\frac{2}{m_k}\sum_{t=2}^{m_k/2}\frac{(\frac{\pi}{2}+24)d^\frac{1+\beta}{2}}{(B_t-\widehat{G}_k)^{\alpha}} + \frac{2}{m_k}\Big\|\sum_{t=1}^{m_k} (\mathbb{E}[\Tilde{G}(w_k, \xi_{k,t})] - \Tilde{G}(w_k, \xi_{k,t}))\Big\|.
%\leq&~\frac{1}{m_k}\sum_{t=1}^{m_k}\frac{\sigma^{1+\alpha}}{B_t^{\alpha}} + \sqrt{\frac{2B_{m_k}^{1-\alpha}\sigma^{1+\alpha}\log(1/\delta)}{m_k}} + \frac{B_{m_k}\log(1/\delta)}{3m_k}
\end{align*}
Now, by vector-valued Bernstein's inequality for bounded independent random vectors (see, for example~\cite[Corollary 4.1]{minsker2017bernstein}), and by noting that conditioned on $\widehat{G}_k$ we have 
$$
\mathbb{E}[\|\widetilde{G}(w_k, \xi_{k,t}\|^2]\bigg| \widehat{G}_k \leq \frac{\pi}{2}d^{\frac{1+\beta}{2}}(B+\|\widehat{G}_k\|)^{1-\beta},
$$
when ${m_k}\geq 100\log(1/\delta)$, we have, with probability at least $1-\delta$, 
\begin{align*}
\left\|\bar{\Delta}_k\right\|  
\leq &~\left(\frac{2}{m_k}\sum_{t=2}^{m_k/2}\frac{(\frac{\pi}{2}+24)d^\frac{1+\beta}{2}}{(B_t-\widehat{G}_k)^{\alpha}} + 
\sqrt{\frac{4(B_{m_k}+\widehat{G}_k)^{1-\beta}\frac{\pi}{2}d^\frac{1+\beta}{2}\log(1/\delta)}{m_k}} + \frac{2(B_{m_k}+\widehat{G}_k)\log(1/\delta)}{3m_k}\right).
\end{align*}
The high-probability statement from Proposition~\ref{proposition1} then follows by setting $B_t$ and noting that the norm of $\widehat{G}_k$ is bounded, i.e., $\widehat{G}_k\leq 24\sqrt{d}$. 
\end{proof}

\begin{proof}[\textbf{Proof of Theorem~\ref{theorem3}}]

We first prove part (a). Note that by Lemma~\ref{templemma}, we can obtain the inequality \eqref{alg_ineq}. As before, we note that the first two terms in the right hand side of~\eqref{alg_ineq}  are bounded by the constant $3LD_0$. Hence, we proceed to bound the last two terms on the right hand side of~\eqref{alg_ineq} with a high probability bound. 

For the fourth term on the right hand side of~\eqref{alg_ineq}, using the same approach as in the proof of Theorem \ref{theorem_gc} we have for all $\lambda \geq 0$ that
$$
\mathbb{P}\left(\sum_{k=1}^N \frac{\|\bar{\Delta}_k\|^2}{\hat\Gamma_k} \geq \lambda D_0d \right) \leq \exp\left\{-\lambda^{\tfrac{1+\alpha}{2\alpha}}\right\}.
$$
Next, by setting $m_k = N^{\frac{3\alpha+3}{2\alpha}}$, we obtain
$$
{\hat\Gamma^{-1}_k m_k^{-\tfrac{2\alpha}{\alpha+1}}} \leq \frac{D_0}{N}.
$$
Hence, we have for all $\lambda \geq 0$ that
\begin{align}\label{temp345}
\mathbb{P}\left(\sum_{k=1}^N \frac{\|\bar{\Delta}_k\|^2}{\hat\Gamma_k} \geq D_0d\log \left(\frac{1}{\delta}\right)^{\tfrac{2\alpha}{1+\alpha}}\right) \leq \delta.
\end{align}

For the third term in the right hand side of~\eqref{alg_ineq}, define $$
\zeta_k \coloneqq \frac{\alpha_k}{\hat\Gamma_k} \langle \bar{\Delta}_k, x_*-x_{k-1}\rangle.
$$ 
As we have that Assumption \ref{assumption2} is stronger than Assumption \ref{assumption3}, Proposition \ref{proposition1} and  Lemma \ref{app_lemma1} imply that,
\begin{align*}
&~\mathbb{E}\left[\exp\left\{\left(\frac{\zeta_k}{[\alpha_k\hat{\Gamma}_k^{-1}D_0\sqrt{d}]}\right)^{\tfrac{1+\alpha}{\alpha}}m_k\right\}\Bigg|\mathcal{F}_{k-1}\right] \\
\leq&~ \mathbb{E}\left[\exp\left\{\frac{m_k\left(\|\bar{\Delta}\|\|x_{k-1}-x^*\|\right)^{\tfrac{1+\alpha}{\alpha}}}{(D_0\sqrt{d})^{\tfrac{1+\alpha}{\alpha}}}\right\}\Bigg|\mathcal{F}_{k-1}\right]\leq  2.
\end{align*}
which indicates that $\zeta_k$ satisfies Assumption \ref{app_assumption2}. Consequently  according to Lemma \ref{app_lemma1} it also satisfies Assumption \ref{app_assumption1}. By setting $m_k = N^{\tfrac{3\alpha+3}{2\alpha}}$, we have
\begin{align*}
\frac{1}{m_k}(\hat\Gamma_k^{-1}\alpha_k)^{\tfrac{1+\alpha}{\alpha}} \leq  N^{-\tfrac{\alpha+1}{2\alpha}}.
\end{align*}
Now, part (b) of Lemma \ref{app_lemma2} and Proposition \ref{app_proposition1} implies that we have 
\begin{align*}
 \mathbb{P}\left(\sum_{k=1}^N \zeta_k \geq \lambda D_0\sqrt{d} \right)\leq \exp\{-C_\alpha\lambda^{1+\alpha}\},\quad\text{for all}\quad\lambda \geq \left[\Gamma\left(\tfrac{\alpha}{1+\alpha}\right)\frac{1+\alpha}{\alpha}\right]^{\tfrac{1}{1-\alpha}},
\end{align*}
where
\begin{align}\label{eq:calpha}
C_\alpha =  \left(\frac{\alpha}{1+\alpha}\right)^{\alpha} - \left(\frac{\alpha}{1+\alpha}\right)^{1+\alpha} -  \left(\frac{\alpha}{1+\alpha}\right)^{2\alpha} \geq 0.
\end{align}
The above probability bound, in turn leads to
\begin{align}\label{eq:temp2324}
\mathbb{P}\left(\sum_{k=1}^N \zeta_k \geq D_0\sqrt{d}\left[\log \left(\frac{1}{\delta}\right)^{\tfrac{1}{1+\alpha}}\right]\right) \leq \delta,\quad \text{when}\quad \left(\log (1/\delta)\right)^{\tfrac{1}{1+\alpha}}\geq \left[\Gamma\left(\frac{\alpha}{1+\alpha}\right)\frac{1+\alpha}{\alpha}\right]^{\tfrac{1}{1-\alpha}}.
\end{align}
Combining~\eqref{alg_ineq} with the high probability bounds in~\eqref{temp345} and~\eqref{eq:temp2324} proves the claim in Theorem~\ref{theorem3}. The oracle complexity results then follow by our choice of $m_k$ and the argument similar to that used in the proof of Theorem~\ref{theorem_light_tail}.

Furthermore, the proof of part (b) follows verbatim the proof of part (a) with $\alpha$ replaced by $\beta$.

\end{proof}

\fi

\iffalse
\subsection{Vector Case}

\begin{lemma}
 For random vector $X$ satisfies $\mathbb{P}(\|X\|\geq t)\leq C\exp(-t^{\frac{1+\alpha}{\alpha}}/\sigma^2)$, we have
$$
(\mathbb{E} \|X\|^k)^{\frac{1}{k}} \leq C(\sigma^2k)^{\frac{\alpha}{1+\alpha}}
$$
There is an absolute constant $c$ , if zero-mean random variable $X$ satisfies the above condition, then let
$$
Y :=  
\begin{pmatrix}
0 & X^T \\
X & 0
\end{pmatrix}
\in \mathbb{R}^{(d+1)\times(d+1)}
$$
we have $\mathbb{E}e^{\theta Y} \preceq  (1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma^2)^\frac{2\alpha}{1+\alpha})e^{\theta^\frac{\alpha+1}{\alpha}\sigma^2}\mathbf{I}$ for any $\theta \in \mathbb{R}$.
\end{lemma}
\begin{proof}To prove first claim:
\begin{eqnarray*}
\mathbb{E}[\|X\|^k] &=& \int_0^\infty \mathbb{P}(\|X\|^k \geq t)dt \\
&=& \int_0^\infty \mathbb{P}(\|X\| \geq t^{\frac{1}{k}})dt \\
&\leq& C\int_0^\infty \exp (-t^{\frac{1+\alpha}{\alpha k}}/\sigma^2)dt \\
&=& C(\sigma^2)^{\frac{\alpha k}{\alpha+1}}\frac{\alpha k}{\alpha+1}\int_0^\infty e^{-u}u^{\frac{\alpha k}{\alpha +1}-1} du \\
&=& C (\sigma^2)^{\frac{\alpha k}{\alpha+1}} k \Gamma\left(\frac{\alpha k}{\alpha+1}\right) \\
\end{eqnarray*}
Then, following the fact that $\Gamma(\frac{\alpha k}{\alpha+1}) \leq (\frac{\alpha k}{\alpha+1})^\frac{\alpha k}{\alpha+1}$ and $k^{1/k} \leq e^{1/e}$ for any $k\geq 2$, we have
$$
(\mathbb{E}[\|X\|^k])^{1/k} \leq (\sigma^2)^\frac{\alpha}{\alpha+1} \left(\frac{\alpha k}{\alpha+1}\right)^\frac{\alpha}{\alpha +1} e^{1/e} \leq C(\sigma^2k)^\frac{\alpha}{\alpha+1}
$$

To prove second claim:

Note that $Y$ is a rank-2 matrix whose eigenvalues are $\|X\|, -\|X\|$, and $\mathbb{E}Y^{2k+1} = 0$ for any $k\in\mathbb{N}$. On the other hand, we have $\|Y^{k}\| \leq \|X\|^{k}$. Therefore, by Lemma \ref{app_lemma1} and the first claim, for any $\theta\in\mathbb{R}$:
$$
\mathbb{E}e^{\theta Y} = \mathbf{I} + \sum_{k=2}^\infty \frac{\theta^{k}\mathbb{E}Y^{k}}{k!} \preceq \left(1 + \sum_{k=2}^\infty \frac{\theta^{k}\mathbb{E}\|X\|^k}{k!}\right) \mathbf{I}  \preceq  (1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma^2)^\frac{2\alpha}{1+\alpha})e^{\theta^\frac{\alpha+1}{\alpha}\sigma^2}\mathbf{I}
$$

where $c = \Gamma\left(\frac{\alpha}{1+\alpha}\right)\frac{1+\alpha}{\alpha}$.
\end{proof}




\begin{lemma}\label{app_lemma3}
Let random vectors $X_1,\dots, X_n\in\mathbb{R}^d$, and the corresponding filtration $\mathcal{F}_i = \sigma(X_1, \dots, X_i)$ for $i \in [n]$ satisfies that
\begin{align*}
\mathbb{E}[X_i|\mathcal{F}_{i-1}] = 0, \ \ \mathbb{P}(\|X_i\|\geq t|\mathcal{F}_{i-1})\leq ce^{-t^{\frac{1+\alpha}{\alpha}}/\sigma_i^2}.
\end{align*}
Then for any fixed $\delta>0$, $\theta>0$, with probability at $1-\delta$:
\begin{align}\label{eq:firstresult}
\left \|\sum_{i=1}^n X_i \right\| \leq \frac{1}{\theta}\sum_{i=1}^n \log\left(1 + c\left(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2\right)^\frac{2\alpha}{1+\alpha}\right) + \theta^\frac{1}{\alpha}\sum_{i=1}^n\sigma_i^2 + \frac{1}{\theta}\log\left(\frac{2d}{\delta}\right).
\end{align}
Thus, when 
\begin{align*}
\sigma_i^2 \leq n^{-\frac{\alpha+1}{2\alpha}}~\quad\text{and}\quad~\left(\log\left(\frac{2d}{\delta}\right)\right)^\frac{1-\alpha}{\alpha+1} >  \Gamma\left(\frac{\alpha}{1+\alpha}\right)\frac{1+\alpha}{\alpha},
\end{align*}
we obtain 
\begin{align}\label{eq:secondresult}
\left\|\sum_{i=1}^n X_i\right\|^2 \leq C\left(\log\frac{2d}{\delta}\right)^{\frac{2}{\alpha+1}}.
\end{align}
\end{lemma}

\begin{proof}
According to Lemma \ref{app_lemma3}, we have
\begin{align*}
&~\mathbb{E}\text{tr}\exp\bigg(-\big(\sum_{i=1}^n\log(1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)^\frac{2\alpha}{1+\alpha}) + \theta^\frac{\alpha+1}{\alpha}\sigma_i^2\big)\mathbf{I}  \theta\sum_{i=1}^n Y_i\bigg) \\
=&~\mathbb{E}\text{tr}\exp\bigg(-\big(\sum_{i=1}^n\log(1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)^\frac{2\alpha}{1+\alpha}) + \theta^\frac{\alpha+1}{\alpha}\sigma_i^2\big)\mathbf{I} + \theta\sum_{i=1}^n Y_i\bigg)\\
\overset{(\ast)}{\leq}&~\mathbb{E}\text{tr}\exp\bigg(-\big(\sum_{i=1}^n\log(1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)^\frac{2\alpha}{1+\alpha}) + \theta^\frac{\alpha+1}{\alpha}\sigma_i^2\big)\mathbf{I} + \theta\sum_{i=1}^{n-1} Y_i + \log\mathbb{E}[e^{\theta Y_n}|\mathcal{F}_{n-1}]\bigg)\\
\leq&~\mathbb{E}\text{tr}\exp\bigg(-\big(\sum_{i=1}^{n-1}\log(1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)^\frac{2\alpha}{1+\alpha}) + \theta^\frac{\alpha+1}{\alpha}\sigma_i^2\big)\mathbf{I} + \theta\sum_{i=1}^{n-1}n Y_i\bigg) \\\leq&~ \cdots \leq \text{tr}\exp(0\mathbf{I}) = d
\end{align*}
where step $(\ast)$ is due to $\mathbb{E}\text{tr}(\exp(A+Y))\leq \text{tr}\exp(A+\log(\mathbb{E}e^Y))$ when A is fixed symmetric matrix and Y is a random symmetric matrix; see, for example~\cite{tropp2012user}. On the other hand, since identity matrix commutes with any matrix, we have
\begin{align*}
&\exp\bigg(-\big(\sum_{i=1}^n\log(1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)^\frac{2\alpha}{1+\alpha}) + \theta^\frac{\alpha+1}{\alpha}\sigma_i^2\big)\mathbf{I} + \theta\sum_{i=1}^n Y_i\bigg) \\
=&\exp\bigg(-\big(\sum_{i=1}^n\log(1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)^\frac{2\alpha}{1+\alpha}) + \theta^\frac{\alpha+1}{\alpha}\sigma_i^2\big)\bigg)\cdot \exp(\theta\sum_{i=1}^n Y_i)
\end{align*}
Therefore, for any $t\geq 0$, $\theta\geq 0$, by Markov's inequality, we have
\begin{align*}
&~\mathbb{P}\left[  \Big\|\sum_{i=1}^n X_i\Big\|\geq  \frac{1}{\theta}\sum_{i=1}^n \log(1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)^\frac{2\alpha}{1+\alpha}) + \sum_{i=1}^n\theta^\frac{1}{\alpha}\sigma_i^2 + t/\theta      \right] \\
= &~\mathbb{P}\left[  \Big\|\sum_{i=1}^n Y_i\Big\|\geq  \frac{1}{\theta}\sum_{i=1}^n \log(1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)^\frac{2\alpha}{1+\alpha}) + \sum_{i=1}^n\theta^\frac{1}{\alpha}\sigma_i^2 + t/\theta      \right]  \\
=&~ 2\mathbb{P}\left[ \lambda_{max}\left(\sum_{i=1}^n Y_i\right)\geq  \frac{1}{\theta}\sum_{i=1}^n \log(1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)^\frac{2\alpha}{1+\alpha}) + \sum_{i=1}^n\theta^\frac{1}{\alpha}\sigma_i^2 + t/\theta      \right] \\
=&~ 2\mathbb{P}\left[ \lambda_{max}\left(e^{\theta\sum_{i=1}^n Y_i}\right)\geq  e^{\sum_{i=1}^n \log(1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)^\frac{2\alpha}{1+\alpha}) + \sum_{i=1}^n\theta^\frac{1+\alpha}{\alpha}\sigma_i^2 + t}      \right] \\
\leq&~ 2\mathbb{P}\left[ \text{tr}\left(e^{\theta\sum_{i=1}^n Y_i}\right)\geq  e^{\sum_{i=1}^n \log(1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)^\frac{2\alpha}{1+\alpha}) + \sum_{i=1}^n\theta^\frac{1+\alpha}{\alpha}\sigma_i^2 + t}      \right] \\
\leq&~ 2e^{-t}\mathbb{E}\text{tr}\bigg( \text{tr}\big(e^{- (\sum_{i=1}^n \log(1 + c(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)^\frac{2\alpha}{1+\alpha}) + \sum_{i=1}^n\theta^\frac{1+\alpha}{\alpha}\sigma_i^2)\mathbf{I} + \theta\sum_{i=1}^n Y_i }\big)      \bigg) \\
\leq&~ 2de^{-t}
\end{align*}
By setting the right hand side of the above expression equal to $\delta$, we obtain~\eqref{eq:firstresult}. To prove~\eqref{eq:secondresult}, note that with $\sigma_i^2\leq n^{-\frac{\alpha+1}{2\alpha}}$,
\begin{eqnarray*}
\Big\|\sum_{i=1}^n X_i\Big\| &\leq& \frac{1}{\theta}\sum_{i=1}^n \log\left(1 + c\left(\theta^\frac{1+\alpha}{\alpha}\sigma_i^2\right)^\frac{2\alpha}{1+\alpha}\right) + \theta^\frac{1}{\alpha}\sum_{i=1}^n\sigma_i^2 + \frac{1}{\theta}\log\left(\frac{2d}{\delta}\right)\\
&\leq& c\theta + \theta^\frac{1}{\alpha}n^\frac{\alpha-1}{2\alpha} + \frac{1}{\theta}\log\left(\frac{2d}{\delta}\right)\\
&\leq& C\left(\log\left(\frac{2d}{\delta}\right)\right)^\frac{\alpha}{1+\alpha}\\
\end{eqnarray*}
when $\theta = \left(\log\left(\frac{2d}{\delta}\right)\right)^\frac{\alpha}{\alpha+1}$ and $\left(\log\left(\frac{2d}{\delta}\right)\right)^\frac{1-\alpha}{\alpha+1} > c$.
\end{proof}



\subsection{Bernstein's Inequality}
\begin{lemma}\label{lemma:bernstein_ineq}
Let $X_1,\cdots, X_n$ be independent zero-mean random variables. Suppose that $|X_i|\leq M$ almost surely, for all $i$. Then, for all positive $t$,
$$
\mathbb{P}\left(\sum_{i=1}^n X_i \geq t\right) \leq \exp\left(-\frac{\frac{1}{2}t^2}{\sum_{i=1}^n\mathbb{E}[X_i^2] + \frac{1}{3}Mt}\right)
$$
\end{lemma}

\fi

\iffalse
\section{Summary of the Robust Mean Estimation procedure from~\cite{cherapanamjeri2020optimal}}\label{alg:optimal_mean}

For the sake of completeness, we now describe the robust mean-estimation procedure from~\cite{cherapanamjeri2020optimal}. The main algorithm from~\cite{cherapanamjeri2020optimal} is provided in Algorithm~\ref{alg:mean_estimation}. 
\begin{algorithm} [H]
	\caption{\textsc{optimalmeanest}($\{G(w_k, \xi_{k,j}) \}_{j=1}^{m_k}$)}
	\label{alg:mean_estimation}
	\begin{algorithmic}
\State Input: Data Points $\{G(w_k, \xi_{k,j}) \}_{j=1}^{m_k} \in \mathbb{R}^{d}$, Target Confidence $\delta$

\State $G^+ \leftarrow$ Initial Mean Estimate$(\{G(w_k, \xi_{k,j}) \}_{j=1}^{m_k/2})$
\State $Z \leftarrow$ Produce Bucket Estimates$(\{G(w_k, \xi_{k,j}) \}_{j=m_k/2}^{m_k}, G^+, \delta)$
\State $T \leftarrow 10^6\log dn$
\State $\bar{G}_k = $ Gradient Descent$(Z, G^+, T)$
\State Return: $\bar{G}_k$
\end{algorithmic}
\end{algorithm}

Algorithm~\ref{alg:mean_estimation} comprises of the following three sub-steps.



\subsubsection*{ \textbf{1. Data Pruning Step}}
The following algorithms correspond to the first sub-step. Algorithm \ref{alg:initial_est} compute an initial estimate of the mean which is with $\mathcal{O}(\sqrt{d})$ of the mean and Algorithm \ref{alg:prune_data} use this estimate to filter out data points which are far away from the estimate.

\begin{algorithm} [H]
	\caption{Initial Mean Estimate}
	\label{alg:initial_est}
	\begin{algorithmic}

\State Input: Set of Data Points $\{G_i\}_{i=1}^n$
\State $\hat\mu\leftarrow \arg\min_{G_i: i \leq n}\min\left\{r>0, \sum_{j=1}^n \mathbbm{1}\{\|G_j - G_i\|\leq r\} \geq 0.6n\right\}$
\State Return: $\hat\mu$
\end{algorithmic}
\end{algorithm}

\begin{algorithm} [H]
	\caption{Prune Data}
	\label{alg:prune_data}
	\begin{algorithmic}

\State Input: Set of Data Points $\{G_i\}_{i=1}^n$, Mean Estimate $G^+$
\State $\tau\leftarrow \max \left(100n^{\frac{1}{1+\beta}}d^{-\frac{1-\beta}{2(1+\beta)}}, 100\sqrt{d}\right)$
\State $\mathcal{C}\leftarrow \{G_i: \|G_i - G^+\|\leq \tau\}$
\State Return: $\mathcal{C}$
\end{algorithmic}
\end{algorithm}

 \subsubsection*{\textbf{2. Data Batching Step}}
The following algorithm corresponds to the second sub-step.  The data points that survive the truncation procedure in the data pruning stage are then divided into $k$ bins and mean estimates are computed based on sample-averaging in each bin by Algorithm \ref{alg:bucket_est}.
 
\begin{algorithm} [H]
	\caption{Produce Bucket Estimates}
	\label{alg:bucket_est}
	\begin{algorithmic}

\State Input: Set of Data Points $\{G_i\}_{i=1}^n$, Mean Estimate $G^+$, Target Confidence $\delta$

\State $Y \leftarrow$ Prune Data$(\{G_i\}, G^+)$
\State $m\leftarrow|Y|$
\State $k\leftarrow 4000\log 1/\delta$
\State Split data points into $k$ buckets with bucket $\mathcal{B}_i$ consisting of the points $G_{(i-1)\frac{m}{k}+1}, \cdots, G_{i\frac{m}{k}}$
\State $Z_i\leftarrow$ Mean$(\mathcal{B}_i)$ $\forall i \in [k]$ and $Z\leftarrow(Z_i, \cdots, Z_k)$
\State Return: $Z$
\end{algorithmic}
\end{algorithm}

 \subsubsection*{\textbf{3. Median Computation Step}}
 
The following algorithms correspond to the third sub-step. The bucket estimates from the previous stage are aggregated to produce the final estimate following the testing-to-estimation framework. The testing program is defined in $\textbf{MT}$ below. Algorithms \ref{alg:distance_est} and \ref{alg:gradient_est} display the estimation of distance and gradient. 
 
\begin{algorithm} [H]
	\caption{Distance Estimation}
	\label{alg:distance_est}
	\begin{algorithmic}
\State Input: Data Points $Z\in\mathbb{R}^{k\times d}$, Current point $x$
\State $d = \arg\max_{r>0} \textbf{MT}(x, r, Z) \geq 0.9k$
\State Return: $d$
\end{algorithmic}
\end{algorithm}

\begin{algorithm} [H]
	\caption{Gradient Estimation}
	\label{alg:gradient_est}
	\begin{algorithmic}

\State Input: Data Points $Z\in\mathbb{R}^{k\times d}$,  Current point $x$
\State $d^* =$ Distance Estimation$(Z, x)$
\State $(v, X) = \textbf{MT}(x, d^*, Z)$
\State $g\leftarrow$ Top Singular Vector$(X_v)$
\State Return: $g$
\end{algorithmic}
\end{algorithm}

The following polynomial and its semidefinite optimization $\textbf{MT}(x, r, Z)$ play a key role in the subsequent analysis. Intuitively, given a test point $x$, the problem searches for a direction ($v$) such that a large proportion of the bucket estimates, $Z_i$, are far away from $x$ along $v$. Formally, the polynomial optimization problem, parameterized by $x$, $r$ and $Z$, is defined below:
\begin{align*}
\max \sum_{i=1}^k &b_i\\
\text{Subjectd to}~~~~~b_i^2 &= b_i\\
\|v\|^2 &= 1, \\
b_i(\langle v, Z_i-x\rangle -r ) &\geq 0 \quad \forall i\in[k]\\
\end{align*}

The binary variables $b_i$ indicates whether $i-th$ bucket mean $Z_i$ is far away along $v$. However, the binary constraints on $b_i$, the restriction of $v$ and the final constraint make this problem nonconvex and hard to optimize efficiently. Therefore, they work with the simidefinite relaxation defined as follows:

\begin{align*}
\max \sum_{i=1}^k &X_{1,b_i}\\
\text{Subjectd to}~~~~~~~X_{1, b_i} &= X_{b_i, b_i}\\
\sum_{j=1}^d X_{v_j, v_j} &= 1, \\
\langle v_{b_i}, Z_i-x\rangle &\geq X_{b_i, b_i}r \quad \forall i\in[k]\\
X_{1,1} &= 1\\
X&\succeq 0
\end{align*}
where $v_{b_i} = [X_{b_i, v_1}, \cdots, X_{b_i, v_d}]$. The matrix $X \in \mathcal{S}_{+}^{(k+d+1)}$ is symbolically indexed by 1 and the variables $b_1, \dots, b_k$ and $v_1, \dots, v_d$. Here, $(v, X) = \textbf{MT}(x, r, Z)$ refers the optimal value $v$ and solution $X$ of the following semidefinite optimization problem initialized with $x$, $r$ and $Z$:

The estimate above is then used in Algorithm \ref{alg:GD} to obtain an improved estimate.
\begin{algorithm} [H]
	\caption{Gradient Descent}
	\label{alg:GD}
	\begin{algorithmic}

\State Input: Bucket Means $Z \in \mathbb{R}^{k\times d}$, Initialization $G^+$, Number of Iterations T
\State $x^*$, $x_0 \leftarrow G^+$ and $d^*$, $d_0\leftarrow\infty$
\For{$t = 0, \ldots, T$}% } $k = 1, \cdots, N$ \textbf{do}
\State $d_t\leftarrow$ Distance Estimation$(Z, x_t)$
\State $g_t \leftarrow$ Gradient Estimation$(Z, x_t)$
\If{$d_t < d^*$}
\State $x^*\leftarrow x_t$
\State $d^*\leftarrow d_t$
\EndIf
\State  $x_{t+1}\leftarrow x_t + \frac{1}{20}d_tg_t$
\EndFor
\State Output: $x^*$
\end{algorithmic}
\end{algorithm}
The overall computational complexity of the algorithm is polynomial in the problem parameters~\cite{cherapanamjeri2020optimal}. However, from the perspective of implementation, especially on large scale datasets, it is perhaps not efficient. 

\fi

\iffalse
\section{Introduction}\label{sec:intro}
UAI 2022 papers have to be prepared using \LaTeX.
To start writing your paper, copy \texttt{tang_327.tex} and replace title, authorship, and content with your own.

The UAI 2022 paper style is based on a custom \textsf{tang_327} class.
The class file sets the page geometry and visual style.\footnote{%
    The class uses the packages \textsf{adjustbox}, \textsf{environ}, \textsf{letltxmacro}, \textsf{geometry}, \textsf{footmisc}, \textsf{caption}, \textsf{textcase}, \textsf{titlesec}, \textsf{titling}, \textsf{authblk}, \textsf{enumitem}, \textsf{microtype}, \textsf{lastpage}, and \textsf{kvoptions}.
}
The class file also loads basic text fonts.\footnote{%
    Fonts loaded are \textsf{times} (roman), \textsf{helvet} (sanserif), \textsf{courier} (fixed-width), and \textsf{textcomp} (common symbols).
}
\emph{You may not modify the geometry or style in any way, for example, to squeeze out a little bit of extra space.}
(Also do not use \verb|\vspace| for this.)
Feel free to use convenience functionality of loaded packages such as \textsf{enumitem}.
The class enables hyperlinking by loading the \textsf{hyperref} package.

You are free to load any packages available in \TeX{Live}~2020 that are compatible with the UAI class.\footnote{In case this template or your submission does not compile, always first make sure your \TeX\ installation is up-to-date.}
(Mik\TeX{} and Mac\TeX{} generally contain the same packages.)
Do not load conflicting packages—you will get an error message—, as this complicates creating the proceedings.
Please avoid using obsolete commands, such as \verb|\rm|, and obsolete packages, such as \textsf{epsfig}.\footnote{%
    See \url{https://ctan.org/pkg/l2tabu}.
}

\swap[ ]{in the header of your source file.}{Feel free to include your own macros}

\section{General Formatting Instructions}
As a general rule: \emph{follow the template}.

\subsection{Authorship}
Reviewing is double-blind.
However, you can already fill in your author names and affiliations in the \verb|\author| block in the preamble following the example of the template because the class will remove it as long as the option \textsf{accepted} is not passed to the class.
Nevertheless, make sure any other information in the paper does not disclose your identity, for example URLs to supplementary material.

\subsection{Sectioning}
Three numbered sectioning commands are provided: \verb|\section|, \verb|\subsection|, and \verb|\subsubsection|.
Please respect their order, so do not put a \verb|\subsubsection| directly beneath a \verb|\section|.
One unnumbered sectioning command is provided, \verb|\paragraph|.
It can be used directly below any numbered section level.
Do not use any other sectioning commands.

\subsubsection{Typing the Section Titles}
The \verb|\section| and \verb|\subsection| titles are uppercased by the class.
Please type them in title case.
(This is used in the PDF bookmarks.)
Please also write the \verb|\subsubsection| titles in title case.

\paragraph{What is title case?}
\href{https://en.wikipedia.org/wiki/Title_case}{Wikipedia} explains:
\begin{quote}
    Title case or headline case is a style of capitalization used for rendering the titles of published works or works of art in English.
    When using title case, all words are capitalized except for ‘minor’ words (typically articles, short prepositions, and some conjunctions) unless they are the first or last word of the title.
\end{quote}

\subsection{References, Citations, Footnotes}\label{sec:etc}
\subsubsection{Cross-Referencing}
Always use \verb|\label| and \verb|\ref|—or a command with a similar effect—when cross-referencing.
For example, this subsection is Section~\ref{sec:etc}.

\subsubsection{Citations}
Citations should include the author's last name and year.
They should be part of the sentence.
An example parenthetical citation: “Good introductions to the topic are available \citep{latexcompanion}.”
An example textual citation: “\citet{einstein} discusses electrodynamics of moving bodies.”
Do not use a parenthetical citation where a textual one is appropriate.
An example of what \emph{not} to do: “\citep{einstein} discusses electrodynamics of moving bodies.”

We strongly advise to use reference list software such as Bib\TeX{} and a citation package such as \textsf{natbib}.
The reference style you use should be compatible with the author-year citations.
Both the citation style and reference style used should be consistent.

For the original submission, take care not to reveal the authors' identity through the manner in which one's own previous work is cited.
For example, writing
“I discussed electrodynamics of moving bodies before \citep{einstein}.” would be inappropriate, as it reveals the author's identity.
Instead, write “\citet{einstein} discussed electrodynamics of moving bodies.”

\subsubsection{Footnotes}
You can include footnotes in your text.\footnote{
    Use footnotes sparingly, as they can be distracting, having readers skip back and forth between the main text and the foot of the page.
}
The footnote mark should follow the fragment to which it refers, so a footnote\footnote{
    A footnote is material put at the foot of a page.
}
for a word has a footnote mark attached to that word and a footnote for a phrase or sentence has a footnote mark attached to the closing punctuation.

\section{Math}\label{sec:math}
The class file does not load any math support package like \textsf{amsmath}\footnote{%
  See the \textsf{amsmath} documentation at \url{https://ctan.org/pkg/amsmath} for further details.
}.
We advise using the \textsf{mathtools}\footnote{%
  See the \textsf{mathtools} documentation at \url{https://ctan.org/pkg/mathtools} for further details.
}
package, which extends \textsf{amsmath} with fixes and even more useful commands.
Feel free to load other support packages for symbols, theorems, etc.

Use the \textsf{amsmath} environments for displayed equations.
So, specifically, use the \texttt{equation} environment instead of \verb|$$...$$| and the \texttt{align} environment instead of \texttt{eqnarray}.\footnote{For reasons why you should not use the obsolete \texttt{eqnarray} environment, see Lars Madsen, \textit{Avoid eqnarray!} TUGboat 33(1):21--25, 2012.}
An \texttt{equation}:
\begin{equation}\label{eq:example}
  0 = 1 - 1.
\end{equation}
Two \texttt{align}'ed equations:
\begin{align*} % no numbers with starred version
  1 + 2 &= 3,\\
  1 - 2 &= -1.
\end{align*}
Equations can also be put inline, of course.
For example, Equation~\eqref{eq:example}: \(0=1+1\). % $0=1+1$ also works
(Notice that both inline and displayed math are part of the sentence, so punctuation should be added to displayed math.)

The \textsf{amsmath} and \textsf{mathtools} packages provide a lot of nice functionality, such as many common math operators, e.g., \(\sin\) and \(\max\), and also commands for defining new ones.

\section{Floats}\label{sec:floats}
Floats, such as figures, tables and algorithms, are moving objects and are supposed to float to the nearest convenient location.
Please do not force them to go in the middle of a paragraph.
They must respect the column width.

Two-column floats are possible.
They appear at the top of the next page, so strategic placement may be necessary.
For an example, see Figure~\ref{fig:tikz}.
They may not enter the margins.
\begin{figure*}
    \centering
    \begin{tikzpicture}[xscale=1.5]
        \coordinate (origin);
        \draw[->] (origin) -- +(1cm,0) node[below] {$x$};
        \draw[->] (origin) -- +(0,1cm) node[left] {$y$};
        \fill[gray] (45:1cm) circle[radius=.2cm];
    \end{tikzpicture}
    \caption{A Nice Filled Ellipse with a Pair of Coordinate Axes.}\label{fig:tikz}
\end{figure*}

All material in floats should be legible and of good quality.
So avoid very small or large text and pixelated or fuzzy lines.

\subsection{Figures}\label{sec:figures}
Figures should go in the \texttt{figure} environment and be centered therein.
The caption should go below the figure.
Use \verb|\includegraphics| for external graphics files but omit the file extension.
Supported formats are \textsf{pdf} (preferred for vector drawings and diagrams), \textsf{png} (preferred for screenshots), and \textsf{jpeg} (preferred for photographs).
Do not use \verb|\epsfig| or \verb|\psfig|.
If you want to scale the image, it is better to use a fraction of the line width rather than an explicit length.
For example, see Figure~\ref{fig:Eindhoven}.
\begin{figure}
  \centering
  \includegraphics[width=0.7\linewidth,page=3]{Eindhoven}
  \caption{A View of a Nice City.}\label{fig:Eindhoven}
\end{figure}

Do not use \verb|\graphicspath|.
If the images are contained in a subdirectory, specify this when you include the image, for example \verb|\includegraphics{figures/mypic}|.

\subsection{Tables}\label{sec:tables}
Tables should go in the \texttt{table} environment and be centered therein.
The caption should go above the table and be in title caps.
For an example, see Table~\ref{tab:data}.
\begin{table}
    \centering
    \caption{An Interesting Table.}\label{tab:data}
    \begin{tabular}{rl}
      \toprule % from booktabs package
      \bfseries Dataset & \bfseries Result\\
      \midrule % from booktabs package
      Data1 & 0.12345\\
      Data2 & 0.67890\\
      Data3 & 0.54321\\
      Data4 & 0.09876\\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table}

\subsection{Algorithms}\label{sec:algorithms}
You can load your favorite algorithm package, such as \textsf{algorithm2e}\footnote{See the \textsf{algorithm2e} documentation at \url{https://ctan.org/pkg/algorithm2e}.}.
Use the environment defined in the package to create a centered float with an algorithm inside.

\section{Back Matter}
There are a some final, special sections that come at the back of the paper, in the following order:
\begin{itemize}
  \item Author Contributions
  \item Acknowledgements
  \item References
\end{itemize}
They all use an unnumbered \verb|\subsubsection|.

For the first two special environments are provided.
(These sections are automatically removed for the anonymous submission version of your paper.)
The third is the ‘References’ section.
(See below.)

(This ‘Back Matter’ section itself should not be included in your paper.)

\begin{contributions} % will be removed in pdf for initial submission,
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
    Briefly list author contributions.
    This is a nice way of making clear who did what and to give proper credit.

    H.~Q.~Bovik conceived the idea and wrote the paper.
    Coauthor One created the code.
    Coauthor Two created the figures.
\end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    Briefly acknowledge people and organizations here.

    \emph{All} acknowledgements go in this section.
\end{acknowledgements}

\bibliography{tang_327}

\appendix
% NOTE: necessary when ptmx or no mathfont class option is given
\providecommand{\upGamma}{\Gamma}
\providecommand{\uppi}{\pi}
\section{Math font exposition}
How math looks in equations is important:
\begin{equation*}
  F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
\end{equation*}
However, one should not ignore how well math mixes with text:
The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.
\fi
\end{document}
