%\documentclass{uai2021} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2021} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2021} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
%\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
%\usepackage{booktabs} % commands to create good-looking tables
%\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
%\newcommand{\swap}[3][-]{#3#1#2} % just an example



%###########################################################
% My customized notations and environment -- Shaocong
% \renewcommand\labelitemii{$\circ$}

\newcommand{\TODO}{{\color{red}TODO}}
\newcommand{\blue}[1]{{\color{blue} {#1}}}

% BASIC PACKAGES %

\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{mathrsfs}
%\usepackage{fullpage}
\usepackage{tikz} %画交换图
%\usepackage{amscd} %画交换图
\usepackage{comment} %大段注释
\usepackage{xcolor}
\usepackage{soul}
%\usepackage{enumerate}
\usepackage{graphicx}
\usepackage{float} %插入图片
\usepackage{ulem} %波浪线 
\usepackage{tcolorbox}
\usepackage{enumerate}
\usepackage[inline]{enumitem}


% EMPHASIZE BOX %

\usepackage{empheq}
\newcommand{\widefbox}[1]{\fbox{\hspace{0.33in}#1\hspace{0.33in}}}

 
\usepackage[utf8]{inputenc}
%\usepackage{ctex} %中文支持
%\usepackage{hyperref} 
\usepackage{attrib}

%\usepackage{algorithm2e}
\usepackage[ruled,vlined]{algorithm2e}
\newcommand\mycommfont[1]{\footnotesize\ttfamily\textcolor{blue}{#1}}
\SetCommentSty{mycommfont}

\def\mathunderline#1#2{\color{#1}\underline{{\color{black}#2}}\color{black}}
\newcommand{\hlc}[2][yellow]{{%
		\colorlet{foo}{#1}%
		\sethlcolor{foo}\hl{#2}}%
}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\FF}{\mathcal{F}}
\newcommand{\MM}{\mathscr{M}}
\newcommand{\calA}{\mathcal{A}}
\newcommand{\calF}{\mathcal{F}}
\newcommand{\calO}{\mathcal{O}}
\newcommand{\calP}{\mathcal{P}}
\newcommand{\calR}{\mathcal{R}}
\newcommand{\calS}{\mathcal{S}}
\newcommand{\calT}{\mathcal{T}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \newcommand{\bfQ}{\mathbf{Q}}
 \newcommand{\bfI}{\mathbf{I}}
 \newcommand{\bfP}{\mathbf{P}}
 \newcommand{\bfV}{\mathbf{V}}
 \newcommand{\bfD}{\mathbf{D}}
 \newcommand{\bfLambda}{\mathbf{\Lambda}}
 \newcommand{\bfDelta}{\mathbf{\Delta}}
 \newcommand{\fraki}{\mathfrak{i}}
 \newcommand{\frakj}{\mathfrak{j}}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\newcommand{\frakX}{\mathfrak{X}}


\newcommand{\PP}{\mathbb{P}}
\newcommand{\dd}{\mathrm{d}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\NN}{\mathbb{N}}
\newcommand{\TT}{\mathscr{T}}
\newcommand{\QQ}{\mathbf{Q}}


\newcommand{\Idx}{\mathcal{I}}
\newcommand{\isom}{\mathrm{Isom}}
\newcommand{\aff}{\mathrm{Aff}} 
\newcommand{\Span}{\mathrm{Span}} 
\newcommand{\Proj}{\mathrm{Proj}} 
\newcommand{\prox}{\mathrm{prox}} 

\makeatletter
\newcommand*\dotp{\mathpalette\dotp@{.5}}
\newcommand*\dotp@[2]{\mathbin{\vcenter{\hbox{\scalebox{#2}{$\m@th#1\bullet$}}}}}
\makeatother

\newcommand{\Leb}{\mathrm{Leb}}
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand{\Var}{\mathbf{Var}}
\newcommand{\Lc}{\mathcal{L}^2_C}
\newcommand{\IndOne}{\boldsymbol{1}}
\newcommand\DistEq{\stackrel{D}{=}}
\newcommand\SimIID{\stackrel{iid}{\sim}}
\newcommand\ConvInProb{\xrightarrow{\ \PP \ }}
\newcommand\ConvAS{\xrightarrow{a.s.}}
%\newcommand\ConvWeak{\xrightarrow{\ w \ }}
\newcommand\ConvWeak{\xrightarrow{\ D \ }}
\newcommand\ConvInLp{\xrightarrow{L^p}}
\newcommand\Normal{N}

 
% BASIC MATH %

\newcommand{\bst}{ \hspace{1.5pt} | \hspace{1.5pt} }
\newcommand{\cst}{ \hspace{0.5pt} : \hspace{0.5pt} }
\newcommand{\sst}{ \hspace{2pt} ; \hspace{0.5pt} }
\newcommand{\evl}{ \left| \right. }

\newcommand{\abs}[1]{\left| {#1} \right|}
\newcommand{\ceil}[1]{\left\lceil #1 \right\rceil}
\newcommand{\floor}[1]{\left\lfloor #1 \right\rfloor}

\newcommand{\dist}{\mathrm{dist}}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

% Algebra
\newcommand{\permutation}{\mathrm{S}}
% Geometry
\newcommand{\sphere}{\mathbb{S}}

\theoremstyle{definition}
\newtheorem{definition}{Definition}[section]
\newtheorem*{definition*}{Definition} 
\newtheorem{example}[definition]{Example} 
	
\theoremstyle{plain}
\newtheorem{theorem}[definition]{Theorem}
\newtheorem{conjecture}[definition]{Conjecture}
\newtheorem{lemma}[definition]{Lemma}
\newtheorem{proposition}[definition]{Proposition}
\newtheorem{corollary}[definition]{Corollary}
\newtheorem{assumption}[definition]{Assumption}
\newtheorem{condition}[definition]{Condition}
\newtheorem{exercise}[definition]{Exercise}
  
\newtheorem*{theorem*}{Theorem}
\newtheorem*{lemma*}{Lemma}
\newtheorem*{proposition*}{Proposition}
\newtheorem{corollary*}{Corollary}   
\newtheorem{assumption*}{Assumption}
\newtheorem{condition*}{Condition}
\newtheorem{exercise*}{Exercise}
\newtheorem*{example*}{Example}
	
\theoremstyle{remark}
\newtheorem*{remark}{Remark}
\usepackage{hyperref}       % hyperlinks
\hypersetup{
    colorlinks = true,
    linkbordercolor = {blue},
    citecolor = {blue},
    urlcolor = {}
}
\usepackage{cleveref} 
\usepackage{booktabs}   
\usepackage{makecell}
\usepackage{mathtools}
\usepackage{multirow,wrapfig}
% Add the table of content to appendix
%\usepackage[toc,page,header]{appendix}
%\usepackage{minitoc} 
%\renewcommand \thepart{} 
%\renewcommand \partname{}

%##########################################################



\title{Data Sampling Affects the Complexity of Online SGD over Dependent Data}

%Mini-batch Sampling Improves the Complexity of Online SGD over Highly Dependent Data

%
% Add authors
\author[1]{\href{mailto:<s.ma@utah.edu>?Subject=Your UAI 2022 paper}{Shaocong~Ma}{}}
\author[1]{\href{mailto:<zc286@cornell.edu>?Subject=Your UAI 2022 paper}{Ziyi~Chen}{}}
\author[1]{\href{mailto:<yi.zhou@utah.edu>?Subject=Your UAI 2022 paper}{Yi~Zhou}{}}
\author[2]{\href{mailto:<ji.367@buckeyemail.osu.edu>?Subject=Your UAI 2022 paper}{Kaiyi~Ji}{}}
\author[3]{\href{mailto:<yingbinliang@gmail.com>?Subject=Your UAI 2022 paper}{Yingbin~Liang}{}} 
% Add affiliations after the authors
\affil[1]{%
   Department of Electrical and Computer Engineering\\
    University of Utah 
} 
\affil[2]{%
    Electrical Engineering and Computer Science Department\\
   University of Michigan, Ann Arbor 
  }
\affil[3]{%
    Department of Electrical and Computer Engineering\\
   The Ohio State University 
  }




\begin{document}
  
  
%################################################
%used to make content
%\doparttoc % Tell to minitoc to generate a toc for the parts
%\faketableofcontents % Run a fake tableofcontents command for the partocs 
%\part{Paper} 
%\parttoc
%################################################

  
\maketitle

\begin{abstract}
Conventional machine learning applications typically assume that data samples are independently and identically distributed (i.i.d.). However, practical scenarios often involve a data-generating process that produces highly dependent data samples, which are known to heavily bias the stochastic optimization process and slow down the convergence of learning. In this paper, we conduct a fundamental study on how different stochastic data sampling schemes affect the sample complexity of online stochastic gradient descent (SGD) over highly dependent data. 
Specifically, with a $\phi$-mixing process of data, we show that online SGD with proper periodic data-subsampling achieves an improved sample complexity over the standard online SGD in the full spectrum of the data dependence level. Interestingly, even subsampling a subset of data samples can accelerate the convergence of online SGD over highly dependent data. 
Moreover, we show that online SGD with mini-batch sampling can further substantially improve the sample complexity over online SGD with periodic data-subsampling over highly dependent data. Numerical experiments validate our theoretical results.  
\end{abstract}
\section{Introduction}
Stochastic optimization algorithms have attracted great attention in the past decade due to its successful applications to a broad research areas, including deep learning \citep{Goodfellow-et-al-2016}, reinforcement learning \citep{sutton2018}, online learning \citep{Bottou2010,hazan2019introduction}, control \citep{Marti2017}, etc. In the conventional analysis of stochastic optimization algorithms, it is usually assumed that all data samples are independently and identically distributed (i.i.d.) and queried. For example, data samples in the traditional empirical risk minimization framework are assumed to be queried independently from the underlying data distribution, while data samples in reinforcement learning are assumed to be queried from the stationary distribution of the underlying Markov chain. 
     
Although the i.i.d.\ data assumption leads to a comprehensive understanding of the statistical limit and computation complexity of SGD, it violates the nature of many practical data-generating stochastic processes, which generate highly correlated samples that depend on the history. In fact, dependent data can be found almost everywhere, e.g., daily stock price \citep{onalan2009financial, fort2005subgeometric}, weather/climate data, state transitions in Markov chains, etc. To understand the impact of data dependence on the convergence and complexity of stochastic algorithms, there is a growing number of recent works that introduce various definitions to quantify data dependence. 
Specifically, to analyze the finite-time convergence of various stochastic reinforcement learning algorithms, recent studies assume that the dependent samples queried from the Markov decision process satisfy a geometric mixing property \citep{dalal2018finite,zou2019finite,xu2020finite,qu2020finite}, which requires the underlying Markov chain to be uniformly ergodic or has a finite mixing time \citep{even2003learning}. On the other hand, to analyze the convergence of stochastic optimization algorithms over dependent data, \cite{karimi2019non} assumed the existence of a solution to the Poisson equation associated with the underlying Markov chain, which is a weaker condition than the uniform ergodic condition \citep{glynn1996poisson}. Moreover, \cite{agarwal2012generalization} introduced a $\phi$-mixing process that quantifies how fast the distribution of future data samples (conditioned on a fixed filtration) converges to the underlying stationary data distribution. In particular, the $\phi$-mixing process is more general than the previous two notions of data dependence \citep{douc2018markov}. 

While the aforementioned works leveraged the above notions of data dependence to characterize the sample complexity of various stochastic algorithms over dependent data, there still lacks theoretical understanding of how algorithm structure affects the sample complexity of stochastic algorithms under different levels of data dependence. In particular, a key algorithm structure is the stochastic data sampling scheme, which critically affects the bias and variance of the stochastic learning process.
In fact, under i.i.d.\ data and convex geometry, it is well known that SGD achieves the sample complexity lower bound under various stochastic data sampling schemes \citep{Bottou2010}, e.g., single-sample sampling and mini-batch sampling. However, these schemes may lead to substantially different convergence behaviors over highly dependent data, as they are no longer unbiased.
Therefore, it is of vital importance to understand the interplay among data dependence, stochastic data sampling and sample complexity of stochastic learning algorithms, and we want to ask the following fundamental question. 
%that can effectively reduce the data dependence in the stochastic optimization process, and this critically depends on the 
\begin{itemize}
    \item {\bf Q:} {\em  How does stochastic data sampling affect the convergence rate and sample complexity of stochastic learning algorithms over dependent data?}
\end{itemize}

%Particularly, an important class of stochastic algorithms with dependent data is reinforcement learning algorithms \citep{sutton2018}, which utilize dependent Markovian samples queried from the Markov decision process to learn the optimal policy. In the existing literature, numerous stochastic reinforcement learning algorithms have been developed, among which the Q-learning algorithm \citep{watkins1992q,baird1995residual} and Fitted Q-Iteration algorithm \citep{mnih2013playing,mnih2015human} receive much popularity. However, the existing analysis of these stochastic reinforcement learning algorithms rely on a strong assumption of the data dependence, e.g., the Markovian data satisfies a fast geometric $\phi$-mixing property, which essentially implies that the data samples have weak dependence and are close to being independent from each other \citep{dalal2018finite,zou2019finite}. Therefore, there lacks understanding of convergence of $Q$-learning-type algorithms with highly dependent data. Having answered the aforementioned question, we aim to further provide an answer to the following question. 
% \begin{itemize}
%     \item {\em How does the level of data dependence affect the convergence of Fitted Q-Iteration with stochastic optimization algorithms?}
% \end{itemize}

%Here we specially note that the existence of solution to Poisson equation associated its underlying Markov chain is a weaker condition than the uniform ergodicity \citep{glynn1996poisson}; 
%however, it is a stronger condition than the convergence of Markov chain admitting a polynomial rate (called polynomially ergodic). It is because the existence of solution to Poisson equation implies the central limit theorem (CLT) (Theorem 21.2.5 from \cite{douc2018markov}), but there exists a polynomially ergodic chain such that the CLT does not hold (Remark 21.4.6 from \cite{douc2018markov}). Therefore, we consider the assumption made in \cite{agarwal2012generalization} is a more suitable setup on studying the data dependence. 
%Besides it, the $\phi$-mixing coefficient gives a more concrete characterization on the data processes; it leads to many interesting topics. For example, how does the convergence rate of data process influence the optimization algorithm? This question has been answered in \cite{agarwal2012generalization}; it says that when the data process slowly converges to its stationary distribution, it requires a larger sample size to achieve the same error compared to the case with a faster convergence rate. Then it naturally leads to the next question: How can we handle the case when the data process admits a slow convergence rate?     

In this paper, we provide comprehensive answers to this fundamental question. Specifically, we conduct a comprehensive study of the convergence rate and sample complexity of the online SGD algorithm over a wide spectrum of data dependence levels under various stochastic data sampling schemes, including periodic subsampling and mini-batch sampling. Our results show that online SGD with both data sampling schemes achieves a substantially improved sample complexity over the standard online SGD over highly dependent data. We summarize our contributions as follows.

 
\subsection{Our Contributions}
We consider the following stochastic optimization problem.
\begin{align}
	\min_{w \in \mathcal{W}}  f(w) : = \EE_{\xi\sim \mu} ~ \big[F(w; \xi)\big], \tag{P}
\end{align}
where the objective function $f$ is convex and Lipschitz continuous, and the expectation is taken over the stationary distribution $\mu$ of the underlying data-generating process $\mathbf{P}$. To perform online learning, we query a stream of dependent data samples from the underlying data-generating process.
Specifically, we adopt the $\phi$-mixing process to quantify the data dependence via a decaying mixing coefficient function $\phi_\xi(k)$ (see Definition \ref{def: mix}) \citep{agarwal2012generalization}. We study the convergence of the online stochastic gradient descent (SGD) algorithm over a $\phi$-mixing data stream under various stochastic data sampling schemes, including periodic subsampling and mini-batch sampling. We summarize and compare the sample complexities of online SGD with these data sampling schemes under different $\phi$-mixing data dependence models in Table \ref{table: 1}.

We first study the convergence of online SGD over $\phi$-mixing dependent data samples under the data subsampling scheme. In particular, the data subsampling scheme utilizes only one data sample per $r$ consecutive data samples by periodically skipping $r-1$ samples. With this data subsampling scheme, the subsampled data samples are less dependent for a larger subsampling period $r$. Also, the improvement is substantial when the data is highly dependent with an algebraic decaying $\phi$-mixing coefficient. 

Moreover, we study the sample complexity of online SGD over $\phi$-mixing dependent data samples under the mini-batch sampling scheme. Compare to the data subsampling scheme, mini-batch sampling substantially reduces the mini-batch data dependence without skipping data samples. Consequently, mini-batch update leverages the sample average over a mini batch of data samples to reduce both the bias (caused by the data dependence) and the variance (caused by stochastic sampling). Specifically, we show that online SGD with mini-batch sampling achieves an orderwise lower sample complexity than both the standard online SGD and the online SGD with data subsampling in the full spectrum of the convergence rate of the $\phi$-mixing coefficient. Our study reveals that the widely used mini-batch sampling scheme can effectively reduce the bias caused by data dependence without sacrificing data efficiency. 



\begin{table*}[ht]
%\vspace{-2mm}
	\caption{Comparison of sample complexities of SGD, SGD with subsampling and mini-batch sampling under different data dependence models for achieving $f(w) \!-\! f(w^*) \!\le\! \epsilon$. Note that $\theta$ parameterizes convergence rate of the $\phi$-mixing coefficient.}\label{table: 1}
	\center
	\begin{footnotesize}
	%\vspace{0mm}
	\begin{tabular}{ccccc}
		\toprule
		{Data dependence model} & $\phi_\xi(k)$ & {SGD} & {SGD w/ subsampling} & {Mini-batch SGD}\\ \midrule
		Geometric $\phi$-mixing  & $\exp(-k^{\theta}),$ &\multirow{2}{*}{$\mathcal{O}(\epsilon^{-2}(\log \epsilon^{-1})^{\frac{2}{\theta}})$}  & \multirow{2}{*}{$\mathcal{O}(\epsilon^{-2}(\log \epsilon^{-1})^{\frac{1}{\theta}})$} &\multirow{2}{*}{{\color{blue}$\mathcal{O}(\epsilon^{-2})$}} \\
		(Weakly dependent)   & $\theta>0$ &  \\
		\midrule
		Fast algebraic $\phi$-mixing  & $k^{-\theta},$ & \multirow{2}{*}{$\mathcal{O}(\epsilon^{-2-\frac{2}{\theta}})$} & \multirow{2}{*}{$\mathcal{O}(\epsilon^{-2-\frac{1}{\theta}})$} & \multirow{2}{*}{{\color{blue}$\widetilde{\mathcal{O}}(\epsilon^{-2})$}} \\
		(Medium dependent)  & $\theta\ge 1$ &   \\
		\midrule
		Slow algebraic $\phi$-mixing  & $k^{-\theta},$ & \multirow{2}{*}{$\mathcal{O}(\epsilon^{-2-\frac{2}{\theta}})$} & \multirow{2}{*}{$\mathcal{O}(\epsilon^{-2-\frac{1}{\theta}})$} & \multirow{2}{*}{{\color{blue}$\mathcal{O}(\epsilon^{-1-\frac{1}{\theta}})$}} \\
		(Highly dependent)  & $0<\theta<1$ &   \\
		\bottomrule
	\end{tabular}
	\end{footnotesize}
\end{table*}

%We build the generalization error bound for online convex optimization problem under dependent data with considering the mini-batch averaging. Though it is a direct extension of \cite{agarwal2012generalization}, we provide a more accurate upper bound in the mini-batch setting. The improved result gives us two new understandings on the stationary $\phi$-mixing data processes: First, when the convergence rate of data processes is fast enough (geometric $\phi$-mixing or fast algebraic $\phi$-mixing), the data independence has no too much influences on the performance of online convex optimization algorithm with or without taking mini-batch averaging. Second, applying a large batch size can reduce the batch-level data correlation, especially when the convergence rate of data process is slow (slow algebraic $\phi$-mixing). Based on these understandings, we build the non-asymptotic analysis of mini-batch online SGD algorithm with a large batch size and obtain an improved sample complexity compared to the original results from \cite{agarwal2012generalization} that takes the batch size as $1$. 

%We further apply our results of SGD with dependent data to study the convergence of the fitted Q-iteration algorithm that uses a linear model to approximate the value function and applied mini-batch SGD to solve the regression problems. We characterize the sample complexity of this algorithm for obtaining an output policy $\pi_K$ that achieves $\epsilon$ convergence error (i.e., $V^* - V^{\pi_K} \le \epsilon$) under different levels of data dependence. In particular, we note that our analysis of the fitted Q-iteration relies on weaker assumptions than those adopted in the existing convergence analysis of Q-learning with linear function approximation \citep{melo2008analysis,chen2019finite}, which additionally assume that the underlying Markov chain mixes geometrically fast and that the feature vectors satisfy certain strong non-singularity conditions.  

%linear value function approximation. For the standard convergence analysis of Q-learning with linear value function approximation \citep{melo2008analysis,chen2019finite} or neural tangent kernel approximation \citep{xu2020finite}, it is usually made some restrictive assumptions on the features and discount factor. From the perspective of online convex optimization, we can build the non-asymptotic upper bound without making those assumptions.


%will study this question by extending the results of \cite{agarwal2012generalization} to the mini-batch setting. Surprisingly, compared to the upper bound provided in \cite{agarwal2012generalization}, we find that simply applying a large batch size leads to a better sample complexity when the convergence rate of Markov chain to its stationary distribution is slow enough. We note that \cite{agarwal2012generalization} can include the mini-batch setting as a special case; however, its analysis does not take the advantage of mini-batch averaging, which makes some key results cannot achieve the desired upper bound. Also, there exists related work considering the streaming-batches data \citep{godichon2021non}; it only considers the unbiased update while the stochastic update is usually biased in the general $\phi$-mixing data stream. Lastly, we apply the analysis to the fitted Q-iteration algorithm \citep{mnih2013playing,mnih2015human}; it directly gives the non-asymptotic upper bound of the fitted Q-iteration with the linear function approximation without assuming linear independent features.   



\subsection{Related Work}
\paragraph{Stochastic Algorithms over Dependent Data} {\cite{steinwart2009fast} and \cite{modha1996minimum} established the convergence analysis of online stochastic algorithms for streaming data with geometric ergodicity.  \cite{duchi2011ergodic} proved that the stochastic subgradient method has strong convergence guarantee if the mixing time is uniformly bounded.}  \cite{agarwal2012generalization} studied the convex/strongly convex stochastic optimization problem and proved high-probability convergence bounds for general stochastic algorithms under general stationary mixing processes. \cite{godichon2021non} provided the non-asymptotic analysis of stochastic algorithms with strongly convex objective function over streaming mini-batch data. In a more general setting, the stochastic approximation (SA) problem was studied in \citep{karimi2019non} by assuming the existence of solution to a Poisson equation. Recently, \cite{debavelaere2021} developed the asymptotic convergence analysis of SA problem for sub-geometric Markov dynamic noises. 

\paragraph{Finite-time convergence of reinforcement learning} Recently, a series of work studied the finite-time convergence of many stochastic reinforcement learning algorithms over Markovian dependent samples, including TD learning \citep{dalal2018finite,xu2019two,kaledin2020finite}, Q-learning \citep{qu2020finite,li2021q,melo2008analysis, chen2019finite,xu2020finite}, fitted Q-iteration \citep{mnih2013playing,mnih2015human,agarwal2021rlbook}, actor-critic algorithms \citep{wang2019neural,yang2019provably,kumar2019sample,qiu2019finite,wu2020finite,xu2020improving}, etc. In these studies, the dependent Markovian samples are assumed to be generated from a geometric $\phi$-mixing process, which is satisfied when the underlying Markov chain is uniformly ergodic or time-homogeneous with finite-states. 


\paragraph{Regret of Stochastic Convex Optimization} There have been many known regret bounds for online convex optimization problem. \cite{hazan2019introduction} has built the standard $\calO(\sqrt{T})$ regret bound for online SGD algorithm with assuming the bounded gradient. \cite{xiao2009dual} introduces the regret bound of online dual averaging method. To our best knowledge, there is no high-probability guaranteed regret bound for mini-batch SGD with considering the data dependence. 



\section{Formulation and Assumptions}
In this section, we introduce the problem formulation and some basic assumptions. 
Consider a model with parameters $w$. For any data sample $\xi$, denote $F(w;\xi) \in \mathbb{R}$ as the sample loss of this data sample under the model $w$. In this paper, we consider the following standard stochastic optimization problem that has broad applications in machine learning.
\begin{align}
	\min_{w \in \mathcal{W}}  f(w) : = \EE_{\xi\sim \mu} ~ \big[F(w; \xi)\big]. \tag{P}
\end{align}
Here, the expectation is taken over the randomness of the data sample $\xi$, which is drawn from an underlying distribution $\mu$. We make the following standard assumptions regarding the problem (P) \citep{agarwal2012generalization}.

\begin{assumption}\label{ass:lipschitz}
	The optimization problem (P) satisfies
\begin{itemize}
	\item[1.] For every $\xi$, function $F(\cdot;\xi)$ is $G$-Lipschitz continuous over the domain $\mathcal{W}$.
% 	, i.e., for all $w,v\in \mathcal{W}$,
% 	$$|F(w;\xi) - F(v;\xi)|\leq G \|w - v\|.$$
	\item[2.] Function $f(\cdot)$ is convex and bounded below, i.e., $f(w^*) := \inf_{w\in \mathcal{W}} f(w) > -\infty$.
	\item[3.] $\mathcal{W}$ is convex and compact with bounded diameter $R$. %, i.e., $\sup_{w,v\in \mathcal{W}} \|w - v\| \leq R.$
\end{itemize}
\end{assumption} 

To solve this stochastic optimization problem, one often needs to query a stream of data samples from the distribution $\mu$ to perform optimization. Unlike traditional stochastic optimization that usually assumes that the data samples are i.i.d.\, we consider a more general and practical dependent data-generating process as we elaborate below. 

{\bf Dependent data-generating process:} 
We consider a stochastic process $\mathbf{P}$ that generates a stream of data samples $\{\xi_1, \xi_2,...,\}$, which are not necessarily independent. In particular, the stochastic process $\mathbf{P}$ has an underlying stationary distribution $\mu$. 
To quantify the dependence of the data generation process, we introduce the following standard $\phi$-mixing process \citep{agarwal2012generalization}, where we denote $\{\calF_t \}_{t}$ as the   filtration generated by $\{\xi_t\}_{t}$.

\begin{definition}[$\phi$-mixing process]\label{def: mix}
	Consider a stochastic process $\{\xi_t\}_{t}$ with a stationary distribution $\mu$. 
	Let $\PP( \xi_{t+k} \in \cdot | \calF_t )$ be the distribution of the $(t+k)$-th sample conditioned on $\calF_t$, and denote $d_{\text{TV}}$ as the total variation distance. Then, the process $\{\xi_t\}_t$ is called $\phi$-mixing if the following mixing coefficient $\phi_\xi(\cdot)$ converges to $0$ as $k$ tends to infinity. 
	\begin{align*}
		\phi_\xi(k):= \sup_{t\in \NN, A\in \calF_t} 2 d_{\text{TV}} \big( \PP( \xi_{t+k} \in \cdot | A), \mu \big).
	\end{align*}
\end{definition}
Intuitively, the $\phi$-mixing coefficient describes how fast the distribution of sample $\xi_{t+k}$ converges to the stationary distribution $\mu$ when conditioned on the filtration $\calF_t$, as the time gap $k\to \infty$. The $\phi$-mixing process can be found in many applications, which involve mixing coefficients that converge to zero at different convergence rates. Below we mention some representative examples. 
\begin{itemize}
    \item {\bf Geometric $\phi$-mixing process.} Such a type of process has a geometrically diminishing mixing coefficient, i.e., $\phi_\xi(k) \le \phi_0 \exp(-ck^{\theta})$ for some $\phi_0, c, \theta>0$. Examples include finite-state ergodic Markov chains and some aperiodic Harris-recurrent Markov processes \citep{modha1996minimum,agarwal2012generalization,meyn2012markov};
    \item {\bf Algebraic $\phi$-mixing process.} Such a type of process has a polynomially diminishing mixing coefficient, i.e., $\phi_\xi(k) \le \phi_0 k^{-\theta}$ for some $\phi_0, \theta>0$. Examples include a large class of Metropolis-Hastings samplers \citep{jarner2002polynomial} and some queuing systems \citep{agarwal2012generalization}.
\end{itemize}



 

\section{Complexity of Online SGD over Dependent Data}

%{\color{red} Note that excessive risk is not generalization error bound. We need a final result relating to Proposition 1 of Agarwal. It is a simple generalization to mini-batch setting.}

In this section, we recap the convergence results of online SGD over dependent data established in \citep{agarwal2012generalization}. Throughout, we define the sample complexity as the total number of samples required for the algorithm to output a model $w$ that achieves an $\epsilon$ convergence error with a certain probability, i.e., $f(w) - f(w^*) \le \epsilon$ with probability $1-\delta$.
Also, the standard regret of an online learning algorithm is defined as
\begin{align*}
    \text{(Regret):}\quad \mathfrak{R}_n := \sum_{t=1}^n F(w(t);\xi_t) - F(w^*; \xi_t),
\end{align*}
where the models $\{w_1,w_2,...,w_n\}$ are generated using the data samples $\{\xi_1,\xi_2,...,\xi_n\}$, respectively, and $w^*$ is the minimizer of $f(w)$.
For this sequence of models $\{w_1,w_2,...,w_n\}$, we make the following mild assumption, which is satisfied by many SGD-type algorithms. 
\begin{assumption}\label{ass:kappa}
    There is a non-increasing sequence $\{\kappa(t)\}_t$ such that
    $\|w(t+1) - w(t)\| \leq \kappa(t).$ 
\end{assumption}

%\subsection{Stochastic gradient descent}\label{subsec: SGD}
Online SGD is a popular and standard algorithm for solving the problem (P). In every iteration $t$, online SGD queries a sample $\xi_t$ from the data-generating process and performs the following SGD update.
\begin{align}
\text{(SGD):}\quad	w(t+1) = w(t) - \eta_t \nabla F(w(t);\xi_t), \label{eq:sgd}
\end{align}
where $\eta_t$ is the learning rate. 
In Theorem 2 of \citep{agarwal2012generalization}, the authors established a high probability convergence error bound for a generic class of stochastic algorithms. Specifically, under the Assumptions \ref{ass:lipschitz} and \ref{ass:kappa},
they showed that for any $\tau \in \mathbb{N}$ with probability at least $1-\delta$, the averaged predictor $\widehat{w}_n := \frac{1}{n}\sum_{t=1}^n w(t)$ satisfies
\begin{align}
    &f(\widehat{w}_n) - f(w^*) \nonumber \\
    &\le \frac{\mathfrak{R}_n}{n}  + \frac{(\tau \!-\!1)G}{n} \sum_{t=1}^n \kappa(t) \label{eq: SGD_GE}\\
    &\quad+  \frac{2(\tau \!-\! 1)GR}{n} + 2GR \sqrt{\frac{2\tau}{n} \log \frac{\tau}{\delta}} + \phi_\xi(\tau)GR. \nonumber
\end{align}
Here, {$\mathfrak{R}_n$ is the regret of the algorithm of interest, G is the Lipschitz constant of the loss function $F(\cdot;\xi)$,  and  $R$ is the diameter of the parameter domain,} and $\tau\in \mathbb{N}$ is an auxiliary parameter that is introduced to decouple the dependence of the data samples. From the above bound, one can see that the optimal choice of $\tau$ depends on the convergence rate of the mixing coefficient $\phi_{\xi}(\tau)$.
Specifically, consider the online SGD algorithm in \eqref{eq:sgd}. It can be shown that it achieves the regret $\mathfrak{R}_n = \mathcal{O}(\sqrt{n})$ and satisfies $\kappa(t) = \mathcal{O}(1/\sqrt{t})$ under a proper diminishing learning rate.  Consequently, the above high-probability convergence bound for online SGD reduces to
\begin{align}
    &f(\widehat{w}_n) - f(w^*) \nonumber\\
    &\le \mathcal{O}\Big(\frac{1}{\sqrt{n}} + \inf_{\tau \in \mathbb{N}} \Big\{\frac{\tau-1}{\sqrt{n}} + \sqrt{\frac{\tau}{n} \log \frac{\tau}{\delta}} + \phi_\xi(\tau) \Big\} \Big). \nonumber
\end{align}
Such a bound further implies the following sample complexity results of online SGD under different $\phi$-mixing models. 

\begin{corollary}\label{coro: sgd}
The sample complexity of online SGD for achieving an $\epsilon$ convergence error over $\phi$-mixing data is 
\begin{itemize}
    \item If the data is geometric $\phi$-mixing with parameter $\theta>0$, then we set $\tau=\calO\big( (\log \frac{1}{\epsilon})^{\frac{1}{\theta}}  \big)$. The resulting sample complexity is in the order of $n = \mathcal{O}\big(\epsilon^{-2} (\log \frac{1}{\epsilon})^{\frac{2}{\theta}} \big)$.
    \item If the data is algebraic $\phi$-mixing with parameter $\theta>0$, then we set $\tau= \calO( \epsilon^{-\frac{1}{\theta}} )$. The resulting sample complexity is in the order of $n = \mathcal{O}(\epsilon^{-2-\frac{2}{\theta}})$.
    % \item If the data is slow algebraic $\phi$-mixing, then we choose $\tau=, n= $. The resulting sample complexity is in the order of $\mathcal{O}()$.
\end{itemize}
\end{corollary}

It can be seen that if the data-generating process has a fast geometrically diminishing mixing coefficient, i.e., the data samples are close to being independent from each other, then the resulting sample complexity is almost the same as that of SGD with i.i.d.\ samples. On the other hand, if the data-generating process mixes slowly with an algebraically diminishing mixing coefficient, i.e., the data samples are highly dependent, then the data dependence increases the sample complexity by a non-negligible factor of $\epsilon^{-\frac{2}{\theta}}$. In particular, such a factor is substantially large if the mixing rate parameter $\theta$ is close to zero.

%\blue{Comment: 
% \begin{itemize}
%     \item $n$ is the sample complexity so I delete all $n$ above (choose $\tau= ,n= \dots$). 
%     \item And since $\tau\geq  c\log \frac{1}{\epsilon}$, $ \frac{\tau-1}{\sqrt{n}}$ will dominate  $\sqrt{\frac{\tau}{n} \log \frac{\tau}{n}}$. Also, the ``$\inf$" term is always larger than the first term. Then  Eq.(3) can be further simplified as
% $$f(\widehat{w}_n) - f(w^*) \le  \inf_{\tau \in \mathbb{N}} \Big\{\frac{\tau-1}{\sqrt{n}}  + \phi(\tau) \Big\} .$$
% \item For SGD algorithm, both fast and slow cases are same. So I commented the third case.
% \end{itemize}
%}


\section{Complexity of Online SGD with Data Subsampling}\label{subsec: SGDsub}

When apply online SGD to solve stochastic optimization problems over dependent data, the key challenge is that the data dependence introduces non-negligible bias that slows down the convergence of the algorithm. Hence, a straightforward solution is to reduce data dependence before performing stochastic optimization, and data subsampling is such a simple and effective approach \citep{Nagaraj2020,Kotsalis2020}. 
%Next, we show that such an approach leads to an improved convergence bound and sample complexity of SGD over highly dependent data. 

Specifically, consider a stream of $\phi$-mixing data samples $\{\xi_1, \xi_{2}, \xi_{3}, \dots \}$. Instead of utilizing the entire stream of data, we subsample a subset of this data stream with period $r\in\mathbb{N}$ and obtain the following subsampled data stream 
\begin{align*}
	\{\xi_1, \xi_{r+1}, \xi_{2r+1}, \dots \}.
\end{align*}
In particular, let $\{\calF_t\}_t$ be the canonical filtration generated by $\{\xi_{tr+1}\}_t$. Since the consecutive subsampled samples are $r$ time steps away from each other, it is easy to verify that the subsampled data stream $\{\xi_{tr+1}\}_t$ is also a  $\phi$-mixing process with mixing coefficient given by $\phi_{\xi}^r(t) = \phi_\xi(r t)$, 
where $\phi_\xi^r$ denotes the mixing coefficient of the subsampled data stream $\{\xi_{tr+1}\}_t$. Therefore, by periodically subsampling the data stream, the resulting subsampled process has a faster-converging mixing coefficient. Then, we can apply online SGD to such subsampled data, i.e., 
\begin{align}
\text{(SGD with subsampling):}& \nonumber\\	w(t+1) = w(t) &- \eta_t \nabla F(w(t);\xi_{tr+1}). \label{eq:sgdsubsample}
\end{align}
In particular, the convergence error bound in \cref{eq: SGD_GE} still holds by replacing $\phi_\xi(\tau)$ with $\phi_\xi(r\tau)$, and we obtain the following bound for online SGD with subsampling.
\begin{align}\label{eq: subsampling}
    &f(\widehat{w}_n) - f(w^*)  \\
    &\le \mathcal{O}\Big(\frac{1}{\sqrt{n}} + \inf_{\tau \in \mathbb{N}} \Big\{\frac{(\tau-1)}{\sqrt{n}} + \sqrt{\frac{\tau}{n} \log \frac{\tau}{\delta}} + \phi_{\xi}(r\tau) \Big\} \Big) \nonumber. 
\end{align}
Such a bound implies the following sample complexity results of online SGD with subsampling under different convergence rates of the mixing coefficient $\phi_\xi$. 

\begin{corollary}\label{coro:sub}
The sample complexity of online SGD with subsampling for achieving an $\epsilon$ convergence error over $\phi$-mixing data process is. 
\begin{itemize}
    \item If the data is geometric $\phi$-mixing with parameter $\theta>0$, then we choose $r = \calO\big( (\log \frac{1}{\epsilon})^{\frac{1}{\theta}} \big)$ and $\tau = \calO(1)$. The resulting sample complexity is  $r n =  \mathcal{O}\big(\epsilon^{-2} (\log \frac{1}{\epsilon})^{\frac{1}{\theta}} \big)$.
    \item If the data is  algebraic $\phi$-mixing with parameter $\theta>0$, then we choose $r = \calO\big( \epsilon^{-\frac{1}{\theta}}  \big)$ and $\tau = \calO(1)$. The resulting sample complexity is $r n =  \mathcal{O}\big(\epsilon^{-2-\frac{1}{\theta}}   \big)$.
\end{itemize}
\end{corollary}
Compare the above sample complexity results with those of the standard online SGD in Corollary \ref{coro: sgd}, we conclude that data-subsampling can improve the sample complexity by a factor of $(\log \frac{1}{\epsilon})^{\frac{1}{\theta}}$ and $\epsilon^{-\frac{1}{\theta}}$ for geometric $\phi$-mixing and algebraic $\phi$-mixing data process, respectively. Intuitively, this is because with data subsampling, we can choose a sufficiently large subsampling period $r$ to decouple the data dependence in the term $\phi_{\xi}(r\tau)$, as opposed to choosing a large $\tau$ in Corollary \ref{coro: sgd}. In this way, the order of the dominant term $\sqrt{\frac{\tau}{n} \log \frac{\tau}{\delta}}$ is reduced.
Therefore, when the data is highly dependent, it is beneficial to subsample the dependent data before performing SGD. We also note another advantage of using data-subsampling, i.e., it only requires computing the stochastic gradients of the subsampled data, and therefore can substantially reduce the computation complexity. 


\section{Complexity of Online SGD with Mini-batch Sampling}

Although the data-subsampling scheme studied in the previous section helps improve the sample complexity of online SGD, it does not leverage the full information of all the queried data. In particular, when the data is highly dependent, we need to choose a large period $r$ to reduce data dependence, and this will throw away a huge amount of valuable samples.  
In this section, we study online SGD with another popular data sampling scheme that leverages the full information of all the sampled data, i.e., the mini-batch sampling scheme. We show that this simple and widely used scheme can effectively reduce data dependence without skipping data samples, and can achieve an improved sample complexity over online SGD with subsampling. 

Specifically, consider a data stream $\{\xi_t\}_t$ with $\phi$-mixing dependent samples. We rearrange the data samples into a stream of mini-batches $\{x_t\}_t$, where each mini-batch $x_t$ contains $B$ samples, i.e., $x_t= \{\xi_{(t-1)B+1}, \xi_{(t-1)B+2}, \dots, \xi_{tB}\}$.
Then, we perform mini-batch SGD update as follows.
\begin{align}
\text{(SGD with mini-batch}~ &\text{sampling):}\nonumber\\	w(t+1) = w(t) & - \frac{\eta_t}{B} \sum_{\xi \in x_t} \nabla F(w(t);\xi). \label{eq:alg}
\end{align}
Performing online learning with mini-batch sampling has several advantages. 
First, it substantially reduce the optimization variance and allows to use a large learning rate to facilitate the convergence of the algorithm. As a comparison, SGD with subsampling suffers from a large optimization variance. Second, unlike subsampling, mini-batch sampling utilizes the information of all the queried data samples to improve the performance of the model. Moreover, as we show in the following lemma, mini-batch sampling substantially reduces the stochastic bias caused by the data dependence. 
In the sequel, we denote $F(w;x):= \frac{1}{B}\sum_{\xi \in x} F(w;\xi)$ as the average loss on a mini-batch of samples. With a bit abuse of notation, we also define $\{\calF_t\}_t$ as the canonical filtration generated by the mini-batch samples $\{x_t\}_t$.

\begin{lemma} \label{lemma:0}
	Let Assumption \ref{ass:lipschitz} hold and consider the mini-batch data stream $\{x_t\}_t$. Then, for any $w,v\in \mathcal{W}$ measureable with regard to $\mathcal{F}_t$ and any $\tau \in \NN$, it holds that
	\begin{align}
		&\EE \big[ F(w;x_{t+\tau})  - F(v;x_{t+\tau}) | \calF_t \big]  - \big(f(w) - f(v)\big)\nonumber \\
		&\leq \frac{ GR}{B} \sum_{i=1}^{B}  {\phi_{\xi}(\tau B + i)}. \label{eq: lemma1}
	\end{align}  
\end{lemma}
With dependent data, the above lemma shows that we can approximate the population risk $f(w)$ by the conditional expectation $\mathbb{E}[F(w;x_{t+\tau})|\calF_t]$, which involves the mini-batch $x_{t+\tau}$ that is $\tau$ steps ahead of the filtration $\calF_t$. Intuitively, by the definition of $\phi$-mixing process, as $\tau$ gets larger, the distribution of $x_{t+\tau}$ {conditional on $\calF_t$} gets closer to the stationary distribution $\mu$. In general, the estimation bias $\frac{ GR}{B} \sum_{i=1}^{B}  {\phi_{\xi}(\tau B + i)}$ depends on both the batch size and the accumulated mixing coefficient over the corresponding batch of samples. To provide a concrete understanding, below we calculate the estimation bias in \cref{eq: lemma1} for various $\phi$-mixing processes. 
\begin{itemize}
	\item {\bf Geometric $\phi$-mixing:} In this case, $\sum_{i=1}^B \phi_\xi(\tau B+i) \le \sum_{i=1}^\infty \phi_\xi(i) = \calO(1)$. Hence, the estimation bias is in the order of $\mathcal{O}(\frac{GR}{B})$.
	
	\item {\bf Fast algebraic $\phi$-mixing ($\theta\ge 1$):} In this case, $\sum_{i=1}^B \phi_\xi(\tau B+i) \le \sum_{i=1}^\infty \phi_\xi(i) = \widetilde{\calO}(1)$. Hence, the estimation bias is in the order of $\widetilde{\mathcal{O}}(\frac{GR}{B})$, where $\widetilde{\mathcal{O}}$ hides all logarithm factors.
	
	
	\item {\bf Slow algebraic $\phi$-mixing} ($0<\theta < 1$): In this case, $\sum_{i=1}^B \phi_\xi(\tau B+i) \le \mathcal{O}((\tau B)^{1-\theta})$. Hence, the estimation bias is in the order of $\mathcal{O}(\frac{GR\tau^{1-\theta}}{ B^{\theta}})$.
\end{itemize}

It can be seen that if the mixing coefficient converges fast, i.e., either geometrically or fast algebraically, then the data dependence has a negligible impact on the estimation error. 
On the other hand, when the mixing coefficient converges slow algebraically, it substantially increases the estimation bias, but it is still beneficial to use a large batch size.  

We obtain the following convergence error bound for online SGD with mini-batch sampling over dependent data. 

\begin{theorem} \label{thm:convex}
Let Assumption \ref{ass:lipschitz} and \ref{ass:kappa} hold. Apply SGD with mini-batch sampling to solve the stochastic optimization problem (P) over $\phi$-mixing dependent data process and assume that it achieves regret $\mathfrak{R}_n$. Then, for any $\tau \in \mathbb{N}$ and any minimizer $w^*$ with probability at least $1-\delta$, the averaged predictor $\widehat{w}_n := \frac{1}{n}\sum_{t=1}^n w(t)$ satisfies
\begin{align}
    &f(\widehat{w}_n) - f(w^\ast) \nonumber\\
    &\le \frac{\mathfrak{R}_n}{n} + \frac{G (\tau - 1)}{n} \sum_{t=1}^{n-\tau+1}\kappa(t) +\frac{2GR(\tau - 1)}{n} \nonumber\\
    & \quad+ \calO\bigg(\frac{1}{nB}\sum_{i=1}^B \phi(\tau B + i)  \nonumber \\ 
    & \quad+  \sqrt{\frac{\tau }{nB} \log \frac{\tau}{\delta} } \log\frac{n}{\delta}  \Big(B^{-\frac{1}{4}} + \Big[\sum_{i=1}^{B}  \phi(i) \Big]^{\frac{1}{4}} \Big) \bigg). \label{eq:main1}
\end{align}
 
\end{theorem}  
 

To further understand the order of the above bound, a standard regret analysis shows that mini-batch SGD achieves the regret {$\frac{\mathfrak{R}_n}{n} = \widetilde{\calO}(\sqrt{\frac{\sum_{j=1}^{n} \phi(j)}{n B}})$} and $\kappa(t) \equiv \mathcal{O}(\sqrt{\frac{B}{n}})$ (see Theorem C.3 for the proof). Consequently, the above convergence error bound reduces to the following bound.
%where we hide all logarithm factors for simplicity of presentation.
\begin{align}
    &f(\widehat{w}_n) - f(w^\ast) \nonumber \\
    &\le \widetilde{\calO}\bigg( \sqrt{\frac{\sum_{j=1}^{n} \phi(j)}{nB}} +\frac{GR(\tau - 1)}{n} \nonumber \\
    & + \frac{1}{nB}\sum_{i=1}^B \phi(\tau B + i) +  \sqrt{\frac{\tau }{nB}} \Big( B^{-\frac{1}{4}} + \Big[\sum_{i=1}^{B}  \phi(i) \Big]^{\frac{1}{4}} \Big) \bigg). \nonumber
\end{align}
Such a bound further implies the following sample complexity results of online SGD with mini-batch sampling under different convergence rates of the mixing coefficient $\phi_\xi$. 

\begin{corollary}\label{coro: minibatch}
The sample complexity of online SGD with mini-batch sampling for achieving an $\epsilon$ convergence error over $\phi$-mixing dependent data is
\begin{itemize}
    \item If the data is geometric $\phi$-mixing with parameter $\theta>0$, then we choose $\tau = 1, B = \mathcal{O}(\epsilon^{-1}), n = \mathcal{O}(\epsilon^{-1})$. The overall sample complexity is $nB = \mathcal{O}(\epsilon^{-2})$.
    
    \item If the data is fast algebraic $\phi$-mixing with parameter $\theta\ge 1$, then we choose $\tau = 1, B = \mathcal{O}(\epsilon^{-1}), n = \mathcal{O}(\epsilon^{-1})$. The overall sample complexity is $nB = \widetilde{\mathcal{O}}(\epsilon^{-2})$.   %{\color{red} here should possibly have a log factor if $\theta = 1$.}
    
    \item If the data is slow algebraic $\phi$-mixing with parameter $0<\theta < 1$, then we choose $\tau = 1, B = \mathcal{O}(\epsilon^{-\frac{1}{\theta}}), n = \mathcal{O}(\epsilon^{-1})$. The overall sample complexity is $nB = \mathcal{O}(\epsilon^{-1-\frac{1}{\theta}})$.
\end{itemize}
\end{corollary} 
{ 
\begin{remark}
This corollary provides a potential way to set the optimal batch size $B$ with respect to the mixing rate $\theta$. Specifically, we can leverage Lemma \ref{lemma:0} to estimate the dependence parameter $\theta$. Choosing batch size $B=1$, the upper bound of Lemma \ref{lemma:0} becomes $GR\phi_\xi(\tau+1)$, which is proportional to the mixing coefficient $\phi_\xi(\tau+1)$. Therefore, the left-hand side $\mathbb{E}\big[ F(w;x_{t+\tau}) - F(v;x_{t+\tau}) | \mathcal{F}_t \big] - \big(f(w) - f(v)\big)$ of Lemma \ref{lemma:0} serves as an estimator, which can be estimated by (conditional) sample average queried at any fixed points $\omega, v$. Once we estimate this quantity with various values of $\tau$, we can use regression to  find out the type of convergence for $\phi_\xi(\tau)$ and estimate the parameter $\theta$. 
With the estimated $ \theta$, we then follow this corollary to choose the batch size. 
\end{remark}}
It can be seen that  online SGD with mini-batch sampling improves the sample complexity of online SGD with subsampling by a factor of $\mathcal{O}((\log \frac{1}{\epsilon})^{\frac{1}{\theta}})$, $\widetilde{\mathcal{O}}(\epsilon^{-\frac{1}{\theta}})$ and $\mathcal{O}(\epsilon^{-1})$ for geometric $\phi$-mixing, fast algebraic $\phi$-mixing and slow algebraic $\phi$-mixing data samples, respectively. This shows that mini-batch sampling can effectively reduce the bias caused by data dependence and leverage the full information of all the data samples to improve the learning performance.


To provide an intuitive explanation, this is because with mini-batch sampling, we can choose a sufficiently large batch size $B$ to reduce the bias caused by the data dependence and then choose a small auxiliary parameter $\tau = 1$. As a comparison, to control the bias caused by data dependence, the standard online SGD needs to choose a very large $\tau$ and the online SGD with subsampling needs to choose a large subsampling period $r$ that skips a huge amount of valuable data samples, especially when the mixing coefficient converges slowly. Therefore, our result proves that it is beneficial to use mini-batch data sampling when the data samples are highly dependent.

Our proof of the high-probability bound in Theorem \ref{thm:convex} for SGD with mini-batch sampling involves substantial new developments compared with the proof of \citep{agarwal2012generalization}. Next, we elaborate on our technical novelty. 

\begin{itemize}
\item In \citep{agarwal2012generalization}, they defined the following random variable 
\begin{align*}
    X_t^i := &f\big(w((t-1)\tau + i) \big) - f(w^\ast)  \\ & + F\big(w((t-1)\tau + i); \xi_{t+\tau -1} \big) - F\big(w^\ast; \xi_{t+\tau -1}\big).
\end{align*}
As this random variable involves only one sample $\xi_{t+\tau -1}$, they bound the bias term $X_t^i - \EE[X_t^i|\calF_{t-1}^i]$ as a universal constant. As a comparison, the random variable $X_t^i$ would involve a mini-batch of samples $x_{t+\tau -1}$ in our analysis.  With the mini-batch structure, the bias $X_t^i - \EE[X_t^i|\calF_{t-1}^i]$ can be written as an average of $B$ zero-mean dependent random variables, which is close to zero with high probability due to the concentration phenomenon. Consequently, we are able to apply a Bernstein-type inequality developed in \citep{delyon2009exponential} for dependent stochastic process to obtain an improved bias bound from $\calO(1)$ to $\widetilde{\calO}(1/{\sqrt{B}})$. This is critical for obtaining the improved bound.

\item Second, with the improved high-probability bias bound mentioned above, the remaining proof of \citep{agarwal2012generalization} no longer holds. 
Specifically, we can no longer apply the Azuma's inequality to bound the accumulated bias $\sum_t (X_t^i - \EE[X_t^i|\calF_{t-1}^i])$, as each bias term is no longer bounded with probability one. To address this issue, we developed a generalized Azuma's inequality {for martingale differences} in Lemma B.3 based on Proposition 34 of \citep{tao2015random} {for independent zero-mean random variables}. 
%\blue{As far as I know, it is the first time to apply the Bernstein's inequality of \cite{delyon2009exponential} to the optimization research, while there are other types of Bernstein's inequalities used in this field.}

\item Third, we develop a high-probability regret bound for online SGD with mini-batch sampling over dependent data so that it can be integrated with the high-probability convergence bound in Theorem \ref{thm:convex}. To our best knowledge, the regret of SGD over dependent data has not been studied before. 

%As a comparison, the regret bounds mentioned in \citep{agarwal2012generalization} are in-expectation \blue{when generalizing them into the mini-batch setting}, which cannot be directly integrated into the high-probability convergence error bound.

%\blue{Comment: there are two deterministic regret bounds I just found: \cite{hazan2019introduction} and \cite{xiao2009dual}; they can be directly applied to \cite{agarwal2012generalization}'s result. So I think our regret bound is just novel for mini-batch setting with data dependence.}
\end{itemize}
 
    


%from Bernstein's inequality,  $X_t^i - \EE[X_t^i|\calF_{t-1}^i]$ could be bounded of the level $\calO(\frac{1}{\sqrt{B}})$ with a high probability.   

% In Eq.(14) of \cite{agarwal2012generalization}, 
%$\sum_t [X_t^i - \EE[X_t^i|\calF_{t-1}^i]]$   is  bounded using Azuma's inequality with the order $\tilde{\calO}(\sqrt{n \tau})$, where $X_t^i$ is defined as 
        %$$X_t^i := f(w((t-1)\tau + i) ) - f(w^\ast) + F(w((t-1)\tau + i); x_{t+\tau -1} ) - F(w^\ast; x_{t+\tau -1}).$$ 


\section{Experiments}\label{sec: numerical}

In this section, we examine our SGD theory via two experiments on stochastic quadratic programming and neural network training with dependent data. 

\subsection{Stochastic Quadratic Programming}
We consider the following stochastic convex quadratic optimization problem.
$$\min_{w \in \RR^d} f(w) := \EE_{\xi \sim \mu} \big[ (w - \xi)^\top A (w - \xi) \big],$$
where $A\succeq 0$ is a fixed positive semi-definite matrix and  $\mu$ is the uniform distribution on $[0,1]^d$. 
Then, following the construction in \citep{jarner2002polynomial}, we generate an algebraic $\phi$-mixing Markov chain that has the stationary distribution $\mu$. In particular, its mixing coefficient $\phi_\xi(k)$ converges at a sublinear convergence rate $k^{-\frac{1}{r}}$, where $r>0$ is a parameter that controls the speed of convergence. Please refer to Appendix D for more details of the experiment setup. 


We first estimate the following stochastic bias at the fixed origin point $w=\boldsymbol{0}_d$. 
$$\text{(Bias):}\quad \Big|\mathbb{E} \big[ F(w ; x_\tau) | x_0 = \boldsymbol{0}_d\big] - f(w)\Big|,$$
where the expectation is taken over the randomness of the mini-batch of samples queried at time $\tau\in \mathbb{N}$.
Such a bias is affected by several factors, including the time gap $\tau$, the batch size $B$ and the convergence rate parameter $r$ of the mixing coefficient. 

In \Cref{fig: 1}, we investigate the impact of these factors on the stochastic bias, and we use 10k Monte Carlo samples to estimate the stochastic bias. The top two figures fix the batch size, and it can be seen that the bias decreases as $\tau$ increases, which matches the definition of the $\phi$-mixing process. Also, a faster-mixing Markov chain (i.e., smaller $r$) leads to a smaller bias. In particular, with batch size $B=1$ and a slow-mixing chain $r=2$, it takes an unacceptably large $\tau$ to achieve a relatively small bias. This provides an empirical justification to Corollary \ref{coro: sgd} and explains why the standard SGD suffers from a high sample complexity over highly dependent data. Moreover, as the batch size gets larger, one can achieve a numerically smaller bias, which matches our Lemma \ref{lemma:0}. The bottom two figures fix the convergence rate parameter of the mixing coefficient, and it can be seen that increasing the batch size significantly reduces the bias. Consequently, instead of choosing a large $\tau$ to reduce the bias, one can simply choose a large batch size $B=100$ and set $\tau = 1$. This observation matches and justifies our theoretical results in Corollary \ref{coro: minibatch}.

\begin{figure}[tbh]
%\vspace{-2mm}
	\centering
\includegraphics[width=0.2\textwidth]{fig/tau-1.png}
%\includegraphics[width=0.32\textwidth]{uai2022/fig/tau-10.png}
\includegraphics[width=0.2\textwidth]{fig/tau-100.png}
\includegraphics[width=0.2\textwidth]{fig/bs-0.5.png}  
%\includegraphics[width=0.32\textwidth]{iclr2022/fig/bs-1.png}  
\includegraphics[width=0.2\textwidth]{fig/bs-2.png}  
%\vspace{-4mm}
	\caption{Impact of $\tau$, batch size $B$ and convergence rate of mixing coefficient on the bias in quadratic programming.} 
	\label{fig: 1}
%\vspace{-2mm}
\end{figure}

% \begin{figure}[tbh]
% \vspace{-4mm}
% 	\centering
% \includegraphics[width=0.32\textwidth]{iclr2022/fig/bs-0.5.png}  
% %\includegraphics[width=0.32\textwidth]{iclr2022/fig/bs-1.png}  
% \includegraphics[width=0.32\textwidth]{iclr2022/fig/bs-2.png}  
% \vspace{-4mm}
% 	\caption{Impact of $\tau$ and convergence rate of mixing coefficient on the bias under different batch sizes.} 
% 	\label{fig: 2}
% 	\vspace{-4mm}
% \end{figure}

\begin{figure}
	\centering
	%\vspace{-6mm}
	\includegraphics[width=0.2\textwidth]{fig/loss_curve.png} 
	% \vspace{-6mm}
	\caption{Comparison of sample complexity of different SGD algorithms in quadratic programming.}\label{fig:2} 
\end{figure}
We further compare the convergence of SGD, SGD with subsampling and mini-batch SGD. {Here, we set $r=2$ to generate highly dependent data samples. We set learning rate $\eta = 0.01$ for both SGD and SGD with subsampling, and set learning rate $\eta = 0.01 \times \sqrt{\frac{B}{\sum_{j=1}^B \phi_\xi(j)}} = 0.01 \times 100^{1/4} $ for mini-batch SGD with batch size $B=100$, as suggested by Theorem C.3 in the appendix. The results are plotted in Figure \ref{fig:2}, where each curve corresponds to the mean of $100$ independent trails.} It can be seen that {SGD with subsampling achieves a lower loss than the standard SGD asymptotically, due to the use of less dependent data. Moreover, mini-batch SGD achieves the smallest asymptotic loss.} All these observations are consistent with our   results.

\subsection{Neural Network Training}
We further apply these online SGD algorithms to train a convolutional neural network with the MNIST dataset \citep{lecun98}. The network consists of two convolution blocks followed by two fully connected layers. Specifically,
each convolution block contains a convolution layer, a
max-pooling layer with stride step $2$, and a ReLU activation layer. The convolution layers in the two blocks have input channel $1$, $10$ and output channel $10$, $20$, respectively, and both of them have kernel size $5$, stride step $1$ and with no
padding. The two fully connected layers have input dimensions $320$, $50$ and output dimensions $50$, $10$, respectively.

To generate a stream of dependent data, we first generate an algebraic $\phi$-mixing Markov chain $\{X_t\}_t$ with the construction provided in \citep{jarner2002polynomial}. Then, we map each $X_t$ to a label of the MNIST dataset $\{0,1,2,\dots,9\}$, and uniformly sample an image at random from the corresponding image class. This data-generating process generates a dependent data stream with a $\phi_\xi$-mixing coefficient approximately $k^{-\frac{1}{r}}$.  


We first test the performance of SGD with a fixed batch size and different correlation coefficients. Specifically, we choose batch size $B=1000$ and consider different correlation coefficients $r \in \{1.0,1.25,1.5,1.75,2.0\}$. Here, a larger $r$ implies higher data dependency. \Cref{fig:3} (left) plots the experiment results. It can be seen that with an increasing correlation coefficient, the convergence of SGD is slower.  We further fix the correlation coefficient $r = 2.0$ and vary the batch size $B\in\{8, 16, 32, 64, 128\}$. \Cref{fig:3} (right) plots the experiment results. It can be seen that SGD with the largest batch size $B=128$ achieves the smallest asymptotic loss among all choices of batch sizes. In particular, SGD with a larger batch size tends to converge faster over such dependent data. This also matches our theoretical analysis and it implies that mini-batch SGD with a large batch size can benefit neural network training over dependent data.  

\begin{figure}[tbh]
	\centering
	% \vspace{-6mm}
	\includegraphics[width=0.2\textwidth]{fig/r.png} 
	\includegraphics[width=0.2\textwidth]{fig/bs.png} 
	% \vspace{-6mm}
	\caption{Comparison of SGD over dependent data with different mixing coefficients and batch sizes.}\label{fig:3} 
\end{figure}




















% Particularly, when $\tau = 1$, there is the probability at least $1-\delta$, 
% \begin{align}
%     \sum_{t=1}^{n} [f(w(t)) - f(w^\ast)] = \tilde{\calO}\left(\frac{n}{B}\sum_{i=1}^B \phi(B + i) +  \sqrt{\frac{ n}{B}  }\cdot  (\sum_{i=1}^{B}  \phi(i) )^{\frac{1}{4}} \right)+  R_n  . \label{eq:main2}
% \end{align}
 

%\section{Application to Analysis of Fitted Q-Iteration}
%\input{./tex/body/main_rl.tex}

%\section{Experiments}
% moved to Section 3.
%\input{./tex/body/experiment.tex}
 
\section{Conclusion}
In this study, we investigate the convergence property of SGD under various popular stochastic update schemes over highly dependent data. Unlike the conventional i.i.d.\ data setting in which the stochastic update schemes do not affect the sample complexity of SGD, the convergence of SGD in the data-dependent setting critically depends on the structure of the stochastic update scheme. In particular, we show that both data subsampling and mini-batch sampling can substantially improve the sample complexity of SGD over highly dependent data. 
Our study takes one step forward toward understanding the theoretical limits of stochastic optimization over dependent data, and it opens many directions for future study. For example, it is interesting to further explore the impact of algorithm structure on the sample complexity of stochastic reinforcement learning algorithms. Also, it is important to develop advanced algorithm update schemes that can facilitate the convergence of learning over highly dependent data.


%\begin{contributions} % will be removed in pdf for initial submission,
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
%    Briefly list author contributions.
%    This is a nice way of making clear who did what and to give proper credit.

%    H.~Q.~Bovik conceived the idea and wrote the paper.
%    Coauthor One created the code.
%    Coauthor Two created the figures.
%\end{contributions}

\begin{acknowledgements}  
The work of Shaocong Ma, Ziyi Chen and Yi Zhou was supported in part by U.S. National Science Foundation under the Grants CCF-2106216 and DMS-2134223.

The work of Y. Liang was supported in part by U.S. National Science Foundation under the grants CCF-1909291 and ECCS-2113860.
\end{acknowledgements}

\bibliography{uai2022-template}
 

\end{document}
