\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}


\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{mathtools}
\newcommand{\R}{ \mathbb{R}}
\newcommand{\squeeze}{\textstyle}
\newcommand{\eqdef}{\coloneqq}
\usepackage{colortbl}
\definecolor{bgcolor}{rgb}{1, 1, 0.8}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amsthm}
\usepackage{mathtools}
\newtheorem{proposition}{Proposition}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{assumption}{Assumption}
\newtheorem{remark}{Remark}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{amsthm}
\newtheorem{definition}{Definition}[section]

\usepackage[flushleft]{threeparttable} % http://ctan.org/pkg/threeparttable


\definecolor{bgcolor}{rgb}{1, 1, 0.8}

\usepackage{xspace}
\newcommand{\DS}{{\sf \footnotesize \color{blue} Det-Shuffle}\xspace}
\newcommand{\RS}{{\sf \footnotesize \color{orange} Rand-Shuffle}\xspace}
\newcommand{\RR}{{\sf \footnotesize \color{red} Rand-Reshuffle}\xspace}


%\newcommand{\algname}[1]{{\sf \footnotesize \color{cyan}#1}\xspace}
%\newcommand{\algnameSMALL}[1]{{\sf\color{cyan}#1}\xspace}

\newcommand{\algname}[1]{{\sf \footnotesize #1}\xspace}
\newcommand{\algnameSMALL}[1]{{\sf #1}\xspace}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Random Reshuffling with Variance Reduction: New Analysis and Better Rates}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Grigory Malinovsky}
\author[1]{Alibek Sailanbayev}
\author[1]{Peter Richt\'arik}
% Add affiliations after the authors
\affil[1]{%
    AI Initiative\\
    King Abdullah University of Science and Technology\\
    Saudi Arabia
}
  
  \begin{document}
\maketitle

\begin{abstract}
	Virtually all state-of-the-art methods for training supervised machine learning models are variants of Stochastic Gradient Descent (\algname{SGD}), enhanced with a number of additional tricks, such as minibatching, momentum, and adaptive stepsizes. However, one of the most basic questions in the design of  successful \algname{SGD} methods, one that is orthogonal to the aforementioned tricks, is the choice of the {\em next} training data point to be learning from. Standard variants of \algname{SGD} employ a {\em sampling with replacement} strategy, which means that the next training data point is sampled from the entire data set, often independently of all previous samples. While standard \algname{SGD} is well understood theoretically,  virtually all widely used machine learning software is based on  {\em sampling without replacement} as this is often empirically superior. That is, the training data is randomly shuffled/permuted, either only once at the beginning, strategy known as {\em random shuffling} (\RS), or before every epoch, strategy known as {\em random reshuffling} (\RR),  and  training proceeds in the data order dictated by the shuffling.  \RS and \RR strategies  have for a long time remained beyond the reach of  theoretical analysis that would satisfactorily explain their success. 	However, very recently, \citet{mishchenko2020random} provided tight {\em sublinear} convergence rates through a novel analysis, and showed that these strategies can improve upon standard \algname{SGD} in certain regimes. Inspired by these results, we seek to further  improve the rates of shuffling-based methods. In particular, we show that it is possible to enhance them with a variance reduction mechanism, obtaining  {\em linear} convergence rates.	To the best of our knowledge, our linear convergence rates are the best for any method based on sampling without replacement. 
\end{abstract}

\section{Introduction}\label{sec:intro}
The main paradigm for training supervised machine learning models---Empirical Risk Minimization (ERM)---is an optimization problem of the finite sum structure \begin{equation}
	\label{eq:main_finite_sum}
	\min \limits_{x \in \R^d} \left[ f(x) \eqdef  \frac{1}{n}\sum \limits_{i=1}^n f_i(x)\right],
\end{equation}
where $x \in \R^d$ is a vector representing the parameters (model weights, features) of a model we wish to train, $n$ is the number of training data points, and $f_i(x)$ represents the (smooth) loss of the model $x$ on data point $i$. The goal of ERM is to train a model whose average loss on the training data is minimized. This abstraction allows to encode virtually all supervised models trained in practice, including linear and logistic regression, and neural networks. 

The gigantic size of modern training data sets necessary to train models with good generalization poses severe issues for the designers of methods for solving \eqref{eq:main_finite_sum}. Over the last decade, stochastic first-order methods have emerged as the methods of choice, and for this reason, their importance in machine learning remains exceptionally high~\citep{Bottou2018}. 
Of these, stochastic gradient descent (\algname{SGD}) is perhaps the best known, but also the most basic. \algname{SGD} has a long history~\citep{robbins1951stochastic,bertsekas1996neuro} and is therefore well-studied and well-understood~\citep{Rakhlin2012,hardt2016train,Drori2019,gower2019sgd,Nguyen2020}.

{\bf Training data order.} Standard and even variance-reduced variants of \algname{SGD} employ a {\em sampling with replacement} strategy~\citep{sigma_k}, which means that the next training data point in each epoch is sampled from the entire data set, independently of all previous samples. However,  virtually all widely used machine learning software is based on  {\em sampling without replacement} as this is often empirically superior~\citep{bottou2009curiously,recht2013parallel}, and therefore acts as the de-facto default sampling mechanism in deep learning~\citep{Bengio2012,Sun2020}. With this latter strategy, in each epoch we sample each training data exactly once, and this can be performed by generating a random permutation of the training data. 

There are three commonly used variants of sampling without replacement. 
\begin{itemize}
	\item [(i)] In the first, which we call {\em deterministic shuffling} (\DS) in this paper, the training data is processed in some natural order in a cyclic manner. That is, a deterministic permutation is used  throughout the entire training process. This idea is the basis of 
	the \algname{Cyclic-GD} method~\citep{Luo1991, Grippo1994}. While this strategy is not effective in practice, it is perhaps the simplest strategy conceptually, and has been studied repeatedly. However, it is notoriously difficult to obtain good guarantees for it.
	
	\item [(ii)] In the second variant, which we call {\em random shuffling} (\RS) in this paper\footnote{This method is called ``shuffle once'' in some papers.}, the training data is instead  shuffled/permuted randomly. This is done only once, before the start of the training process, and the selection of training data then follows a cyclic pattern dictated by this single random permutation~\citep{Nedic2001}. The purpose of this procedure is to break the potentially adversarial default ordering of the data that could negatively affect training speed. Almost no non-trivial analyses exist for this method~\citep{mishchenko2020random}.  This strategy works very well in practice.
	
	\item [(iii)] In the third variant, known as {\em random reshuffling} (\RR),  the training data is randomly  reshuffled before the start of each epoch. This is perhaps the most common and relatively most studied approach. Its empirical performance is, however, often very similar to \RS, and the current  best theoretical bounds for both are the same~\citep{mishchenko2020random}.
\end{itemize}





{\bf Difficulties with analyzing shuffling-based methods.} The main difficulty in analyzing methods based on sampling without replacement is that each gradient step within an epoch is {\em biased}, and performing a sharp analysis of methods based on biased estimators is notoriously difficult. While \algname{Cyclic-GD} was studied already a few decades ago~\citep{Mangasarian1994,Bertsekas2000}, convergence rates were established relatively recently~\citep{Li2019,Ying2019,Gurbuzbalaban2019IG,Nguyen2020}. For the \RS method, the situation is more complicated, and non-vacuous theoretical analyses were only performed recently~\citep{safran2020good, Rajput2020}. \RR is well understood for twice-smooth~\citep{gurbuzbalaban2019random,haochen2018random} and smooth~\citep{Nagaraj2019} objectives. Moreover, lower bounds for \RR and similar methods were also recently established~\citep{safran2020good, Rajput2020}. \citet{mishchenko2020random} recently performed an in-depth analysis of \DS, \RS and \RR with novel and simpler proof techniques, leading to improved and new convergence rates. Their rate for \RS, for example, tightly matches the lower bound of \citet{safran2020good} in the case when each $f_i$ is strongly convex. Further, \RR can be accelerated~\citep{gurbuzbalaban2019random}, and for small constant step-sizes, the neighborhood of solution can be controlled~\citep{sayed2014adaptation}.  However, despite these advances, \RR and related methods described above still suffer from the same problem as \algname{SGD}, i.e., we do not have variants that would have a fast linear convergence rate to the exact minimizer. 	

{\bf Variance reduction.} Despite its simplicity and elegance, \algname{SGD} has a significant disadvantage: the variance of naive stochastic gradient estimators of the true gradient remains high throughout the training process, which causes issues with convergence. When a constant learning rate is used in the smooth and strongly convex regime, \algname{SGD} converges linearly to a neighborhood of the optimal solution of size proportional to the learning rate and to the variance of the stochastic gradients at the optimum~\citep{gower2020variance}. While a small or a decaying learning schedule restores convergence, the convergence speed suffers as a result. Fortunately, there is a remedy for this ailment: {\em variance-reduction} (VR) \citep{johnson2013accelerating}. The purpose of VR mechanisms is to steer away from the naive gradient estimators. Instead, VR mechanisms iteratively construct and apply a gradient estimator whose variance would eventually vanish. This allows for larger learning rates to be used safely, which accelerates training. Among the early VR-empowered \algname{SGD} methods belong \algname{SAG}~\citep{roux2012stochastic}, \algname{SVRG}~\citep{johnson2013accelerating}, \algname{SAGA}~\citep{defazio2014saga}, and \algname{Finito}~\citep{pmlr-v32-defazio14}. For a recent survey of VR methods, see \citep{gower2020variance}.



{\bf Related work.}	
Some cyclic and random reshuffling versions of variance-reduced methods were shown to obtain linear convergence. Incremental Average Gradient (\algname{IAG})---a cyclic version of the famous \algname{SAG} method---was analyzed by \citet{Gurbuzbalaban2017}. Based on this, the Doubly Incremental Average Gradient (\algname{DIAG}) method was introduced, and it has a significantly better rate if each $f_i$ is strongly convex~\citep{mokhtari2018surpassing}. A linear rate for  \algname{Cyclic-SAGA} was established by \citet{park2020linear}. The first analysis of \RR with variance reduction was done by \citet{ying2020variance}. Firstly, they establish a linear rate for \algname{SAGA} under random reshuffling, and then they introduce a new method called Amortized Variance-Reduced Gradient (\algname{AVRG}), which is similar to \algname{SAGA}. Stochastic Variance Reduced Gradient (\algname{SVRG}) using \RR was introduced by \citet{shamir2016without}, and their theoretical analysis was conducted for the Least Squares problem. The promising result of \algname{Prox-DFinito} is introduced in~\citet{huang2021improved} for the composite optimization problem.


\section{Approach and Contributions}





Let us now briefly outline our approach and key contributions.	

\subsection{Controlled linear perturbations}

In the design of our methods we employ a simple but powerful tool: the idea of introducing a sequence of carefully crafted reformulations of the original finite sum problem, and applying vanilla shuffling-based methods on these reformulations instead of the original formulation. As the sequence is designed to have progressively better conditioning properties, our methods will behave progressively better as well, and this is why this result in  variance reduced shuffling methods. 

The main idea is to perturb the objective function with zero written as the average of $n$ nonzero linear functions. This perturbation is performed at the beginning of each epoch, and stays fixed within each epoch. Let us consider the finite sum problem \eqref{eq:main_finite_sum} and vectors $a^i_t, \dots, a^n_t \in \R^d$ summing up to zero: $\sum_{i=1}^{n}a^i_t = 0$. Let $a^t = (a^1_t, \dots, a^n_t)$. Adding this {\em structured} zero to $f$, we reformulate problem~\eqref{eq:main_finite_sum} into the equivalent form
\begin{align}
	\label{reform}
	 	f(x)  &= \frac{1}{n}  \sum \limits_{i=1}^n f_i(x) = \frac{1}{n}  \sum \limits_{i=1}^n \left( f_i(x)+\left\langle a^i_t, x \right\rangle \right)\notag\\
	& = \frac{1}{n} \sum \limits_{i=1}^{n}f_i^t(x),
\end{align}
where $f_i^t(x) \eqdef f_i(x)+\left\langle a^i_t, x \right\rangle$. Note that \begin{equation}\label{eq:09yh9f8hdfd}
	\nabla f_i^t(x) = \nabla f_i(x)+ a^i_t.\end{equation} Next, we establish a simple but important property of this reformulation.

\begin{proposition}
	\label{prop-reform}
	Assume that each $f_i$ is $\mu$-strongly convex (resp.\ convex) and $L$-smooth. Then $f_i^t$ 		is $\mu$-strongly convex (resp.\ convex) and $L$-smooth.
\end{proposition}

In our methods, the vectors $a^1_t, \dots, a^n_t$ depend on two objects:
\begin{itemize}
	\item a {\em control vector} $y_t\in \R^d$, which is updated at the start of each epoch,
	\item the {\em permutation} $\pi = \left\{\pi_{0}, \pi_{1}, \ldots, \pi_{n-1}\right\}$ chosen at the beginning of the current epoch.
\end{itemize}

In particular, we choose
\begin{equation}\label{eq:a_t^i} a_t^i\eqdef -\nabla f_i\left(y_t\right)+\nabla f\left(y_t\right).\end{equation}

Note that by plugging \eqref{eq:a_t^i} into \eqref{eq:09yh9f8hdfd},  the gradient of $f_{\pi_i}^t$ at $x\in \R^d$ is given by
\begin{equation} \label{eq:SG-reform} g^i_t(x,y_t) \eqdef  \nabla f_{\pi_i} (x)-\nabla f_{\pi_i} (y_t)+\nabla f (y_t).\end{equation}

At the start of each epoch, the control vector $y_t$ is set to the latest iterate $x_t$.

\subsection{New algorithms: improvement of shuffling based methods}

\begin{algorithm}[t]
	\caption{Algorithms {\DS, \RS, \RR}}
	\label{alg:GENERIC-SVRG}
	\begin{algorithmic}
		\STATE \textbf{Input:} Stepsize $\gamma>0$, initial iterate $x_0  \in \mathbb{R}^{d}$, number of epochs $T$
		\STATE {\color{blue} {\bf Option  \DS:} Choose a deterministic permutation $\{\pi_0, \dots, \pi_{n-1}\}$ of $\{1, \dots, n\}$} 
		\STATE {\color{orange} {\bf Option  \RS:} Choose a random permutation $\{\pi_0, \dots, \pi_{n-1}\}$ of $\{1, \dots, n\}$} 
		\FOR{$t =  0, 1, \dots T-1$ }
		\STATE  {\color{red}{\bf Option \RR:} Choose a random permutation $\{\pi_0, \dots, \pi_{n-1}\}$ of $\{1, \dots, n\}$} 
		\STATE $x_t^0 = x_t$, $y_t=x_t$
		\FOR{$i= 0, \dots, n-1$ }
		\STATE $g^i_t(x_t^i,y_t) =  \nabla f_{\pi_i} (x_t^i)-\nabla f_{\pi_i} (y_t)+\nabla f (y_t) $
		\STATE $x^{i+1}_t = x^i_t - \gamma g_t^i(x_t^i, y_t)$
		\ENDFOR
		\STATE $x_{t+1} = x^n_t$
		\ENDFOR
	\end{algorithmic}
\end{algorithm}

Our key proposal is to run {\em standard} \DS, \RS and \RR methods, for example as described in \citep{mishchenko2020random}, but in each epoch to apply them to the current reformulated problem
\[ \min \limits_{x\in \R^d} \frac{1}{n} \sum \limits_{i=1}^n f_i^t(x). \]
This leads to our variance-reduced algorithms, all described compactly in Algorithm~\ref{alg:GENERIC-SVRG}. Hoping that this will not cause confusion, we do not give the methods a different name. 

\begin{itemize}
	\item Note that as mentioned in the introduction,  in \DS we only use a single deterministic permutation at the start of the method. The steps in the inner loop are then performed incrementally through all data points (individual functions), in the same order in each epoch. 
	
	\item	In contrast, in \RS we shuffle the data points randomly instead, but otherwise proceed as in \DS, using this one permutation in all subsequent epochs. 
	
	\item Finally, \RR is similar to \RS, with the exception that a new  permutation is resampled at the start of each epoch.\footnote{Note that \RR can be seen as a version of \algname{SVRG} in which the number of inner steps $m$ is equal to $n$, and in which sampling {\em without} replacement is used. \citet{johnson2013accelerating} remarked that $m = \mathcal{O}(n)$ works well in practice, but a theoretical analysis of this was not provided.}
	
\end{itemize}


Besides Algorithm~\ref{alg:GENERIC-SVRG},	we also propose a generalized version of \RR  (Algorithm~\ref{alg:GENERIC-SVRG}), which differs from \RR in that at the end of each epoch we flip a biased coin to decide whether to update the control vector $y_t$ or not. While in \RR the control vector $y_{t+1}$ is updated to the latest iterate $x_{t+1}$, in Algorithm~\ref{alg:RR_VR} we use the previous point $x_t$. We do this as it slightly simplified the analysis. However, it makes sense to use the newest point $x_{t+1}$ instead of $x_t$ to update the control vector in practice. This method is described in the appendix only.

\subsection{Analysis technique: the basic idea}

Since in view of Proposition~\ref{prop-reform} the reformulated problem satisfies all assumptions of the original problem, in a single epoch it is possible to apply results that hold for vanilla \DS, \RS and \RR methods -- variants that are not variance-reduced. In particular, we rely on some results of \citet{mishchenko2020random}, and complement them with new analysis that handles the changing nature of the reformulations through the change in the control vectors $\{y_t\}$.

In particular, a key insight of our paper is the observation that by updating the control vector, we can control the  variance of shuffling based methods.\footnote{While this was known for methods based on sampling with replacement, this is a new observation for methods based on sampling without replacement, and our control strategy.} 

%The critical requirement for variance reduction mechanism is updating the control vector. We update it after each epoch, which means that the problem's reformulation happens at the beginning of the next epoch. However, \algname{RR-VR} (Algorithm~\ref{alg:RR_VR}) allows to do this probabilistically.


We are now ready to formulate the core lemma of our work. %While this lemma is simple, it allows us to trasnform shuffling  algorithms variance-reduced.
\begin{lemma}
	\label{main_lemma_lemma}
	Assume that each $f_i$ is $L$-smooth and convex. If we apply the linear perturbation reformulation~\eqref{reform} 
	using vectors of the form \eqref{eq:a_t^i}, then the gradient variance of the reformulated problem  at the optimum $x_*$ can be bounded via the distance of the control vector $y_t$ to $x_*$ as follows:
	\begin{equation}
		\label{main_lemma} 
		\left(\sigma^t_{*}\right)^{2} \eqdef \frac{1}{n} \sum \limits_{i=1}^{n}\left\|\nabla f_i^t \left(x_{*}\right)\right\|^{2}\leq 4L^2\|y_t-x_*\|^2.
	\end{equation}
\end{lemma}





% We also provide  better rates for \DS based methods in the strongly convex and convex cases (Section~\ref{subsection 6}).  We also provide the first convergence analysis of \RS and \algname{Cyclic-SVRG}. 



\subsection{Complexity results}

%Table~\ref{Tab:mainresults}

{\footnotesize
	
	\begin{table*}[t]
		\centering
		\caption{Complexity of shuffling based methods (in all expressions we ignore constant terms). }\label{Tab:mainresults}
		%\renewcommand{\arraystretch}{1.0}
		\begin{threeparttable}			
			%\begin{tabular}{|p{22mm}|p{22mm}|p{22mm}|p{12mm}|p{10mm}|p{24mm}|}		
			\footnotesize
			\begin{tabular}{| c | c | c | c | c | c | c}				
				\hline
				\bf Algorithm & \begin{tabular}{c}\bf $\mu$-strongly \\ \bf convex $f_i$ \end{tabular}& \begin{tabular}{c}\bf $\mu$-strongly   \\ convex $f$ \end{tabular} & \begin{tabular}{c}\bf convex \\ $f$  \end{tabular} &\bf \begin{tabular}{c}\bf non-convex \\ $f$  \end{tabular}  &\bf memory& \bf reference\\
				\hline
				\algname{RR-SAGA}
				& --
				&  $\kappa^2\log \nicefrac{1}{\epsilon}$& -- &--&$dn$&\citet{ying2020variance}\\ \hline
				
				\algname{AVRG}
				& --
				& $\kappa^2\log \nicefrac{1}{\epsilon}$&--&--& $d$&\citet{ying2020variance}\\ \hline
				\rowcolor{bgcolor}
				\begin{tabular}{c}\RS \\  \RR \end{tabular} & $\kappa\sqrt{\frac{\kappa}{n}} \log \nicefrac{1}{\epsilon}$\tnote{\color{red} (1)}  \; & \begin{tabular}{c}$ \kappa \log \nicefrac{1}{\epsilon} $\tnote{\color{red} (2)}  \\ $ \kappa\sqrt{\kappa} \log \nicefrac{1}{\epsilon} $\tnote{\color{red} (3)} \end{tabular} &$\nicefrac{L}{\varepsilon}$&$\nicefrac{L}{\epsilon^2}$\tnote{\color{red} (6)} &$d$& this paper\\
				\hline
				\algname{Prox-DFinito}& $\kappa \log \nicefrac{1}{\epsilon} $&--&$\nicefrac{L^2}{\epsilon}$&--&$dn$&\citet{huang2021improved} \\
				\hline
				\algname{Cyclic-SAGA}& $\kappa^2\log \nicefrac{1}{\epsilon}$&--&--&--&$dn$&\citet{park2020linear}\\
				\hline
				\algname{IAG}\tnote{\color{red} (4)} & -- & $n\kappa^2\log \nicefrac{1}{\epsilon}$&--&--&$dn$& \citet{Gurbuzbalaban2017}\\
				\hline
				\algname{DIAG}\tnote{\color{red} (5)}  &  $\kappa\log \nicefrac{1}{\epsilon}$&--&--&--&$dn$&\citet{mokhtari2018surpassing}\\
				\hline
				\rowcolor{bgcolor}
				\DS & -- & $\kappa\sqrt{\kappa}\log \nicefrac{1}{\epsilon}$ & $\nicefrac{L}{\epsilon}$ &--& $d$ &  this paper\\
				\hline
			\end{tabular}
			\begin{tablenotes}
				{\footnotesize
					\item [{\color{red}(1)}] First Big data regime: $n>\log \left(1-\delta^2\right) / \log (1-\gamma \mu)$.
					\item [{\color{red}(2)}] Second Big data regime: $n \geq 2 \kappa /\left(1-\frac{1}{\sqrt{2} \kappa}\right)$.
					\item [{\color{red}(3)}] General regime.
 (\algnameSMALL{SAG}) method, which was the original inspiration for \algnameSMALL{SAG}.     
 					\item [{\color{red}(4)}] Cyclic version of the \algnameSMALL{Finito} algorithm.      
 					\item [{\color{red}(5)}]  Cyclic version of the Stochastic Average Gradient   .
					\item [{\color{red}(6)}] The result is applied to \RR  .
				}
			\end{tablenotes}			
		\end{threeparttable}										
	\end{table*}
}	

Our theory leads to improved rates for shuffling-based methods using all three sampling strategies: \DS, \RS and \RR. We provide theoretical guaranties in Section~\ref{sec:theory}; a summary  is presented in Table~\ref{Tab:mainresults}.

$\diamond$ \textbf{Strongly convex case.} If $f$ is strongly convex, we obtain $\mathcal{O}\left(\kappa^{3/2}\log \nicefrac{1}{\varepsilon}\right)$ iteration (epoch-by-epoch) complexity for \RR, where $\kappa$ is the condition number. This rate is better than the $\mathcal{O}\left(\kappa^{2}\log \nicefrac{1}{\varepsilon}\right)$ rate of \algname{RR-SAGA} and \algname{AVRG} introduced by~\citet{ying2020variance}. Moreover, if $n> \mathcal{O}(\kappa)$, we improve this rate for \RR and get $\mathcal{O}\left(\kappa\log \nicefrac{1}{\varepsilon}\right)$ complexity. If each $f_i$ is strongly convex and the number of functions is sufficiently large (Theorem~\ref{th3}), then the rate of \RR can be further improved to $\mathcal{O}(\kappa\sqrt{\nicefrac{\kappa}{n}}\log \nicefrac{1}{\varepsilon})$. For \DS we prove similar convergence results under the assumption of strong convexity of $f$. The iteration complexity of \DS method is $\mathcal{O}\left(\kappa^{3/2}\log \nicefrac{1}{\varepsilon}\right)$, which is noticeably better than the $\mathcal{O}\left(n\kappa^2\log \nicefrac{1}{\varepsilon}\right)$ rate of \algname{IAG}~\citep{Gurbuzbalaban2017}. Furthermore, it is better than the $\mathcal{O}\left(\kappa^2\log\nicefrac{1}{\varepsilon}\right)$ rate of  \algname{Cyclic-SAGA}~\citep{park2020linear}. It is worth mentioning that \citet{mokhtari2018surpassing} obtain a better complexity, $\mathcal{O}\left(\kappa\log \nicefrac{1}{\varepsilon}\right)$, for their  \algname{DIAG} method. However, their analysis requires much stricter assumption. 


$\diamond$ \textbf{Convex case.} In the general convex setting we give the first analysis and convergence guarantees for \DS, \RS, and  \RR. After applying variance reduction, we obtain fast convergence to the exact solution. As expected, these methods have the sublinear rate $\mathcal{O}(\frac{1}{\varepsilon})$ in an ergodic sense. 


%$\diamond$ \textbf{Generalized version of \RR.} Finally, in the appendix we introduce a new \RR-based method, Algorithm~\ref{alg:RR_VR}, employing variance reduction ideas similar to those behind \algname{L-SVRG}~\citep{kovalev2020don}. In this method the control vectors are updated in a randomized manner, which allows for less frequent updates of the control vector.

\subsection{Shuffling-based variants of variance reduced methods.} While, as we argue, our methods should be seen as improvements over existing shuffling-based methods via variance reduction, it is possible to alternatively see them as shuffling-based variants of variance reduced methods. However, when seen that way, we do not observe an improvement in complexity. The reason for this is that there is a large gap in our understanding of shuffling based methods, especially for variance reduced variants, which does not yet allow for theoretical speedups compared to their sampling-with-replacement cousins.  For example, from the latter viewpoint, and to the best of our knowledge, we provide the first convergence analysis of \algname{SVRG} under random reshuffling. However, the rate of classical variance reduced methods, such as \algname{SVRG}, is still superior in some regimes.	

%	\item We resolved the open convergence problem and provided the first theoretical proof and guarantee of linear convergence to the exact minimizer under random reshuffling for \algname{SVRG}. 
%	\item For strongly convex case we obtained $\tilde{\mathcal{O}}(\kappa\sqrt{\kappa})$ complexity. And in big data regime $n>\mathcal{O}(\kappa)$ we obtained $\tilde{\mathcal{O}}(\kappa)$ complexity. If each $f_i$ is strongly convex function, we get even better complexity $\tilde{\mathcal{O}}(\kappa\sqrt{\frac{\kappa}{n}})$ for significatly big number of data $n$. However, the condition for this number is quite complicated. These convergence quarantiies are better than guaranties of existed Variance Reduction algorithms under Random Reshuffling ($\tilde{\mathcal{O}}(\kappa^2)$). Citation needed.
%	\item We introduced a generalized version of \algname{SVRG} under Random Reshuffling. We also provided the convergence guarantees of this method.
%	\item We were the first to obtain convergence analysis of a cyclic version of \algname{SVRG}. It can be thought as Variance reduction for Incremental Gradient method. Our obtained complexity $\tilde{\mathcal{O}}(\kappa\sqrt{\kappa})$ is better than complexity of  \algname{Cyclic-SAGA} method. ($\tilde{\mathcal{O}}(\kappa^2)$). Citation needed.
%	\item We provided an analysis of \algname{SVRG-RR} and \algname{Cyclic-SVRG} (IG-VR) in the convex setting. We showed that the complexity in this case is $\mathcal{O}(\frac{1}{\varepsilon})$.

%\subsection{Generalized version of \RR}

%This algorithm has similarities with \algname{L-SVRG}~\citep{kovalev2020don}, where outer loop was removed and coin flip was added. Due to structure of random hhuffling we cannot remove outer loop fully, but we can add randomization using coin flip after each epoch. 




%
%\textbf{\RR and \RS} We show that the rates of \algname{SVRG} under random reshuffling (\RR) and \algname{SVRG} with shuffling once in the beginning (\RS)  for the strongly convex objective in large data regime ($n > \kappa$) is $O(\kappa)$, which is similar to the rate of original \algname{SVRG} algorithm. We also show that in small data regime these algorithms have known $O(\kappa^2)$ linear rate. In addition, we prove that \RR and \RS converge sublinearly in convex case.
%
%\textbf{General Variance Reduced Random Reshuffling} We also develop the variance reduced version of the Random Reshuffling algorithm (RR-VR) and we obtain theoretical rate for it. We show that under particular conditions, \algname{RR-VR} converges linearly in strongly-convex case.
%
%% TODO
%\textbf{Deep Learning}. We also experimentally show that \RR outperforms general \algname{SVRG}, \algname{SAGA} in neural networks. 

%\medskip

\section{Main Theoretical Results}\label{sec:theory}

Having described the methods and the idea of controlled linear perturbations, we are ready to proceed to the formal statement of our convergence results.


\subsection{Assumptions and Notation}
Before introducing our convergence results, let us first formulate the definitions and assumptions we use throughout the work.  Function $f: \mathbb{R}^d \rightarrow \mathbb{R}$ is $L$-smooth if 
\begin{equation*}
	f(y) \leq f(x)+\left<\nabla f(x),y-x\right> +\frac{L}{2}\|y-x\|^{2} \quad \forall x, y\in \R^d,
\end{equation*}
convex if 
\begin{equation*}
	f(x)+\left<\nabla f(x),y-x\right> \leq f(y)   \quad \forall x,y\in \R^d,
\end{equation*}
and $\mu$-strongly convex if 
\begin{equation*} 
	f(x)+\left<\nabla f(x),y-x\right> + \frac{\mu}{2}\|y-x\|^2 \leq	f(y)  \quad \forall x,y\in \R^d.
\end{equation*}

The Bregman divergence with respect to $f$ is the mapping $D_f:\R^d\times \R^d\to \R$ defined as follows:
\begin{equation*}
	D_{f}(x, y) \eqdef f(x)-f(y)-\langle\nabla f(y), x-y\rangle.
\end{equation*}
Note that if $y=x_*$, where $x_*$ is a minimum of $f$, then $D_{f}(x, x_*) = f(x) - f(x_*).$ Also note that $L\geq \mu$ and it leads to $\kappa = \frac{L}{\mu} \geq1$.

Lastly, we define an object that plays the key role in our analysis. 
\begin{definition}[Variance at optimum]
	\label{def:sigma}
	Gradient variance at optimum is the quantity
	\begin{equation}
		\sigma_{*}^{2} \eqdef \frac{1}{n} \sum \limits_{i=1}^{n}\left\|\nabla f_{i}\left(x_{*}\right)\right\|^{2}.
	\end{equation}
\end{definition}
This quantity is used in several recent papers on stochastic gradient-type methods. Particularly, it is a version of gradient noise introduced in \citet{gower2019sgd} for finite sum problems. 

We also need an analogous notation for permutation-based algorithms.
\begin{definition}[Shuffling variance]
	\label{def:sigma_rad}
Given a stepsize $\gamma>0$ and a random permutation $\pi$ of $1,2, \ldots, n$, define $x_*^i$ as $$x_*^i=x_*-\gamma \sum_{j=0}^{i-1} \nabla f_{\pi_j}\left(x_*\right), \quad i=1, \ldots, n-1$$. Then, the shuffling variance is given by
$$
\sigma_{\text {Shuffle }}^2=\max _{i=1, \ldots, n-1}\left(\frac{1}{\gamma} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_*^i, x_*\right)\right]\right),
$$
where the expectation is taken with respect to the randomness in the permutation $\pi$.
\end{definition}

For all theorems in this paper the following assumption is used. 
\begin{assumption}
	\label{L-smooth}
	The objective $f$ and the individual losses $f_1, \ldots , f_n$ are all $L$-smooth and function $f$ has a lower bound $f^*$. In convex setting we also assume the existence of a minimizer $x_* \in \mathbb{R}^d$. 
\end{assumption}
This assumption is classical in the literature, and it is necessary for us to get convergence results for all the methods described above. 

We consider relative error $\Psi_t \leq \varepsilon \Psi_0$, where $\Psi_t$ is a particular loss criterion.
\begin{itemize}
\item In the strongly convex case, we use $$\Psi_T=\mathbb{E}\left[\left\|x_T-x_*\right\|^2\right]$$
\item In the general convex case, we use $$\Psi_T=\mathbb{E}\left[f\left(\hat{x}_T\right)-f\left(x_*\right)\right]$$
\item In the general non-convex case, we use $$\Psi_T=\frac{1}{T} \sum_{t=0}^{T-1} \mathbb{E}\left[\left\|\nabla f\left(x_t\right)\right\|^2\right]$$
\item In the non-convex Polyak-Łojasiewicz (PL) case, we use $$\Psi_T=\mathbb{E}\left[f\left(x_T\right)-f\left(x_*\right)\right]$$.
\end{itemize}




\subsection{Convergence Analysis of Rand-Shuffle and Rand-Reshuffle}\label{subsection 5}

We provide two different rates in the strongly convex case. 
\begin{theorem}[Strongly convex case: $f$]
	\label{th1}
	Suppose that each $f_i$ is convex, $f$ is $\mu$-strongly convex, and Assumption~\ref{L-smooth}
	holds. If the stepsize satisfies $0<\gamma \leq (2\sqrt{2} L n \sqrt{\kappa})^{-1}$,
	the iterates generated by \RS and \RR satisfy
	\begin{align*} 
		\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( 1 - \frac{\gamma n \mu}{2} \right)^T \|x_0 - x_*\|^2.
	\end{align*}
	This result means that the iteration complexity of these methods is
$$
	T = \mathcal{O}\left(\kappa\sqrt{\kappa}\log \frac{1}{\varepsilon}\right).
	$$
	
\end{theorem}

If we are in the big data regime characterized by the inequality $n>\mathcal{O}(\kappa)$, then we can use a larger step-size, which leads to an improved rate. This is captured by our next theorem.
\begin{theorem}[Strongly convex case: $f$]
	\label{th2}
	Suppose that each $f_i$ is convex, $f$ is $\mu$-strongly convex and Assumption~\ref{L-smooth} holds. Additionally assume we are in the ``big data'' regime characterized by $n \geq \nicefrac{2 \kappa}{(1-\frac{1}{\sqrt{2} \kappa})}$. Then provided the stepsize satisfies $\gamma \leq \nicefrac{1}{(\sqrt{2}Ln)},$
	the iterates generated by \RS and \RR satisfy
	\begin{align*} 
		\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( 1 - \frac{\gamma n \mu}{2} \right)^T \|x_0 - x_*\|^2.
	\end{align*}
	This means that the iteration complexity of  these methods is
	$$		T = \mathcal{O}\left(\kappa\log \frac{1}{\varepsilon}\right).
	$$
\end{theorem}
% 	This additional assumption allowed us to make a significant improvement in the iteration complexity.
As we shall see next, we obtain an even better rate in the case when each function $f_i$ is strongly convex. 	
\begin{theorem}[Strongly convex case: $f_i$]\label{th3}
	Suppose that the functions $f_1, \ldots, f_n$ are $\mu$-strongly convex and Assumption~\ref{L-smooth} holds. Fix constant $0<\delta<1$. If the stepsize satisfies $\gamma\leq \nicefrac{\delta}{(L \sqrt{2n\kappa})}$, and if number of functions is sufficiently big, $n>\nicefrac{\log\left(1-\delta^2\right)}{  (\log\left(1-\gamma\mu\right))}$, then the iterates generated by \RS and \RR satisfy
	\begin{align*} 
		\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( \left(1 - \gamma \mu\right)^n +\delta^2 \right)^T \|x_0 - x_*\|^2.
	\end{align*}
	If we further assume that $\delta^2 \leq (1-\gamma\mu)^{n/2}\left(1-(1-\gamma\mu)^{n/2}\right)$, then the iteration complexity of these methods is	$$
	T = \mathcal{O}\left(\kappa\sqrt{\frac{\kappa}{n}}\log \frac{1}{\varepsilon} \right).
	$$
	
\end{theorem}
\begin{figure*}[t]
	\centering
	\begin{tabular}{ccc}
		\includegraphics[scale=0.37]{./plots/ridge_saga_svrg_theoretical_bodyfat_normalized_lambd_1_nneurips.pdf}&
		\includegraphics[scale=0.37]{./plots/ridge_saga_svrg_theoretical_a7a_normalized_lambd_10_nneurips.pdf} &
		\includegraphics[scale=0.37]{./plots/ridge_saga_svrg_theoretical_ijcnn1_normalized_lambd_1_nneurips.pdf}
		% \includegraphics[scale=0.27]{./plots/ridge_saga_svrg_theoretical_phishing_normalized_lambd_1_n.pdf}
	\end{tabular}
	\caption{Comparison of \RR and \algname{RR-SAGA} with theoretical stepsizes on \texttt{bodyfat}, \texttt{a7a}, and \texttt{ijcnn1} datasets (from left to right). }
	\label{fig:rr_vs_rrvr}
\end{figure*}	
In our work we provide the first bounds for \algname{SVRG} under random reshuffling without strong convexity.
\begin{theorem}[Convex case]
	\label{th4}
	Suppose the functions $f_1, f_2, \ldots, f_n$ are convex and Assumption~\ref{L-smooth} holds. Then for \RS and \RR with stepsize $\gamma \leq \nicefrac{1}{(\sqrt{2}Ln)},$ the average iterate $\hat{x}_{T} \eqdef \frac{1}{T} \sum_{t=1}^{T} x_{t}$ satisfies 
	\begin{align*} 
		\mathbb{E}\left[f\left(\hat{x}_{T}\right) \right] -f\left(x_{*}\right) \leq \frac{3\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma n T}.
	\end{align*}
	This means that the iteration complexity of these methods is
	$$	T = \mathcal{O}\left(\frac{L\left\|x_{0}-x_{*}\right\|^{2}}{\varepsilon}\right).
	$$
\end{theorem}

We also obtained a first convergence result for \RR in the non-convex case.
\begin{theorem}[General non-convex case]
	\label{non-convex-1}
	Suppose that Assumption \ref{L-smooth} holds. Then for Algorithm~\ref{alg:GENERIC-SVRG} (\RR) run for $T$ epochs with a stepsize $\gamma\leq \frac{1}{2Ln}$ we have 
	$$\frac{1}{T} \sum_{t = 0}^{T-1}  \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right] \leq \frac{4(f(x_0) - f_*)} {\gamma n T}.$$
	Choose $\gamma = \frac{1}{2nL}$. Then the mean of gradient norms satisfies 
	$ \frac{1}{T} \sum_{t = 0}^{T-1}  \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right] \leq \varepsilon^2 $
	provided the number of iterations satisfies 
	$$T = \mathcal{O}\left( \frac{8 \delta_{0} L }{\varepsilon^{2}}\right).$$
\end{theorem}

\begin{theorem}[Polyak-Łojasiewicz condition]
	\label{PL}
	Suppose that Assumption \ref{L-smooth} holds and $f$ satisfies the Polyak-Łojasiewicz inequality with $\mu>0$, i.e., $\left\Vert\nabla f(x) \right\Vert^{2} \geq 2 \mu(f(x)-f_*)$ for any $x \in \mathbb{R}^{d}$. Then for Algorithm \RR run for $T$ epochs with a stepsize $\gamma\leq \frac{1}{2Ln}$ we have 
	$$ \mathbb{E}\left[f(x_T)-f_*\right] \leq\left(1-\frac{\gamma \mu n}{2}\right)^{T}\left(f(x_0)-f_*\right),  $$
	then the relative error satisfies 
	$\frac{\mathbb{E}\left[f\left(x_{T}\right)-f_*\right]}{f\left(x_0\right)-f_*} \leq \varepsilon $
	provided the number of iterations satisfies $$T = \mathcal{O} \left(\kappa \log \frac{1}{\varepsilon}\right).$$
\end{theorem}



\subsection{Convergence Analysis of Det-Shuffle}\label{subsection 6}
In this section we present results for \DS. They are very similar to the previous bounds. However, the lack of randomization does not allow us to improve convergence in the big data regime.


\begin{theorem}[Strongly convex case: $f$]
	\label{th5}
	Suppose that each $f_i$ is convex function, $f$ is $\mu$-strongly convex function, and Assumption~\ref{L-smooth}
	holds. If the stepsize satisfies $\gamma \leq \nicefrac{1}{(4 L n \sqrt{\kappa})}$,
	the iterates generated by \DS satisfy
	\begin{align*} 
		\|x_T - x_* \|^2  \leq \left( 1 - \frac{\gamma n \mu}{2} \right)^T \|x_0 - x_*\|^2.
	\end{align*}
	This means that the iteration complexity of this method is
	$$		T = \mathcal{O}\left(\kappa\sqrt{\kappa}\log \frac{1}{\varepsilon}\right).$$
\end{theorem}
Note that this is the same rate as that of \RS and \RR.



Our rate for \DS is better than the rate of  \algname{Cyclic-SAGA}~\citep{park2020linear}. We remark that the convergence rate of  \algname{DIAG}~\citep{mokhtari2018surpassing} is better still; however, their result requires strong convexity of each $f_i$. 


Similarly, we can establish convergence results for \DS in the convex case. 
\begin{theorem}[Convex case]
	\label{th6}
	Suppose the functions $f_1, f_2, \ldots, f_n$ are convex and Assumption~\ref{L-smooth} holds. 
	If the stepsize satisfies $\gamma \leq \nicefrac{1}{(2\sqrt{2}Ln)}$,
	the average iterate $\hat{x}_{T} \eqdef \frac{1}{T} \sum_{j=1}^{T} x_{j} $ generated by \DS satisfies
	\begin{align*} 
		\mathbb{E}\left[f\left(\hat{x}_{T}\right)\right] -f\left(x_{*}\right) \leq \frac{2\left\|x_{0}-x_{*}\right\|^{2}}{ \gamma n T}.
	\end{align*}
	This means that the iteration complexity of this method is
	$$ 
	T = \mathcal{O}\left(\frac{L\left\|x_{0}-x_{*}\right\|^{2}}{\varepsilon}\right).
	$$
	
\end{theorem}

Up to a constant factor, the complexity of \DS is the same as that of  \RS and \RR. 



\section{Experiments}
\begin{figure*}[t]
	\centering
	\begin{tabular}{ccc}
		\includegraphics[scale=0.36]{./plots/ridge_svrg_versions_cadata_normalized_lambd_10_n_best.pdf}&		\includegraphics[scale=0.36]{./plots/ridge_svrg_versions_abalone_normalized_lambd_10_n_averages_neurips.pdf}&
		\includegraphics[scale=0.36]{./plots/ridge_svrg_versions_a1a_normalized_lambd_10_n_averages_neurips.pdf}\\
		(a)&(b)&(c)
	\end{tabular}
	
	\caption{(a) Comparison of methods on \texttt{cadata} dataset, we set the regularization constant $\lambda = 10/n$ and carefully chosen stepsizes. (b, c) Comparison of  \algname{SVRG}, \algname{L-SVRG}, \RR, \DS and \RS on \texttt{abalone} and \texttt{a1a} datasets. For each dataset we run 5 experiments and use average errors for each algorithm. }
	\label{fig:rr_vs_rrvr}
\end{figure*}







\begin{figure*}[t]
	\centering
	\begin{tabular}{ccc}
		\includegraphics[scale=0.36]{./plots/ridge_saga_svrg_rr_bodyfat_normalized_lambd_10_nneurips.pdf} &
		\includegraphics[scale=0.36]{./plots/ridge_saga_svrg_rr_bodyfat_normalized_lambd_1_nneurips.pdf} &
		\includegraphics[scale=0.36]{./plots/ridge_saga_svrg_rr_a1a_normalized_lambd_10_nneurips.pdf} 
		%   \includegraphics[scale=0.34]{../Experiments/plots/ridge_saga_svrg_rr_a1a_normalized_lambd_1_n.pdf} 
		
		
	\end{tabular}
	\caption{Comparison of \algname{SAGA}, \algname{RR-SAGA}, \RR, \algname{L-SVRG} and \algname{SVRG} with optimal stepsizes on \texttt{bodyfat} dataset with different regularization constants (on the left and middle) and \texttt{a1a} (on the right).}
	\label{fig:saga_svrg_rrvr}
\end{figure*}




In our experiments we solve the regularized ridge regression problem, which has the form \eqref{eq:main_finite_sum} with 
$$	f_i(x) = \frac{1}{2}\|A_{i,:}x-y_i\|^2 + \frac{\lambda}{2} \|x\|^2,   
$$	where $A \in \mathbb{R}^{n \times d}, y \in \mathbb{R}^n$ and $\lambda>0$ is a regularization parameter. Note that this problem is strongly convex and satisfies the Assumptions \ref{L-smooth} for $$L = \max_i \|A_{i,:}\|^2 + \lambda $$ and $$\mu = \lambda_{\min} (A^\top A)/n + \lambda,$$ where $\lambda_{\min}$ is the smallest eigenvalue. To have a tighter bound on the $L$-smoothness constant we normalize rows of the data matrix $A$. We use datasets from open LIBSVM corpus~\citep{chang2011libsvm}. In the plots $x$-axis is the number of single data gradient computation divided by $n$, and $y$-axis is the normalized error of the argument $\|x_k - x_*\|^2/\|x_0 - x_*\|^2$. In the appendix you can find the details and additional experiments.


\begin{figure*}[t]
	\centering
	\begin{tabular}{ccc}
		\includegraphics[scale=0.36]{./plots/logistic_saga_svrg_rr_bodyfat_normalized_lambd_10_ngrad_neurips.pdf} &
		\includegraphics[scale=0.36]{./plots/logistic_saga_svrg_rr_bodyfat_normalized_lambd_1_ngrad_neurips.pdf} &
		
		\includegraphics[scale=0.36]{./plots/logistic_saga_svrg_rr_a1a_normalized_lambd_1_ngrad_neurips.pdf} \\
		\includegraphics[scale=0.36]{./plots/logistic_saga_svrg_rr_abalone_normalized_lambd_1_ngrad_neurips.pdf} &
		\includegraphics[scale=0.36]{./plots/logistic_saga_svrg_rr_a3a_normalized_lambd_1_ngrad_neurips.pdf} &
		\includegraphics[scale=0.36]{./plots/logistic_saga_svrg_rr_a5a_normalized_lambd_1_ngrad_neurips.pdf}
		%   \includegraphics[scale=0.34]{../Experiments/plots/ridge_saga_svrg_rr_a1a_normalized_lambd_1_n.pdf} 
		
	\end{tabular}
	\caption{Comparison of \algname{SAGA}, \algname{RR-SAGA}, \RR, \algname{L-SVRG} and \algname{SVRG} with optimal stepsizes on \texttt{bodyfat} dataset with different regularization constants (upper left and middle), \texttt{a1a} (upper right), \texttt{abalone} (lower left), \texttt{a3a} (lower middle) and \texttt{a5a} (lower right).}
	\label{fig:logistic_saga_svrg_rrvr}
\end{figure*}


\subsection{Rand-Reshuffle vs RR-SAGA}
In this experiment, we compare \RR and \algname{RR-SAGA} under an academic setting, i.e. we choose the steps that are suggested by theory. For \RR we take the stepsize $$\gamma = \frac{1}{(\sqrt{2} L n)}$$ when $$n \geq \frac{2L}{\mu} \frac{1}{1 - \frac{\mu}{\sqrt{2}L}}$$ and $$\gamma = \frac{1}{2 \sqrt{2} L n} \sqrt{\frac{\mu}{L}}$$ otherwise, and for \algname{RR-SAGA} $$\gamma = \frac{\mu}{11 L^2 n}$$. We can see that \RR outperforms \algname{RR-SAGA} in terms of the number of epochs and the number of gradient computations. Although the cost of iteration of \RR is twice higher than \algname{RR-SAGA}, the larger stepsize significantly impacts the total complexity. In addition, \algname{RR-SAGA} needs ${\cal O}(nd)$ extra storage to maintain the table of gradients, which makes \algname{RR-SAGA} algorithm hard to use in the big data regime.

\subsection{Variance Reduced Random Reshuffling Algorithms}
This section compares the variance reduced algorithms with and without random reshuffling: \algname{SAGA}, \algname{RR-SAGA}, \algname{SVRG}, \algname{L-SVRG} and \RR. For each algorithm, we choose its optimal stepsizes using the grid search. To make algorithms reasonable to compare in \algname{SVRG}, we set the length of the inner loop $m=n$, in \algname{L-SVRG} the control update probability is $1/n$. Also, we consider only the uniform sampling version of \algname{SVRG} and \algname{L-SVRG}. We can see the results on Figure \ref{fig:saga_svrg_rrvr}. We can see that the variance reduced algorithms perform well on this experiment, and there is no obvious leader. However, note that for \algname{SAGA} and \algname{RR-SAGA}, we need to have an additional ${\cal O}(nd)$ space to store the table of the gradients, which is a serious issue in the big data regime.

\subsection{Different versions of \algname{SVRG}}
In this section, we compare different types of the \algname{SVRG} algorithm: \algname{SVRG}, \algname{L-SVRG}, \RR, \RS and \DS. For each algorithm we run five experiments with different random seeds with optimal stepsizes found by grid search, then we plot the best of the errors on Figure~\ref{fig:rr_vs_rrvr}. We can see that \RR in average outperforms other algorithms, while in some random cases \algname{L-SVRG} can perform better. Also, we can see that \RS is better than \DS that coincides with theoretical findings. If the sampling in each epoch is problematic, one can shuffle data once before the training.

\subsection{Experiments with logistic regression}

We also run experiments for the regularized logistic regression problem; i.e., for problem \eqref{eq:main_finite_sum} with
\begin{align*}
	f(x) = \frac{1}{n} \sum \limits_{i=1}^n  \log \left(1+\exp(-y_i a_i^\top x) \right)+ \frac{\lambda}{2} \|x\|^2.
\end{align*}
Note that the problem is $L$-smooth and $\mu$-strongly convex for $$L = \frac{1}{4 n} \lambda_{\max} (A^\top A) + \lambda,$$ and $\mu = \lambda$. In these experiments (also in the ridge regression experiments) when we choose optimal stepsize, we choose the best one among $\{\frac{1}{L}, \frac{1}{2L}, \frac{1}{3L}, \frac{1}{5L}, \frac{1}{10L}\}$. For the logistic regression we do not have an explicit formula for the optimum $x_*$ as in the ridge regression, thus in this case we compare the norm of the gradients instead. In Figure~\ref{fig:logistic_saga_svrg_rrvr} we can see the performance of the variance reduced algorithms: \algname{SAGA}, \algname{RR-SAGA}, \algname{SVRG}, \algname{L-SVRG} and \RR.




\section{Conclusion}	In this paper, we consider variance-reduced algorithms under random reshuffling. Our results are predominantly theoretical because these algorithms are already widely used in practice and show excellent work. We have proposed a new approach for analysis using inner product reformulation, which leads to better rates. Experimental results confirm our theoretical discoveries. Thus, we receive a deeper theoretical understanding of these algorithms' work, and we hope that this will inspire researchers to develop further and analyze these methods. The understanding of variance reduction mechanism is essential to construct accelerated versions for stochastic algorithms. We also believe that our theoretical results can be applied to other aspects of machine learning, leading to improvements in state of the art for current or future applications. 

\bibliography{biblio.bib}
\clearpage
\end{document}
\appendix
\onecolumn
\part*{Appendix}


\tableofcontents

\clearpage



\section{Basic Facts}\label{seca1}

\subsection{Elementary Inequalities}

\begin{proposition}
	For all $a, b \in \mathbb{R}^{d}$  and $t > 0$ the following inequalities hold
	\begin{align}
		\langle a, b\rangle &\leq \frac{\|a\|^{2}}{2 t}+\frac{t\|b\|^{2}}{2},\notag \\
		\|a+b\|^{2} &\leq 2\|a\|^{2}+2\|b\|^{2},\label{eq:young}\\
		\frac{1}{2}\|a\|^{2}-\|b\|^{2} &\leq\|a+b\|^{2}.\notag
	\end{align}
	
\end{proposition}


\subsection{Convexity and smoothness}\label{sec:convex_smoothness}

\begin{proposition}
	\label{eq:prop3}
	Let $f : \mathbb{R}^d \to \mathbb{R}$ be continuously differentiable and let $L\geq 0$. Then the following statements are equivalent:
	\begin{itemize}
		\item $f$ is $L$-smooth,
		\item $2 D_{f}(x, y) \leq L\|x-y\|^{2} \text { for all } x, y \in \mathbb{R}^{d}$,
		\item $\langle\nabla f(x)-\nabla f(y), x-y\rangle \leq L\|x-y\|^{2} \text { for all } x, y \in \mathbb{R}^{d}$.
	\end{itemize}
\end{proposition}

\begin{proposition}
	Let $f : \mathbb{R}^d \to \mathbb{R}$ be continuously differentiable and let $\mu\geq 0$. Then the following statements are equivalent:
	\begin{itemize}
		\item $f$ is $\mu$-strongly convex,
		\item $2 D_{f}(x, y) \geq \mu\|x-y\|^{2} \text { for all } x, y \in \mathbb{R}^{d}$,
		\item $\langle\nabla f(x)-\nabla f(y), x-y\rangle \geq \mu\|x-y\|^{2} \text { for all } x, y \in \mathbb{R}^{d}$.
	\end{itemize}
\end{proposition}
Note that the $\mu = 0$ case reduces to convexity.

\begin{proposition}
	Let $f : \mathbb{R}^d \to \mathbb{R}$ be continuously differentiable and $L > 0$. Then the following statements are equivalent:
	\begin{itemize}
		\item $f$ is convex and $L$-smooth
		\item $0 \leq 2 D_{f}(x, y) \leq L\|x-y\|^{2} \text { for all } x, y \in \mathbb{R}^{d}$,
		\item $\frac{1}{L}\|\nabla f(x)-\nabla f(y)\|^{2} \leq 2 D_{f}(x, y) \text { for all } x, y \in \mathbb{R}^{d}$,
		\item $\frac{1}{L}\|\nabla f(x)-\nabla f(y)\|^{2} \leq\langle\nabla f(x)-\nabla f(y), x-y\rangle \text { for all } x, y \in \mathbb{R}^{d}$.
	\end{itemize}
\end{proposition}

\begin{proposition}[Jensen's inequality]
	Let $f: \mathbb{R}^{d} \to \mathbb{R}$ be a convex function, $x_1,\ldots,x_m \in \mathbb{R}^{d}$, and $\lambda_1,\ldots, \lambda_m$ be nonnegative real numbers adding up to 1. Then  
	$$f\left(\sum_{i=1}^{m} \lambda_{i} x_{i}\right) \leq \sum_{i=1}^{m} \lambda_{i} f\left(x_{i}\right).$$
\end{proposition}




\subsection{From convergence rate to iteration complexity}


We implicitly use the following standard result to derive iteration complexity results in our theorems. We include the statement and proof, for completeness.


\begin{lemma}\label{lem:itercomplex}
	Consider a randomized algorithm producing a sequence of random iterates $\{x_t\}_{t\geq 0}$. Let $S_t$ be some nonnegative function of $x_t$ (example: $S_t=\|x_t-x_*\|^2$). Assume that there exists $q\in (0,1)$ such that the following inequality holds for  all $t\geq 0$:
	\begin{align}
		\mathbb{E} \left[ S_t \right] \leq \left( 1 - q \right)^t S_0. \label{eq:nuh8g9f8d_98y8fhdf}
	\end{align}
	Fix any $\varepsilon>0$. Then as long as 
	$$T \geq \frac{1}{q} \ln \left(\frac{1}{\varepsilon}\right),$$ 
	we have $$\mathbb{E} \left[ S_T  \right] \leq \varepsilon S_0.
	$$
\end{lemma}

\begin{proof}
	Since $e^{q} \geq 1+q$ for all $q\in \R$, we have $e^{-q} \geq 1-q$ for all $ q \in(0,1).$ Since logarithm is an increasing over $\mathbb{R}_{+}$, it follows that
	$
	-q \geq \ln (1-q)$ for all $q \in(0,1)$. Therefore, the inequality
	\begin{align*}
		-t q \geq t \ln \left(1-q\right)
	\end{align*}
	holds for all	 $t\geq 0$ and all $q \in(0,1)$. 		Now if we have $T\geq \frac{1}{q}\ln\left(\frac{1}{\varepsilon}\right),$
	which is equivalent to
	$-T\cdot q \leq \ln (\varepsilon),$
	we obtain
	$T \ln \left(1-q\right) \leq \ln (\varepsilon).$
	Taking exponential on both sides, we get	\begin{equation} \label{eq:8yfd98gf8df}0<\left(1-q\right)^{T} \leq \varepsilon.\end{equation}
	Finally, we have 
	$$\mathbb{E} \left[ S_T \right] \overset{\eqref{eq:nuh8g9f8d_98y8fhdf}}{ \leq} \left(1-q\right)^{T} S_0 \overset{\eqref{eq:8yfd98gf8df}}{\leq} \varepsilon S_0.$$
	
\end{proof}
\begin{lemma}
Consider a randomized algorithm producing a sequence of random iterates $x_t$.
	Let $S_t$ be some nonnegative function of $x_t$ (example: $S_t=\left\|x_t-x_*\right\|^2$ ).
	Assume that there exists $q \in(0,1)$ such that the following inequality holds for all $t \geq 0$ :
	$$
	\mathbb{E}\left[S_t\right] \leq(1-q)^{\beta t} S_0.
	$$
	Fix any $\varepsilon>0$. Then as long as
	$$
	T \geq \frac{1}{q \beta} \ln \left(\frac{1}{\varepsilon}\right)
	$$
	we have
	$$
	\mathbb{E}\left[S_T\right] \leq \varepsilon .
	$$
\end{lemma}
\begin{proof}:
	Since $e^q \geq 1+q$ for all $q \in \mathbb{R}$, we have $e^{-q} \geq 1-q$ for all $q \in(0,1)$. Since logarithm is an increasing function over $\mathbb{R}_{+}$, it follows that $-q \geq \ln (1-q)$ for all $q \in(0,1)$. Therefore, the inequality
	$-\beta t q \geq \beta t \ln (1-q)$
	holds for all $t \geq 0$ and all $q \in(0,1)$.
	Now, if we have $T \geq \frac{1}{\beta q} \ln \left(\frac{1}{\varepsilon}\right)$, which is equivalent to $-T \beta \cdot q \leq \ln (\varepsilon)$, we obtain $\beta T \ln (1-q) \leq \ln (\varepsilon)$.
	Taking exponential on both sides, we get
	$$
	0<(1-q)^{\beta T} \leq \varepsilon .
	$$
	Finally, we have
	$$
	\mathbb{E}\left[\Psi_T\right] \leq(1-q)^{\beta T} \Psi_0 \leq \varepsilon \Psi_0 .
	$$
\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%

\clearpage	
\section{Proof of Proposition~\ref{prop-reform}}
Assume that each $f_i$ is $\mu$-strongly convex (resp.\ convex) and $L$-smooth. Then the function $$f^t\eqdef \frac{1}{n}\sum_{i=1}^n f_i^t,$$ and  
\begin{equation}
	f_i^t (x) \eqdef f_i(x)+\left\langle a_i^t,x \right\rangle,
\end{equation}
are $\mu$-strongly convex (resp.\ convex) and $L$-smooth.
\begin{proof}
	Let us compute Bregman divergence with respect to the new function $f^t_i(x):$
	\begin{align*}
		D_{f^t_i}(x, y) = f^t_i(x)-f^t_i(y)-\langle\nabla f^t_i(y), x-y\rangle.
	\end{align*}
	Note that $\nabla f^t_i(y) = \nabla f_i(y)+a_i^t$. Now we have 
	\begin{align*}
		D_{f^t_i}(x, y) &= f^t_i(x)-f^t_i(y)-\langle\nabla f^t_i(y), x-y\rangle\\
		&= f_i(x)+\left\langle a_i^t, x \right\rangle - \left( f_i(y)+\left\langle a_i^t, y \right\rangle\right) - \langle \nabla f_i(y)+a_i^t, x-y\rangle\\
		&=f_i(x)+\left\langle a_i^t, x \right\rangle - f_i(y)-\left\langle a_i^t, y \right\rangle - \langle \nabla f_i(y), x-y\rangle - \langle a_i^t, x-y \rangle\\
		&=f_i(x)+\left\langle a_i^t, x \right\rangle - f_i(y)-\left\langle a_i^t, y \right\rangle - \langle \nabla f_i(y), x-y\rangle - \langle a_i^t, x \rangle+\langle a_i^t, y \rangle\\
		&=f_i(x) - f_i(y) - \langle \nabla f_i(y), x-y\rangle\\
		& = D_{f_i}(x, y).
	\end{align*}
	Since the Bregman divergence is not changed, the new function $f^t_i(x)$ has the same properties ($\mu$-strong convexity or convexity and $L$-smoothness) as the initial function $f_i(x)$. 
\end{proof}




%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%

\clearpage	
\section{Proof of Lemma~\ref{main_lemma_lemma}}
\begin{proof}
We start from definition of $\left(\sigma_*^t\right)^2$ and $a_t^i$ from equation \ref{eq:a_t^i}:
$$
\left(\sigma_*^t\right)^2:=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i^t\left(x_*\right)\right\|^2=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i\left(x_*\right)-\nabla f_i\left(y_t\right)+\nabla f\left(y_t\right)\right\|^2.
$$
Using the fact that $\nabla f\left(x_*\right)=0$ we have
$$
\left(\sigma_*^t\right)^2=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i\left(x_*\right)-\nabla f_i\left(y_t\right)+\nabla f\left(y_t\right)-\nabla f\left(x_*\right)\right\|^2.
$$
Applying Young's inequality (12) we obtain
$$
\left(\sigma_*^t\right)^2 \leq \frac{1}{n} \sum_{i=1}^n\left(2\left\|\nabla f_i\left(y_t\right)-\nabla f_i\left(x_*\right)\right\|^2+2\left\|\nabla f\left(y_t\right)-\nabla f\left(x_*\right)\right\|^2\right).
$$
Now we apply Proposition 5 for the squared norms of gradient differences:
$$
\left(\sigma_*^t\right)^2 \leq \frac{1}{n} \sum_{i=1}^n 4 L D_{f_i}\left(y_t, x_*\right)+\frac{1}{n} \sum_{i=1}^n 4 L D_f\left(y_t, x_*\right).
$$
We need to use the fact that $\frac{1}{n} \sum_{i=1}^n D_{f_i}\left(y_t, x_*\right)=D_f\left(y_t, x_*\right)$. It is true since 
$f(x)=\frac{1}{n} \sum_{i=1}^n f_i(x)$. So, $$\left(\sigma_*^t\right)^2 \leq 4 L D_f\left(y_t, x_*\right)+4 L D_f\left(y_t, x_*\right)=8 L D_f\left(y_t, x_*\right).$$
Finally, we apply the $L$-smoothness property from Proposition \ref{eq:prop3}:
$$
\left(\sigma_*^t\right)^2 \leq 4 L^2\left\|y_t-x_*\right\|^2.
$$
\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%

\clearpage	
\section{Analysis of  Rand-Shuffle and Rand-Reshuffle}

\subsection{Proof of Theorems~\ref{th1} and~\ref{th2} }
\label{D.1}
%	Suppose that each $f_i$ is convex, $f$ is $\mu$-strongly convex, and Assumption~\ref{L-smooth}
%	holds. Then provided the stepsize satisfies $\gamma \leq \frac{1}{2\sqrt{2} L n}\sqrt{\frac{\mu}{L}},$
%	the iterates generated by \RR (Algorithm~\ref{alg:RRSVRG}) or by \RS (Algorithm~\ref{alg:SOSVRG}) satisfy
%	\begin{align*}
%	\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( 1 - \frac{\gamma n \mu}{2} \right)^T \|x_0 - x_*\|^2.
%	\end{align*}
%	
%	Suppose that each $f_i$ is convex, $f$ is $\mu$-strongly convex and Assumption~\ref{L-smooth} holds. Additionally assume we are in the ``big data'' regime characterized by $n \geq \frac{2 L}{\mu} \cdot \frac{1}{1-\frac{\mu}{\sqrt{2} L}}$. Then provided the stepsize satisfies $\gamma \leq \frac{1}{\sqrt{2}Ln},$
%	the iterates generated by \RR (Algorithm~\ref{alg:RRSVRG}) or by \RS (Algorithm~\ref{alg:SOSVRG}) satisfy
%	\begin{align*}
%	\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( 1 - \frac{\gamma n \mu}{2} \right)^T \|x_0 - x_*\|^2.
%	\end{align*}

\begin{proof}
	We start from Lemma 3 in paper of~\citet{mishchenko2020random}. 
	
\begin{lemma}
Assume that functions $f_1, \ldots, f_n$ are convex and that Assumption~\ref{L-smooth} is satisfied. If
	Random Reshuffling or Shuffle-Once is run with a stepsize satisfying
	$\gamma\leq\frac{1}{\sqrt{2}Ln}$, then
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right] \leq \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]-2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f(x_{*})\right]+\frac{\gamma^{3} L n^{2} \sigma_{*}^{2}}{2}.
	\end{align*}
\end{lemma}

 The proof of the analogous inequality from \cite{mishchenko2020random} but with condition expectation is identical with very minor changes. We provide such proof below:
	
	We denote by $\mathcal{F}_t$ the $\sigma$-algebra generated by the collection of $(\mathcal{X} \times \mathcal{Y})$-valued random variables $\left(x_0, y_0\right), \ldots,\left(x_t, y_t\right)$, for every $t \geq0$. In this work, we consider unbiased random estimates: for every $t \geq 0$. If the method does not depend on $y_\tau$ we can still use such notation because of the independence property for conditional expectations.
	We denote by $\mathcal{F}_t$ the $\sigma$-algebra generated by the collection of $(\mathcal{X} \times \mathcal{Y})$-valued random variables $\left(x_0, y_0\right), \ldots,\left(x_t, y_t\right)$, for every $t \geq 0$. In this work, we consider unbiased random estimates: for every $t \geq 0$.
	We define the forward per-epoch deviation over the $t$-th epoch $\mathcal{V}_i$ as
	$$
	\mathcal{V}_t=\sum_{i=0}^{n-1}\left\|x_t^i-x_{t+1}\right\|^2
	$$
	
	Lemma 2. Consider the iterates of Random Reshuffling or Shuffle-Once. If the functions $f_1, \ldots, f_n$ are convex and Assumption 1 is satisfied, then
	$$
	\mathbb{E}\left[\mathcal{V}_t \mid \mathcal{F}_t\right] \leq 4 \gamma^2 n^2 L \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_i}\left(x_*, x_t^i\right) \mid \mathcal{F}_t\right]+\frac{1}{2} \gamma^2 n^2 \sigma_*^2
	$$
	where $\mathcal{V}_i$ is defined above, and $\sigma_*^2$ is the variance at the optimum given by $\sigma_*^2=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i\left(x_*\right)\right\|^2$
	We will follow the steps from Mishchenko et al. [2020].
\begin{proof} For any fixed $k \in 0, \ldots, n-1$, by definition of $x_t^k$ and $x_{t+1}$ (According to Algorithm 1 or 2 in Mishchenko et al. [2020]) we get the decomposition
	$$
	x_t^k-x_{t+1}=\gamma \sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_t^i\right)=\gamma \sum_{i=k}^{n-1}\left(\nabla f_{\pi_i}\left(x_t^i\right)-\nabla f_{\pi_i}\left(x_*\right)\right)+\gamma \sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)
	$$
	Applying Young's inequality to the sums above yields
	$$
	\left\|x_t^k-x_{t+1}\right\|^2 \leq 2 \gamma^2\left\|\sum_{i=k}^{n-1}\left(\nabla f_{\pi_i}\left(x_t^i\right)-\nabla f_{\pi_i}\left(x_*\right)\right)\right\|^2+2 \gamma^2\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2
	$$
	Using Jensen's inequality we have
	$$
	\left\|x_t^k-x_{t+1}\right\|^2 \leq 2 \gamma^2 n \sum_{i=k}^{n-1}\left\|\nabla f_{\pi_i}\left(x_t^i\right)-\nabla f_{\pi_i}\left(x_*\right)\right\|^2+2 \gamma^2\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2
	$$
	Using $L$-smoothness property from Proposition 3 we have
	$$
	\left\|x_t^k-x_{t+1}\right\|^2 \leq 4 \gamma^2 L n \sum_{i=k}^{n-1} D_{f_{\pi_i}}\left(x_*, x_t^i\right)+2 \gamma^2\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2
	$$

Further, we have
$$
\left\|x_t^k-x_{t+1}\right\|^2 \leq 4 \gamma^2 L n \sum_{i=0}^{n-1} D_{f_{\pi_i}}\left(x_*, x_t^i\right)+2 \gamma^2\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2
$$
Summing up and taking conditional expectations leads to
$$
\sum_{k=0}^{n-1} \mathbb{E}\left[\left\|x_t^k-x_{t+1}\right\|^2 \mid \mathcal{F}_t\right] \leq 4 \gamma^2 L n^2 \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_i}\left(x_*, x_t^i\right) \mid \mathcal{F}_t\right]+2 \gamma^2 \sum_{k=0}^{n-1} \mathbb{E}\left[\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2 \mid \mathcal{F}_t\right]
$$
Since $\sum_{k=0}^{n-1} \mathbb{E}\left[\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2 \mid \mathcal{F}_t\right]$ does not depend on $\mathcal{F}_t$ but only on permutations we have
$$
\sum_{k=0}^{n-1} \mathbb{E}\left[\left\|x_t^k-x_{t+1}\right\|^2 \mid \mathcal{F}_t\right] \leq 4 \gamma^2 L n^2 \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_*, x_t^i\right) \mid \mathcal{F}_t\right]+2 \gamma^2 \sum_{k=0}^{n-1} \mathbb{E}\left[\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2\right]
$$
We now bound the second term in the right-hand side. First, using Lemma 1 from Mishchenko et al. [2020], we get
$\mathbb{E}\left[\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2\right]=(n-k)^2 \mathbb{E}\left[\left\|\frac{1}{n-k} \sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2\right]=(n-k)^2 \frac{k}{(n-k)(n-1)} \sigma_*^2=\frac{k(n-k)}{n-1} \sigma_*^2$.
Next, by summing this for $k$ from 0 to $n-1$, we obtain
$$
\sum_{k=0}^{n-1} \mathbb{E}\left[\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2\right]=\sum_{k=0}^{n-1} \frac{k(n-k)}{n-1} \sigma_*^2=\frac{1}{6} n(n+1) \sigma_*^2 \leq \frac{n^2 \sigma_*^2}{4}
$$
where in the last step we also used $n \geq 2$. The result follows.
\end{proof}
Let us provide analogue for Lemma 3 from \cite{mishchenko2020random}.

Lemma $3^*$. Assume that functions $f_1, \ldots, f_n$ are convex and that Assumption 1 is satisfied. If Random Reshuffling (Algorithm 1 ) or Shuffle-Once (Algorithm 2 ) is run with a stepsize satisfying $\gamma \leq \frac{1}{\sqrt{2} L n}$, then
$$
\mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq\left\|x_t-x_*\right\|^2-2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f_* \mid \mathcal{F}_t\right]+\frac{\gamma^3 L n^2 \sigma_*^2}{2}
$$
\begin{proof}
Define the sum of gradients used in the $t$-th epoch as $g_t=\sum_{i=0}^{n-1} \nabla f_{\pi_i}\left(x_t^i\right)$. We will use $g_t$ to relate the iterates $x_t$ and $x_{t+1}$. By definition of $x_{t+1}$, we can write
$$
x_{t+1}=x_t^n=x_t^{n-1}-\gamma \nabla f_{\pi_{n-1}}\left(x_t^{n-1}\right)=\ldots=x_t^0-\gamma \sum_{i=0}^{n-1} \nabla f_{\pi_i}\left(x_t^i\right)
$$
Further, since $x_t^0=x_t$, we see that $x_{t+1}=x_t-\gamma g_t$, which leads to
$$
\left\|x_t-x_*\right\|^2=\left\|x_{t+1}+\gamma g_t-x_*\right\|^2=\left\|x_{t+1}-x_*\right\|^2+2 \gamma\left\langle g_t, x_{t+1}-x_*\right\rangle+\gamma^2\left\|g_t\right\|^2
$$
Since $\gamma^2\left\|g_t\right\|^2 \geq 0$ we have
$$
\left\|x_t-x_*\right\|^2 \geq\left\|x_{t+1}-x_*\right\|^2+2 \gamma\left\langle g_t, x_{t+1}-x_*\right\rangle=\left\|x_{t+1}-x_*\right\|^2+2 \gamma \sum_{i=0}^{n-1}\left\langle\nabla f_{\pi_i}\left(x_t^i\right), x_{t+1}-x_*\right\rangle
$$

Observe that for any $i$, we have the following decomposition
$$
\left\langle\nabla f_{\pi_i}\left(x_t^i\right), x_{t+1}-x_*\right\rangle=\left(f_{\pi_i}\left(x_{t+1}\right)-f_{\pi_i}\left(x_*\right)\right)+D_{f_{\pi_i}}\left(x_*, x_t^i\right)-D_{f_{\pi_i}}\left(x_{t+1}, x_t^i\right)
$$
Summing the first quantity over $i$ from 0 to $n-1$ gives
$$
\sum_{i=0}^{n-1}\left(f_{\pi_i}\left(x_{t+1}\right)-f_{\pi_i}\left(x_*\right)\right)=n\left(f\left(x_{t+1}\right)-f_*\right)
$$
Now, we can bound the third term in the decomposition (33) using $L$-smoothness as follows:
$$
D_{f_{\pi_i}}\left(x_{t+1}, x_t^i\right) \leq \frac{L}{2}\left\|x_{t+1}-x_t^i\right\|^2
$$
By summing the right-hand side over $i$ from 0 to $n-1$ we get the forward deviation over an epoch $\mathcal{V}_t$, which we bound by analogue of Lemma 2 to get
$$
\sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_{t+1}, x_t^i\right) \mid \mathcal{F}_t\right] \leq \frac{L}{2} \mathbb{E}\left[\mathcal{V}_t \mid \mathcal{F}_t\right] \leq 2 \gamma^2 L^2 n^2 \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_*, x_t^i\right) \mid \mathcal{F}_t\right]+\frac{\gamma^2 L n^2 \sigma_*^2}{4}
$$
Therefore, we can lower-bound the sum of the second and the third term as
$$
\begin{aligned}
	& \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_*, x_t^i\right)-D_{f_{\pi_i}}\left(x_{t+1}, x_t^i\right) \mid \mathcal{F}_t\right] \geq \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_*, x_t^i\right) \mid \mathcal{F}_t\right] \\
	& -2 \gamma^2 L^2 n^2 \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_*, x_t^i\right) \mid \mathcal{F}_t\right]-\frac{\gamma^2 L n^2 \sigma_*^2}{4} .
\end{aligned}
$$

\end{proof}


Proof. We start from analogue of Lemma 3 in paper of Mishchenko et al. [2020], which we proved above.
$$
\mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq\left\|x_t-x_*\right\|^2-2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_*\right) \mid \mathcal{F}_t\right]+\frac{\gamma^3 L n^2 \sigma_*^2}{2}
$$
Now we can apply this inequality to the reformulated problem (2). Using strong convexity, we obtain
$$
\begin{aligned}
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq\left\|x_t-x_*\right\|^2-2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_*\right) \mid \mathcal{F}_t\right]+\frac{\gamma^3 L n^2\left(\sigma_*^t\right)^2}{2} \\
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq\left\|x_t-x_*\right\|^2-\gamma n \mu \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right]+\frac{\gamma^3 L^2\left(\sigma_*^t\right)^2}{2}
\end{aligned}
$$
Since we update $y_t=x_t$ after each epoch, this leads to
$$
\begin{aligned}
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq \frac{1}{1+\gamma \mu n}\left(\left\|x_t-x_*\right\|^2+\frac{\gamma^3 L n^2\left(\sigma_*^t\right)^2}{2}\right) \\
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq \frac{1}{1+\gamma \mu n}\left(\left\|x_t-x_*\right\|^2+\frac{\gamma^3 L n^2 \cdot 4 L^2\left\|y_t-x_*\right\|^2}{2}\right) \\
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq \frac{1}{1+\gamma \mu n}\left(\left\|x_t-x_*\right\|^2+2 \gamma^3 n^2 L^3\left\|x_t-x_*\right\|^2\right) \\
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq \frac{1}{1+\gamma \mu n}\left(1+2 \gamma^3 n^2 L^3\right)\left\|x_t-x_*\right\|^2
\end{aligned}
$$
We can use the tower property of conditional expectation to obtain
$$
\mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2\right] \leq \frac{1+2 \gamma^3 L^3 n^2}{1+\gamma \mu n} \mathbb{E}\left[\left\|x_t-x_*\right\|^2\right]
$$
Since $\gamma \leq \frac{1}{2 \sqrt{2} L n} \sqrt{\frac{\mu}{L}}, n \geq 1$ and $\mu \leq L$ we have
$$
\frac{1}{4 n}+\frac{1}{4 \sqrt{2}} \frac{\mu}{L} \sqrt{\frac{\mu}{L}} \leq \frac{1}{2}
$$
From this inequality we obtain
$$
\begin{aligned}
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2\right] \leq \frac{1+2 \gamma^3 L^3 n^2}{1+\gamma \mu n} \mathbb{E}\left[\left\|x_t-x_*\right\|^2\right] \\
	& \text { Since } \gamma \leq \frac{1}{2 \sqrt{2} L n} \sqrt{\frac{\mu}{L}}, n \geq 1 \text { and } \mu \leq L \text { we have } \\
	& \frac{1}{4 n}+\frac{1}{4 \sqrt{2}} \frac{\mu}{L} \sqrt{\frac{\mu}{L}} \leq \frac{1}{2}
\end{aligned}
$$
From this inequality we obtain
$2 \cdot \frac{1}{8 L^2 n^2} \cdot \frac{\mu}{L} L^3 n+\frac{1}{2 \sqrt{2} L n} \sqrt{\frac{\mu}{L}} \cdot \frac{n \mu^2}{2} \leq \frac{\mu}{2}$
$$
\frac{1}{4 n} \mu+\frac{1}{4 \sqrt{2}} \frac{\mu}{L} \sqrt{\frac{\mu}{L}} \mu \leq \frac{\mu}{2}
$$
We continue to derive inequalities:
$$
\begin{aligned}
	& 2 \gamma^2 L^3 n+\frac{\gamma n \mu^2}{2} \leq \frac{\mu}{2} \\
	& 2 \gamma^2 L^3 n \leq \frac{\mu}{2}-\frac{\gamma n \mu^2}{2} \\
	& 2 \gamma^2 L^3 n^2 \leq \frac{n \mu}{2}-\frac{\gamma n^2 \mu^2}{2} \\
	& 1+2 \gamma^3 L^3 n^2 \leq 1+\frac{\gamma n \mu}{2}-\frac{\gamma^2 n^2 \mu^2}{2}
\end{aligned}
$$
Finally, we obtain
$$
\frac{1+2 \gamma^3 L^3 n^2}{1+\gamma \mu n} \leq 1-\frac{\gamma n \mu}{2}
$$
Plugging this inequality into $\mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2\right] \leq \frac{1+2 \gamma^3 L^3 n^2}{1+\gamma \mu n} \mathbb{E}\left[\left\|x_t-x_*\right\|^2\right]$, we unroll the recursion and obtain the final result:
$$
\mathbb{E}\left[\left\|x_T-x_*\right\|^2\right] \leq\left(1-\frac{\gamma n \mu}{2}\right)^T\left\|x_0-x_*\right\|^2
$$
\end{proof}



\subsection{Proof of Theorem~\ref{th3}}
%	Suppose that the functions $f_1, \ldots, f_n$ are $\mu$-strongly convex and Assumption~\ref{L-smooth} holds. Fix constant $0<\delta<1$. If the stepsize satisfies $\gamma\leq\frac{\delta}{L}\sqrt{\frac{\mu}{2nL}}$ and if number of functions is sufficiently big, $$n>\log\left(\frac{1}{1-\delta^2}\right)\cdot\left(\log\left(\frac{1}{1-\gamma\mu}\right)\right)^{-1},$$ then the iterates generated by \RR (Algorithm~\ref{alg:RRSVRG}) or by \RS (Algorithm~\ref{alg:SOSVRG}) satisfy
%	\begin{align*}
%	\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( \left(1 - \gamma \mu\right)^n +\delta^2 \right)^T \|x_0 - x_*\|^2.
%	\end{align*}

% \begin{proof}

We start from conditional analogue of Theorem 1 in \citep{mishchenko2020random} (similarly to Section \ref{D.1}), which states that
$$\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid \mathcal{F}_t \right] \leq(1-\gamma \mu)^{n} \left\|x_{t}-x_{*}\right\|^{2}+2 \gamma^{2} \sigma_{\text {Shuffle }}^{2}\left(\sum_{i=0}^{n-1}(1-\gamma \mu)^{i}\right).$$
Using Proposition 1 from \citep{mishchenko2020random}, which says that
$$\frac{\gamma \mu n}{8} \sigma_{*}^{2} \leq \sigma_{\text {Shuffle }}^{2} \leq \frac{\gamma L n}{4} \sigma_{*}^{2},$$
we get 
\begin{align*} \mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid \mathcal{F}_t \right] & \leq(1-\gamma \mu)^{n} \left\|x_{t}-x_{*}\right\|^{2}+ \frac{\gamma^3 L n}{2} \sigma_{*}^{2}\left(\sum_{i=0}^{n-1}(1-\gamma \mu)^{i}\right) \\
	& \leq(1-\gamma \mu)^{n} \left\|x_{t}-x_{*}\right\|^{2}+ \frac{\gamma^2 L n}{2\mu} \sigma_{*}^{2}. \end{align*}

Now we can apply Lemma~\ref{main_lemma_lemma} and Reformulation \ref{reform}. Using $y_t = x_t$ we have the following inequality:
\begin{align*}
	\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t \right] &\leq(1-\gamma \mu)^{n} \left\|x_{t}-x_{*}\right\|^{2}+ \frac{2\gamma^2 L^3 n}{\mu}\|x_t - x_*\|^2\\
	&\leq\left((1-\gamma \mu)^{n} + \frac{2\gamma^2 L^3 n}{\mu}\right)\|x_t - x_*\|^2.\\
\end{align*}
Applying the tower property, we get
\begin{align*}
	\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right] &\leq\left((1-\gamma \mu)^{n} + \frac{2\gamma^2 L^3 n}{\mu}\right)\mathbb{E}\left[\|x_t - x_*\|^2\right],
\end{align*}
and after	unrolling this recursion, we get
\begin{align*}
	\mathbb{E}\left[\left\|x_{T}-x_{*}\right\|^{2}\right] &\leq\left((1-\gamma \mu)^{n} + \frac{2\gamma^2 L^3 n}{\mu}\right)^T\mathbb{E}\left[\|x_0 - x_*\|^2\right]\\
	&\leq\left((1-\gamma \mu)^{n} + \frac{\delta^2}{L^2}\frac{\mu}{2nL}\frac{2 L^3 n}{\mu}\right)^T\mathbb{E}\left[\|x_0 - x_*\|^2\right]\\
	&\leq\left((1-\gamma \mu)^{n} +\delta^2\right)^T\mathbb{E}\left[\|x_0 - x_*\|^2\right],
\end{align*}
where we used the stepsize restriction $\gamma\leq\frac{\delta}{L}\sqrt{\frac{\mu}{2nL}}$. In order for this to lead to convergence, we need to assume that
$(1-\gamma \mu)^{n} +\delta^2 <1.$	This is satisfied, for example, if $n$ is large enough. In particular, this holds when
$$n>\log\left(\frac{1}{1-\delta^2}\right)\cdot\left(\log\left(\frac{1}{1-\gamma\mu}\right)\right)^{-1}.$$


Finally, 
using the additional assumption 
$\delta^2 \leq (1-\gamma\mu)^{\frac{n}{2}}\left(1-(1-\gamma\mu)^{\frac{n}{2}}\right),$
we get
\begin{align*}
	\delta^2+(1-\gamma\mu)^{n} \leq (1-\gamma\mu)^{\frac{n}{2}}.
\end{align*} 
Now we can apply Theorem~\ref{th3} and get
\begin{align*}
	\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq  \left(1 - \gamma \mu\right)^{\frac{nT}{2}} \|x_0 - x_*\|^2.
\end{align*}
Finally, we apply Lemma~\ref{lem:itercomplex} with $\gamma = \frac{\delta}{L}\sqrt{\frac{\mu}{2nL}}$ and get iteration complexity
$
T = \mathcal{O}\left(\kappa\sqrt{\frac{\kappa}{n}}\log \left(\frac{1}{\varepsilon}\right)\right).
$

% \end{proof}



\subsection{Proof of Theorem~\ref{th4}}
Suppose the functions $f_1, f_2, \ldots, f_n$ are convex and Assumption~\ref{L-smooth} holds. Then for \RR  or \RS  with stepsize $\gamma \leq \frac{1}{\sqrt{2}Ln},$ the average iterate $\hat{x}_{T} \eqdef \frac{1}{T} \sum_{t=1}^{T} x_{t}$ satisfies 
\begin{align*}
	\mathbb{E}\left[f\left(\hat{x}_{T}\right)-f\left(x_{*}\right)\right] \leq \frac{3\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma n T}.
\end{align*}
\begin{proof}
	We start with conditional analogue of Lemma 3 from~\citet{mishchenko2020random} (similarly to Section \ref{D.1}), which says that
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid \mathcal{F}_t\right] \leq \left\|x_{t}-x_{*}\right\|^{2} -2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\mid \mathcal{F}_t\right]+\frac{\gamma^{3} Ln^{2} \sigma_{*}^{2}}{2}.\end{align*}
	
	Apply this inequality to the reformulated problem~\eqref{reform}, we get 			
	\begin{equation}\label{eq:bug87gdfd_8y9fd}
		2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\mid \mathcal{F}_t\right] \leq \left\|x_{t}-x_{*}\right\|^{2}-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid \mathcal{F}_t\right]+\frac{\gamma^{3} L n^{2} \left(\sigma_{*}^t\right)^{2}}{2}.
	\end{equation}
	Using Lemma~\ref{main_lemma_lemma} and the fact that $y_t=x_t$ and $f=f^t$, we get  
	\begin{equation}\label{eq:u987g9fdf}\left(\sigma_*^t\right)^2 \leq 8LD_{f^t}(x_t,x_*) = 8LD_{f}(x_t,x_*) =8L(f(x_t) - f(x_*)),\end{equation}
	where the last identity follows from Proposition~\ref{prop-reform}.
	
	
	Plugging \eqref{eq:u987g9fdf} into \eqref{eq:bug87gdfd_8y9fd}, we obtain
	\begin{align*}
		2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\mid \mathcal{F}_t\right] \leq \left\|x_{t}-x_{*}\right\|^{2}-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid \mathcal{F}_t\right]+\frac{\gamma^{3} L n^{2}}{2} \cdot 8L(f(x_t) - f(x_*)),
	\end{align*}
	which after using the tower property turns into
	\begin{align*}
		2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right] \leq \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]+4\gamma^{3} L^2 n^{2} \mathbb{E}\left[f(x_t) - f(x_*)\right].
	\end{align*}
	Now we subtract from both sides:
	\begin{align*}
		2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right] - 4\gamma^3L^2n^2\mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right] & \leq \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]\\
	&	\qquad+4\gamma^{3} L^2 n^{2} \mathbb{E}\left[f(x_t) - f(x_*)\right]\\
		&\qquad- 4\gamma^3L^2n^2\mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right]\\
		\left(2 \gamma n - 4\gamma^3L^2n^2\right)\mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right] &\leq \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]\\
	&	\qquad+ 4\gamma^{3} L^2 n^{2}\left( \mathbb{E}\left[f(x_t) - f(x_*)\right] - \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right]\right)\\
		2 \gamma n\left(1 - 2\gamma^2L^2n\right)\mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right]& \leq \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]\\
	&	\qquad + 4\gamma^{3} L^2 n^{2}\left( \mathbb{E}\left[f(x_t) - f(x_*)\right] - \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right]\right).
	\end{align*}
	Summing these inequalities for $t=0,1,\ldots,T-1$ gives
	\begin{align*}
		2 \gamma n\left(1 - 2\gamma^2L^2n\right)\sum_{t=0}^{T-1}\mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right] &\leq \sum_{t=0}^{T-1}\left(\mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]\right)\\
		&\qquad + 4\gamma^{3} L^2 n^{2}\sum_{t=0}^{T-1}\left( \mathbb{E}\left[f(x_t) - f(x_*)\right] - \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right]\right)\\
		&= \mathbb{E}\left[\left\|x_{0}-x_{*}\right\|^{2}\right] - \mathbb{E}\left[\left\|x_{T}-x_{*}\right\|^{2}\right]\\
		&\qquad+4\gamma^{3} L^2 n^{2}\mathbb{E}\left[f\left(x_{0}\right)-f\left(x_{*}\right)\right] - 4\gamma^{3} L^2 n^{2}\mathbb{E}\left[f\left(x_{T}\right)-f\left(x_{*}\right)\right]\\
		&\leq \mathbb{E}\left[\left\|x_{0}-x_{*}\right\|^{2}\right] +4\gamma^{3} L^2 n^{2}\mathbb{E}\left[f\left(x_{0}\right)-f\left(x_{*}\right)\right]\\
		&\leq \mathbb{E}\left[\left\|x_{0}-x_{*}\right\|^{2}\right] + 2\gamma^{3} L^3 n^{2}\mathbb{E}\left[\left\|x_{0}-x_{*}\right\|^{2}\right]\\
		&= (1+2\gamma^{3} L^3 n^{2})\mathbb{E}\left[\left\|x_{0}-x_{*}\right\|^{2}\right],
	\end{align*}
	and dividing both sides by $2 \gamma n\left(1 - 2\gamma^2L^2n\right)T$, we get
	\begin{align*}
		\frac{1}{T}\sum_{t=0}^{T-1}\mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right] &\leq \frac{1+2\gamma^{3} L^3 n^{2}}{1 - 2\gamma^2L^2n}\frac{\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma nT}.
	\end{align*}
	Using the convexity of $f$, the average iterate $\hat{x}_{T} \stackrel{\text { def }}{=} \frac{1}{T} \sum_{t=1}^{T} x_{t}$ satisfies
	\begin{align*}
		\mathbb{E}\left[f\left(\hat{x}_{T}\right)-f\left(x_{*}\right)\right] \leq \frac{1}{T} \sum_{t=1}^{T} \mathbb{E}\left[f\left(x_{t}\right)-f\left(x_{*}\right)\right]\leq \frac{1+2\gamma^{3} L^3 n^{2}}{1 - 2\gamma^2L^2n}\frac{\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma nT}.
	\end{align*}
	Let us show that 
	$$\frac{1+2\gamma^{3} L^3 n^{2}}{1 - 2\gamma^2L^2n} \leq 3.$$
	Applying $\gamma\leq \frac{1}{\sqrt{2}Ln}$ we have 
	$$\frac{1+2\frac{1}{2\sqrt{2}L^3n^3}L^3n^2}{1-2\frac{1}{2L^2n^2}L^2n} = \frac{1+\frac{1}{\sqrt{2}n}}{1-\frac{1}{n}}\leq 3.$$
	This leads to $4n>6+\sqrt{2}$ and since $n \in \mathbb{N}:n>1$, this inequality holds. Finally, we have 
	$$ \mathbb{E}\left[f\left(\hat{x}_{T}\right)-f\left(x_{*}\right)\right] \leq \frac{3\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma nT}.$$
\end{proof}


%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%
\clearpage
\subsection{Proof of Theorem~\ref{non-convex-1}~and~\ref{PL} }
We provide analysis for non-convex settings.

Let us remind you our reformulation:
$$f(x)=\frac{1}{n} \sum_{i=1}^{n} f_{i}(x)=\frac{1}{n} \sum_{i=1}^{n}\left(f_{i}(x)+\left\langle a_{t}^{i}, x\right\rangle\right):=\frac{1}{n} \sum_{i=1}^{n} f_{i}^{t}(x),$$

where $f_{i}^{t}(x):=f_{i}(x)+\left\langle a_{t}^{i}, x\right\rangle$ and $\sum_{i=1}^{n} a_{t}^{i}=0$. Note that
$$
\nabla f_{i}^{t}(x)=\nabla f_{i}(x)+a_{t}^{I}.
$$ 
In particular, we choose
$$a_{t}^{i}:=-\nabla f_{\pi_{i}}\left(y_{t}\right)+\nabla f\left(y_{t}\right).$$
Finally, we have 
$$\nabla f_{\pi_i}^{t}(x) = \nabla f_{\pi_i}(x)-\nabla f_{\pi_{i}}\left(y_{t}\right)+\nabla f\left(y_{t}\right).$$

Now we need to establish an analogue of Lemma 1 for gradient variance. Let us define 
$$ \sigma^2(x_t) = \frac{1}{n} \sum_{i=1}^{n}\left\Vert\nabla f_{i}(x_t)-\nabla f(x_t)\right\Vert^{2}.$$
\begin{lemma} If we apply the linear perturbation reformulation, then the gradient variance of the reformulated problem $\left(\sigma_t^2\right)$ is equal to zero.
\end{lemma}

\begin{proof}
	
	$$
	\sigma_t^2\left(x_t\right)=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i^t\left(x_t\right)-\nabla f\left(x_t\right)\right\|^2=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i\left(x_t\right)-\nabla f_i\left(y_t\right)+\nabla f\left(y_t\right)-\nabla f\left(x_t\right)\right\|^2
	$$
	In Algorithm \ref{alg:GENERIC-SVRG} (\RR) we set $x_t=y_t$, and hence we have
	$$
	\sigma_t^2\left(x_t\right)=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i\left(x_t\right)-\nabla f_i\left(x_t\right)+\nabla f\left(x_t\right)-\nabla f\left(x_t\right)\right\|^2=0.
	$$
	

\end{proof}

Suppose that Assumption \ref{L-smooth} holds. Then for Algorithm \RR run for $T$ epochs with a stepsize $\gamma\leq \frac{1}{2Ln}$ we have 
$$\squeeze\frac{1}{T} \sum_{t = 0}^{T-1}  \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right] \leq \frac{4(f(x_0) - f_*)} {\gamma n T}.$$
Choose $\gamma = \frac{1}{2nL}$. Then the mean of gradient norms satisfies 
$ \frac{1}{T} \sum_{t = 0}^{T-1}  \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right] \leq \varepsilon^2 $
provided the number of iterations satisfies 
$T = \mathcal{O}\left( \frac{8 \delta_{0} L }{\varepsilon^{2}}\right).$

Suppose that Assumption \ref{L-smooth} holds and $f$ satisfies the Polyak-Łojasiewicz inequality with $\mu>0$, i.e., $\left\Vert\nabla f(x) \right\Vert^{2} \geq 2 \mu(f(x)-f_*)$ for any $x \in \mathbb{R}^{d}$. Then for Algorithm \RR run for $T$ epochs with a stepsize $\gamma\leq \frac{1}{2Ln}$ we have 
$$\squeeze \mathbb{E}\left[f(x_T)-f_*\right] \leq\left(1-\frac{\gamma \mu n}{2}\right)^{T}\left(f(x_0)-f_*\right),  $$
then the relative error satisfies 
$\frac{\mathbb{E}\left[f\left(x_{T}\right)-f_*\right]}{f\left(x_0\right)-f_*} \leq \varepsilon $
provided the number of iterations satisfies $T = \mathcal{O} (\kappa \log \frac{1}{\varepsilon}).$
\\

\begin{proof}
	We start from conditional analogues of Lemmas 4 and 5 from \citet{mishchenko2020random} (similarly to Section \ref{D.1})
	\begin{align*}
		\mathbb{E} \left[f(x_{t+1})|\mathcal{F}_t\right] \leq f(x_t) - \frac{\gamma n}{2}\left\Vert \nabla f(x_t) \right\Vert^2 + \frac{\gamma L^2}{2}\left( \gamma^2 n^3 \left\Vert \nabla f(x_t) \right\Vert^2 + \gamma^2 n^2 \sigma^2(x_t)\right)
	\end{align*}
	This lemma works for the reformulated problem. Since we do not change initial function f(x) the gradient $\nabla f(x_t)$ remains the same. The only thing that changes is the variance of the gradient. According to the lemma proved above, this variance is equal to zero. Now we have the following inequality:
	\begin{align*}
		\mathbb{E} \left[f(x_{t+1})|\mathcal{F}_t\right] &\leq f(x_t) - \frac{\gamma n}{2}\left\Vert \nabla f(x_t) \right\Vert^2 + \frac{\gamma L^2}{2} \gamma^2 n^3 \left\Vert \nabla f(x_t) \right\Vert^2\\
		&\leq f\left(x_{t}\right)-\frac{\gamma n}{2}\left(1-\gamma^{2} L^{2} n^{2}\right)\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}
	\end{align*}
	Let $\delta_t = f(x_t) - f_*$. Adding $-f_*$ to both sides,
	\begin{align*}
		\mathbb{E} \left[\delta_{t+1}|\mathcal{F}_t\right] \leq \delta_{t}-\frac{\gamma n}{2}\left(1-\gamma^{2} L^{2} n^{2}\right)\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}
	\end{align*}
	Taking unconditional expectations and using that $\gamma \leq \frac{1}{2Ln}$ we have $1 - \gamma^2 L^2 n^2 \geq \frac{1}{2}$, we get 
	$$ \mathbb{E}\left[\delta_{t+1}\right] \leq\mathbb{E}\left[\delta_{t}\right]-\frac{\gamma n}{4} \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right]. $$
	It leads to
	
	\begin{align*}
		\frac{1}{T} \sum_{t = 0}^{T-1}  \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right] \leq \frac{4} {\gamma n }\frac{1}{T}\sum_{t = 0}^{T-1}\left(\mathbb{E}\left[\delta_{t+1}\right] - \mathbb{E}\left[\delta_{t}\right] \right) \leq  \frac{4\delta_0} {\gamma n T}
	\end{align*}
	
	If we have PL condition, then we start from
	$$ \mathbb{E}\left[\delta_{t+1}\right] \leq\mathbb{E}\left[\delta_{t}\right]-\frac{\gamma n}{4} \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right]. $$
	Applying $\frac{1}{2}\|\nabla f(x)\|^{2} \geq \mu(f(x)-f_*)$ leads to 
	$$ \mathbb{E}\left[\delta_{t+1}\right] \leq \mathbb{E}\left[\delta_{t}\right]-\frac{\gamma \mu n}{2} \mathbb{E}\left[f\left(x_{t}\right)-f_{*}\right].$$
	Unrolling this recursion, we get 
	$$ \mathbb{E}\left[\delta_{T}\right] \leq\left(1-\frac{\gamma \mu n}{2}\right)^{T} \delta_{0}.$$
	
	Suppose that Assumption 1 holds. Choose the stepsize $\gamma$ as $\frac{1}{2nL}$. Then the mean of gradient norms satisfies 
	$$ \frac{1}{T} \sum_{t = 0}^{T-1}  \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right] \leq \varepsilon^2 $$
	provided the number of iterations satisfies 
	$$T \geq \frac{8 \delta_{0} L }{\varepsilon^{2}}.$$
	If $f$ satisfies the Polyak-Łojasiewicz inequality, then the relative error satisfies 
	$$\frac{\mathbb{E}\left[f\left(x_{T}\right)-f_*\right]}{\left(f\left(x_0\right)-f_*\right)} \leq \varepsilon $$
	provided the number of iterations satisfies 
	$$T = \mathcal{O} \left(\kappa \log \frac{1}{\varepsilon}\right).$$
\end{proof}

\section{Analysis of Det-Shuffle}

%\peter{The method is deterministic. Why do we have expectations in the theorems and proofs?}

\subsection{Proof of Theorem~\ref{th5}}

%		Suppose that each $f_i$ is convex function, $f$ is $\mu$-strongly convex function, and Assumption~\ref{L-smooth}
%		holds. Then provided the stepsize satisfies $\gamma \leq \frac{1}{4 L n}\sqrt{\frac{\mu}{L}},$
%		the iterates generated by \DS satisfy
%		\begin{align*}
%		\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( 1 - \frac{\gamma n \mu}{2} \right)^T \|x_0 - x_*\|^2.
%		\end{align*}

We start from Lemma 8 in~\citet{mishchenko2020random}
\begin{align}
	\left\|x_{t+1}-x_{*}\right\|^{2} \leq\left\|x_{t}-x_{*}\right\|^{2}-2 \gamma n\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right)+\gamma^{3} L n^{3} \sigma_{*}^{2}.
\end{align}

Now we can apply to the reformulated problem~\eqref{reform}. Using strong convexity we obtain
\begin{align*}
	\left\|x_{t+1}-x_{*}\right\|^{2} &\leq \left\|x_{t}-x_{*}\right\|^{2}-2 \gamma n \left(f\left(x_{t+1}\right)-f(x_{*})\right)+\gamma^{3} L n^{2} \left(\sigma_*^t\right)^2\\
	&\leq\left\|x_{t}-x_{*}\right\|^{2}- \gamma n \mu\left(\left\|x_{t+1}-x_{*}\right\|^{2} \right) +\gamma^{3} L n^{3} \left(\sigma_*^t\right)^2.
\end{align*}
Since we update $y_t = x_t$ after each epoch, this leads to
\begin{align*}
	\left\|x_{t+1}-x_{*}\right\|^{2} &\leq \frac{1}{1+\gamma \mu n}\left(\left\|x_{t}-x_{*}\right\|^{2}+\gamma^{3} L n^{3} \left(\sigma_*^t\right)^2\right)\\
	&\leq \frac{1}{1+\gamma \mu n}\left(\left\|x_{t}-x_{*}\right\|^{2}+\gamma^{3} L n^{3} \cdot 4 L^2\|y_t-x_*\|^2\right)\\
	&= \frac{1}{1+\gamma \mu n}\left(\left\|x_{t}-x_{*}\right\|^{2}+4\gamma^{3} n^{3} L^3\|x_t-x_*\|^2\right)\\
	&= \frac{1}{1+\gamma \mu n}\left(1+4\gamma^{3} n^{3} L^3\right)\|x_t-x_*\|^2.\\
\end{align*}
We obtain
\begin{equation*}
	\left\|x_{t+1}-x_{*}\right\|^{2} \leq \frac{1+4 \gamma^{3} L^3 n^{3} }{1+\gamma \mu n} \left\|x_{t}-x_{*}\right\|^{2}.
\end{equation*}
Since we have $\mu\leq L$ we obtain 
\begin{align*}
		\frac{1}{8} + \frac{1}{8}\frac{\mu}{L}\sqrt{\frac{\mu}{L}} &\leq \frac{1}{2}\\
			\frac{1}{8}\mu + \frac{1}{8}\frac{\mu}{L}\sqrt{\frac{\mu}{L}}\mu &\leq \frac{\mu}{2}\\
	2\cdot\frac{1}{16L^2n^2}\cdot\frac{\mu}{L}L^3n^2+\frac{1}{4Ln}\sqrt{\frac{\mu}{L}}\cdot\frac{n\mu^2}{2}&\leq \frac{\mu}{2}.
\end{align*}
Now as $\gamma \leq \frac{1}{4 L n}\sqrt{\frac{\mu}{L}}$, we have
\begin{align*}
		4 \gamma^{2} L^3 n^2 + \frac{\gamma n \mu^2}{2}  &\leq  \frac{\mu}{2}\\
			4 \gamma^{2} L^3 n^2  &\leq  \frac{\mu}{2} - \frac{\gamma n \mu^2}{2}   \\
		4 \gamma^{2} L^3 n^{3}  &\leq  \frac{n \mu}{2} - \frac{\gamma n^2 \mu^2}{2}   \\
		1+4 \gamma^{3} L^3 n^{3}  &\leq 1 + \frac{\gamma n \mu}{2} - \frac{\gamma^2 n^2 \mu^2}{2}.
\end{align*}
Let us simplify it: 
\begin{align*}
	\frac{1+4 \gamma^{3} L^3 n^{3} }{1+\gamma \mu n} \leq 1 - \frac{\gamma n \mu}{2}.
\end{align*}



We can unroll the recursion and obtain
\begin{equation*}
	\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( 1 - \frac{\gamma n \mu}{2} \right)^T \|x_0 - x_*\|^2.
\end{equation*}




\subsection{Proof of Theorem~\ref{th6}}
Suppose the functions $f_1, f_2, \ldots, f_n$ are convex and Assumption~\ref{L-smooth} hold.s Then for Algorithm~\ref{alg:GENERIC-SVRG} (\DS) with a stepsize $\gamma \leq \frac{1}{2\sqrt{2}Ln}$, the average iterate $\hat{x}_{T} \eqdef \frac{1}{T} \sum_{j=1}^{T} x_{j}$ satisfies 
\begin{align*}
	f\left(\hat{x}_{T}\right)-f\left(x_{*}\right) \leq \frac{2\left\|x_{0}-x_{*}\right\|^{2}}{ \gamma n T}.
\end{align*}
We start with Lemma 8 from~\citet{mishchenko2020random}:
\begin{align*}
	\left\|x_{t+1}-x_{*}\right\|^{2} \leq \left\|x_{t}-x_{*}\right\|^{2} -2 \gamma n \left( f\left(x_{t+1}\right)-f\left(x_{*}\right) \right)+\gamma^{3} Ln^{3} \sigma_{*}^{2}\\
	2 \gamma n \left(f\left(x_{t+1}\right)-f\left(x_{*}\right) \right) \leq \left\|x_{t}-x_{*}\right\|^{2}-\left\|x_{t+1}-x_{*}\right\|^{2}+\gamma^{3} L n^{3} \sigma_{*}^{2}.
\end{align*}
Using Lemma~\ref{main_lemma_lemma} and considering $y_t=x_t$, we have  
$$\left(\sigma_*^t\right)^2 \leq 8LD_{f^t}(x_t,x_*).$$
Applying Proposition~\ref{prop-reform} we get 
$$\left(\sigma_*^t\right)^2 \leq 8LD_{f}(x_t,x_*) = 8L(f(x_t) - f(x_*)).$$
Next, we utilize the inner product reformulation and get 
\begin{align*}
	2 \gamma n \left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) \leq \left\|x_{t}-x_{*}\right\|^{2}-\left\|x_{t+1}-x_{*}\right\|^{2} +\gamma^{3} L n^{3} \cdot 8L(f(x_t) - f(x_*)).
\end{align*}
Using tower property we have
\begin{align*}
	2 \gamma n \left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) \leq \left\|x_{t}-x_{*}\right\|^{2}-\left\|x_{t+1}-x_{*}\right\|^{2}+8\gamma^{3} L^2 n^{3} \left((f(x_t) - f(x_*))\right).
\end{align*}
Now we subtract from both sides:
\begin{align*}
	2 \gamma n \left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) - 8\gamma^3L^2n^3\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right)  &\leq \left(\left\|x_{t}-x_{*}\right\|^{2}\right)-\left\|x_{t+1}-x_{*}\right\|^{2}\\
	&+8\gamma^{3} L^2 n^{3} \left((f(x_t) - f(x_*))\right)\\
	&- 8\gamma^3L^2n^3\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right)\\
	\left(2 \gamma n - 8\gamma^3L^2n^3\right)\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) &\leq \left\|x_{t}-x_{*}\right\|^{2}-\left\|x_{t+1}-x_{*}\right\|^{2}\\
	&\qquad+ 8\gamma^{3} L^2 n^{3}\left( \left(f(x_t) - f(x_*)\right) - \left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right)\right)\\
	2 \gamma n\left(1 - 4\gamma^2L^2n^2\right)\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) &\leq \left\|x_{t}-x_{*}\right\|^{2}-\left\|x_{t+1}-x_{*}\right\|^{2}\\
	\qquad+ 8\gamma^{3} L^2 n^{3}\left( \left(f(x_t) - f(x_*)\right) - \left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right)\right).
\end{align*}
Summing these inequalities for $t=0,1,\ldots,T-1$ gives
\begin{align*}
	2 \gamma n\left(1 - 4\gamma^2L^2n^2\right)\sum_{t=0}^{T-1}\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) &\leq \sum_{t=0}^{T-1}\left(\left\|x_{t}-x_{*}\right\|^{2}-\left\|x_{t+1}-x_{*}\right\|^{2}\right)\\
	&\qquad + 8\gamma^{3} L^2 n^{3}\sum_{t=0}^{T-1}\left( \left(f(x_t) - f(x_*)\right) - \left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right)\right)\\
	&= \left\|x_{0}-x_{*}\right\|^{2} - \left\|x_{T}-x_{*}\right\|^{2}\\
	&\qquad+8\gamma^{3} L^2 n^{3}\left(f\left(x_{0}\right)-f\left(x_{*}\right)\right) - 8\gamma^{3} L^2 n^{3}\left(f\left(x_{T}\right)-f\left(x_{*}\right)\right)\\
	&\leq \left\|x_{0}-x_{*}\right\|^{2} +8\gamma^{3} L^2 n^{3}\left(f\left(x_{0}\right)-f\left(x_{*}\right)\right)\\
	&\leq \left\|x_{0}-x_{*}\right\|^{2} + 4\gamma^{3} L^3 n^{3}\left\|x_{0}-x_{*}\right\|^{2}\\
	&= (1+4\gamma^{3} L^3 n^{3})\left\|x_{0}-x_{*}\right\|^{2},
\end{align*}
and dividing both sides by $2 \gamma n\left(1 - 4\gamma^2L^2n^2\right)T$, we get
\begin{align*}
	\frac{1}{T}\sum_{t=0}^{T-1}\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) &\leq \frac{1+4\gamma^{3} L^3 n^{3}}{1 - 4\gamma^2L^2n^2}\frac{\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma nT}.
\end{align*}
Using the convexity of $f$, the average iterate $\hat{x}_{T} \stackrel{\text { def }}{=} \frac{1}{T} \sum_{t=1}^{T} x_{t}$ satisfies
\begin{align*}
	\left(f\left(\hat{x}_{T}\right)-f\left(x_{*}\right)\right) \leq \frac{1}{T} \sum_{t=1}^{T} \left(f\left(x_{t}\right)-f\left(x_{*}\right)\right)\leq \frac{1+4\gamma^{3} L^3 n^{3}}{1 - 4\gamma^2L^2n^2}\frac{\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma nT}.
\end{align*}
Let us show that 
$$\frac{1+4\gamma^{3} L^3 n^{3}}{1 - 4\gamma^2L^2n^2} \leq 4.$$
Applying $\gamma\leq \frac{1}{2\sqrt{2}Ln}$ we have 
$$\frac{1+4\frac{1}{16\sqrt{2}L^3n^3}L^3n^3}{1-4\frac{1}{8L^2n^2}L^2n^2} = \frac{1+\frac{1}{4\sqrt{2}}}{1-\frac{1}{2}}\leq 4.$$
Finally, we have 
$$f\left(\hat{x}_{T}\right)-f\left(x_{*}\right) \leq \frac{2\left\|x_{0}-x_{*}\right\|^{2}}{\gamma nT}.$$
This ends the proof. 



%%%%%%%%%%%%%%	
\clearpage	
\section{One More Algorithm: \algname{RR-VR}}


\subsection{New Algorithm: \algname{RR-VR}}

\begin{algorithm}[h]
	\caption{Random Reshuffling with Variance Reduction}
	\label{alg:RR_VR}	
	\begin{algorithmic}[1]
		\STATE \textbf{Input:} Stepsize $\gamma>0$, probability $p$, $x_0 = x_0^0 \in \mathbb{R}^{d}, y_0 \in \mathbb{R}^{d}$, number of epochs $T$.
		\FOR{$t =  0, 1, \dots T-1$ }
		\STATE {\color{red}Choose a random permutation $\{\pi_0, \dots, \pi_{n-1}\}$ of $\{1, \dots, n\}$}
		\STATE $x_t^0 = x_t$
		\FOR{$i= 0, \dots, n-1$ }
		\STATE $g^i_t(x_t^i,y_t) =  \nabla f_{\pi_i} (x_t^i)-\nabla f_{\pi_i} (y_t)+\nabla f (y_t) $
		\STATE $x^{i+1}_t = x^i_t - \gamma g^i_t(x_t^i,y_t)$
		\ENDFOR
		\STATE $x_{t+1} = x^n_t$
		\STATE  $y_{t+1}=\begin{cases} y_t & \text{with probability } 1-p \\ x_t & \text{with probability } p \end{cases}$
		\ENDFOR
	\end{algorithmic}
	
\end{algorithm}


In this section we formulate convergence results for a generalized version of \algname{SVRG} under random reshuffling. Analysis of \algname{RR-VR} (Algorithm~\ref{alg:RR_VR}) is more complicated. 


\subsection{Convergence Theory}	

To analyze this method, we introduce Lyapunov functions. 
\begin{theorem}
	\label{th7}
	Suppose that each $f_i$ is convex, $f$ is $\mu$-strongly convex, and Assumption~\ref{L-smooth}
	holds. Then provided the parameters satisfy $n>\kappa$, $\frac{\kappa}{n}<p<1$ and $\gamma \leq \frac{1}{2\sqrt{2}Ln}$, 
	the final iterate generated by \algname{RR-VR} (Algorithm~\ref{alg:RR_VR}) satisfies
	$
	V_{T} \leq \max \left( q_1,q_2 \right)^{T} V_{0},
	$
	where
	$
	q_1 = 1-\frac{\gamma \mu n}{4}\left(1-\frac{p}{2}\right)$,		$q_2 = 1-p+\frac{8}{\mu} \gamma^{2} L^{3} n$, 
	and the Lyapunov function is defined via
	\begin{align*}
		V_t \eqdef \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+\left(\frac{4}{\gamma\mu n}\right)^{-1}\mathbb{E}\left[\left\|y_{t}-x_{*}\right\|^{2}\right]. 
	\end{align*}
	This means that the iteration complexity of Algorithm~\ref{alg:RR_VR} is
	$
	T = \mathcal{O}\left(\kappa\log \left(\frac{1}{\varepsilon}\right)\right).
	$
\end{theorem}
Note that the probability $p$ should not be too small. 
We obtain the same complexity as that of of \RR. 

\begin{theorem}
	\label{th_last}
	Suppose that the functions $f_1, \ldots, f_n$ are $\mu$-strongly convex, and that Assumption~\ref{L-smooth} holds. Then for \algname{RR-VR} (Algorithm~\ref{alg:RR_VR}) with parameters that satisfy $\gamma \leq \frac{1}{2L}\sqrt{\frac{\mu}{2nL}}$, $\frac{1}{2}<\delta<\frac{1}{\sqrt{2}}$, $0<p<1$, and for a sufficiently large number of functions, $n>\log\left(\frac{1}{1-\delta^2}\right)\cdot\left(\log\left(\frac{1}{1-\gamma\mu}\right)\right)^{-1}$, the iterates generated by the \algname{RR-VR} algorithm satisfy
	$	
	V_{T} 
	\leq \max \left(q_1,q_2\right)^{T} V_{0},$
	where $ \squeeze q_1 
	= (1-\gamma \mu)^{n}+\delta^2$, 
	$q_2 	
	= 1-p\left(1-\frac{2\gamma^2L^3n}{\mu\delta^2}\right),$ and 
	$$
	V_t \eqdef \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+\frac{\delta^2}{p}\mathbb{E}\left[\left\|y_{t}-x_{*}\right\|^{2}\right]. 
	$$
	This means that the iteration complexity of Algorithm~\ref{alg:RR_VR} is
	$
	T = \mathcal{O}\left(\max\left(\kappa\sqrt{\frac{\kappa}{n}},\frac{1}{2\log (2\delta)}\right)\log \left(\frac{1}{\varepsilon}\right)\right).
	$
\end{theorem}

We get almost the same rate as the rate of \RR, but there is one difference. Complexity depends on $\delta$ term. However, the first term dominates in most cases. 


\subsection{Proof of Theorem~\ref{th7}}
Suppose that each $f_i$ is convex, $f$ is $\mu$-strongly convex, and Assumption~\ref{L-smooth}
holds. Then provided the parameters satisfy $n>\kappa$, $\frac{\kappa}{n}<p<1$ and $\gamma \leq \frac{1}{2\sqrt{2}Ln}$, 
the final iterate generated by \algname{RR-VR} (Algorithm~\ref{alg:RR_VR}) satisfies
\begin{align*}
	V_{T} \leq \max \left( q_1,q_2 \right)^{T} V_{0},
\end{align*}
where
\begin{align*}
	q_1 = 1-\frac{\gamma \mu n}{4}\left(1-\frac{p}{2}\right), \quad
	q_2 = 1-p+\frac{8}{\mu} \gamma^{2} L^{3} n,
\end{align*}
and the Lyapunov function is defined via
\begin{align*}
	V_t \eqdef \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+\frac{4}{\gamma\mu n}\mathbb{E}\left[\left\|y_{t}-x_{*}\right\|^{2}\right]. 
\end{align*}
\begin{proof}
	For the problem $\frac{1}{n}\sum_{i=1}^{n}f^t_i(x)$ we will use an inequality from \citet{mishchenko2020random}:
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t\right] & \leq \frac{1}{1+\gamma \mu n}\left(\left\|x_{t}-x_{*}\right\|^{2}+\frac{\gamma^{3} L n^{2} \sigma_{*}^{2}}{2}\right) \\
		&=\frac{1}{1+\gamma \mu n} \left\|x_{t}-x_{*}\right\|^{2}+\frac{1}{1+\gamma \mu n} \frac{\gamma^{3} L n^{2} \sigma_{*}^{2}}{2} \\
		& \leq\left(1-\frac{\gamma \mu n}{2}\right) \left\|x_{t}-x_{*}\right\|^{2}+\frac{\gamma^{3} L n^{2} \sigma_{*}^{2}}{2}.
	\end{align*}
	Now we apply inequality 
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t, y_t\right] &\leq\left(1-\frac{\gamma \mu n}{2}\right) \left\|x_{t}-x_{*}\right\|^{2}+\frac{\gamma^{3} L n^{2} \sigma_{*}^{2}}{2}\\
		&\leq\left(1-\frac{\gamma \mu n}{2}\right) \left\|x_{t}-x_{*}\right\|^{2} + 2\gamma^3L^3n^2\|y_t-x_*\|^2.
	\end{align*}
	Using tower property we have 
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right] &= \mathbb{E}\left[\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t, y_t\right]\right] \\
		&\leq\left(1-\frac{\gamma \mu n}{2}\right) \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right] +2\gamma^3L^3n^2\mathbb{E}\left[\|y_t-x_*\|^2\right].
	\end{align*}
	Now we look at 
	\begin{align*}
		y_{t+1}=\left\{\begin{array}{ll} y_t & \text{with probability } 1-p \\
			x_t & \text{with probability } p
		\end{array}\right. .
	\end{align*}
	We get
	\begin{align*}
		\mathbb{E}\left[\|y_{t+1} - x_*\|^2\mid x_t,y_t\right] = (1-p)\|y_t - x_*\|^2+p\| x_t - x_* \|^2.
	\end{align*}
	Using tower property 
	\begin{align*}
		\mathbb{E}\left[\|y_{t+1} - x_*\|^2\right] &= \mathbb{E}\left[\mathbb{E}\left[\|y_{t+1} - x_*\|^2\mid x_t,y_t\right]\right]\\
		&= (1-p)\mathbb{E}\left[\|y_t - x_*\|^2\right]+p\mathbb{E}\left[\| x_t - x_* \|^2\right].
	\end{align*}
	Finally, we have 
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]+M\mathbb{E}\left[\|y_{t+1} - x_*\|^2\right]  & \leq\left(1-\frac{\gamma \mu n}{2}\right) \left\|x_{t}-x_{*}\right\|^{2} +2\gamma^3L^3n^2\mathbb{E}\left[\|y_t-x_*\|^2\right]\\\notag
		&\qquad+(1-p)M\mathbb{E}\|y_t - x_*\|^2 +pM\mathbb{E}\| x_t - x_* \|^2.
	\end{align*}
	Denote $V_{t} = \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+M\mathbb{E}\left[\|y_{t} - x_*\|^2\right].$ Using this we obtain
	\begin{align*}
		V_{t+1}&\leq\left(1-\frac{\gamma \mu n}{2}\right) \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right] +2\gamma^3L^3n^2\mathbb{E}\left[\|y_t-x_*\|^2\right]\\\notag
		&\qquad+(1-p)M\mathbb{E}\left[\|y_t - x_*\|^2\right]+pM\mathbb{E}\left[\| x_t - x_* \|^2\right].
	\end{align*}
	Thus,
	\begin{align*}
		V_{t+1}&\leq\left(1-\frac{\gamma \mu n}{2}+pM\right) \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right] +\left(1-p+\frac{1}{M}2\gamma^3L^3n^2\right)M\mathbb{E}\left[\|y_t-x_*\|^2\right].
	\end{align*}
	To have contraction we use 
	\begin{align*}
		M = \frac{\gamma\mu n}{4}, \qquad \gamma = \frac{1}{2\sqrt{2}Ln}.
	\end{align*}
	We have the final rate
	\begin{align*}
		V_{t+1} &\leq \max \left(1-\frac{\gamma \mu n}{4}\left(1-\frac{p}{2}\right),  1-p+\frac{8}{\mu} \gamma^{2} L^{3} n \right)V_t\\
		V_{T} &\leq \max\left(1-\frac{\gamma \mu n}{4}\left(1-\frac{p}{2}\right),  1-p+\frac{8}{\mu} \gamma^{2} L^{3} n \right)^TV_0.
	\end{align*}
\end{proof} 
%	The proof of Corollary~\ref{corollary7} is an application of the lemma from Section~\ref{compl_lemma}.

\subsection{Proof of Theorem~\ref{th_last}}
Suppose that the functions $f_1, \ldots, f_n$ are $\mu$-strongly convex, and that Assumption~\ref{L-smooth} holds. Then for \algname{RR-VR} (Algorithm~\ref{alg:RR_VR}) with parameters that satisfy $\gamma \leq \frac{1}{2L}\sqrt{\frac{\mu}{2nL}}$, $\frac{1}{2}<\delta<\frac{1}{\sqrt{2}}$, $0<p<1$, and for a sufficiently large number of functions, $n>\log\left(\frac{1}{1-\delta^2}\right)\cdot\left(\log\left(\frac{1}{1-\gamma\mu}\right)\right)^{-1}$, the iterates generated by the \algname{RR-VR} algorithm satisfy
\begin{align*}	
	V_{T} &
	\leq \max \left(q_1,q_2\right)^{T} V_{0},	
\end{align*}
where $$  q_1 
= (1-\gamma \mu)^{n}+\delta^2, \quad
q_2 	
= 1-p\left(1-\frac{2\gamma^2L^3n}{\mu\delta^2}\right),$$ and 
$$
V_t \eqdef \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+\frac{\delta^2}{p}\mathbb{E}\left[\left\|y_{t}-x_{*}\right\|^{2}\right]. 
$$
\begin{proof}
	For the problem $\frac{1}{n}\sum_{i=1}^{n}f^t_i(x)$ we will use two inequalities from \citet{mishchenko2020random}:
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t\right]  &\leq\left(1-\gamma \mu\right)^n \left\|x_{t}-x_{*}\right\|^{2}+2\gamma^2\sigma_{\text {Shuffle }}^2\left(\sum_{i=0}^{n-1}(1-\gamma\mu)^i\right)\\
		\sigma_{\text {Shuffle }}^{2} &\leq \frac{\gamma L n}{4} \sigma_{*}^{2}.
	\end{align*}
	Using this result, we have 
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t,y_t\right]  &\leq\left(1-\gamma \mu\right)^n \left\|x_{t}-x_{*}\right\|^{2}+\frac{1}{2}\gamma^3Ln\sigma_{*}^2\left(\sum_{i=0}^{n-1}(1-\gamma\mu)^i\right)\\
		&\leq\left(1-\gamma \mu\right)^n\left\|x_{t}-x_{*}\right\|^{2}+\frac{1}{\mu}2\gamma^2L^2nL\|y_t - x_*\|^2.
	\end{align*}
	Using tower property
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right] &= \mathbb{E}\left[\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t,y_t\right]\right]\\
		&  \leq\left(1-\gamma \mu\right)^n\mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+\frac{1}{\mu}2\gamma^2LnL^2\mathbb{E}\left[\|y_t - x_*\|^2\right].
	\end{align*}
	Now we look at 
	\begin{align*}
		y_{t+1}=\left\{\begin{array}{ll} y_t & \text{with probability } 1-p \\
			x_t & \text{with probability } p
		\end{array}\right..
	\end{align*}
	Thus,
	$
	\mathbb{E}\left[\|y_{t+1} - x_*\|^2\mid x_t,y_t\right] = (1-p)\|y_t - x_*\|^2+p\| x_t - x_* \|^2.
	$
	Using tower property 
	\begin{align*}
		\squeeze
		\mathbb{E}\left[\|y_{t+1} - x_*\|^2\right] &= \mathbb{E}\left[\mathbb{E}\left[\|y_{t+1} - x_*\|^2\mid x_t,y_t\right]\right]\\
		&= (1-p)\mathbb{E}\left[\|y_t - x_*\|^2\right]+p\mathbb{E}\left[\| x_t - x_* \|^2\right].
	\end{align*}
	Denote $V_{t} = \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+M\mathbb{E}\left[\|y_{t} - x_*\|^2\right]$ and we have
	\begin{align*}\squeeze
		V_{t+1} &= \mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]+M\mathbb{E}\left[\|y_{t+1} - x_*\|^2\right]\\
		& \leq\left(1-\gamma \mu\right)^n\mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+\frac{2}{\mu}\gamma^2L^3n\mathbb{E}\left[\|y_t - x_*\|^2\right]+(1-p)M\mathbb{E}\left[\|y_t - x_*\|^2\right]+pM\mathbb{E}\left[\| x_t - x_* \|^2\right]\\
		&\leq \left(\left(1-\gamma \mu\right)^n+pM\right)\mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right] +\left((1-p)+\frac{2\gamma^2L^3n}{\mu M}\right)M\mathbb{E}\left[\| x_t - x_* \|^2\right]\\
		&\leq\max\left(\left(\left(1-\gamma \mu\right)^n+pM\right),\left((1-p)+\frac{2\gamma^2L^3n}{\mu M}\right)\right)V_t.
	\end{align*}
	Unrolling the recusrion we have 
	\begin{align*}
		\squeeze
		V_T \leq \max\left(\left((1-\gamma\mu)^n+pM\right), \left(1-p + \frac{2\gamma^2L^3n}{\mu M}\right)\right)^TV_0.
	\end{align*}
	Applying $M = \frac{\delta^2}{p}$ and $\gamma \leq \frac{1}{2L}\sqrt{\frac{\mu}{2nL}}$ we get 
	\begin{align*}	
		\squeeze
		V_{T} &	
		\leq \max \left((1-\gamma \mu)^{n}+\delta^2,1-p\left(1-\frac{2\gamma^2L^3n}{\mu\delta^2}\right)\right)^{T} V_{0}.
	\end{align*}
\end{proof}



