\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}


\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{mathtools}
\usepackage{zref-xr,zref-user}
\zexternaldocument*{malinovsky_196}
\newcommand{\R}{ \mathbb{R}}
\newcommand{\squeeze}{\textstyle}
\newcommand{\eqdef}{\coloneqq}
\usepackage{colortbl}
\definecolor{bgcolor}{rgb}{1, 1, 0.8}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amsthm}
\usepackage{mathtools}
\newtheorem{proposition}{Proposition}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{assumption}{Assumption}
\newtheorem{remark}{Remark}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{amsthm}
\newtheorem{definition}{Definition}[section]

\usepackage[flushleft]{threeparttable} % http://ctan.org/pkg/threeparttable


\definecolor{bgcolor}{rgb}{1, 1, 0.8}

\usepackage{xspace}
\newcommand{\DS}{{\sf \footnotesize \color{blue} Det-Shuffle}\xspace}
\newcommand{\RS}{{\sf \footnotesize \color{orange} Rand-Shuffle}\xspace}
\newcommand{\RR}{{\sf \footnotesize \color{red} Rand-Reshuffle}\xspace}


%\newcommand{\algname}[1]{{\sf \footnotesize \color{cyan}#1}\xspace}
%\newcommand{\algnameSMALL}[1]{{\sf\color{cyan}#1}\xspace}

\newcommand{\algname}[1]{{\sf \footnotesize #1}\xspace}
\newcommand{\algnameSMALL}[1]{{\sf #1}\xspace}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Random Reshuffling with Variance Reduction: New Analysis and Better Rates (Supplementary material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Grigory Malinovsky}
\author[1]{Alibek Sailanbayev}
\author[1]{Peter Richt\'arik}
% Add affiliations after the authors
\affil[1]{%
    AI Initiative\\
    King Abdullah University of Science and Technology\\
    Saudi Arabia
}
  
  \begin{document}
  	\onecolumn
\maketitle



\appendix
\tableofcontents

\part*{Appendix}







\section{Basic Facts}\label{seca1}

\subsection{Elementary Inequalities}

\begin{proposition}
	For all $a, b \in \mathbb{R}^{d}$  and $t > 0$ the following inequalities hold
	\begin{align}
		\langle a, b\rangle &\leq \frac{\|a\|^{2}}{2 t}+\frac{t\|b\|^{2}}{2},\notag \\
		\|a+b\|^{2} &\leq 2\|a\|^{2}+2\|b\|^{2},\label{eq:young}\\
		\frac{1}{2}\|a\|^{2}-\|b\|^{2} &\leq\|a+b\|^{2}.\notag
	\end{align}
	
\end{proposition}


\subsection{Convexity and smoothness}\label{sec:convex_smoothness}

\begin{proposition}
	\label{eq:prop3}
	Let $f : \mathbb{R}^d \to \mathbb{R}$ be continuously differentiable and let $L\geq 0$. Then the following statements are equivalent:
	\begin{itemize}
		\item $f$ is $L$-smooth,
		\item $2 D_{f}(x, y) \leq L\|x-y\|^{2} \text { for all } x, y \in \mathbb{R}^{d}$,
		\item $\langle\nabla f(x)-\nabla f(y), x-y\rangle \leq L\|x-y\|^{2} \text { for all } x, y \in \mathbb{R}^{d}$.
	\end{itemize}
\end{proposition}

\begin{proposition}
	Let $f : \mathbb{R}^d \to \mathbb{R}$ be continuously differentiable and let $\mu\geq 0$. Then the following statements are equivalent:
	\begin{itemize}
		\item $f$ is $\mu$-strongly convex,
		\item $2 D_{f}(x, y) \geq \mu\|x-y\|^{2} \text { for all } x, y \in \mathbb{R}^{d}$,
		\item $\langle\nabla f(x)-\nabla f(y), x-y\rangle \geq \mu\|x-y\|^{2} \text { for all } x, y \in \mathbb{R}^{d}$.
	\end{itemize}
\end{proposition}
Note that the $\mu = 0$ case reduces to convexity.

\begin{proposition}
	Let $f : \mathbb{R}^d \to \mathbb{R}$ be continuously differentiable and $L > 0$. Then the following statements are equivalent:
	\begin{itemize}
		\item $f$ is convex and $L$-smooth
		\item $0 \leq 2 D_{f}(x, y) \leq L\|x-y\|^{2} \text { for all } x, y \in \mathbb{R}^{d}$,
		\item $\frac{1}{L}\|\nabla f(x)-\nabla f(y)\|^{2} \leq 2 D_{f}(x, y) \text { for all } x, y \in \mathbb{R}^{d}$,
		\item $\frac{1}{L}\|\nabla f(x)-\nabla f(y)\|^{2} \leq\langle\nabla f(x)-\nabla f(y), x-y\rangle \text { for all } x, y \in \mathbb{R}^{d}$.
	\end{itemize}
\end{proposition}

\begin{proposition}[Jensen's inequality]
	Let $f: \mathbb{R}^{d} \to \mathbb{R}$ be a convex function, $x_1,\ldots,x_m \in \mathbb{R}^{d}$, and $\lambda_1,\ldots, \lambda_m$ be nonnegative real numbers adding up to 1. Then  
	$$f\left(\sum_{i=1}^{m} \lambda_{i} x_{i}\right) \leq \sum_{i=1}^{m} \lambda_{i} f\left(x_{i}\right).$$
\end{proposition}




\subsection{From convergence rate to iteration complexity}


We implicitly use the following standard result to derive iteration complexity results in our theorems. We include the statement and proof, for completeness.


\begin{lemma}\label{lem:itercomplex}
	Consider a randomized algorithm producing a sequence of random iterates $\{x_t\}_{t\geq 0}$. Let $S_t$ be some nonnegative function of $x_t$ (example: $S_t=\|x_t-x_*\|^2$). Assume that there exists $q\in (0,1)$ such that the following inequality holds for  all $t\geq 0$:
	\begin{align}
		\mathbb{E} \left[ S_t \right] \leq \left( 1 - q \right)^t S_0. \label{eq:nuh8g9f8d_98y8fhdf}
	\end{align}
	Fix any $\varepsilon>0$. Then as long as 
	$$T \geq \frac{1}{q} \ln \left(\frac{1}{\varepsilon}\right),$$ 
	we have $$\mathbb{E} \left[ S_T  \right] \leq \varepsilon S_0.
	$$
\end{lemma}

\begin{proof}
	Since $e^{q} \geq 1+q$ for all $q\in \R$, we have $e^{-q} \geq 1-q$ for all $ q \in(0,1).$ Since logarithm is an increasing over $\mathbb{R}_{+}$, it follows that
	$
	-q \geq \ln (1-q)$ for all $q \in(0,1)$. Therefore, the inequality
	\begin{align*}
		-t q \geq t \ln \left(1-q\right)
	\end{align*}
	holds for all	 $t\geq 0$ and all $q \in(0,1)$. 		Now if we have $T\geq \frac{1}{q}\ln\left(\frac{1}{\varepsilon}\right),$
	which is equivalent to
	$-T\cdot q \leq \ln (\varepsilon),$
	we obtain
	$T \ln \left(1-q\right) \leq \ln (\varepsilon).$
	Taking exponential on both sides, we get	\begin{equation} \label{eq:8yfd98gf8df}0<\left(1-q\right)^{T} \leq \varepsilon.\end{equation}
	Finally, we have 
	$$\mathbb{E} \left[ S_T \right] \overset{\eqref{eq:nuh8g9f8d_98y8fhdf}}{ \leq} \left(1-q\right)^{T} S_0 \overset{\eqref{eq:8yfd98gf8df}}{\leq} \varepsilon S_0.$$
	
\end{proof}
\begin{lemma}
Consider a randomized algorithm producing a sequence of random iterates $x_t$.
	Let $S_t$ be some nonnegative function of $x_t$ (example: $S_t=\left\|x_t-x_*\right\|^2$ ).
	Assume that there exists $q \in(0,1)$ such that the following inequality holds for all $t \geq 0$ :
	$$
	\mathbb{E}\left[S_t\right] \leq(1-q)^{\beta t} S_0.
	$$
	Fix any $\varepsilon>0$. Then as long as
	$$
	T \geq \frac{1}{q \beta} \ln \left(\frac{1}{\varepsilon}\right)
	$$
	we have
	$$
	\mathbb{E}\left[S_T\right] \leq \varepsilon .
	$$
\end{lemma}
\begin{proof}:
	Since $e^q \geq 1+q$ for all $q \in \mathbb{R}$, we have $e^{-q} \geq 1-q$ for all $q \in(0,1)$. Since logarithm is an increasing function over $\mathbb{R}_{+}$, it follows that $-q \geq \ln (1-q)$ for all $q \in(0,1)$. Therefore, the inequality
	$-\beta t q \geq \beta t \ln (1-q)$
	holds for all $t \geq 0$ and all $q \in(0,1)$.
	Now, if we have $T \geq \frac{1}{\beta q} \ln \left(\frac{1}{\varepsilon}\right)$, which is equivalent to $-T \beta \cdot q \leq \ln (\varepsilon)$, we obtain $\beta T \ln (1-q) \leq \ln (\varepsilon)$.
	Taking exponential on both sides, we get
	$$
	0<(1-q)^{\beta T} \leq \varepsilon .
	$$
	Finally, we have
	$$
	\mathbb{E}\left[\Psi_T\right] \leq(1-q)^{\beta T} \Psi_0 \leq \varepsilon \Psi_0 .
	$$
\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%

\clearpage	
\section{Proof of Proposition 1}
Assume that each $f_i$ is $\mu$-strongly convex (resp.\ convex) and $L$-smooth. Then the function $$f^t\eqdef \frac{1}{n}\sum_{i=1}^n f_i^t,$$ and  
\begin{equation}
	f_i^t (x) \eqdef f_i(x)+\left\langle a_i^t,x \right\rangle,
\end{equation}
are $\mu$-strongly convex (resp.\ convex) and $L$-smooth.
\begin{proof}
	Let us compute Bregman divergence with respect to the new function $f^t_i(x):$
	\begin{align*}
		D_{f^t_i}(x, y) = f^t_i(x)-f^t_i(y)-\langle\nabla f^t_i(y), x-y\rangle.
	\end{align*}
	Note that $\nabla f^t_i(y) = \nabla f_i(y)+a_i^t$. Now we have 
	\begin{align*}
		D_{f^t_i}(x, y) &= f^t_i(x)-f^t_i(y)-\langle\nabla f^t_i(y), x-y\rangle\\
		&= f_i(x)+\left\langle a_i^t, x \right\rangle - \left( f_i(y)+\left\langle a_i^t, y \right\rangle\right) - \langle \nabla f_i(y)+a_i^t, x-y\rangle\\
		&=f_i(x)+\left\langle a_i^t, x \right\rangle - f_i(y)-\left\langle a_i^t, y \right\rangle - \langle \nabla f_i(y), x-y\rangle - \langle a_i^t, x-y \rangle\\
		&=f_i(x)+\left\langle a_i^t, x \right\rangle - f_i(y)-\left\langle a_i^t, y \right\rangle - \langle \nabla f_i(y), x-y\rangle - \langle a_i^t, x \rangle+\langle a_i^t, y \rangle\\
		&=f_i(x) - f_i(y) - \langle \nabla f_i(y), x-y\rangle\\
		& = D_{f_i}(x, y).
	\end{align*}
	Since the Bregman divergence is not changed, the new function $f^t_i(x)$ has the same properties ($\mu$-strong convexity or convexity and $L$-smoothness) as the initial function $f_i(x)$. 
\end{proof}




%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%

\clearpage	
\section{Proof of Lemma 1}
\begin{proof}
We start from definition of $\left(\sigma_*^t\right)^2$ and $a_t^i$:
$$
\left(\sigma_*^t\right)^2:=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i^t\left(x_*\right)\right\|^2=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i\left(x_*\right)-\nabla f_i\left(y_t\right)+\nabla f\left(y_t\right)\right\|^2.
$$
Using the fact that $\nabla f\left(x_*\right)=0$ we have
$$
\left(\sigma_*^t\right)^2=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i\left(x_*\right)-\nabla f_i\left(y_t\right)+\nabla f\left(y_t\right)-\nabla f\left(x_*\right)\right\|^2.
$$
Applying Young's inequality (12) we obtain
$$
\left(\sigma_*^t\right)^2 \leq \frac{1}{n} \sum_{i=1}^n\left(2\left\|\nabla f_i\left(y_t\right)-\nabla f_i\left(x_*\right)\right\|^2+2\left\|\nabla f\left(y_t\right)-\nabla f\left(x_*\right)\right\|^2\right).
$$
Now we apply Proposition 5 for the squared norms of gradient differences:
$$
\left(\sigma_*^t\right)^2 \leq \frac{1}{n} \sum_{i=1}^n 4 L D_{f_i}\left(y_t, x_*\right)+\frac{1}{n} \sum_{i=1}^n 4 L D_f\left(y_t, x_*\right).
$$
We need to use the fact that $\frac{1}{n} \sum_{i=1}^n D_{f_i}\left(y_t, x_*\right)=D_f\left(y_t, x_*\right)$. It is true since 
$f(x)=\frac{1}{n} \sum_{i=1}^n f_i(x)$. So, $$\left(\sigma_*^t\right)^2 \leq 4 L D_f\left(y_t, x_*\right)+4 L D_f\left(y_t, x_*\right)=8 L D_f\left(y_t, x_*\right).$$
Finally, we apply the $L$-smoothness property from Proposition \ref{eq:prop3}:
$$
\left(\sigma_*^t\right)^2 \leq 4 L^2\left\|y_t-x_*\right\|^2.
$$
\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%

\clearpage	
\section{Analysis of  Rand-Shuffle and Rand-Reshuffle}

\subsection{Proof of Theorems 1 and 2 }
\label{D.1}
%	Suppose that each $f_i$ is convex, $f$ is $\mu$-strongly convex, and Assumption~\ref{L-smooth}
%	holds. Then provided the stepsize satisfies $\gamma \leq \frac{1}{2\sqrt{2} L n}\sqrt{\frac{\mu}{L}},$
%	the iterates generated by \RR (Algorithm~\ref{alg:RRSVRG}) or by \RS (Algorithm~\ref{alg:SOSVRG}) satisfy
%	\begin{align*}
%	\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( 1 - \frac{\gamma n \mu}{2} \right)^T \|x_0 - x_*\|^2.
%	\end{align*}
%	
%	Suppose that each $f_i$ is convex, $f$ is $\mu$-strongly convex and Assumption~\ref{L-smooth} holds. Additionally assume we are in the ``big data'' regime characterized by $n \geq \frac{2 L}{\mu} \cdot \frac{1}{1-\frac{\mu}{\sqrt{2} L}}$. Then provided the stepsize satisfies $\gamma \leq \frac{1}{\sqrt{2}Ln},$
%	the iterates generated by \RR (Algorithm~\ref{alg:RRSVRG}) or by \RS (Algorithm~\ref{alg:SOSVRG}) satisfy
%	\begin{align*}
%	\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( 1 - \frac{\gamma n \mu}{2} \right)^T \|x_0 - x_*\|^2.
%	\end{align*}

\begin{proof}
	We start from Lemma 3 in paper of~\citet{mishchenko2020random}. 
	
\begin{lemma}
Assume that functions $f_1, \ldots, f_n$ are convex and that Assumption~1 is satisfied. If
	Random Reshuffling or Shuffle-Once is run with a stepsize satisfying
	$\gamma\leq\frac{1}{\sqrt{2}Ln}$, then
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right] \leq \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]-2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f(x_{*})\right]+\frac{\gamma^{3} L n^{2} \sigma_{*}^{2}}{2}.
	\end{align*}
\end{lemma}

 The proof of the analogous inequality from \cite{mishchenko2020random} but with condition expectation is identical with very minor changes. We provide such proof below:
	
	We denote by $\mathcal{F}_t$ the $\sigma$-algebra generated by the collection of $(\mathcal{X} \times \mathcal{Y})$-valued random variables $\left(x_0, y_0\right), \ldots,\left(x_t, y_t\right)$, for every $t \geq0$. In this work, we consider unbiased random estimates: for every $t \geq 0$. If the method does not depend on $y_\tau$ we can still use such notation because of the independence property for conditional expectations.
	We denote by $\mathcal{F}_t$ the $\sigma$-algebra generated by the collection of $(\mathcal{X} \times \mathcal{Y})$-valued random variables $\left(x_0, y_0\right), \ldots,\left(x_t, y_t\right)$, for every $t \geq 0$. In this work, we consider unbiased random estimates: for every $t \geq 0$.
	We define the forward per-epoch deviation over the $t$-th epoch $\mathcal{V}_i$ as
	$$
	\mathcal{V}_t=\sum_{i=0}^{n-1}\left\|x_t^i-x_{t+1}\right\|^2
	$$
	
	Lemma 2. Consider the iterates of Random Reshuffling or Shuffle-Once. If the functions $f_1, \ldots, f_n$ are convex and Assumption 1 is satisfied, then
	$$
	\mathbb{E}\left[\mathcal{V}_t \mid \mathcal{F}_t\right] \leq 4 \gamma^2 n^2 L \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_i}\left(x_*, x_t^i\right) \mid \mathcal{F}_t\right]+\frac{1}{2} \gamma^2 n^2 \sigma_*^2
	$$
	where $\mathcal{V}_i$ is defined above, and $\sigma_*^2$ is the variance at the optimum given by $\sigma_*^2=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i\left(x_*\right)\right\|^2$
	We will follow the steps from Mishchenko et al. [2020].
\begin{proof} For any fixed $k \in 0, \ldots, n-1$, by definition of $x_t^k$ and $x_{t+1}$ (According to Algorithm 1 or 2 in Mishchenko et al. [2020]) we get the decomposition
	$$
	x_t^k-x_{t+1}=\gamma \sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_t^i\right)=\gamma \sum_{i=k}^{n-1}\left(\nabla f_{\pi_i}\left(x_t^i\right)-\nabla f_{\pi_i}\left(x_*\right)\right)+\gamma \sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)
	$$
	Applying Young's inequality to the sums above yields
	$$
	\left\|x_t^k-x_{t+1}\right\|^2 \leq 2 \gamma^2\left\|\sum_{i=k}^{n-1}\left(\nabla f_{\pi_i}\left(x_t^i\right)-\nabla f_{\pi_i}\left(x_*\right)\right)\right\|^2+2 \gamma^2\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2
	$$
	Using Jensen's inequality we have
	$$
	\left\|x_t^k-x_{t+1}\right\|^2 \leq 2 \gamma^2 n \sum_{i=k}^{n-1}\left\|\nabla f_{\pi_i}\left(x_t^i\right)-\nabla f_{\pi_i}\left(x_*\right)\right\|^2+2 \gamma^2\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2
	$$
	Using $L$-smoothness property from Proposition 3 we have
	$$
	\left\|x_t^k-x_{t+1}\right\|^2 \leq 4 \gamma^2 L n \sum_{i=k}^{n-1} D_{f_{\pi_i}}\left(x_*, x_t^i\right)+2 \gamma^2\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2
	$$

Further, we have
$$
\left\|x_t^k-x_{t+1}\right\|^2 \leq 4 \gamma^2 L n \sum_{i=0}^{n-1} D_{f_{\pi_i}}\left(x_*, x_t^i\right)+2 \gamma^2\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2
$$
Summing up and taking conditional expectations leads to
$$
\sum_{k=0}^{n-1} \mathbb{E}\left[\left\|x_t^k-x_{t+1}\right\|^2 \mid \mathcal{F}_t\right] \leq 4 \gamma^2 L n^2 \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_i}\left(x_*, x_t^i\right) \mid \mathcal{F}_t\right]+2 \gamma^2 \sum_{k=0}^{n-1} \mathbb{E}\left[\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2 \mid \mathcal{F}_t\right]
$$
Since $\sum_{k=0}^{n-1} \mathbb{E}\left[\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2 \mid \mathcal{F}_t\right]$ does not depend on $\mathcal{F}_t$ but only on permutations we have
$$
\sum_{k=0}^{n-1} \mathbb{E}\left[\left\|x_t^k-x_{t+1}\right\|^2 \mid \mathcal{F}_t\right] \leq 4 \gamma^2 L n^2 \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_*, x_t^i\right) \mid \mathcal{F}_t\right]+2 \gamma^2 \sum_{k=0}^{n-1} \mathbb{E}\left[\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2\right]
$$
We now bound the second term in the right-hand side. First, using Lemma 1 from Mishchenko et al. [2020], we get
$\mathbb{E}\left[\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2\right]=(n-k)^2 \mathbb{E}\left[\left\|\frac{1}{n-k} \sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2\right]=(n-k)^2 \frac{k}{(n-k)(n-1)} \sigma_*^2=\frac{k(n-k)}{n-1} \sigma_*^2$.
Next, by summing this for $k$ from 0 to $n-1$, we obtain
$$
\sum_{k=0}^{n-1} \mathbb{E}\left[\left\|\sum_{i=k}^{n-1} \nabla f_{\pi_i}\left(x_*\right)\right\|^2\right]=\sum_{k=0}^{n-1} \frac{k(n-k)}{n-1} \sigma_*^2=\frac{1}{6} n(n+1) \sigma_*^2 \leq \frac{n^2 \sigma_*^2}{4}
$$
where in the last step we also used $n \geq 2$. The result follows.
\end{proof}
Let us provide analogue for Lemma 3 from \cite{mishchenko2020random}.

Lemma $3^*$. Assume that functions $f_1, \ldots, f_n$ are convex and that Assumption 1 is satisfied. If Random Reshuffling (Algorithm 1 ) or Shuffle-Once (Algorithm 2 ) is run with a stepsize satisfying $\gamma \leq \frac{1}{\sqrt{2} L n}$, then
$$
\mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq\left\|x_t-x_*\right\|^2-2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f_* \mid \mathcal{F}_t\right]+\frac{\gamma^3 L n^2 \sigma_*^2}{2}
$$
\begin{proof}
Define the sum of gradients used in the $t$-th epoch as $g_t=\sum_{i=0}^{n-1} \nabla f_{\pi_i}\left(x_t^i\right)$. We will use $g_t$ to relate the iterates $x_t$ and $x_{t+1}$. By definition of $x_{t+1}$, we can write
$$
x_{t+1}=x_t^n=x_t^{n-1}-\gamma \nabla f_{\pi_{n-1}}\left(x_t^{n-1}\right)=\ldots=x_t^0-\gamma \sum_{i=0}^{n-1} \nabla f_{\pi_i}\left(x_t^i\right)
$$
Further, since $x_t^0=x_t$, we see that $x_{t+1}=x_t-\gamma g_t$, which leads to
$$
\left\|x_t-x_*\right\|^2=\left\|x_{t+1}+\gamma g_t-x_*\right\|^2=\left\|x_{t+1}-x_*\right\|^2+2 \gamma\left\langle g_t, x_{t+1}-x_*\right\rangle+\gamma^2\left\|g_t\right\|^2
$$
Since $\gamma^2\left\|g_t\right\|^2 \geq 0$ we have
$$
\left\|x_t-x_*\right\|^2 \geq\left\|x_{t+1}-x_*\right\|^2+2 \gamma\left\langle g_t, x_{t+1}-x_*\right\rangle=\left\|x_{t+1}-x_*\right\|^2+2 \gamma \sum_{i=0}^{n-1}\left\langle\nabla f_{\pi_i}\left(x_t^i\right), x_{t+1}-x_*\right\rangle
$$

Observe that for any $i$, we have the following decomposition
$$
\left\langle\nabla f_{\pi_i}\left(x_t^i\right), x_{t+1}-x_*\right\rangle=\left(f_{\pi_i}\left(x_{t+1}\right)-f_{\pi_i}\left(x_*\right)\right)+D_{f_{\pi_i}}\left(x_*, x_t^i\right)-D_{f_{\pi_i}}\left(x_{t+1}, x_t^i\right)
$$
Summing the first quantity over $i$ from 0 to $n-1$ gives
$$
\sum_{i=0}^{n-1}\left(f_{\pi_i}\left(x_{t+1}\right)-f_{\pi_i}\left(x_*\right)\right)=n\left(f\left(x_{t+1}\right)-f_*\right)
$$
Now, we can bound the third term in the decomposition (33) using $L$-smoothness as follows:
$$
D_{f_{\pi_i}}\left(x_{t+1}, x_t^i\right) \leq \frac{L}{2}\left\|x_{t+1}-x_t^i\right\|^2
$$
By summing the right-hand side over $i$ from 0 to $n-1$ we get the forward deviation over an epoch $\mathcal{V}_t$, which we bound by analogue of Lemma 2 to get
$$
\sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_{t+1}, x_t^i\right) \mid \mathcal{F}_t\right] \leq \frac{L}{2} \mathbb{E}\left[\mathcal{V}_t \mid \mathcal{F}_t\right] \leq 2 \gamma^2 L^2 n^2 \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_*, x_t^i\right) \mid \mathcal{F}_t\right]+\frac{\gamma^2 L n^2 \sigma_*^2}{4}
$$
Therefore, we can lower-bound the sum of the second and the third term as
$$
\begin{aligned}
	& \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_*, x_t^i\right)-D_{f_{\pi_i}}\left(x_{t+1}, x_t^i\right) \mid \mathcal{F}_t\right] \geq \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_*, x_t^i\right) \mid \mathcal{F}_t\right] \\
	& -2 \gamma^2 L^2 n^2 \sum_{i=0}^{n-1} \mathbb{E}\left[D_{f_{\pi_i}}\left(x_*, x_t^i\right) \mid \mathcal{F}_t\right]-\frac{\gamma^2 L n^2 \sigma_*^2}{4} .
\end{aligned}
$$

\end{proof}


Proof. We start from analogue of Lemma 3 in paper of Mishchenko et al. [2020], which we proved above.
$$
\mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq\left\|x_t-x_*\right\|^2-2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_*\right) \mid \mathcal{F}_t\right]+\frac{\gamma^3 L n^2 \sigma_*^2}{2}
$$
Now we can apply this inequality to the reformulated problem (2). Using strong convexity, we obtain
$$
\begin{aligned}
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq\left\|x_t-x_*\right\|^2-2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_*\right) \mid \mathcal{F}_t\right]+\frac{\gamma^3 L n^2\left(\sigma_*^t\right)^2}{2} \\
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq\left\|x_t-x_*\right\|^2-\gamma n \mu \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right]+\frac{\gamma^3 L^2\left(\sigma_*^t\right)^2}{2}
\end{aligned}
$$
Since we update $y_t=x_t$ after each epoch, this leads to
$$
\begin{aligned}
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq \frac{1}{1+\gamma \mu n}\left(\left\|x_t-x_*\right\|^2+\frac{\gamma^3 L n^2\left(\sigma_*^t\right)^2}{2}\right) \\
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq \frac{1}{1+\gamma \mu n}\left(\left\|x_t-x_*\right\|^2+\frac{\gamma^3 L n^2 \cdot 4 L^2\left\|y_t-x_*\right\|^2}{2}\right) \\
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq \frac{1}{1+\gamma \mu n}\left(\left\|x_t-x_*\right\|^2+2 \gamma^3 n^2 L^3\left\|x_t-x_*\right\|^2\right) \\
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2 \mid \mathcal{F}_t\right] \leq \frac{1}{1+\gamma \mu n}\left(1+2 \gamma^3 n^2 L^3\right)\left\|x_t-x_*\right\|^2
\end{aligned}
$$
We can use the tower property of conditional expectation to obtain
$$
\mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2\right] \leq \frac{1+2 \gamma^3 L^3 n^2}{1+\gamma \mu n} \mathbb{E}\left[\left\|x_t-x_*\right\|^2\right]
$$
Since $\gamma \leq \frac{1}{2 \sqrt{2} L n} \sqrt{\frac{\mu}{L}}, n \geq 1$ and $\mu \leq L$ we have
$$
\frac{1}{4 n}+\frac{1}{4 \sqrt{2}} \frac{\mu}{L} \sqrt{\frac{\mu}{L}} \leq \frac{1}{2}
$$
From this inequality we obtain
$$
\begin{aligned}
	& \mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2\right] \leq \frac{1+2 \gamma^3 L^3 n^2}{1+\gamma \mu n} \mathbb{E}\left[\left\|x_t-x_*\right\|^2\right] \\
	& \text { Since } \gamma \leq \frac{1}{2 \sqrt{2} L n} \sqrt{\frac{\mu}{L}}, n \geq 1 \text { and } \mu \leq L \text { we have } \\
	& \frac{1}{4 n}+\frac{1}{4 \sqrt{2}} \frac{\mu}{L} \sqrt{\frac{\mu}{L}} \leq \frac{1}{2}
\end{aligned}
$$
From this inequality we obtain
$2 \cdot \frac{1}{8 L^2 n^2} \cdot \frac{\mu}{L} L^3 n+\frac{1}{2 \sqrt{2} L n} \sqrt{\frac{\mu}{L}} \cdot \frac{n \mu^2}{2} \leq \frac{\mu}{2}$
$$
\frac{1}{4 n} \mu+\frac{1}{4 \sqrt{2}} \frac{\mu}{L} \sqrt{\frac{\mu}{L}} \mu \leq \frac{\mu}{2}
$$
We continue to derive inequalities:
$$
\begin{aligned}
	& 2 \gamma^2 L^3 n+\frac{\gamma n \mu^2}{2} \leq \frac{\mu}{2} \\
	& 2 \gamma^2 L^3 n \leq \frac{\mu}{2}-\frac{\gamma n \mu^2}{2} \\
	& 2 \gamma^2 L^3 n^2 \leq \frac{n \mu}{2}-\frac{\gamma n^2 \mu^2}{2} \\
	& 1+2 \gamma^3 L^3 n^2 \leq 1+\frac{\gamma n \mu}{2}-\frac{\gamma^2 n^2 \mu^2}{2}
\end{aligned}
$$
Finally, we obtain
$$
\frac{1+2 \gamma^3 L^3 n^2}{1+\gamma \mu n} \leq 1-\frac{\gamma n \mu}{2}
$$
Plugging this inequality into $\mathbb{E}\left[\left\|x_{t+1}-x_*\right\|^2\right] \leq \frac{1+2 \gamma^3 L^3 n^2}{1+\gamma \mu n} \mathbb{E}\left[\left\|x_t-x_*\right\|^2\right]$, we unroll the recursion and obtain the final result:
$$
\mathbb{E}\left[\left\|x_T-x_*\right\|^2\right] \leq\left(1-\frac{\gamma n \mu}{2}\right)^T\left\|x_0-x_*\right\|^2
$$
\end{proof}



\subsection{Proof of Theorem 3}
%	Suppose that the functions $f_1, \ldots, f_n$ are $\mu$-strongly convex and Assumption~\ref{L-smooth} holds. Fix constant $0<\delta<1$. If the stepsize satisfies $\gamma\leq\frac{\delta}{L}\sqrt{\frac{\mu}{2nL}}$ and if number of functions is sufficiently big, $$n>\log\left(\frac{1}{1-\delta^2}\right)\cdot\left(\log\left(\frac{1}{1-\gamma\mu}\right)\right)^{-1},$$ then the iterates generated by \RR (Algorithm~\ref{alg:RRSVRG}) or by \RS (Algorithm~\ref{alg:SOSVRG}) satisfy
%	\begin{align*}
%	\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( \left(1 - \gamma \mu\right)^n +\delta^2 \right)^T \|x_0 - x_*\|^2.
%	\end{align*}

% \begin{proof}

We start from conditional analogue of Theorem 1 in \citep{mishchenko2020random} (similarly to Section \ref{D.1}), which states that
$$\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid \mathcal{F}_t \right] \leq(1-\gamma \mu)^{n} \left\|x_{t}-x_{*}\right\|^{2}+2 \gamma^{2} \sigma_{\text {Shuffle }}^{2}\left(\sum_{i=0}^{n-1}(1-\gamma \mu)^{i}\right).$$
Using Proposition 1 from \citep{mishchenko2020random}, which says that
$$\frac{\gamma \mu n}{8} \sigma_{*}^{2} \leq \sigma_{\text {Shuffle }}^{2} \leq \frac{\gamma L n}{4} \sigma_{*}^{2},$$
we get 
\begin{align*} \mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid \mathcal{F}_t \right] & \leq(1-\gamma \mu)^{n} \left\|x_{t}-x_{*}\right\|^{2}+ \frac{\gamma^3 L n}{2} \sigma_{*}^{2}\left(\sum_{i=0}^{n-1}(1-\gamma \mu)^{i}\right) \\
	& \leq(1-\gamma \mu)^{n} \left\|x_{t}-x_{*}\right\|^{2}+ \frac{\gamma^2 L n}{2\mu} \sigma_{*}^{2}. \end{align*}

Now we can apply Lemma 1 and Reformulation. Using $y_t = x_t$ we have the following inequality:
\begin{align*}
	\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t \right] &\leq(1-\gamma \mu)^{n} \left\|x_{t}-x_{*}\right\|^{2}+ \frac{2\gamma^2 L^3 n}{\mu}\|x_t - x_*\|^2\\
	&\leq\left((1-\gamma \mu)^{n} + \frac{2\gamma^2 L^3 n}{\mu}\right)\|x_t - x_*\|^2.\\
\end{align*}
Applying the tower property, we get
\begin{align*}
	\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right] &\leq\left((1-\gamma \mu)^{n} + \frac{2\gamma^2 L^3 n}{\mu}\right)\mathbb{E}\left[\|x_t - x_*\|^2\right],
\end{align*}
and after	unrolling this recursion, we get
\begin{align*}
	\mathbb{E}\left[\left\|x_{T}-x_{*}\right\|^{2}\right] &\leq\left((1-\gamma \mu)^{n} + \frac{2\gamma^2 L^3 n}{\mu}\right)^T\mathbb{E}\left[\|x_0 - x_*\|^2\right]\\
	&\leq\left((1-\gamma \mu)^{n} + \frac{\delta^2}{L^2}\frac{\mu}{2nL}\frac{2 L^3 n}{\mu}\right)^T\mathbb{E}\left[\|x_0 - x_*\|^2\right]\\
	&\leq\left((1-\gamma \mu)^{n} +\delta^2\right)^T\mathbb{E}\left[\|x_0 - x_*\|^2\right],
\end{align*}
where we used the stepsize restriction $\gamma\leq\frac{\delta}{L}\sqrt{\frac{\mu}{2nL}}$. In order for this to lead to convergence, we need to assume that
$(1-\gamma \mu)^{n} +\delta^2 <1.$	This is satisfied, for example, if $n$ is large enough. In particular, this holds when
$$n>\log\left(\frac{1}{1-\delta^2}\right)\cdot\left(\log\left(\frac{1}{1-\gamma\mu}\right)\right)^{-1}.$$


Finally, 
using the additional assumption 
$\delta^2 \leq (1-\gamma\mu)^{\frac{n}{2}}\left(1-(1-\gamma\mu)^{\frac{n}{2}}\right),$
we get
\begin{align*}
	\delta^2+(1-\gamma\mu)^{n} \leq (1-\gamma\mu)^{\frac{n}{2}}.
\end{align*} 
Now we can apply Theorem~3 and get
\begin{align*}
	\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq  \left(1 - \gamma \mu\right)^{\frac{nT}{2}} \|x_0 - x_*\|^2.
\end{align*}
Finally, we apply Lemma~\ref{lem:itercomplex} with $\gamma = \frac{\delta}{L}\sqrt{\frac{\mu}{2nL}}$ and get iteration complexity
$
T = \mathcal{O}\left(\kappa\sqrt{\frac{\kappa}{n}}\log \left(\frac{1}{\varepsilon}\right)\right).
$

% \end{proof}



\subsection{Proof of Theorem 4}
Suppose the functions $f_1, f_2, \ldots, f_n$ are convex and Assumption 1 holds. Then for \RR  or \RS  with stepsize $\gamma \leq \frac{1}{\sqrt{2}Ln},$ the average iterate $\hat{x}_{T} \eqdef \frac{1}{T} \sum_{t=1}^{T} x_{t}$ satisfies 
\begin{align*}
	\mathbb{E}\left[f\left(\hat{x}_{T}\right)-f\left(x_{*}\right)\right] \leq \frac{3\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma n T}.
\end{align*}
\begin{proof}
	We start with conditional analogue of Lemma 3 from~\citet{mishchenko2020random} (similarly to Section \ref{D.1}), which says that
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid \mathcal{F}_t\right] \leq \left\|x_{t}-x_{*}\right\|^{2} -2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\mid \mathcal{F}_t\right]+\frac{\gamma^{3} Ln^{2} \sigma_{*}^{2}}{2}.\end{align*}
	
	Apply this inequality to the reformulated problem, we get 			
	\begin{equation}\label{eq:bug87gdfd_8y9fd}
		2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\mid \mathcal{F}_t\right] \leq \left\|x_{t}-x_{*}\right\|^{2}-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid \mathcal{F}_t\right]+\frac{\gamma^{3} L n^{2} \left(\sigma_{*}^t\right)^{2}}{2}.
	\end{equation}
	Using Lemma 1 and the fact that $y_t=x_t$ and $f=f^t$, we get  
	\begin{equation}\label{eq:u987g9fdf}\left(\sigma_*^t\right)^2 \leq 8LD_{f^t}(x_t,x_*) = 8LD_{f}(x_t,x_*) =8L(f(x_t) - f(x_*)),\end{equation}
	where the last identity follows from Proposition 1.
	
	Plugging \eqref{eq:u987g9fdf} into \eqref{eq:bug87gdfd_8y9fd}, we obtain
	\begin{align*}
		2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\mid \mathcal{F}_t\right] \leq \left\|x_{t}-x_{*}\right\|^{2}-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid \mathcal{F}_t\right]+\frac{\gamma^{3} L n^{2}}{2} \cdot 8L(f(x_t) - f(x_*)),
	\end{align*}
	which after using the tower property turns into
	\begin{align*}
		2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right] \leq \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]+4\gamma^{3} L^2 n^{2} \mathbb{E}\left[f(x_t) - f(x_*)\right].
	\end{align*}
	Now we subtract from both sides:
	\begin{align*}
		2 \gamma n \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right] - 4\gamma^3L^2n^2\mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right] & \leq \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]\\
	&	\qquad+4\gamma^{3} L^2 n^{2} \mathbb{E}\left[f(x_t) - f(x_*)\right]\\
		&\qquad- 4\gamma^3L^2n^2\mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right]\\
		\left(2 \gamma n - 4\gamma^3L^2n^2\right)\mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right] &\leq \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]\\
	&	\qquad+ 4\gamma^{3} L^2 n^{2}\left( \mathbb{E}\left[f(x_t) - f(x_*)\right] - \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right]\right)\\
		2 \gamma n\left(1 - 2\gamma^2L^2n\right)\mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right]& \leq \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]\\
	&	\qquad + 4\gamma^{3} L^2 n^{2}\left( \mathbb{E}\left[f(x_t) - f(x_*)\right] - \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right]\right).
	\end{align*}
	Summing these inequalities for $t=0,1,\ldots,T-1$ gives
	\begin{align*}
		2 \gamma n\left(1 - 2\gamma^2L^2n\right)\sum_{t=0}^{T-1}\mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right] &\leq \sum_{t=0}^{T-1}\left(\mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]-\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]\right)\\
		&\qquad + 4\gamma^{3} L^2 n^{2}\sum_{t=0}^{T-1}\left( \mathbb{E}\left[f(x_t) - f(x_*)\right] - \mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right]\right)\\
		&= \mathbb{E}\left[\left\|x_{0}-x_{*}\right\|^{2}\right] - \mathbb{E}\left[\left\|x_{T}-x_{*}\right\|^{2}\right]\\
		&\qquad+4\gamma^{3} L^2 n^{2}\mathbb{E}\left[f\left(x_{0}\right)-f\left(x_{*}\right)\right] - 4\gamma^{3} L^2 n^{2}\mathbb{E}\left[f\left(x_{T}\right)-f\left(x_{*}\right)\right]\\
		&\leq \mathbb{E}\left[\left\|x_{0}-x_{*}\right\|^{2}\right] +4\gamma^{3} L^2 n^{2}\mathbb{E}\left[f\left(x_{0}\right)-f\left(x_{*}\right)\right]\\
		&\leq \mathbb{E}\left[\left\|x_{0}-x_{*}\right\|^{2}\right] + 2\gamma^{3} L^3 n^{2}\mathbb{E}\left[\left\|x_{0}-x_{*}\right\|^{2}\right]\\
		&= (1+2\gamma^{3} L^3 n^{2})\mathbb{E}\left[\left\|x_{0}-x_{*}\right\|^{2}\right],
	\end{align*}
	and dividing both sides by $2 \gamma n\left(1 - 2\gamma^2L^2n\right)T$, we get
	\begin{align*}
		\frac{1}{T}\sum_{t=0}^{T-1}\mathbb{E}\left[f\left(x_{t+1}\right)-f\left(x_{*}\right)\right] &\leq \frac{1+2\gamma^{3} L^3 n^{2}}{1 - 2\gamma^2L^2n}\frac{\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma nT}.
	\end{align*}
	Using the convexity of $f$, the average iterate $\hat{x}_{T} \stackrel{\text { def }}{=} \frac{1}{T} \sum_{t=1}^{T} x_{t}$ satisfies
	\begin{align*}
		\mathbb{E}\left[f\left(\hat{x}_{T}\right)-f\left(x_{*}\right)\right] \leq \frac{1}{T} \sum_{t=1}^{T} \mathbb{E}\left[f\left(x_{t}\right)-f\left(x_{*}\right)\right]\leq \frac{1+2\gamma^{3} L^3 n^{2}}{1 - 2\gamma^2L^2n}\frac{\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma nT}.
	\end{align*}
	Let us show that 
	$$\frac{1+2\gamma^{3} L^3 n^{2}}{1 - 2\gamma^2L^2n} \leq 3.$$
	Applying $\gamma\leq \frac{1}{\sqrt{2}Ln}$ we have 
	$$\frac{1+2\frac{1}{2\sqrt{2}L^3n^3}L^3n^2}{1-2\frac{1}{2L^2n^2}L^2n} = \frac{1+\frac{1}{\sqrt{2}n}}{1-\frac{1}{n}}\leq 3.$$
	This leads to $4n>6+\sqrt{2}$ and since $n \in \mathbb{N}:n>1$, this inequality holds. Finally, we have 
	$$ \mathbb{E}\left[f\left(\hat{x}_{T}\right)-f\left(x_{*}\right)\right] \leq \frac{3\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma nT}.$$
\end{proof}


%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%
\clearpage
\subsection{Proof of Theorem~5~and~Theorem~6 }
We provide analysis for non-convex settings.

Let us remind you our reformulation:
$$f(x)=\frac{1}{n} \sum_{i=1}^{n} f_{i}(x)=\frac{1}{n} \sum_{i=1}^{n}\left(f_{i}(x)+\left\langle a_{t}^{i}, x\right\rangle\right):=\frac{1}{n} \sum_{i=1}^{n} f_{i}^{t}(x),$$

where $f_{i}^{t}(x):=f_{i}(x)+\left\langle a_{t}^{i}, x\right\rangle$ and $\sum_{i=1}^{n} a_{t}^{i}=0$. Note that
$$
\nabla f_{i}^{t}(x)=\nabla f_{i}(x)+a_{t}^{I}.
$$ 
In particular, we choose
$$a_{t}^{i}:=-\nabla f_{\pi_{i}}\left(y_{t}\right)+\nabla f\left(y_{t}\right).$$
Finally, we have 
$$\nabla f_{\pi_i}^{t}(x) = \nabla f_{\pi_i}(x)-\nabla f_{\pi_{i}}\left(y_{t}\right)+\nabla f\left(y_{t}\right).$$

Now we need to establish an analogue of Lemma 1 for gradient variance. Let us define 
$$ \sigma^2(x_t) = \frac{1}{n} \sum_{i=1}^{n}\left\Vert\nabla f_{i}(x_t)-\nabla f(x_t)\right\Vert^{2}.$$
\begin{lemma} If we apply the linear perturbation reformulation, then the gradient variance of the reformulated problem $\left(\sigma_t^2\right)$ is equal to zero.
\end{lemma}

\begin{proof}
	
	$$
	\sigma_t^2\left(x_t\right)=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i^t\left(x_t\right)-\nabla f\left(x_t\right)\right\|^2=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i\left(x_t\right)-\nabla f_i\left(y_t\right)+\nabla f\left(y_t\right)-\nabla f\left(x_t\right)\right\|^2
	$$
	In Algorithm \ref{alg:GENERIC-SVRG} (\RR) we set $x_t=y_t$, and hence we have
	$$
	\sigma_t^2\left(x_t\right)=\frac{1}{n} \sum_{i=1}^n\left\|\nabla f_i\left(x_t\right)-\nabla f_i\left(x_t\right)+\nabla f\left(x_t\right)-\nabla f\left(x_t\right)\right\|^2=0.
	$$
	

\end{proof}

Suppose that Assumption 1 holds. Then for Algorithm \RR run for $T$ epochs with a stepsize $\gamma\leq \frac{1}{2Ln}$ we have 
$$\squeeze\frac{1}{T} \sum_{t = 0}^{T-1}  \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right] \leq \frac{4(f(x_0) - f_*)} {\gamma n T}.$$
Choose $\gamma = \frac{1}{2nL}$. Then the mean of gradient norms satisfies 
$ \frac{1}{T} \sum_{t = 0}^{T-1}  \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right] \leq \varepsilon^2 $
provided the number of iterations satisfies 
$T = \mathcal{O}\left( \frac{8 \delta_{0} L }{\varepsilon^{2}}\right).$

Suppose that Assumption \ref{L-smooth} holds and $f$ satisfies the Polyak-Łojasiewicz inequality with $\mu>0$, i.e., $\left\Vert\nabla f(x) \right\Vert^{2} \geq 2 \mu(f(x)-f_*)$ for any $x \in \mathbb{R}^{d}$. Then for Algorithm \RR run for $T$ epochs with a stepsize $\gamma\leq \frac{1}{2Ln}$ we have 
$$\squeeze \mathbb{E}\left[f(x_T)-f_*\right] \leq\left(1-\frac{\gamma \mu n}{2}\right)^{T}\left(f(x_0)-f_*\right),  $$
then the relative error satisfies 
$\frac{\mathbb{E}\left[f\left(x_{T}\right)-f_*\right]}{f\left(x_0\right)-f_*} \leq \varepsilon $
provided the number of iterations satisfies $T = \mathcal{O} (\kappa \log \frac{1}{\varepsilon}).$
\\

\begin{proof}
	We start from conditional analogues of Lemmas 4 and 5 from \citet{mishchenko2020random} (similarly to Section \ref{D.1})
	\begin{align*}
		\mathbb{E} \left[f(x_{t+1})|\mathcal{F}_t\right] \leq f(x_t) - \frac{\gamma n}{2}\left\Vert \nabla f(x_t) \right\Vert^2 + \frac{\gamma L^2}{2}\left( \gamma^2 n^3 \left\Vert \nabla f(x_t) \right\Vert^2 + \gamma^2 n^2 \sigma^2(x_t)\right)
	\end{align*}
	This lemma works for the reformulated problem. Since we do not change initial function f(x) the gradient $\nabla f(x_t)$ remains the same. The only thing that changes is the variance of the gradient. According to the lemma proved above, this variance is equal to zero. Now we have the following inequality:
	\begin{align*}
		\mathbb{E} \left[f(x_{t+1})|\mathcal{F}_t\right] &\leq f(x_t) - \frac{\gamma n}{2}\left\Vert \nabla f(x_t) \right\Vert^2 + \frac{\gamma L^2}{2} \gamma^2 n^3 \left\Vert \nabla f(x_t) \right\Vert^2\\
		&\leq f\left(x_{t}\right)-\frac{\gamma n}{2}\left(1-\gamma^{2} L^{2} n^{2}\right)\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}
	\end{align*}
	Let $\delta_t = f(x_t) - f_*$. Adding $-f_*$ to both sides,
	\begin{align*}
		\mathbb{E} \left[\delta_{t+1}|\mathcal{F}_t\right] \leq \delta_{t}-\frac{\gamma n}{2}\left(1-\gamma^{2} L^{2} n^{2}\right)\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}
	\end{align*}
	Taking unconditional expectations and using that $\gamma \leq \frac{1}{2Ln}$ we have $1 - \gamma^2 L^2 n^2 \geq \frac{1}{2}$, we get 
	$$ \mathbb{E}\left[\delta_{t+1}\right] \leq\mathbb{E}\left[\delta_{t}\right]-\frac{\gamma n}{4} \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right]. $$
	It leads to
	
	\begin{align*}
		\frac{1}{T} \sum_{t = 0}^{T-1}  \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right] \leq \frac{4} {\gamma n }\frac{1}{T}\sum_{t = 0}^{T-1}\left(\mathbb{E}\left[\delta_{t+1}\right] - \mathbb{E}\left[\delta_{t}\right] \right) \leq  \frac{4\delta_0} {\gamma n T}
	\end{align*}
	
	If we have PL condition, then we start from
	$$ \mathbb{E}\left[\delta_{t+1}\right] \leq\mathbb{E}\left[\delta_{t}\right]-\frac{\gamma n}{4} \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right]. $$
	Applying $\frac{1}{2}\|\nabla f(x)\|^{2} \geq \mu(f(x)-f_*)$ leads to 
	$$ \mathbb{E}\left[\delta_{t+1}\right] \leq \mathbb{E}\left[\delta_{t}\right]-\frac{\gamma \mu n}{2} \mathbb{E}\left[f\left(x_{t}\right)-f_{*}\right].$$
	Unrolling this recursion, we get 
	$$ \mathbb{E}\left[\delta_{T}\right] \leq\left(1-\frac{\gamma \mu n}{2}\right)^{T} \delta_{0}.$$
	
	Suppose that Assumption 1 holds. Choose the stepsize $\gamma$ as $\frac{1}{2nL}$. Then the mean of gradient norms satisfies 
	$$ \frac{1}{T} \sum_{t = 0}^{T-1}  \mathbb{E}\left[\left\Vert\nabla f\left(x_{t}\right)\right\Vert^{2}\right] \leq \varepsilon^2 $$
	provided the number of iterations satisfies 
	$$T \geq \frac{8 \delta_{0} L }{\varepsilon^{2}}.$$
	If $f$ satisfies the Polyak-Łojasiewicz inequality, then the relative error satisfies 
	$$\frac{\mathbb{E}\left[f\left(x_{T}\right)-f_*\right]}{\left(f\left(x_0\right)-f_*\right)} \leq \varepsilon $$
	provided the number of iterations satisfies 
	$$T = \mathcal{O} \left(\kappa \log \frac{1}{\varepsilon}\right).$$
\end{proof}

\section{Analysis of Det-Shuffle}

%\peter{The method is deterministic. Why do we have expectations in the theorems and proofs?}

\subsection{Proof of Theorem~7}

%		Suppose that each $f_i$ is convex function, $f$ is $\mu$-strongly convex function, and Assumption~\ref{L-smooth}
%		holds. Then provided the stepsize satisfies $\gamma \leq \frac{1}{4 L n}\sqrt{\frac{\mu}{L}},$
%		the iterates generated by \DS satisfy
%		\begin{align*}
%		\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( 1 - \frac{\gamma n \mu}{2} \right)^T \|x_0 - x_*\|^2.
%		\end{align*}

We start from Lemma 8 in~\citet{mishchenko2020random}
\begin{align}
	\left\|x_{t+1}-x_{*}\right\|^{2} \leq\left\|x_{t}-x_{*}\right\|^{2}-2 \gamma n\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right)+\gamma^{3} L n^{3} \sigma_{*}^{2}.
\end{align}

Now we can apply to the reformulated problem~\eqref{reform}. Using strong convexity we obtain
\begin{align*}
	\left\|x_{t+1}-x_{*}\right\|^{2} &\leq \left\|x_{t}-x_{*}\right\|^{2}-2 \gamma n \left(f\left(x_{t+1}\right)-f(x_{*})\right)+\gamma^{3} L n^{2} \left(\sigma_*^t\right)^2\\
	&\leq\left\|x_{t}-x_{*}\right\|^{2}- \gamma n \mu\left(\left\|x_{t+1}-x_{*}\right\|^{2} \right) +\gamma^{3} L n^{3} \left(\sigma_*^t\right)^2.
\end{align*}
Since we update $y_t = x_t$ after each epoch, this leads to
\begin{align*}
	\left\|x_{t+1}-x_{*}\right\|^{2} &\leq \frac{1}{1+\gamma \mu n}\left(\left\|x_{t}-x_{*}\right\|^{2}+\gamma^{3} L n^{3} \left(\sigma_*^t\right)^2\right)\\
	&\leq \frac{1}{1+\gamma \mu n}\left(\left\|x_{t}-x_{*}\right\|^{2}+\gamma^{3} L n^{3} \cdot 4 L^2\|y_t-x_*\|^2\right)\\
	&= \frac{1}{1+\gamma \mu n}\left(\left\|x_{t}-x_{*}\right\|^{2}+4\gamma^{3} n^{3} L^3\|x_t-x_*\|^2\right)\\
	&= \frac{1}{1+\gamma \mu n}\left(1+4\gamma^{3} n^{3} L^3\right)\|x_t-x_*\|^2.\\
\end{align*}
We obtain
\begin{equation*}
	\left\|x_{t+1}-x_{*}\right\|^{2} \leq \frac{1+4 \gamma^{3} L^3 n^{3} }{1+\gamma \mu n} \left\|x_{t}-x_{*}\right\|^{2}.
\end{equation*}
Since we have $\mu\leq L$ we obtain 
\begin{align*}
		\frac{1}{8} + \frac{1}{8}\frac{\mu}{L}\sqrt{\frac{\mu}{L}} &\leq \frac{1}{2}\\
			\frac{1}{8}\mu + \frac{1}{8}\frac{\mu}{L}\sqrt{\frac{\mu}{L}}\mu &\leq \frac{\mu}{2}\\
	2\cdot\frac{1}{16L^2n^2}\cdot\frac{\mu}{L}L^3n^2+\frac{1}{4Ln}\sqrt{\frac{\mu}{L}}\cdot\frac{n\mu^2}{2}&\leq \frac{\mu}{2}.
\end{align*}
Now as $\gamma \leq \frac{1}{4 L n}\sqrt{\frac{\mu}{L}}$, we have
\begin{align*}
		4 \gamma^{2} L^3 n^2 + \frac{\gamma n \mu^2}{2}  &\leq  \frac{\mu}{2}\\
			4 \gamma^{2} L^3 n^2  &\leq  \frac{\mu}{2} - \frac{\gamma n \mu^2}{2}   \\
		4 \gamma^{2} L^3 n^{3}  &\leq  \frac{n \mu}{2} - \frac{\gamma n^2 \mu^2}{2}   \\
		1+4 \gamma^{3} L^3 n^{3}  &\leq 1 + \frac{\gamma n \mu}{2} - \frac{\gamma^2 n^2 \mu^2}{2}.
\end{align*}
Let us simplify it: 
\begin{align*}
	\frac{1+4 \gamma^{3} L^3 n^{3} }{1+\gamma \mu n} \leq 1 - \frac{\gamma n \mu}{2}.
\end{align*}



We can unroll the recursion and obtain
\begin{equation*}
	\mathbb{E} \left[ \|x_T - x_* \|^2 \right] \leq \left( 1 - \frac{\gamma n \mu}{2} \right)^T \|x_0 - x_*\|^2.
\end{equation*}




\subsection{Proof of Theorem~8}
Suppose the functions $f_1, f_2, \ldots, f_n$ are convex and Assumption 1 holds. Then for Algorithm 1 (\DS) with a stepsize $\gamma \leq \frac{1}{2\sqrt{2}Ln}$, the average iterate $\hat{x}_{T} \eqdef \frac{1}{T} \sum_{j=1}^{T} x_{j}$ satisfies 
\begin{align*}
	f\left(\hat{x}_{T}\right)-f\left(x_{*}\right) \leq \frac{2\left\|x_{0}-x_{*}\right\|^{2}}{ \gamma n T}.
\end{align*}
We start with Lemma 8 from~\citet{mishchenko2020random}:
\begin{align*}
	\left\|x_{t+1}-x_{*}\right\|^{2} \leq \left\|x_{t}-x_{*}\right\|^{2} -2 \gamma n \left( f\left(x_{t+1}\right)-f\left(x_{*}\right) \right)+\gamma^{3} Ln^{3} \sigma_{*}^{2}\\
	2 \gamma n \left(f\left(x_{t+1}\right)-f\left(x_{*}\right) \right) \leq \left\|x_{t}-x_{*}\right\|^{2}-\left\|x_{t+1}-x_{*}\right\|^{2}+\gamma^{3} L n^{3} \sigma_{*}^{2}.
\end{align*}
Using Lemma~\ref{main_lemma_lemma} and considering $y_t=x_t$, we have  
$$\left(\sigma_*^t\right)^2 \leq 8LD_{f^t}(x_t,x_*).$$
Applying Proposition~\ref{prop-reform} we get 
$$\left(\sigma_*^t\right)^2 \leq 8LD_{f}(x_t,x_*) = 8L(f(x_t) - f(x_*)).$$
Next, we utilize the inner product reformulation and get 
\begin{align*}
	2 \gamma n \left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) \leq \left\|x_{t}-x_{*}\right\|^{2}-\left\|x_{t+1}-x_{*}\right\|^{2} +\gamma^{3} L n^{3} \cdot 8L(f(x_t) - f(x_*)).
\end{align*}
Using tower property we have
\begin{align*}
	2 \gamma n \left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) \leq \left\|x_{t}-x_{*}\right\|^{2}-\left\|x_{t+1}-x_{*}\right\|^{2}+8\gamma^{3} L^2 n^{3} \left((f(x_t) - f(x_*))\right).
\end{align*}
Now we subtract from both sides:
\begin{align*}
	2 \gamma n \left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) - 8\gamma^3L^2n^3\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right)  &\leq \left(\left\|x_{t}-x_{*}\right\|^{2}\right)-\left\|x_{t+1}-x_{*}\right\|^{2}\\
	&+8\gamma^{3} L^2 n^{3} \left((f(x_t) - f(x_*))\right)\\
	&- 8\gamma^3L^2n^3\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right)\\
	\left(2 \gamma n - 8\gamma^3L^2n^3\right)\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) &\leq \left\|x_{t}-x_{*}\right\|^{2}-\left\|x_{t+1}-x_{*}\right\|^{2}\\
	&\qquad+ 8\gamma^{3} L^2 n^{3}\left( \left(f(x_t) - f(x_*)\right) - \left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right)\right)\\
	2 \gamma n\left(1 - 4\gamma^2L^2n^2\right)\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) &\leq \left\|x_{t}-x_{*}\right\|^{2}-\left\|x_{t+1}-x_{*}\right\|^{2}\\
	\qquad+ 8\gamma^{3} L^2 n^{3}\left( \left(f(x_t) - f(x_*)\right) - \left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right)\right).
\end{align*}
Summing these inequalities for $t=0,1,\ldots,T-1$ gives
\begin{align*}
	2 \gamma n\left(1 - 4\gamma^2L^2n^2\right)\sum_{t=0}^{T-1}\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) &\leq \sum_{t=0}^{T-1}\left(\left\|x_{t}-x_{*}\right\|^{2}-\left\|x_{t+1}-x_{*}\right\|^{2}\right)\\
	&\qquad + 8\gamma^{3} L^2 n^{3}\sum_{t=0}^{T-1}\left( \left(f(x_t) - f(x_*)\right) - \left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right)\right)\\
	&= \left\|x_{0}-x_{*}\right\|^{2} - \left\|x_{T}-x_{*}\right\|^{2}\\
	&\qquad+8\gamma^{3} L^2 n^{3}\left(f\left(x_{0}\right)-f\left(x_{*}\right)\right) - 8\gamma^{3} L^2 n^{3}\left(f\left(x_{T}\right)-f\left(x_{*}\right)\right)\\
	&\leq \left\|x_{0}-x_{*}\right\|^{2} +8\gamma^{3} L^2 n^{3}\left(f\left(x_{0}\right)-f\left(x_{*}\right)\right)\\
	&\leq \left\|x_{0}-x_{*}\right\|^{2} + 4\gamma^{3} L^3 n^{3}\left\|x_{0}-x_{*}\right\|^{2}\\
	&= (1+4\gamma^{3} L^3 n^{3})\left\|x_{0}-x_{*}\right\|^{2},
\end{align*}
and dividing both sides by $2 \gamma n\left(1 - 4\gamma^2L^2n^2\right)T$, we get
\begin{align*}
	\frac{1}{T}\sum_{t=0}^{T-1}\left(f\left(x_{t+1}\right)-f\left(x_{*}\right)\right) &\leq \frac{1+4\gamma^{3} L^3 n^{3}}{1 - 4\gamma^2L^2n^2}\frac{\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma nT}.
\end{align*}
Using the convexity of $f$, the average iterate $\hat{x}_{T} \stackrel{\text { def }}{=} \frac{1}{T} \sum_{t=1}^{T} x_{t}$ satisfies
\begin{align*}
	\left(f\left(\hat{x}_{T}\right)-f\left(x_{*}\right)\right) \leq \frac{1}{T} \sum_{t=1}^{T} \left(f\left(x_{t}\right)-f\left(x_{*}\right)\right)\leq \frac{1+4\gamma^{3} L^3 n^{3}}{1 - 4\gamma^2L^2n^2}\frac{\left\|x_{0}-x_{*}\right\|^{2}}{2 \gamma nT}.
\end{align*}
Let us show that 
$$\frac{1+4\gamma^{3} L^3 n^{3}}{1 - 4\gamma^2L^2n^2} \leq 4.$$
Applying $\gamma\leq \frac{1}{2\sqrt{2}Ln}$ we have 
$$\frac{1+4\frac{1}{16\sqrt{2}L^3n^3}L^3n^3}{1-4\frac{1}{8L^2n^2}L^2n^2} = \frac{1+\frac{1}{4\sqrt{2}}}{1-\frac{1}{2}}\leq 4.$$
Finally, we have 
$$f\left(\hat{x}_{T}\right)-f\left(x_{*}\right) \leq \frac{2\left\|x_{0}-x_{*}\right\|^{2}}{\gamma nT}.$$
This ends the proof. 



%%%%%%%%%%%%%%	
\clearpage	
\section{One More Algorithm: \algname{RR-VR}}


\subsection{New Algorithm: \algname{RR-VR}}

\begin{algorithm}[h]
	\caption{Random Reshuffling with Variance Reduction}
	\label{alg:RR_VR}	
	\begin{algorithmic}[1]
		\STATE \textbf{Input:} Stepsize $\gamma>0$, probability $p$, $x_0 = x_0^0 \in \mathbb{R}^{d}, y_0 \in \mathbb{R}^{d}$, number of epochs $T$.
		\FOR{$t =  0, 1, \dots T-1$ }
		\STATE {\color{red}Choose a random permutation $\{\pi_0, \dots, \pi_{n-1}\}$ of $\{1, \dots, n\}$}
		\STATE $x_t^0 = x_t$
		\FOR{$i= 0, \dots, n-1$ }
		\STATE $g^i_t(x_t^i,y_t) =  \nabla f_{\pi_i} (x_t^i)-\nabla f_{\pi_i} (y_t)+\nabla f (y_t) $
		\STATE $x^{i+1}_t = x^i_t - \gamma g^i_t(x_t^i,y_t)$
		\ENDFOR
		\STATE $x_{t+1} = x^n_t$
		\STATE  $y_{t+1}=\begin{cases} y_t & \text{with probability } 1-p \\ x_t & \text{with probability } p \end{cases}$
		\ENDFOR
	\end{algorithmic}
	
\end{algorithm}


In this section we formulate convergence results for a generalized version of \algname{SVRG} under random reshuffling. Analysis of \algname{RR-VR} (Algorithm~\ref{alg:RR_VR}) is more complicated. 


\subsection{Convergence Theory}	

To analyze this method, we introduce Lyapunov functions. 

	\label{th7}
	Suppose that each $f_i$ is convex, $f$ is $\mu$-strongly convex, and Assumption 1
	holds. Then provided the parameters satisfy $n>\kappa$, $\frac{\kappa}{n}<p<1$ and $\gamma \leq \frac{1}{2\sqrt{2}Ln}$, 
	the final iterate generated by \algname{RR-VR} (Algorithm~\ref{alg:RR_VR}) satisfies
	$
	V_{T} \leq \max \left( q_1,q_2 \right)^{T} V_{0},
	$
	where
	$
	q_1 = 1-\frac{\gamma \mu n}{4}\left(1-\frac{p}{2}\right)$,		$q_2 = 1-p+\frac{8}{\mu} \gamma^{2} L^{3} n$, 
	and the Lyapunov function is defined via
	\begin{align*}
		V_t \eqdef \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+\left(\frac{4}{\gamma\mu n}\right)^{-1}\mathbb{E}\left[\left\|y_{t}-x_{*}\right\|^{2}\right]. 
	\end{align*}
	This means that the iteration complexity of Algorithm~\ref{alg:RR_VR} is
	$
	T = \mathcal{O}\left(\kappa\log \left(\frac{1}{\varepsilon}\right)\right).
	$

Note that the probability $p$ should not be too small. 
We obtain the same complexity as that of of \RR. 


	Suppose that the functions $f_1, \ldots, f_n$ are $\mu$-strongly convex, and that Assumption~\ref{L-smooth} holds. Then for \algname{RR-VR} (Algorithm~\ref{alg:RR_VR}) with parameters that satisfy $\gamma \leq \frac{1}{2L}\sqrt{\frac{\mu}{2nL}}$, $\frac{1}{2}<\delta<\frac{1}{\sqrt{2}}$, $0<p<1$, and for a sufficiently large number of functions, $n>\log\left(\frac{1}{1-\delta^2}\right)\cdot\left(\log\left(\frac{1}{1-\gamma\mu}\right)\right)^{-1}$, the iterates generated by the \algname{RR-VR} algorithm satisfy
	$	
	V_{T} 
	\leq \max \left(q_1,q_2\right)^{T} V_{0},$
	where $ \squeeze q_1 
	= (1-\gamma \mu)^{n}+\delta^2$, 
	$q_2 	
	= 1-p\left(1-\frac{2\gamma^2L^3n}{\mu\delta^2}\right),$ and 
	$$
	V_t \eqdef \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+\frac{\delta^2}{p}\mathbb{E}\left[\left\|y_{t}-x_{*}\right\|^{2}\right]. 
	$$
	This means that the iteration complexity of Algorithm~\ref{alg:RR_VR} is
	$
	T = \mathcal{O}\left(\max\left(\kappa\sqrt{\frac{\kappa}{n}},\frac{1}{2\log (2\delta)}\right)\log \left(\frac{1}{\varepsilon}\right)\right).
	$


We get almost the same rate as the rate of \RR, but there is one difference. Complexity depends on $\delta$ term. However, the first term dominates in most cases. 


\subsection{Proof of Theorem~9}
Suppose that each $f_i$ is convex, $f$ is $\mu$-strongly convex, and Assumption 1
holds. Then provided the parameters satisfy $n>\kappa$, $\frac{\kappa}{n}<p<1$ and $\gamma \leq \frac{1}{2\sqrt{2}Ln}$, 
the final iterate generated by \algname{RR-VR} (Algorithm~\ref{alg:RR_VR}) satisfies
\begin{align*}
	V_{T} \leq \max \left( q_1,q_2 \right)^{T} V_{0},
\end{align*}
where
\begin{align*}
	q_1 = 1-\frac{\gamma \mu n}{4}\left(1-\frac{p}{2}\right), \quad
	q_2 = 1-p+\frac{8}{\mu} \gamma^{2} L^{3} n,
\end{align*}
and the Lyapunov function is defined via
\begin{align*}
	V_t \eqdef \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+\frac{4}{\gamma\mu n}\mathbb{E}\left[\left\|y_{t}-x_{*}\right\|^{2}\right]. 
\end{align*}
\begin{proof}
	For the problem $\frac{1}{n}\sum_{i=1}^{n}f^t_i(x)$ we will use an inequality from \citet{mishchenko2020random}:
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t\right] & \leq \frac{1}{1+\gamma \mu n}\left(\left\|x_{t}-x_{*}\right\|^{2}+\frac{\gamma^{3} L n^{2} \sigma_{*}^{2}}{2}\right) \\
		&=\frac{1}{1+\gamma \mu n} \left\|x_{t}-x_{*}\right\|^{2}+\frac{1}{1+\gamma \mu n} \frac{\gamma^{3} L n^{2} \sigma_{*}^{2}}{2} \\
		& \leq\left(1-\frac{\gamma \mu n}{2}\right) \left\|x_{t}-x_{*}\right\|^{2}+\frac{\gamma^{3} L n^{2} \sigma_{*}^{2}}{2}.
	\end{align*}
	Now we apply inequality 
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t, y_t\right] &\leq\left(1-\frac{\gamma \mu n}{2}\right) \left\|x_{t}-x_{*}\right\|^{2}+\frac{\gamma^{3} L n^{2} \sigma_{*}^{2}}{2}\\
		&\leq\left(1-\frac{\gamma \mu n}{2}\right) \left\|x_{t}-x_{*}\right\|^{2} + 2\gamma^3L^3n^2\|y_t-x_*\|^2.
	\end{align*}
	Using tower property we have 
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right] &= \mathbb{E}\left[\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t, y_t\right]\right] \\
		&\leq\left(1-\frac{\gamma \mu n}{2}\right) \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right] +2\gamma^3L^3n^2\mathbb{E}\left[\|y_t-x_*\|^2\right].
	\end{align*}
	Now we look at 
	\begin{align*}
		y_{t+1}=\left\{\begin{array}{ll} y_t & \text{with probability } 1-p \\
			x_t & \text{with probability } p
		\end{array}\right. .
	\end{align*}
	We get
	\begin{align*}
		\mathbb{E}\left[\|y_{t+1} - x_*\|^2\mid x_t,y_t\right] = (1-p)\|y_t - x_*\|^2+p\| x_t - x_* \|^2.
	\end{align*}
	Using tower property 
	\begin{align*}
		\mathbb{E}\left[\|y_{t+1} - x_*\|^2\right] &= \mathbb{E}\left[\mathbb{E}\left[\|y_{t+1} - x_*\|^2\mid x_t,y_t\right]\right]\\
		&= (1-p)\mathbb{E}\left[\|y_t - x_*\|^2\right]+p\mathbb{E}\left[\| x_t - x_* \|^2\right].
	\end{align*}
	Finally, we have 
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]+M\mathbb{E}\left[\|y_{t+1} - x_*\|^2\right]  & \leq\left(1-\frac{\gamma \mu n}{2}\right) \left\|x_{t}-x_{*}\right\|^{2} +2\gamma^3L^3n^2\mathbb{E}\left[\|y_t-x_*\|^2\right]\\\notag
		&\qquad+(1-p)M\mathbb{E}\|y_t - x_*\|^2 +pM\mathbb{E}\| x_t - x_* \|^2.
	\end{align*}
	Denote $V_{t} = \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+M\mathbb{E}\left[\|y_{t} - x_*\|^2\right].$ Using this we obtain
	\begin{align*}
		V_{t+1}&\leq\left(1-\frac{\gamma \mu n}{2}\right) \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right] +2\gamma^3L^3n^2\mathbb{E}\left[\|y_t-x_*\|^2\right]\\\notag
		&\qquad+(1-p)M\mathbb{E}\left[\|y_t - x_*\|^2\right]+pM\mathbb{E}\left[\| x_t - x_* \|^2\right].
	\end{align*}
	Thus,
	\begin{align*}
		V_{t+1}&\leq\left(1-\frac{\gamma \mu n}{2}+pM\right) \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right] +\left(1-p+\frac{1}{M}2\gamma^3L^3n^2\right)M\mathbb{E}\left[\|y_t-x_*\|^2\right].
	\end{align*}
	To have contraction we use 
	\begin{align*}
		M = \frac{\gamma\mu n}{4}, \qquad \gamma = \frac{1}{2\sqrt{2}Ln}.
	\end{align*}
	We have the final rate
	\begin{align*}
		V_{t+1} &\leq \max \left(1-\frac{\gamma \mu n}{4}\left(1-\frac{p}{2}\right),  1-p+\frac{8}{\mu} \gamma^{2} L^{3} n \right)V_t\\
		V_{T} &\leq \max\left(1-\frac{\gamma \mu n}{4}\left(1-\frac{p}{2}\right),  1-p+\frac{8}{\mu} \gamma^{2} L^{3} n \right)^TV_0.
	\end{align*}
\end{proof} 
%	The proof of Corollary~\ref{corollary7} is an application of the lemma from Section~\ref{compl_lemma}.

\subsection{Proof of Theorem~10}
Suppose that the functions $f_1, \ldots, f_n$ are $\mu$-strongly convex, and that Assumption~1 holds. Then for \algname{RR-VR} (Algorithm~\ref{alg:RR_VR}) with parameters that satisfy $\gamma \leq \frac{1}{2L}\sqrt{\frac{\mu}{2nL}}$, $\frac{1}{2}<\delta<\frac{1}{\sqrt{2}}$, $0<p<1$, and for a sufficiently large number of functions, $n>\log\left(\frac{1}{1-\delta^2}\right)\cdot\left(\log\left(\frac{1}{1-\gamma\mu}\right)\right)^{-1}$, the iterates generated by the \algname{RR-VR} algorithm satisfy
\begin{align*}	
	V_{T} &
	\leq \max \left(q_1,q_2\right)^{T} V_{0},	
\end{align*}
where $$  q_1 
= (1-\gamma \mu)^{n}+\delta^2, \quad
q_2 	
= 1-p\left(1-\frac{2\gamma^2L^3n}{\mu\delta^2}\right),$$ and 
$$
V_t \eqdef \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+\frac{\delta^2}{p}\mathbb{E}\left[\left\|y_{t}-x_{*}\right\|^{2}\right]. 
$$
\begin{proof}
	For the problem $\frac{1}{n}\sum_{i=1}^{n}f^t_i(x)$ we will use two inequalities from \citet{mishchenko2020random}:
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t\right]  &\leq\left(1-\gamma \mu\right)^n \left\|x_{t}-x_{*}\right\|^{2}+2\gamma^2\sigma_{\text {Shuffle }}^2\left(\sum_{i=0}^{n-1}(1-\gamma\mu)^i\right)\\
		\sigma_{\text {Shuffle }}^{2} &\leq \frac{\gamma L n}{4} \sigma_{*}^{2}.
	\end{align*}
	Using this result, we have 
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t,y_t\right]  &\leq\left(1-\gamma \mu\right)^n \left\|x_{t}-x_{*}\right\|^{2}+\frac{1}{2}\gamma^3Ln\sigma_{*}^2\left(\sum_{i=0}^{n-1}(1-\gamma\mu)^i\right)\\
		&\leq\left(1-\gamma \mu\right)^n\left\|x_{t}-x_{*}\right\|^{2}+\frac{1}{\mu}2\gamma^2L^2nL\|y_t - x_*\|^2.
	\end{align*}
	Using tower property
	\begin{align*}
		\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right] &= \mathbb{E}\left[\mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\mid x_t,y_t\right]\right]\\
		&  \leq\left(1-\gamma \mu\right)^n\mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+\frac{1}{\mu}2\gamma^2LnL^2\mathbb{E}\left[\|y_t - x_*\|^2\right].
	\end{align*}
	Now we look at 
	\begin{align*}
		y_{t+1}=\left\{\begin{array}{ll} y_t & \text{with probability } 1-p \\
			x_t & \text{with probability } p
		\end{array}\right..
	\end{align*}
	Thus,
	$
	\mathbb{E}\left[\|y_{t+1} - x_*\|^2\mid x_t,y_t\right] = (1-p)\|y_t - x_*\|^2+p\| x_t - x_* \|^2.
	$
	Using tower property 
	\begin{align*}
		\squeeze
		\mathbb{E}\left[\|y_{t+1} - x_*\|^2\right] &= \mathbb{E}\left[\mathbb{E}\left[\|y_{t+1} - x_*\|^2\mid x_t,y_t\right]\right]\\
		&= (1-p)\mathbb{E}\left[\|y_t - x_*\|^2\right]+p\mathbb{E}\left[\| x_t - x_* \|^2\right].
	\end{align*}
	Denote $V_{t} = \mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+M\mathbb{E}\left[\|y_{t} - x_*\|^2\right]$ and we have
	\begin{align*}\squeeze
		V_{t+1} &= \mathbb{E}\left[\left\|x_{t+1}-x_{*}\right\|^{2}\right]+M\mathbb{E}\left[\|y_{t+1} - x_*\|^2\right]\\
		& \leq\left(1-\gamma \mu\right)^n\mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right]+\frac{2}{\mu}\gamma^2L^3n\mathbb{E}\left[\|y_t - x_*\|^2\right]+(1-p)M\mathbb{E}\left[\|y_t - x_*\|^2\right]+pM\mathbb{E}\left[\| x_t - x_* \|^2\right]\\
		&\leq \left(\left(1-\gamma \mu\right)^n+pM\right)\mathbb{E}\left[\left\|x_{t}-x_{*}\right\|^{2}\right] +\left((1-p)+\frac{2\gamma^2L^3n}{\mu M}\right)M\mathbb{E}\left[\| x_t - x_* \|^2\right]\\
		&\leq\max\left(\left(\left(1-\gamma \mu\right)^n+pM\right),\left((1-p)+\frac{2\gamma^2L^3n}{\mu M}\right)\right)V_t.
	\end{align*}
	Unrolling the recusrion we have 
	\begin{align*}
		\squeeze
		V_T \leq \max\left(\left((1-\gamma\mu)^n+pM\right), \left(1-p + \frac{2\gamma^2L^3n}{\mu M}\right)\right)^TV_0.
	\end{align*}
	Applying $M = \frac{\delta^2}{p}$ and $\gamma \leq \frac{1}{2L}\sqrt{\frac{\mu}{2nL}}$ we get 
	\begin{align*}	
		\squeeze
		V_{T} &	
		\leq \max \left((1-\gamma \mu)^{n}+\delta^2,1-p\left(1-\frac{2\gamma^2L^3n}{\mu\delta^2}\right)\right)^{T} V_{0}.
	\end{align*}
\end{proof}
\bibliographystyle{plainnat}
\bibliography{biblio.bib}

\end{document}

